From 8f6d68365e0dccba13c588dd4180ea18fc9cda09 Mon Sep 17 00:00:00 2001 From: David Robillard Date: Mon, 28 Jun 2021 20:59:28 -0400 Subject: Factor out NTriples reader --- src/read_ntriples.h | 223 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 223 insertions(+) create mode 100644 src/read_ntriples.h (limited to 'src/read_ntriples.h') diff --git a/src/read_ntriples.h b/src/read_ntriples.h new file mode 100644 index 00000000..d3a74924 --- /dev/null +++ b/src/read_ntriples.h @@ -0,0 +1,223 @@ +/* + Copyright 2011-2021 David Robillard + + Permission to use, copy, modify, and/or distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ + +#ifndef SERD_READ_NTRIPLES_H +#define SERD_READ_NTRIPLES_H + +#include "serd/serd.h" + +#include +#include + +// Utilities + +static inline bool +codepoint_in_range(const uint32_t c, const uint32_t min, const uint32_t max) +{ + return c >= min && c <= max; +} + +static inline bool +is_PN_CHARS_BASE(const uint32_t c) +{ + return (codepoint_in_range(c, 'A', 'Z') || codepoint_in_range(c, 'a', 'z') || + codepoint_in_range(c, 0x000C0u, 0x000D6u) || + codepoint_in_range(c, 0x000D8u, 0x000F6u) || + codepoint_in_range(c, 0x000F8u, 0x002FFu) || + codepoint_in_range(c, 0x00370u, 0x0037Du) || + codepoint_in_range(c, 0x0037Fu, 0x01FFFu) || + codepoint_in_range(c, 0x0200Cu, 0x0200Du) || + codepoint_in_range(c, 0x02070u, 0x0218Fu) || + codepoint_in_range(c, 0x02C00u, 0x02FEFu) || + codepoint_in_range(c, 0x03001u, 0x0D7FFu) || + codepoint_in_range(c, 0x0F900u, 0x0FDCFu) || + codepoint_in_range(c, 0x0FDF0u, 0x0FFFDu) || + codepoint_in_range(c, 0x10000u, 0xEFFFFu)); +} + +/** + Read one (possibly multi-byte) character (possibly multi-byte). + + The caller must have already eaten the first byte, `c`. +*/ +SerdStatus +read_character(SerdReader* reader, SerdNode* dest, uint8_t c); + +// Terminals + +/** + Read a language tag starting after the '@'. + + RDF 1.1 NTriples: [144s] LANGTAG +*/ +SerdStatus +read_LANGTAG(SerdReader* reader); + +/** + Read an end of line. + + RDF 1.1 NTriples: [7] EOL +*/ +SerdStatus +read_EOL(SerdReader* reader); + +/** + Read an absolute IRI. + + This is a stricter subset of [8] IRIREF in the NTriples grammar, since a + scheme is required. Handling this in the parser results in better error + messages. +*/ +SerdStatus +read_IRI(SerdReader* reader, SerdNode** dest); + +/** + Read an IRI reference suffix into an existing node. + + RDF 1.1 NTriples: [8] IRIREF +*/ +SerdStatus +read_IRIREF_suffix(SerdReader* reader, SerdNode* node); + +/** + Read a string that is single-quoted with the given character. + + RDF 1.1 NTriples: [9] STRING_LITERAL_QUOTE + RDF 1.1 Turtle: [23] STRING_LITERAL_SINGLE_QUOTE +*/ +SerdStatus +read_STRING_LITERAL(SerdReader* reader, SerdNode* ref, uint8_t q); + +/** + Read a blank node label that comes after "_:". + + RDF 1.1 NTriples: [141s] BLANK_NODE_LABEL +*/ +SerdStatus +read_BLANK_NODE_LABEL(SerdReader* reader, SerdNode** dest, bool* ate_dot); + +/** + Read an escape like "u201C", starting after the initial backslash. + + RDF 1.1 NTriples: [10] UCHAR +*/ +SerdStatus +read_UCHAR(SerdReader* reader, SerdNode* node, uint32_t* code_point); + +/** + Read an escape like "n", starting after the initial backslash. + + RDF 1.1 NTriples: [153s] ECHAR +*/ +SerdStatus +read_ECHAR(SerdReader* reader, SerdNode* dest); + +/** + Read a basic prefixed name character. + + RDF 1.1 NTriples: [157s] PN_CHARS_BASE +*/ +SerdStatus +read_PN_CHARS_BASE(SerdReader* reader, SerdNode* dest); + +/** + Read an initial prefixed name character. + + RDF 1.1 NTriples: [158s] PN_CHARS_U +*/ +SerdStatus +read_PN_CHARS_U(SerdReader* reader, SerdNode* dest); + +/** + Read any prefixed name character. + + RDF 1.1 NTriples: [160s] PN_CHARS +*/ +SerdStatus +read_PN_CHARS(SerdReader* reader, SerdNode* dest); + +/** + Read a single hexadecimal digit. + + RDF 1.1 NTriples: [162s] HEX +*/ +uint8_t +read_HEX(SerdReader* reader); + +/** + Read a variable name, starting after the '?' or '$'. + + This is an extension that serd uses in certain contexts to support patterns. + + Restricted version of SPARQL 1.1: [166] VARNAME +*/ +SerdStatus +read_VARNAME(SerdReader* reader, SerdNode** dest); + +// Nonterminals + +/** + Read a comment that starts with '#' and ends with the line. + + Not described by a rule in the grammar since RDF 1.1. +*/ +SerdStatus +read_comment(SerdReader* reader); + +/** + Read a subject (IRI or blank). + + RDF 1.1 NTriples: [3] subject +*/ +SerdStatus +read_nt_subject(SerdReader* reader, SerdNode** dest); + +/** + Read a predicate (IRI). + + RDF 1.1 NTriples: [4] predicate +*/ +SerdStatus +read_nt_predicate(SerdReader* reader, SerdNode** dest); + +/** + Read an object (IRI or blank or literal). + + RDF 1.1 NTriples: [5] object +*/ +SerdStatus +read_nt_object(SerdReader* reader, SerdNode** dest, bool* ate_dot); + +/** + Read a variable that starts with '?' or '$'. + + This is an extension that serd uses in certain contexts to support + patterns. + + Restricted version of SPARQL 1.1: [108] Var +*/ +SerdStatus +read_Var(SerdReader* reader, SerdNode** dest); + +/** + Read a complete NTriples document. + + RDF 1.1 NTriples: [1] ntriplesDoc +*/ +SerdStatus +read_ntriplesDoc(SerdReader* reader); + +#endif // SERD_READ_NTRIPLES_H -- cgit v1.2.1