diff options
author | David Robillard <d@drobilla.net> | 2021-06-28 20:59:28 -0400 |
---|---|---|
committer | David Robillard <d@drobilla.net> | 2022-01-14 19:37:51 -0500 |
commit | 8f6d68365e0dccba13c588dd4180ea18fc9cda09 (patch) | |
tree | c0a4101054593a15cc1afa00e2a0f202be1a7b75 | |
parent | 45902fbbaa11e8c38944b38182afb92bc0641ec9 (diff) | |
download | serd-8f6d68365e0dccba13c588dd4180ea18fc9cda09.tar.gz serd-8f6d68365e0dccba13c588dd4180ea18fc9cda09.tar.bz2 serd-8f6d68365e0dccba13c588dd4180ea18fc9cda09.zip |
Factor out NTriples reader
-rw-r--r-- | meson.build | 1 | ||||
-rw-r--r-- | src/n3.c | 563 | ||||
-rw-r--r-- | src/read_ntriples.c | 790 | ||||
-rw-r--r-- | src/read_ntriples.h | 223 | ||||
-rw-r--r-- | src/reader.c | 37 | ||||
-rw-r--r-- | src/reader.h | 18 | ||||
-rw-r--r-- | src/string_utils.h | 28 | ||||
-rw-r--r-- | test/bad/bad-blank-node-label.nt | 1 | ||||
-rw-r--r-- | test/bad/bad-trailing-garbage.nt | 1 | ||||
-rw-r--r-- | test/bad/manifest.ttl | 12 | ||||
-rw-r--r-- | test/good/manifest.ttl | 7 | ||||
-rw-r--r-- | test/lax/manifest.ttl | 7 | ||||
-rw-r--r-- | test/lax/test-out-of-range-unicode.nt (renamed from test/good/test-out-of-range-unicode.nt) | 0 | ||||
-rw-r--r-- | test/lax/test-out-of-range-unicode.ttl (renamed from test/good/test-out-of-range-unicode.ttl) | 0 | ||||
-rw-r--r-- | test/test_overflow.c | 3 | ||||
-rw-r--r-- | test/test_reader.c | 55 |
16 files changed, 1196 insertions, 550 deletions
diff --git a/meson.build b/meson.build index 00f2f7f6..07786b4b 100644 --- a/meson.build +++ b/meson.build @@ -91,6 +91,7 @@ sources = [ 'src/n3.c', 'src/node.c', 'src/nodes.c', + 'src/read_ntriples.c', 'src/read_utf8.c', 'src/reader.c', 'src/sink.c', @@ -18,12 +18,11 @@ #include "env.h" #include "namespaces.h" #include "node.h" -#include "read_utf8.h" +#include "read_ntriples.h" #include "reader.h" #include "stack.h" #include "string_utils.h" #include "try.h" -#include "uri_utils.h" #include "serd/serd.h" @@ -31,19 +30,8 @@ #include <stdbool.h> #include <stdint.h> #include <stdio.h> -#include <stdlib.h> #include <string.h> -#if defined(__clang__) && __clang_major__ >= 10 -# define SERD_FALLTHROUGH __attribute__((fallthrough)) -_Pragma("clang diagnostic push") -_Pragma("clang diagnostic ignored \"-Wmissing-declarations\"") -#elif defined(__GNUC__) && __GNUC__ >= 7 -# define SERD_FALLTHROUGH __attribute__((fallthrough)) -#else -# define SERD_FALLTHROUGH -#endif - static bool fancy_syntax(const SerdReader* const reader) { @@ -56,189 +44,31 @@ read_collection(SerdReader* reader, ReadContext ctx, SerdNode** dest); static SerdStatus read_predicateObjectList(SerdReader* reader, ReadContext ctx, bool* ate_dot); -static uint8_t -read_HEX(SerdReader* const reader) -{ - const int c = peek_byte(reader); - if (is_xdigit(c)) { - return (uint8_t)eat_byte_safe(reader, c); - } - - r_err(reader, SERD_ERR_BAD_SYNTAX, "invalid hexadecimal digit `%c'", c); - return 0; -} - -// Read UCHAR escape, initial \ is already eaten by caller +// whitespace ::= #x9 | #xA | #xD | #x20 | comment static SerdStatus -read_UCHAR(SerdReader* const reader, - SerdNode* const dest, - uint32_t* const char_code) +read_whitespace(SerdReader* const reader) { - const int b = peek_byte(reader); - unsigned length = 0; - switch (b) { - case 'U': - length = 8; - break; - case 'u': - length = 4; - break; - default: - return SERD_ERR_BAD_SYNTAX; - } - eat_byte_safe(reader, b); - - uint8_t buf[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; - for (unsigned i = 0; i < length; ++i) { - if (!(buf[i] = read_HEX(reader))) { - return SERD_ERR_BAD_SYNTAX; - } - } - - char* endptr = NULL; - const uint32_t code = (uint32_t)strtoul((const char*)buf, &endptr, 16); - assert(endptr == (char*)buf + length); - - unsigned size = 0; - if (code < 0x00000080) { - size = 1; - } else if (code < 0x00000800) { - size = 2; - } else if (code < 0x00010000) { - size = 3; - } else if (code < 0x00110000) { - size = 4; - } else { - r_err( - reader, SERD_ERR_BAD_SYNTAX, "unicode character 0x%X out of range", code); - - *char_code = 0xFFFD; - const SerdStatus st = push_bytes(reader, dest, replacement_char, 3); - return st ? st : SERD_SUCCESS; - } - - // Build output in buf - // (Note # of bytes = # of leading 1 bits in first byte) - uint32_t c = code; - switch (size) { - case 4: - buf[3] = (uint8_t)(0x80u | (c & 0x3Fu)); - c >>= 6; - c |= (16 << 12); // set bit 4 - SERD_FALLTHROUGH; - case 3: - buf[2] = (uint8_t)(0x80u | (c & 0x3Fu)); - c >>= 6; - c |= (32 << 6); // set bit 5 - SERD_FALLTHROUGH; - case 2: - buf[1] = (uint8_t)(0x80u | (c & 0x3Fu)); - c >>= 6; - c |= 0xC0; // set bits 6 and 7 - SERD_FALLTHROUGH; - case 1: - buf[0] = (uint8_t)c; - SERD_FALLTHROUGH; - default: - break; - } - - *char_code = code; - return push_bytes(reader, dest, buf, size); -} - -// Read ECHAR escape, initial \ is already eaten by caller -static SerdStatus -read_ECHAR(SerdReader* const reader, SerdNode* const dest) -{ - const int c = peek_byte(reader); - switch (c) { - case 't': - eat_byte_safe(reader, 't'); - return push_byte(reader, dest, '\t'); - case 'b': - eat_byte_safe(reader, 'b'); - return push_byte(reader, dest, '\b'); - case 'n': - dest->flags |= SERD_HAS_NEWLINE; - eat_byte_safe(reader, 'n'); - return push_byte(reader, dest, '\n'); - case 'r': - dest->flags |= SERD_HAS_NEWLINE; - eat_byte_safe(reader, 'r'); - return push_byte(reader, dest, '\r'); - case 'f': - eat_byte_safe(reader, 'f'); - return push_byte(reader, dest, '\f'); - case '\\': - case '"': - case '\'': - return push_byte(reader, dest, eat_byte_safe(reader, c)); - default: - return SERD_ERR_BAD_SYNTAX; - } -} - -// Read one character (possibly multi-byte) -// The first byte, c, has already been eaten by caller -static SerdStatus -read_character(SerdReader* const reader, SerdNode* const dest, const uint8_t c) -{ - if (!(c & 0x80)) { - switch (c) { - case 0xA: - case 0xD: - dest->flags |= SERD_HAS_NEWLINE; - break; - case '"': - case '\'': - dest->flags |= SERD_HAS_QUOTE; - break; - default: - break; - } - - return push_byte(reader, dest, c); - } - - return read_utf8_continuation(reader, dest, c); -} - -// [10] comment ::= '#' ( [^#xA #xD] )* -static void -read_comment(SerdReader* const reader) -{ - eat_byte_safe(reader, '#'); - int c = 0; - while (((c = peek_byte(reader)) != 0xA) && c != 0xD && c != EOF && c) { - eat_byte_safe(reader, c); - } -} - -// [24] ws ::= #x9 | #xA | #xD | #x20 | comment -static bool -read_ws(SerdReader* const reader) -{ - const int c = peek_byte(reader); - switch (c) { - case 0x9: - case 0xA: - case 0xD: - case 0x20: - eat_byte_safe(reader, c); - return true; + switch (peek_byte(reader)) { + case '\t': + case '\n': + case '\r': + case ' ': + eat_byte_safe(reader, peek_byte(reader)); + return SERD_SUCCESS; case '#': read_comment(reader); - return true; + return SERD_SUCCESS; default: - return false; + break; } + + return SERD_FAILURE; } static bool read_ws_star(SerdReader* const reader) { - while (read_ws(reader)) { + while (!read_whitespace(reader)) { } return true; @@ -301,44 +131,6 @@ read_STRING_LITERAL_LONG(SerdReader* const reader, return tolerate_status(reader, st) ? SERD_SUCCESS : st; } -// STRING_LITERAL_QUOTE and STRING_LITERAL_SINGLE_QUOTE -// Initial quote is already eaten by caller -static SerdStatus -read_STRING_LITERAL(SerdReader* const reader, - SerdNode* const ref, - const uint8_t q) -{ - SerdStatus st = SERD_SUCCESS; - - while (tolerate_status(reader, st)) { - const int c = peek_byte(reader); - uint32_t code = 0; - switch (c) { - case EOF: - return r_err(reader, SERD_ERR_BAD_SYNTAX, "end of file in short string"); - case '\n': - case '\r': - return r_err(reader, SERD_ERR_BAD_SYNTAX, "line end in short string"); - case '\\': - eat_byte_safe(reader, c); - if ((st = read_ECHAR(reader, ref)) && - (st = read_UCHAR(reader, ref, &code))) { - return r_err(reader, st, "invalid escape `\\%c'", peek_byte(reader)); - } - break; - default: - if (c == q) { - eat_byte_safe(reader, q); - return SERD_SUCCESS; - } else { - st = read_character(reader, ref, (uint8_t)eat_byte_safe(reader, c)); - } - } - } - - return tolerate_status(reader, st) ? SERD_SUCCESS : st; -} - static SerdStatus read_String(SerdReader* const reader, SerdNode* const node) { @@ -373,65 +165,6 @@ read_String(SerdReader* const reader, SerdNode* const node) return read_STRING_LITERAL_LONG(reader, node, (uint8_t)q1); } -static bool -is_PN_CHARS_BASE(const uint32_t c) -{ - return ((c >= 0x00C0 && c <= 0x00D6) || (c >= 0x00D8 && c <= 0x00F6) || - (c >= 0x00F8 && c <= 0x02FF) || (c >= 0x0370 && c <= 0x037D) || - (c >= 0x037F && c <= 0x1FFF) || (c >= 0x200C && c <= 0x200D) || - (c >= 0x2070 && c <= 0x218F) || (c >= 0x2C00 && c <= 0x2FEF) || - (c >= 0x3001 && c <= 0xD7FF) || (c >= 0xF900 && c <= 0xFDCF) || - (c >= 0xFDF0 && c <= 0xFFFD) || (c >= 0x10000 && c <= 0xEFFFF)); -} - -static SerdStatus -read_PN_CHARS_BASE(SerdReader* const reader, SerdNode* const dest) -{ - uint32_t code = 0; - const int c = peek_byte(reader); - SerdStatus st = SERD_SUCCESS; - if (is_alpha(c)) { - st = push_byte(reader, dest, eat_byte_safe(reader, c)); - } else if (c == EOF || !(c & 0x80)) { - return SERD_FAILURE; - } else if ((st = read_utf8_code_point(reader, dest, &code, (uint8_t)c))) { - return st; - } else if (!is_PN_CHARS_BASE(code)) { - r_err( - reader, SERD_ERR_BAD_SYNTAX, "invalid character U+%04X in name", code); - if (reader->strict) { - return SERD_ERR_BAD_SYNTAX; - } - } - return st; -} - -static bool -is_PN_CHARS(const uint32_t c) -{ - return (is_PN_CHARS_BASE(c) || c == 0xB7 || (c >= 0x0300 && c <= 0x036F) || - (c >= 0x203F && c <= 0x2040)); -} - -static SerdStatus -read_PN_CHARS(SerdReader* const reader, SerdNode* const dest) -{ - uint32_t code = 0; - const int c = peek_byte(reader); - SerdStatus st = SERD_SUCCESS; - if (is_alpha(c) || is_digit(c) || c == '_' || c == '-') { - st = push_byte(reader, dest, eat_byte_safe(reader, c)); - } else if (c == EOF || !(c & 0x80)) { - return SERD_FAILURE; - } else if ((st = read_utf8_code_point(reader, dest, &code, (uint8_t)c))) { - return st; - } else if (!is_PN_CHARS(code)) { - return r_err( - reader, SERD_ERR_BAD_SYNTAX, "invalid character U+%04X in name", code); - } - return st; -} - static SerdStatus read_PERCENT(SerdReader* const reader, SerdNode* const dest) { @@ -592,67 +325,6 @@ read_PN_PREFIX(SerdReader* const reader, SerdNode* const dest) return st; } -static SerdStatus -read_LANGTAG(SerdReader* const reader) -{ - int c = peek_byte(reader); - if (!is_alpha(c)) { - return r_err(reader, SERD_ERR_BAD_SYNTAX, "unexpected `%c'", c); - } - - SerdNode* node = push_node(reader, SERD_LITERAL, "", 0); - if (!node) { - return SERD_ERR_OVERFLOW; - } - - SerdStatus st = SERD_SUCCESS; - TRY(st, push_byte(reader, node, eat_byte_safe(reader, c))); - while ((c = peek_byte(reader)) && is_alpha(c)) { - TRY(st, push_byte(reader, node, eat_byte_safe(reader, c))); - } - while (peek_byte(reader) == '-') { - TRY(st, push_byte(reader, node, eat_byte_safe(reader, '-'))); - while ((c = peek_byte(reader)) && (is_alpha(c) || is_digit(c))) { - TRY(st, push_byte(reader, node, eat_byte_safe(reader, c))); - } - } - return SERD_SUCCESS; -} - -static SerdStatus -read_IRIREF_scheme(SerdReader* const reader, SerdNode* const dest) -{ - int c = peek_byte(reader); - if (!is_alpha(c)) { - return r_err(reader, SERD_ERR_BAD_SYNTAX, "bad IRI scheme start `%c'", c); - } - - SerdStatus st = SERD_SUCCESS; - while ((c = peek_byte(reader)) != EOF) { - if (c == '>') { - return r_err(reader, SERD_ERR_BAD_SYNTAX, "missing IRI scheme"); - } - - if (!is_uri_scheme_char(c)) { - return r_err(reader, - SERD_ERR_BAD_SYNTAX, - "bad IRI scheme char U+%04X (%c)", - (unsigned)c, - (char)c); - } - - if ((st = push_byte(reader, dest, eat_byte_safe(reader, c)))) { - return st; - } - - if (c == ':') { - return SERD_SUCCESS; // End of scheme - } - } - - return SERD_FAILURE; -} - typedef struct { SerdReader* reader; SerdNode* node; @@ -714,6 +386,10 @@ resolve_IRIREF(SerdReader* const reader, static SerdStatus read_IRIREF(SerdReader* const reader, SerdNode** const dest) { + if (!fancy_syntax(reader)) { + return read_IRI(reader, dest); + } + SerdStatus st = SERD_SUCCESS; if ((st = eat_byte_check(reader, '<'))) { return st; @@ -725,68 +401,14 @@ read_IRIREF(SerdReader* const reader, SerdNode** const dest) const size_t string_start_offset = reader->stack.size; - if (!fancy_syntax(reader) && (st = read_IRIREF_scheme(reader, *dest))) { - return r_err(reader, st, "expected IRI scheme"); - } - - uint32_t code = 0; - while (st <= SERD_FAILURE) { - const int c = eat_byte_safe(reader, peek_byte(reader)); - switch (c) { - case '"': - case '<': - return r_err( - reader, SERD_ERR_BAD_SYNTAX, "invalid IRI character `%c'", c); - case '>': - return (reader->flags & SERD_READ_RELATIVE) - ? SERD_SUCCESS - : resolve_IRIREF(reader, *dest, string_start_offset); - case '\\': - if (read_UCHAR(reader, *dest, &code)) { - return r_err(reader, SERD_ERR_BAD_SYNTAX, "invalid IRI escape"); - } - switch (code) { - case 0: - case ' ': - case '<': - case '>': - return r_err(reader, - SERD_ERR_BAD_SYNTAX, - "invalid escaped IRI character U+%04X", - code); - default: - break; - } - break; - case '^': - case '`': - case '{': - case '|': - case '}': - return r_err( - reader, SERD_ERR_BAD_SYNTAX, "invalid IRI character `%c'", c); - default: - if (c <= 0x20) { - st = r_err(reader, - SERD_ERR_BAD_SYNTAX, - "invalid IRI character (escape %%%02X)", - (unsigned)c); - if (reader->strict) { - break; - } - - if (!(st = push_byte(reader, *dest, c))) { - st = SERD_FAILURE; - } - } else if (!(c & 0x80)) { - st = push_byte(reader, *dest, c); - } else { - st = read_utf8_continuation(reader, *dest, (uint8_t)c); - } - } + st = read_IRIREF_suffix(reader, *dest); + if (!tolerate_status(reader, st)) { + return st; } - return tolerate_status(reader, st) ? SERD_SUCCESS : st; + return (reader->flags & SERD_READ_RELATIVE) + ? SERD_SUCCESS + : resolve_IRIREF(reader, *dest, string_start_offset); } static SerdStatus @@ -970,44 +592,6 @@ read_literal(SerdReader* const reader, } static SerdStatus -read_VARNAME(SerdReader* const reader, SerdNode** const dest) -{ - // Simplified from SPARQL: VARNAME ::= (PN_CHARS_U | [0-9])+ - SerdNode* n = *dest; - SerdStatus st = SERD_SUCCESS; - int c = 0; - peek_byte(reader); - while ((c = peek_byte(reader))) { - if (is_digit(c) || c == '_') { - st = push_byte(reader, n, eat_byte_safe(reader, c)); - } else if ((st = read_PN_CHARS(reader, n))) { - st = st > SERD_FAILURE ? st : SERD_SUCCESS; - break; - } - } - - return st; -} - -static SerdStatus -read_Var(SerdReader* const reader, SerdNode** const dest) -{ - if (!(reader->flags & SERD_READ_VARIABLES)) { - return r_err( - reader, SERD_ERR_BAD_SYNTAX, "syntax does not support variables"); - } - - if (!(*dest = push_node(reader, SERD_VARIABLE, "", 0))) { - return SERD_ERR_OVERFLOW; - } - - assert(peek_byte(reader) == '$' || peek_byte(reader) == '?'); - serd_byte_source_advance(reader->source); - - return read_VARNAME(reader, dest); -} - -static SerdStatus read_verb(SerdReader* reader, SerdNode** dest) { const size_t orig_stack_size = reader->stack.size; @@ -1055,83 +639,6 @@ read_verb(SerdReader* reader, SerdNode** dest) return SERD_SUCCESS; } -static bool -avoid_blank_clashes(const SerdReader* const reader) -{ - return fancy_syntax(reader) && !(reader->flags & SERD_READ_EXACT_BLANKS); -} - -static SerdStatus -adjust_blank_id(SerdReader* const reader, char* const buf) -{ - if (avoid_blank_clashes(reader) && is_digit(buf[reader->bprefix_len + 1])) { - const char tag = buf[reader->bprefix_len]; - if (tag == 'b') { - buf[reader->bprefix_len] = 'B'; // Prevent clash - reader->seen_genid = true; - } else if (tag == 'B' && reader->seen_genid) { - return r_err(reader, - SERD_ERR_ID_CLASH, - "found both `b' and `B' blank IDs, prefix required"); - } - } - - return SERD_SUCCESS; -} - -static SerdStatus -read_BLANK_NODE_LABEL(SerdReader* const reader, - SerdNode** const dest, - bool* const ate_dot) -{ - SerdStatus st = SERD_SUCCESS; - - eat_byte_safe(reader, '_'); - TRY(st, eat_byte_check(reader, ':')); - - if (!(*dest = push_node(reader, - SERD_BLANK, - reader->bprefix ? reader->bprefix : "", - reader->bprefix_len))) { - return SERD_ERR_OVERFLOW; - } - - // Read first: (PN_CHARS | '_' | [0-9]) - SerdNode* const n = *dest; - int c = peek_byte(reader); - if (is_digit(c) || c == '_') { - TRY(st, push_byte(reader, n, eat_byte_safe(reader, c))); - } else if ((st = read_PN_CHARS(reader, n))) { - return r_err(reader, st, "invalid name start"); - } - - // Read middle: (PN_CHARS | '.')* - while ((c = peek_byte(reader))) { - if (c == '.') { - TRY(st, push_byte(reader, n, eat_byte_safe(reader, c))); - } else if ((st = read_PN_CHARS(reader, n))) { - break; - } - } - - if (st > SERD_FAILURE) { - return st; - } - - // Deal with annoying edge case of having eaten the trailing dot - char* const buf = serd_node_buffer(n); - if (buf[n->length - 1] == '.' && read_PN_CHARS(reader, n)) { - --n->length; - serd_stack_pop(&reader->stack, 1); - *ate_dot = true; - } - - // Adjust ID to avoid clashes with generated IDs if necessary - st = adjust_blank_id(reader, buf); - - return tolerate_status(reader, st) ? SERD_SUCCESS : st; -} - static SerdStatus read_anon(SerdReader* const reader, ReadContext ctx, @@ -1312,10 +819,6 @@ read_objectList(SerdReader* const reader, ReadContext ctx, bool* const ate_dot) { SerdStatus st = SERD_SUCCESS; TRY(st, read_object(reader, &ctx, true, ate_dot)); - if (!fancy_syntax(reader) && peek_delim(reader, ',')) { - return r_err( - reader, SERD_ERR_BAD_SYNTAX, "syntax does not support abbreviation"); - } while (st <= SERD_FAILURE && !*ate_dot && eat_delim(reader, ',')) { st = read_object(reader, &ctx, true, ate_dot); @@ -1698,10 +1201,6 @@ read_n3_statement(SerdReader* const reader) case EOF: return SERD_FAILURE; case '@': - if (!fancy_syntax(reader)) { - return r_err( - reader, SERD_ERR_BAD_SYNTAX, "syntax does not support directives"); - } TRY(st, read_directive(reader)); read_ws_star(reader); break; @@ -1761,14 +1260,6 @@ read_n3_statement(SerdReader* const reader) return st; } -static void -skip_until(SerdReader* const reader, const uint8_t byte) -{ - for (int c = 0; (c = peek_byte(reader)) && c != EOF && c != byte;) { - eat_byte_safe(reader, c); - } -} - SerdStatus read_turtleTrigDoc(SerdReader* const reader) { @@ -1855,7 +1346,3 @@ read_nquadsDoc(SerdReader* const reader) } return st; } - -#if defined(__clang__) && __clang_major__ >= 10 -_Pragma("clang diagnostic pop") -#endif diff --git a/src/read_ntriples.c b/src/read_ntriples.c new file mode 100644 index 00000000..a0982cfa --- /dev/null +++ b/src/read_ntriples.c @@ -0,0 +1,790 @@ +/* + Copyright 2011-2021 David Robillard <d@drobilla.net> + + Permission to use, copy, modify, and/or distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ + +#include "read_ntriples.h" + +#include "byte_source.h" +#include "caret.h" +#include "node.h" +#include "read_utf8.h" +#include "reader.h" +#include "stack.h" +#include "statement.h" +#include "string_utils.h" +#include "try.h" +#include "uri_utils.h" + +#include "serd/serd.h" + +#include <assert.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> + +// Terminals + +/// [144s] LANGTAG +SerdStatus +read_LANGTAG(SerdReader* const reader) +{ + if (!is_alpha(peek_byte(reader))) { + return r_err(reader, SERD_ERR_BAD_SYNTAX, "expected A-Z or a-z"); + } + + SerdNode* const node = push_node(reader, SERD_LITERAL, "", 0); + if (!node) { + return SERD_ERR_OVERFLOW; + } + + // First component must be all letters + SerdStatus st = SERD_SUCCESS; + TRY(st, push_byte(reader, node, eat_byte(reader))); + while (is_alpha(peek_byte(reader))) { + TRY(st, push_byte(reader, node, eat_byte(reader))); + } + + // Following components can have letters and digits + while (peek_byte(reader) == '-') { + TRY(st, push_byte(reader, node, eat_byte(reader))); + while (is_alpha(peek_byte(reader)) || is_digit(peek_byte(reader))) { + TRY(st, push_byte(reader, node, eat_byte(reader))); + } + } + + return SERD_SUCCESS; +} + +static bool +is_EOL(const int c) +{ + return c == '\n' || c == '\r'; +} + +/// [7] EOL +SerdStatus +read_EOL(SerdReader* const reader) +{ + if (!is_EOL(peek_byte(reader))) { + return r_err(reader, SERD_ERR_BAD_SYNTAX, "expected a line ending"); + } + + while (is_EOL(peek_byte(reader))) { + eat_byte(reader); + } + + return SERD_SUCCESS; +} + +static SerdStatus +read_IRI_scheme(SerdReader* const reader, SerdNode* const dest) +{ + int c = peek_byte(reader); + if (!is_alpha(c)) { + return r_err(reader, + SERD_ERR_BAD_SYNTAX, + "'%c' is not a valid first IRI character", + c); + } + + SerdStatus st = SERD_SUCCESS; + while (!st && (c = peek_byte(reader)) != EOF) { + if (c == ':') { + return SERD_SUCCESS; // End of scheme + } + + st = is_uri_scheme_char(c) + ? push_byte(reader, dest, eat_byte_safe(reader, c)) + : r_err(reader, + SERD_ERR_BAD_SYNTAX, + "U+%04X is not a valid IRI scheme character", + (unsigned)c); + } + + return st ? st : SERD_ERR_BAD_SYNTAX; +} + +SerdStatus +read_IRIREF_suffix(SerdReader* const reader, SerdNode* const node) +{ + SerdStatus st = SERD_SUCCESS; + uint32_t code = 0u; + + while (st <= SERD_FAILURE) { + const int c = eat_byte(reader); + switch (c) { + case EOF: + return r_err(reader, SERD_ERR_BAD_SYNTAX, "unexpected end of file"); + + case ' ': + case '"': + case '<': + case '^': + case '`': + case '{': + case '|': + case '}': + return r_err( + reader, SERD_ERR_BAD_SYNTAX, "'%c' is not a valid IRI character", c); + + case '>': + return SERD_SUCCESS; + + case '\\': + if ((st = read_UCHAR(reader, node, &code))) { + return st; + } + + if (!code || code == ' ' || code == '<' || code == '>') { + return r_err(reader, + SERD_ERR_BAD_SYNTAX, + "U+%04X is not a valid IRI character", + code); + } + + break; + + default: + if (c <= 0x20) { + st = r_err(reader, + SERD_ERR_BAD_SYNTAX, + "control character U+%04X is not a valid IRI character", + (uint32_t)c); + + if (reader->strict) { + return st; + } + } + + st = ((uint8_t)c & 0x80) + ? read_utf8_continuation(reader, node, (uint8_t)c) + : push_byte(reader, node, c); + } + } + + return tolerate_status(reader, st) ? SERD_SUCCESS : st; +} + +SerdStatus +read_IRI(SerdReader* const reader, SerdNode** const dest) +{ + SerdStatus st = SERD_SUCCESS; + if ((st = eat_byte_check(reader, '<'))) { + return st; + } + + if (!(*dest = push_node(reader, SERD_URI, "", 0))) { + return SERD_ERR_OVERFLOW; + } + + if ((st = read_IRI_scheme(reader, *dest))) { + return r_err(reader, st, "expected IRI scheme"); + } + + return read_IRIREF_suffix(reader, *dest); +} + +SerdStatus +read_character(SerdReader* const reader, SerdNode* const dest, const uint8_t c) +{ + if (!(c & 0x80)) { + switch (c) { + case 0xA: + case 0xD: + dest->flags |= SERD_HAS_NEWLINE; + break; + case '"': + case '\'': + dest->flags |= SERD_HAS_QUOTE; + break; + default: + break; + } + + return push_byte(reader, dest, c); + } + + return read_utf8_continuation(reader, dest, c); +} + +/// [9] STRING_LITERAL_QUOTE +/// [23] STRING_LITERAL_SINGLE_QUOTE +SerdStatus +read_STRING_LITERAL(SerdReader* const reader, + SerdNode* const ref, + const uint8_t q) +{ + SerdStatus st = SERD_SUCCESS; + + while (tolerate_status(reader, st)) { + const int c = peek_byte(reader); + uint32_t code = 0; + switch (c) { + case EOF: + return r_err(reader, SERD_ERR_BAD_SYNTAX, "end of file in short string"); + case '\n': + case '\r': + return r_err(reader, SERD_ERR_BAD_SYNTAX, "line end in short string"); + case '\\': + eat_byte_safe(reader, c); + if ((st = read_ECHAR(reader, ref)) && + (st = read_UCHAR(reader, ref, &code))) { + return r_err(reader, st, "invalid escape `\\%c'", peek_byte(reader)); + } + break; + default: + eat_byte_safe(reader, c); + if (c == q) { + return SERD_SUCCESS; + } + + st = read_character(reader, ref, (uint8_t)c); + } + } + + if (st && reader->strict) { + r_err(reader, st, "failed to read literal (%s)", serd_strerror(st)); + } + + return st; +} + +static SerdStatus +adjust_blank_id(SerdReader* const reader, char* const buf) +{ + if (!(reader->flags & SERD_READ_EXACT_BLANKS) && + is_digit(buf[reader->bprefix_len + 1])) { + const char tag = buf[reader->bprefix_len]; + if (tag == 'b') { + buf[reader->bprefix_len] = 'B'; // Prevent clash + reader->seen_genid = true; + } else if (tag == 'B' && reader->seen_genid) { + return r_err(reader, + SERD_ERR_ID_CLASH, + "found both `b' and `B' blank IDs, prefix required"); + } + } + + return SERD_SUCCESS; +} + +/// [141s] BLANK_NODE_LABEL +SerdStatus +read_BLANK_NODE_LABEL(SerdReader* const reader, + SerdNode** const dest, + bool* const ate_dot) +{ + SerdStatus st = SERD_SUCCESS; + + eat_byte_safe(reader, '_'); + if ((st = eat_byte_check(reader, ':'))) { + return st; + } + + if (!(*dest = push_node(reader, + SERD_BLANK, + reader->bprefix ? reader->bprefix : "", + reader->bprefix_len))) { + return SERD_ERR_OVERFLOW; + } + + // Read first: (PN_CHARS_U | [0-9]) + SerdNode* const n = *dest; + int c = peek_byte(reader); + if (is_digit(c)) { + TRY(st, push_byte(reader, n, eat_byte_safe(reader, c))); + } else { + TRY(st, read_PN_CHARS_U(reader, *dest)); + } + + // Read middle: (PN_CHARS | '.')* + while ((c = peek_byte(reader))) { + if (c == '.') { + TRY(st, push_byte(reader, n, eat_byte_safe(reader, c))); + } else if ((st = read_PN_CHARS(reader, n))) { + break; + } + } + + if (st > SERD_FAILURE) { + return st; + } + + // Deal with annoying edge case of having eaten the trailing dot + char* const buf = serd_node_buffer(n); + if (buf[n->length - 1] == '.' && read_PN_CHARS(reader, n)) { + --n->length; + serd_stack_pop(&reader->stack, 1); + *ate_dot = true; + } + + // Adjust ID to avoid clashes with generated IDs if necessary + return tolerate_status(reader, st) ? adjust_blank_id(reader, buf) : st; +} + +static unsigned +utf8_from_codepoint(uint8_t* const out, const uint32_t code) +{ + const unsigned size = utf8_num_bytes_for_codepoint(code); + uint32_t c = code; + + assert(size <= 4u); + + if (size == 4u) { + out[3] = (uint8_t)(0x80u | (c & 0x3Fu)); + c >>= 6; + c |= 0x10000; + } + + if (size >= 3u) { + out[2] = (uint8_t)(0x80u | (c & 0x3Fu)); + c >>= 6; + c |= 0x800; + } + + if (size >= 2u) { + out[1] = (uint8_t)(0x80u | (c & 0x3Fu)); + c >>= 6; + c |= 0xC0; + } + + if (size >= 1u) { + out[0] = (uint8_t)c; + } + + return size; +} + +/// [10] UCHAR +SerdStatus +read_UCHAR(SerdReader* const reader, + SerdNode* const node, + uint32_t* const code_point) +{ + // Consume first character to determine which type of escape this is + const int b = peek_byte(reader); + unsigned length = 0u; + switch (b) { + case 'U': + length = 8; + break; + case 'u': + length = 4; + break; + default: + return r_err(reader, SERD_ERR_BAD_SYNTAX, "expected 'U' or 'u'"); + } + eat_byte_safe(reader, b); + + // Read character code point in hex + uint8_t buf[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; + for (unsigned i = 0; i < length; ++i) { + if (!(buf[i] = read_HEX(reader))) { + return SERD_ERR_BAD_SYNTAX; + } + } + + // Parse code point from buf, then reuse buf to write the UTF-8 + char* endptr = NULL; + const uint32_t code = (uint32_t)strtoul((const char*)buf, &endptr, 16); + const unsigned size = utf8_from_codepoint(buf, code); + + if (!size) { + *code_point = 0xFFFD; + return (reader->strict + ? r_err(reader, SERD_ERR_BAD_SYNTAX, "U+%X is out of range", code) + : push_bytes(reader, node, replacement_char, 3)); + } + + *code_point = code; + return push_bytes(reader, node, buf, size); +} + +/// [153s] ECHAR +SerdStatus +read_ECHAR(SerdReader* const reader, SerdNode* const dest) +{ + const int c = peek_byte(reader); + switch (c) { + case 't': + eat_byte_safe(reader, 't'); + return push_byte(reader, dest, '\t'); + case 'b': + eat_byte_safe(reader, 'b'); + return push_byte(reader, dest, '\b'); + case 'n': + dest->flags |= SERD_HAS_NEWLINE; + eat_byte_safe(reader, 'n'); + return push_byte(reader, dest, '\n'); + case 'r': + dest->flags |= SERD_HAS_NEWLINE; + eat_byte_safe(reader, 'r'); + return push_byte(reader, dest, '\r'); + case 'f': + eat_byte_safe(reader, 'f'); + return push_byte(reader, dest, '\f'); + case '\\': + case '"': + case '\'': + return push_byte(reader, dest, eat_byte_safe(reader, c)); + default: + return SERD_ERR_BAD_SYNTAX; + } +} + +/// [157s] PN_CHARS_BASE +SerdStatus +read_PN_CHARS_BASE(SerdReader* const reader, SerdNode* const dest) +{ + uint32_t code = 0; + const int c = peek_byte(reader); + SerdStatus st = SERD_SUCCESS; + + if (is_alpha(c)) { + return push_byte(reader, dest, eat_byte_safe(reader, c)); + } + + if (c == EOF || !(c & 0x80)) { + return SERD_FAILURE; + } + + if ((st = read_utf8_code_point(reader, dest, &code, (uint8_t)c))) { + return st; + } + + if (!is_PN_CHARS_BASE(code)) { + r_err(reader, + SERD_ERR_BAD_SYNTAX, + "U+%04X is not a valid name character", + code); + if (reader->strict) { + return SERD_ERR_BAD_SYNTAX; + } + } + + return st; +} + +/// [158s] PN_CHARS_U +SerdStatus +read_PN_CHARS_U(SerdReader* const reader, SerdNode* const dest) +{ + const int c = peek_byte(reader); + + switch (c) { + case ':': + case '_': + return push_byte(reader, dest, eat_byte_safe(reader, c)); + default: + break; + } + + return read_PN_CHARS_BASE(reader, dest); +} + +// [160s] PN_CHARS +SerdStatus +read_PN_CHARS(SerdReader* const reader, SerdNode* const dest) +{ + const int c = peek_byte(reader); + SerdStatus st = SERD_SUCCESS; + + if (c == EOF) { + return SERD_ERR_NO_DATA; + } + + if (is_alpha(c) || is_digit(c) || c == '_' || c == '-') { + return push_byte(reader, dest, eat_byte_safe(reader, c)); + } + + if (!(c & 0x80)) { + return SERD_FAILURE; + } + + uint32_t code = 0u; + if ((st = read_utf8_code_point(reader, dest, &code, (uint8_t)c))) { + return st; + } + + if (!is_PN_CHARS_BASE(code) && code != 0xB7 && + !(code >= 0x0300 && code <= 0x036F) && + !(code >= 0x203F && code <= 0x2040)) { + return r_err(reader, + SERD_ERR_BAD_SYNTAX, + "U+%04X is not a valid name character", + code); + } + + return st; +} + +/// [162s] HEX +uint8_t +read_HEX(SerdReader* const reader) +{ + const int c = peek_byte(reader); + if (is_xdigit(c)) { + return (uint8_t)eat_byte_safe(reader, c); + } + + r_err(reader, SERD_ERR_BAD_SYNTAX, "invalid hexadecimal digit `%c'", c); + return 0; +} + +SerdStatus +read_VARNAME(SerdReader* const reader, SerdNode** const dest) +{ + // Simplified from SPARQL: VARNAME ::= (PN_CHARS_U | [0-9])+ + SerdNode* n = *dest; + SerdStatus st = SERD_SUCCESS; + int c = 0; + peek_byte(reader); + while ((c = peek_byte(reader))) { + if (is_digit(c) || c == '_') { + st = push_byte(reader, n, eat_byte_safe(reader, c)); + } else if ((st = read_PN_CHARS(reader, n))) { + st = st > SERD_FAILURE ? st : SERD_SUCCESS; + break; + } + } + + return st; +} + +SerdStatus +read_Var(SerdReader* const reader, SerdNode** const dest) +{ + assert(peek_byte(reader) == '$' || peek_byte(reader) == '?'); + + if (!(reader->flags & SERD_READ_VARIABLES)) { + return r_err( + reader, SERD_ERR_BAD_SYNTAX, "syntax does not support variables"); + } + + if (!(*dest = push_node(reader, SERD_VARIABLE, "", 0))) { + return SERD_ERR_OVERFLOW; + } + + serd_byte_source_advance(reader->source); + + return read_VARNAME(reader, dest); +} + +// Nonterminals + +// comment ::= '#' ( [^#xA #xD] )* +SerdStatus +read_comment(SerdReader* const reader) +{ + eat_byte_safe(reader, '#'); + + for (int c = peek_byte(reader); c && c != '\n' && c != '\r' && c != EOF;) { + eat_byte_safe(reader, c); + c = peek_byte(reader); + } + + return SERD_SUCCESS; +} + +/// [6] literal +static SerdStatus +read_literal(SerdReader* const reader, SerdNode** const dest) +{ + SerdStatus st = SERD_SUCCESS; + + if (!(*dest = push_node(reader, SERD_LITERAL, "", 0))) { + return SERD_ERR_OVERFLOW; + } + + eat_byte_safe(reader, '"'); + if ((st = read_STRING_LITERAL(reader, *dest, '"'))) { + return st; + } + + SerdNode* datatype = NULL; + switch (peek_byte(reader)) { + case '@': + eat_byte_safe(reader, '@'); + (*dest)->flags |= SERD_HAS_LANGUAGE; + TRY(st, read_LANGTAG(reader)); + break; + case '^': + eat_byte_safe(reader, '^'); + if (!(st = eat_byte_check(reader, '^'))) { + (*dest)->flags |= SERD_HAS_DATATYPE; + TRY(st, read_IRI(reader, &datatype)); + } + break; + } + + return st; +} + +/// [3] subject +SerdStatus +read_nt_subject(SerdReader* const reader, SerdNode** const dest) +{ + bool ate_dot = false; + + switch (peek_byte(reader)) { + case '<': + return read_IRI(reader, dest); + + case '?': + return read_Var(reader, dest); + + case '_': + return read_BLANK_NODE_LABEL(reader, dest, &ate_dot); + + default: + break; + } + + return r_err(reader, SERD_ERR_BAD_SYNTAX, "expected '<' or '_'"); +} + +/// [4] predicate +SerdStatus +read_nt_predicate(SerdReader* const reader, SerdNode** const dest) +{ + return (peek_byte(reader) == '?') ? read_Var(reader, dest) + : read_IRI(reader, dest); +} + +/// [4] object +SerdStatus +read_nt_object(SerdReader* const reader, + SerdNode** const dest, + bool* const ate_dot) +{ + *ate_dot = false; + + switch (peek_byte(reader)) { + case '"': + return read_literal(reader, dest); + + case '<': + return read_IRI(reader, dest); + + case '?': + return read_Var(reader, dest); + + case '_': + return read_BLANK_NODE_LABEL(reader, dest, ate_dot); + + default: + return r_err(reader, SERD_ERR_BAD_SYNTAX, "expected '<', '_', or '\"'"); + } +} + +/// [2] triple +static SerdStatus +read_triple(SerdReader* const reader) +{ + SerdStatementFlags flags = 0; + ReadContext ctx = {0, 0, 0, 0, &flags}; + SerdStatus st = SERD_SUCCESS; + bool ate_dot = false; + + // Read subject and predicate + if ((st = read_nt_subject(reader, &ctx.subject)) || + (st = skip_horizontal_whitespace(reader)) || + (st = read_nt_predicate(reader, &ctx.predicate)) || + (st = skip_horizontal_whitespace(reader))) { + return st; + } + + // Preserve the caret for error reporting and read object + SerdCaret orig_caret = reader->source->caret; + if ((st = read_nt_object(reader, &ctx.object, &ate_dot)) || + (st = skip_horizontal_whitespace(reader))) { + return st; + } + + if (!ate_dot && (st = eat_byte_check(reader, '.'))) { + return st; + } + + if (ctx.object) { + serd_node_zero_pad(ctx.object); + } + + const SerdStatement statement = { + {ctx.subject, ctx.predicate, ctx.object, ctx.graph}, &orig_caret}; + + return serd_sink_write_statement(reader->sink, *ctx.flags, &statement); +} + +static SerdStatus +read_line(SerdReader* const reader) +{ + SerdStatus st = SERD_SUCCESS; + + skip_horizontal_whitespace(reader); + + switch (peek_byte(reader)) { + case EOF: + return SERD_FAILURE; + + case '\n': + case '\r': + return read_EOL(reader); + + case '#': + st = read_comment(reader); + break; + + default: + if (!(st = read_triple(reader))) { + skip_horizontal_whitespace(reader); + if (peek_byte(reader) == '#') { + st = read_comment(reader); + } + } + break; + } + + return (st || peek_byte(reader) == EOF) ? st : read_EOL(reader); +} + +/// [1] ntriplesDoc +SerdStatus +read_ntriplesDoc(SerdReader* const reader) +{ + // Record the initial stack size and read the first line + const size_t orig_stack_size = reader->stack.size; + SerdStatus st = read_line(reader); + + // Return early if we failed to read anything at all + serd_stack_pop_to(&reader->stack, orig_stack_size); + if (st == SERD_FAILURE || !tolerate_status(reader, st)) { + return st; + } + + // Continue reading lines for as long as possible + for (st = SERD_SUCCESS; !st;) { + st = read_line(reader); + serd_stack_pop_to(&reader->stack, orig_stack_size); + + if (st > SERD_FAILURE && !reader->strict && tolerate_status(reader, st)) { + skip_until(reader, '\n'); + st = SERD_SUCCESS; + } + } + + // If we made it this far, we succeeded at reading at least one line + return st > SERD_FAILURE ? st : SERD_SUCCESS; +} diff --git a/src/read_ntriples.h b/src/read_ntriples.h new file mode 100644 index 00000000..d3a74924 --- /dev/null +++ b/src/read_ntriples.h @@ -0,0 +1,223 @@ +/* + Copyright 2011-2021 David Robillard <d@drobilla.net> + + Permission to use, copy, modify, and/or distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ + +#ifndef SERD_READ_NTRIPLES_H +#define SERD_READ_NTRIPLES_H + +#include "serd/serd.h" + +#include <stdbool.h> +#include <stdint.h> + +// Utilities + +static inline bool +codepoint_in_range(const uint32_t c, const uint32_t min, const uint32_t max) +{ + return c >= min && c <= max; +} + +static inline bool +is_PN_CHARS_BASE(const uint32_t c) +{ + return (codepoint_in_range(c, 'A', 'Z') || codepoint_in_range(c, 'a', 'z') || + codepoint_in_range(c, 0x000C0u, 0x000D6u) || + codepoint_in_range(c, 0x000D8u, 0x000F6u) || + codepoint_in_range(c, 0x000F8u, 0x002FFu) || + codepoint_in_range(c, 0x00370u, 0x0037Du) || + codepoint_in_range(c, 0x0037Fu, 0x01FFFu) || + codepoint_in_range(c, 0x0200Cu, 0x0200Du) || + codepoint_in_range(c, 0x02070u, 0x0218Fu) || + codepoint_in_range(c, 0x02C00u, 0x02FEFu) || + codepoint_in_range(c, 0x03001u, 0x0D7FFu) || + codepoint_in_range(c, 0x0F900u, 0x0FDCFu) || + codepoint_in_range(c, 0x0FDF0u, 0x0FFFDu) || + codepoint_in_range(c, 0x10000u, 0xEFFFFu)); +} + +/** + Read one (possibly multi-byte) character (possibly multi-byte). + + The caller must have already eaten the first byte, `c`. +*/ +SerdStatus +read_character(SerdReader* reader, SerdNode* dest, uint8_t c); + +// Terminals + +/** + Read a language tag starting after the '@'. + + RDF 1.1 NTriples: [144s] LANGTAG +*/ +SerdStatus +read_LANGTAG(SerdReader* reader); + +/** + Read an end of line. + + RDF 1.1 NTriples: [7] EOL +*/ +SerdStatus +read_EOL(SerdReader* reader); + +/** + Read an absolute IRI. + + This is a stricter subset of [8] IRIREF in the NTriples grammar, since a + scheme is required. Handling this in the parser results in better error + messages. +*/ +SerdStatus +read_IRI(SerdReader* reader, SerdNode** dest); + +/** + Read an IRI reference suffix into an existing node. + + RDF 1.1 NTriples: [8] IRIREF +*/ +SerdStatus +read_IRIREF_suffix(SerdReader* reader, SerdNode* node); + +/** + Read a string that is single-quoted with the given character. + + RDF 1.1 NTriples: [9] STRING_LITERAL_QUOTE + RDF 1.1 Turtle: [23] STRING_LITERAL_SINGLE_QUOTE +*/ +SerdStatus +read_STRING_LITERAL(SerdReader* reader, SerdNode* ref, uint8_t q); + +/** + Read a blank node label that comes after "_:". + + RDF 1.1 NTriples: [141s] BLANK_NODE_LABEL +*/ +SerdStatus +read_BLANK_NODE_LABEL(SerdReader* reader, SerdNode** dest, bool* ate_dot); + +/** + Read an escape like "u201C", starting after the initial backslash. + + RDF 1.1 NTriples: [10] UCHAR +*/ +SerdStatus +read_UCHAR(SerdReader* reader, SerdNode* node, uint32_t* code_point); + +/** + Read an escape like "n", starting after the initial backslash. + + RDF 1.1 NTriples: [153s] ECHAR +*/ +SerdStatus +read_ECHAR(SerdReader* reader, SerdNode* dest); + +/** + Read a basic prefixed name character. + + RDF 1.1 NTriples: [157s] PN_CHARS_BASE +*/ +SerdStatus +read_PN_CHARS_BASE(SerdReader* reader, SerdNode* dest); + +/** + Read an initial prefixed name character. + + RDF 1.1 NTriples: [158s] PN_CHARS_U +*/ +SerdStatus +read_PN_CHARS_U(SerdReader* reader, SerdNode* dest); + +/** + Read any prefixed name character. + + RDF 1.1 NTriples: [160s] PN_CHARS +*/ +SerdStatus +read_PN_CHARS(SerdReader* reader, SerdNode* dest); + +/** + Read a single hexadecimal digit. + + RDF 1.1 NTriples: [162s] HEX +*/ +uint8_t +read_HEX(SerdReader* reader); + +/** + Read a variable name, starting after the '?' or '$'. + + This is an extension that serd uses in certain contexts to support patterns. + + Restricted version of SPARQL 1.1: [166] VARNAME +*/ +SerdStatus +read_VARNAME(SerdReader* reader, SerdNode** dest); + +// Nonterminals + +/** + Read a comment that starts with '#' and ends with the line. + + Not described by a rule in the grammar since RDF 1.1. +*/ +SerdStatus +read_comment(SerdReader* reader); + +/** + Read a subject (IRI or blank). + + RDF 1.1 NTriples: [3] subject +*/ +SerdStatus +read_nt_subject(SerdReader* reader, SerdNode** dest); + +/** + Read a predicate (IRI). + + RDF 1.1 NTriples: [4] predicate +*/ +SerdStatus +read_nt_predicate(SerdReader* reader, SerdNode** dest); + +/** + Read an object (IRI or blank or literal). + + RDF 1.1 NTriples: [5] object +*/ +SerdStatus +read_nt_object(SerdReader* reader, SerdNode** dest, bool* ate_dot); + +/** + Read a variable that starts with '?' or '$'. + + This is an extension that serd uses in certain contexts to support + patterns. + + Restricted version of SPARQL 1.1: [108] Var +*/ +SerdStatus +read_Var(SerdReader* reader, SerdNode** dest); + +/** + Read a complete NTriples document. + + RDF 1.1 NTriples: [1] ntriplesDoc +*/ +SerdStatus +read_ntriplesDoc(SerdReader* reader); + +#endif // SERD_READ_NTRIPLES_H diff --git a/src/reader.c b/src/reader.c index f2d929ac..5cd4f955 100644 --- a/src/reader.c +++ b/src/reader.c @@ -19,6 +19,7 @@ #include "byte_source.h" #include "namespaces.h" #include "node.h" +#include "read_ntriples.h" #include "stack.h" #include "statement.h" #include "system.h" @@ -44,6 +45,24 @@ r_err(SerdReader* const reader, const SerdStatus st, const char* const fmt, ...) return st; } +SerdStatus +skip_horizontal_whitespace(SerdReader* const reader) +{ + while (peek_byte(reader) == '\t' || peek_byte(reader) == ' ') { + eat_byte(reader); + } + + return SERD_SUCCESS; +} + +void +skip_until(SerdReader* const reader, const uint8_t byte) +{ + for (int c = 0; (c = peek_byte(reader)) && c != EOF && c != byte;) { + eat_byte_safe(reader, c); + } +} + void set_blank_id(SerdReader* const reader, SerdNode* const node, @@ -169,15 +188,27 @@ serd_reader_read_document(SerdReader* const reader) return SERD_ERR_BAD_CALL; } - if (!reader->source->prepared) { + if (reader->syntax != SERD_SYNTAX_EMPTY && !reader->source->prepared) { SerdStatus st = serd_reader_prepare(reader); if (st) { return st; } } - return ((reader->syntax == SERD_NQUADS) ? read_nquadsDoc(reader) - : read_turtleTrigDoc(reader)); + switch (reader->syntax) { + case SERD_SYNTAX_EMPTY: + break; + case SERD_TURTLE: + return read_turtleTrigDoc(reader); + case SERD_NTRIPLES: + return read_ntriplesDoc(reader); + case SERD_NQUADS: + return read_nquadsDoc(reader); + case SERD_TRIG: + return read_turtleTrigDoc(reader); + } + + return SERD_SUCCESS; } SerdReader* diff --git a/src/reader.h b/src/reader.h index acef8ce2..d9347dee 100644 --- a/src/reader.h +++ b/src/reader.h @@ -57,6 +57,12 @@ struct SerdReaderImpl { bool seen_genid; }; +SerdStatus +skip_horizontal_whitespace(SerdReader* reader); + +void +skip_until(SerdReader* reader, uint8_t byte); + SERD_LOG_FUNC(3, 4) SerdStatus r_err(SerdReader* reader, SerdStatus st, const char* fmt, ...); @@ -109,6 +115,18 @@ peek_byte(SerdReader* reader) } static inline int +eat_byte(SerdReader* reader) +{ + const int c = peek_byte(reader); + + if (c != EOF) { + serd_byte_source_advance(reader->source); + } + + return c; +} + +static inline int eat_byte_safe(SerdReader* reader, const int byte) { (void)byte; diff --git a/src/string_utils.h b/src/string_utils.h index 5eeabc6b..a411b90d 100644 --- a/src/string_utils.h +++ b/src/string_utils.h @@ -152,20 +152,44 @@ utf8_num_bytes(const uint8_t leading) return lengths[leading >> 3u]; } +static inline unsigned +utf8_num_bytes_for_codepoint(const uint32_t code) +{ + if (code < 0x00000080) { + return 1u; + } + + if (code < 0x00000800) { + return 2u; + } + + if (code < 0x00010000) { + return 3u; + } + + if (code < 0x00110000) { + return 4u; + } + + return 0u; // Out of range +} + /// Return the code point of a UTF-8 character with known length static inline uint32_t -parse_counted_utf8_char(const uint8_t* utf8, size_t size) +parse_counted_utf8_char(const uint8_t* const utf8, const size_t size) { uint32_t c = utf8[0] & ((1u << (8u - size)) - 1u); + for (size_t i = 1; i < size; ++i) { c = (c << 6) | (utf8[i] & 0x3Fu); } + return c; } /// Parse a UTF-8 character, set *size to the length, and return the code point static inline uint32_t -parse_utf8_char(const uint8_t* utf8, size_t* size) +parse_utf8_char(const uint8_t* const utf8, size_t* const size) { switch (*size = utf8_num_bytes(utf8[0])) { case 1: diff --git a/test/bad/bad-blank-node-label.nt b/test/bad/bad-blank-node-label.nt new file mode 100644 index 00000000..8f37fe13 --- /dev/null +++ b/test/bad/bad-blank-node-label.nt @@ -0,0 +1 @@ +<http://example.org/s> <http://example.org/p> _nocolon .
\ No newline at end of file diff --git a/test/bad/bad-trailing-garbage.nt b/test/bad/bad-trailing-garbage.nt new file mode 100644 index 00000000..b4e2f7cd --- /dev/null +++ b/test/bad/bad-trailing-garbage.nt @@ -0,0 +1 @@ +<http://example.org/s> <http://example.org/p> <http://example.org/o> . <http://example.org/error>
\ No newline at end of file diff --git a/test/bad/manifest.ttl b/test/bad/manifest.ttl index d024dccc..28841e82 100644 --- a/test/bad/manifest.ttl +++ b/test/bad/manifest.ttl @@ -24,6 +24,7 @@ <#bad-14> <#bad-base> <#bad-blank> + <#bad-blank-node-label> <#bad-bom> <#bad-char-in-local> <#bad-char-in-prefix> @@ -70,6 +71,7 @@ <#bad-semicolon-after-subject> <#bad-string> <#bad-subject> + <#bad-trailing-garbage> <#bad-uri-escape> <#bad-var> <#bad-verb> @@ -166,6 +168,11 @@ mf:name "bad-blank" ; mf:action <bad-blank.ttl> . +<#bad-blank-node-label> + rdf:type rdft:TestNTriplesNegativeSyntax ; + mf:name "bad-blank-node-label" ; + mf:action <bad-blank-node-label.nt> . + <#bad-bom> rdf:type rdft:TestTurtleNegativeSyntax ; mf:name "bad-bom" ; @@ -396,6 +403,11 @@ mf:name "bad-subject" ; mf:action <bad-subject.ttl> . +<#bad-trailing-garbage> + rdf:type rdft:TestNTriplesNegativeSyntax ; + mf:name "bad-trailing-garbage" ; + mf:action <bad-trailing-garbage.nt> . + <#bad-uri-escape> rdf:type rdft:TestTurtleNegativeSyntax ; mf:name "bad-uri-escape" ; diff --git a/test/good/manifest.ttl b/test/good/manifest.ttl index 2bf36caf..fe3e195e 100644 --- a/test/good/manifest.ttl +++ b/test/good/manifest.ttl @@ -43,7 +43,6 @@ <#test-no-spaces> <#test-non-curie-uri> <#test-num> - <#test-out-of-range-unicode> <#test-prefix> <#test-pretty> <#test-rel> @@ -257,12 +256,6 @@ mf:action <test-num.ttl> ; mf:result <test-num.nt> . -<#test-out-of-range-unicode> - rdf:type rdft:TestTurtleEval ; - mf:name "test-out-of-range-unicode" ; - mf:action <test-out-of-range-unicode.ttl> ; - mf:result <test-out-of-range-unicode.nt> . - <#test-prefix> rdf:type rdft:TestTurtleEval ; mf:name "test-prefix" ; diff --git a/test/lax/manifest.ttl b/test/lax/manifest.ttl index b63da6d4..0d370f42 100644 --- a/test/lax/manifest.ttl +++ b/test/lax/manifest.ttl @@ -16,6 +16,7 @@ <#test-bad-utf8-nt> <#test-bad-utf8-nq> <#test-lone-list> + <#test-out-of-range-unicode> ) . <#test-bad-string-ttl> @@ -71,3 +72,9 @@ mf:name "test-lone-list" ; mf:action <test-lone-list.ttl> ; mf:result <test-lone-list.nt> . + +<#test-out-of-range-unicode> + rdf:type rdft:TestTurtleNegativeSyntax ; + mf:name "test-out-of-range-unicode" ; + mf:action <test-out-of-range-unicode.ttl> ; + mf:result <test-out-of-range-unicode.nt> . diff --git a/test/good/test-out-of-range-unicode.nt b/test/lax/test-out-of-range-unicode.nt index 5def9e31..5def9e31 100644 --- a/test/good/test-out-of-range-unicode.nt +++ b/test/lax/test-out-of-range-unicode.nt diff --git a/test/good/test-out-of-range-unicode.ttl b/test/lax/test-out-of-range-unicode.ttl index 7e64785a..7e64785a 100644 --- a/test/good/test-out-of-range-unicode.ttl +++ b/test/lax/test-out-of-range-unicode.ttl diff --git a/test/test_overflow.c b/test/test_overflow.c index d94d7e90..2a77135c 100644 --- a/test/test_overflow.c +++ b/test/test_overflow.c @@ -74,6 +74,9 @@ test_ntriples_overflow(void) { static const char* const test_strings[] = { "<http://example.org/s> <http://example.org/p> <http://example.org/o> .", + "<http://example.org/s> <http://example.org/p> \"literal\" .", + "<http://example.org/s> <http://example.org/p> _:blank .", + "<http://example.org/s> <http://example.org/p> \"\"@en .", NULL, }; diff --git a/test/test_reader.c b/test/test_reader.c index 5cc8d634..3651e014 100644 --- a/test/test_reader.c +++ b/test/test_reader.c @@ -19,6 +19,7 @@ #include "serd/serd.h" #include <assert.h> +#include <stdbool.h> #include <stdio.h> #include <string.h> @@ -314,6 +315,59 @@ test_read_chunks(void) serd_world_free(world); } +static size_t +empty_test_read(void* buf, size_t size, size_t nmemb, void* stream) +{ + (void)buf; + (void)size; + (void)nmemb; + (void)stream; + + assert(false); + + return 0; +} + +static int +empty_test_error(void* stream) +{ + (void)stream; + return 0; +} + +/// Test that reading SERD_SYNTAX_EMPTY "succeeds" without reading any input +static void +test_read_empty(void) +{ + SerdWorld* const world = serd_world_new(); + size_t n_statements = 0; + FILE* const f = tmpfile(); + + SerdSink* const sink = serd_sink_new(&n_statements, count_statements, NULL); + assert(sink); + + SerdEnv* const env = serd_env_new(SERD_EMPTY_STRING()); + SerdReader* const reader = + serd_reader_new(world, SERD_SYNTAX_EMPTY, 0, env, sink, 4096); + + assert(reader); + + SerdByteSource* byte_source = serd_byte_source_new_function( + empty_test_read, empty_test_error, NULL, f, NULL, 1); + + SerdStatus st = serd_reader_start(reader, byte_source); + assert(!st); + + assert(serd_reader_read_document(reader) == SERD_SUCCESS); + assert(n_statements == 0); + + serd_byte_source_free(byte_source); + serd_reader_free(reader); + serd_env_free(env); + serd_sink_free(sink); + serd_world_free(world); +} + int main(void) { @@ -322,5 +376,6 @@ main(void) test_read_eof_by_page(); test_read_eof_by_byte(); test_read_chunks(); + test_read_empty(); return 0; } |