diff options
Diffstat (limited to 'src/n3.c')
-rw-r--r-- | src/n3.c | 563 |
1 files changed, 25 insertions, 538 deletions
@@ -18,12 +18,11 @@ #include "env.h" #include "namespaces.h" #include "node.h" -#include "read_utf8.h" +#include "read_ntriples.h" #include "reader.h" #include "stack.h" #include "string_utils.h" #include "try.h" -#include "uri_utils.h" #include "serd/serd.h" @@ -31,19 +30,8 @@ #include <stdbool.h> #include <stdint.h> #include <stdio.h> -#include <stdlib.h> #include <string.h> -#if defined(__clang__) && __clang_major__ >= 10 -# define SERD_FALLTHROUGH __attribute__((fallthrough)) -_Pragma("clang diagnostic push") -_Pragma("clang diagnostic ignored \"-Wmissing-declarations\"") -#elif defined(__GNUC__) && __GNUC__ >= 7 -# define SERD_FALLTHROUGH __attribute__((fallthrough)) -#else -# define SERD_FALLTHROUGH -#endif - static bool fancy_syntax(const SerdReader* const reader) { @@ -56,189 +44,31 @@ read_collection(SerdReader* reader, ReadContext ctx, SerdNode** dest); static SerdStatus read_predicateObjectList(SerdReader* reader, ReadContext ctx, bool* ate_dot); -static uint8_t -read_HEX(SerdReader* const reader) -{ - const int c = peek_byte(reader); - if (is_xdigit(c)) { - return (uint8_t)eat_byte_safe(reader, c); - } - - r_err(reader, SERD_ERR_BAD_SYNTAX, "invalid hexadecimal digit `%c'", c); - return 0; -} - -// Read UCHAR escape, initial \ is already eaten by caller +// whitespace ::= #x9 | #xA | #xD | #x20 | comment static SerdStatus -read_UCHAR(SerdReader* const reader, - SerdNode* const dest, - uint32_t* const char_code) +read_whitespace(SerdReader* const reader) { - const int b = peek_byte(reader); - unsigned length = 0; - switch (b) { - case 'U': - length = 8; - break; - case 'u': - length = 4; - break; - default: - return SERD_ERR_BAD_SYNTAX; - } - eat_byte_safe(reader, b); - - uint8_t buf[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; - for (unsigned i = 0; i < length; ++i) { - if (!(buf[i] = read_HEX(reader))) { - return SERD_ERR_BAD_SYNTAX; - } - } - - char* endptr = NULL; - const uint32_t code = (uint32_t)strtoul((const char*)buf, &endptr, 16); - assert(endptr == (char*)buf + length); - - unsigned size = 0; - if (code < 0x00000080) { - size = 1; - } else if (code < 0x00000800) { - size = 2; - } else if (code < 0x00010000) { - size = 3; - } else if (code < 0x00110000) { - size = 4; - } else { - r_err( - reader, SERD_ERR_BAD_SYNTAX, "unicode character 0x%X out of range", code); - - *char_code = 0xFFFD; - const SerdStatus st = push_bytes(reader, dest, replacement_char, 3); - return st ? st : SERD_SUCCESS; - } - - // Build output in buf - // (Note # of bytes = # of leading 1 bits in first byte) - uint32_t c = code; - switch (size) { - case 4: - buf[3] = (uint8_t)(0x80u | (c & 0x3Fu)); - c >>= 6; - c |= (16 << 12); // set bit 4 - SERD_FALLTHROUGH; - case 3: - buf[2] = (uint8_t)(0x80u | (c & 0x3Fu)); - c >>= 6; - c |= (32 << 6); // set bit 5 - SERD_FALLTHROUGH; - case 2: - buf[1] = (uint8_t)(0x80u | (c & 0x3Fu)); - c >>= 6; - c |= 0xC0; // set bits 6 and 7 - SERD_FALLTHROUGH; - case 1: - buf[0] = (uint8_t)c; - SERD_FALLTHROUGH; - default: - break; - } - - *char_code = code; - return push_bytes(reader, dest, buf, size); -} - -// Read ECHAR escape, initial \ is already eaten by caller -static SerdStatus -read_ECHAR(SerdReader* const reader, SerdNode* const dest) -{ - const int c = peek_byte(reader); - switch (c) { - case 't': - eat_byte_safe(reader, 't'); - return push_byte(reader, dest, '\t'); - case 'b': - eat_byte_safe(reader, 'b'); - return push_byte(reader, dest, '\b'); - case 'n': - dest->flags |= SERD_HAS_NEWLINE; - eat_byte_safe(reader, 'n'); - return push_byte(reader, dest, '\n'); - case 'r': - dest->flags |= SERD_HAS_NEWLINE; - eat_byte_safe(reader, 'r'); - return push_byte(reader, dest, '\r'); - case 'f': - eat_byte_safe(reader, 'f'); - return push_byte(reader, dest, '\f'); - case '\\': - case '"': - case '\'': - return push_byte(reader, dest, eat_byte_safe(reader, c)); - default: - return SERD_ERR_BAD_SYNTAX; - } -} - -// Read one character (possibly multi-byte) -// The first byte, c, has already been eaten by caller -static SerdStatus -read_character(SerdReader* const reader, SerdNode* const dest, const uint8_t c) -{ - if (!(c & 0x80)) { - switch (c) { - case 0xA: - case 0xD: - dest->flags |= SERD_HAS_NEWLINE; - break; - case '"': - case '\'': - dest->flags |= SERD_HAS_QUOTE; - break; - default: - break; - } - - return push_byte(reader, dest, c); - } - - return read_utf8_continuation(reader, dest, c); -} - -// [10] comment ::= '#' ( [^#xA #xD] )* -static void -read_comment(SerdReader* const reader) -{ - eat_byte_safe(reader, '#'); - int c = 0; - while (((c = peek_byte(reader)) != 0xA) && c != 0xD && c != EOF && c) { - eat_byte_safe(reader, c); - } -} - -// [24] ws ::= #x9 | #xA | #xD | #x20 | comment -static bool -read_ws(SerdReader* const reader) -{ - const int c = peek_byte(reader); - switch (c) { - case 0x9: - case 0xA: - case 0xD: - case 0x20: - eat_byte_safe(reader, c); - return true; + switch (peek_byte(reader)) { + case '\t': + case '\n': + case '\r': + case ' ': + eat_byte_safe(reader, peek_byte(reader)); + return SERD_SUCCESS; case '#': read_comment(reader); - return true; + return SERD_SUCCESS; default: - return false; + break; } + + return SERD_FAILURE; } static bool read_ws_star(SerdReader* const reader) { - while (read_ws(reader)) { + while (!read_whitespace(reader)) { } return true; @@ -301,44 +131,6 @@ read_STRING_LITERAL_LONG(SerdReader* const reader, return tolerate_status(reader, st) ? SERD_SUCCESS : st; } -// STRING_LITERAL_QUOTE and STRING_LITERAL_SINGLE_QUOTE -// Initial quote is already eaten by caller -static SerdStatus -read_STRING_LITERAL(SerdReader* const reader, - SerdNode* const ref, - const uint8_t q) -{ - SerdStatus st = SERD_SUCCESS; - - while (tolerate_status(reader, st)) { - const int c = peek_byte(reader); - uint32_t code = 0; - switch (c) { - case EOF: - return r_err(reader, SERD_ERR_BAD_SYNTAX, "end of file in short string"); - case '\n': - case '\r': - return r_err(reader, SERD_ERR_BAD_SYNTAX, "line end in short string"); - case '\\': - eat_byte_safe(reader, c); - if ((st = read_ECHAR(reader, ref)) && - (st = read_UCHAR(reader, ref, &code))) { - return r_err(reader, st, "invalid escape `\\%c'", peek_byte(reader)); - } - break; - default: - if (c == q) { - eat_byte_safe(reader, q); - return SERD_SUCCESS; - } else { - st = read_character(reader, ref, (uint8_t)eat_byte_safe(reader, c)); - } - } - } - - return tolerate_status(reader, st) ? SERD_SUCCESS : st; -} - static SerdStatus read_String(SerdReader* const reader, SerdNode* const node) { @@ -373,65 +165,6 @@ read_String(SerdReader* const reader, SerdNode* const node) return read_STRING_LITERAL_LONG(reader, node, (uint8_t)q1); } -static bool -is_PN_CHARS_BASE(const uint32_t c) -{ - return ((c >= 0x00C0 && c <= 0x00D6) || (c >= 0x00D8 && c <= 0x00F6) || - (c >= 0x00F8 && c <= 0x02FF) || (c >= 0x0370 && c <= 0x037D) || - (c >= 0x037F && c <= 0x1FFF) || (c >= 0x200C && c <= 0x200D) || - (c >= 0x2070 && c <= 0x218F) || (c >= 0x2C00 && c <= 0x2FEF) || - (c >= 0x3001 && c <= 0xD7FF) || (c >= 0xF900 && c <= 0xFDCF) || - (c >= 0xFDF0 && c <= 0xFFFD) || (c >= 0x10000 && c <= 0xEFFFF)); -} - -static SerdStatus -read_PN_CHARS_BASE(SerdReader* const reader, SerdNode* const dest) -{ - uint32_t code = 0; - const int c = peek_byte(reader); - SerdStatus st = SERD_SUCCESS; - if (is_alpha(c)) { - st = push_byte(reader, dest, eat_byte_safe(reader, c)); - } else if (c == EOF || !(c & 0x80)) { - return SERD_FAILURE; - } else if ((st = read_utf8_code_point(reader, dest, &code, (uint8_t)c))) { - return st; - } else if (!is_PN_CHARS_BASE(code)) { - r_err( - reader, SERD_ERR_BAD_SYNTAX, "invalid character U+%04X in name", code); - if (reader->strict) { - return SERD_ERR_BAD_SYNTAX; - } - } - return st; -} - -static bool -is_PN_CHARS(const uint32_t c) -{ - return (is_PN_CHARS_BASE(c) || c == 0xB7 || (c >= 0x0300 && c <= 0x036F) || - (c >= 0x203F && c <= 0x2040)); -} - -static SerdStatus -read_PN_CHARS(SerdReader* const reader, SerdNode* const dest) -{ - uint32_t code = 0; - const int c = peek_byte(reader); - SerdStatus st = SERD_SUCCESS; - if (is_alpha(c) || is_digit(c) || c == '_' || c == '-') { - st = push_byte(reader, dest, eat_byte_safe(reader, c)); - } else if (c == EOF || !(c & 0x80)) { - return SERD_FAILURE; - } else if ((st = read_utf8_code_point(reader, dest, &code, (uint8_t)c))) { - return st; - } else if (!is_PN_CHARS(code)) { - return r_err( - reader, SERD_ERR_BAD_SYNTAX, "invalid character U+%04X in name", code); - } - return st; -} - static SerdStatus read_PERCENT(SerdReader* const reader, SerdNode* const dest) { @@ -592,67 +325,6 @@ read_PN_PREFIX(SerdReader* const reader, SerdNode* const dest) return st; } -static SerdStatus -read_LANGTAG(SerdReader* const reader) -{ - int c = peek_byte(reader); - if (!is_alpha(c)) { - return r_err(reader, SERD_ERR_BAD_SYNTAX, "unexpected `%c'", c); - } - - SerdNode* node = push_node(reader, SERD_LITERAL, "", 0); - if (!node) { - return SERD_ERR_OVERFLOW; - } - - SerdStatus st = SERD_SUCCESS; - TRY(st, push_byte(reader, node, eat_byte_safe(reader, c))); - while ((c = peek_byte(reader)) && is_alpha(c)) { - TRY(st, push_byte(reader, node, eat_byte_safe(reader, c))); - } - while (peek_byte(reader) == '-') { - TRY(st, push_byte(reader, node, eat_byte_safe(reader, '-'))); - while ((c = peek_byte(reader)) && (is_alpha(c) || is_digit(c))) { - TRY(st, push_byte(reader, node, eat_byte_safe(reader, c))); - } - } - return SERD_SUCCESS; -} - -static SerdStatus -read_IRIREF_scheme(SerdReader* const reader, SerdNode* const dest) -{ - int c = peek_byte(reader); - if (!is_alpha(c)) { - return r_err(reader, SERD_ERR_BAD_SYNTAX, "bad IRI scheme start `%c'", c); - } - - SerdStatus st = SERD_SUCCESS; - while ((c = peek_byte(reader)) != EOF) { - if (c == '>') { - return r_err(reader, SERD_ERR_BAD_SYNTAX, "missing IRI scheme"); - } - - if (!is_uri_scheme_char(c)) { - return r_err(reader, - SERD_ERR_BAD_SYNTAX, - "bad IRI scheme char U+%04X (%c)", - (unsigned)c, - (char)c); - } - - if ((st = push_byte(reader, dest, eat_byte_safe(reader, c)))) { - return st; - } - - if (c == ':') { - return SERD_SUCCESS; // End of scheme - } - } - - return SERD_FAILURE; -} - typedef struct { SerdReader* reader; SerdNode* node; @@ -714,6 +386,10 @@ resolve_IRIREF(SerdReader* const reader, static SerdStatus read_IRIREF(SerdReader* const reader, SerdNode** const dest) { + if (!fancy_syntax(reader)) { + return read_IRI(reader, dest); + } + SerdStatus st = SERD_SUCCESS; if ((st = eat_byte_check(reader, '<'))) { return st; @@ -725,68 +401,14 @@ read_IRIREF(SerdReader* const reader, SerdNode** const dest) const size_t string_start_offset = reader->stack.size; - if (!fancy_syntax(reader) && (st = read_IRIREF_scheme(reader, *dest))) { - return r_err(reader, st, "expected IRI scheme"); - } - - uint32_t code = 0; - while (st <= SERD_FAILURE) { - const int c = eat_byte_safe(reader, peek_byte(reader)); - switch (c) { - case '"': - case '<': - return r_err( - reader, SERD_ERR_BAD_SYNTAX, "invalid IRI character `%c'", c); - case '>': - return (reader->flags & SERD_READ_RELATIVE) - ? SERD_SUCCESS - : resolve_IRIREF(reader, *dest, string_start_offset); - case '\\': - if (read_UCHAR(reader, *dest, &code)) { - return r_err(reader, SERD_ERR_BAD_SYNTAX, "invalid IRI escape"); - } - switch (code) { - case 0: - case ' ': - case '<': - case '>': - return r_err(reader, - SERD_ERR_BAD_SYNTAX, - "invalid escaped IRI character U+%04X", - code); - default: - break; - } - break; - case '^': - case '`': - case '{': - case '|': - case '}': - return r_err( - reader, SERD_ERR_BAD_SYNTAX, "invalid IRI character `%c'", c); - default: - if (c <= 0x20) { - st = r_err(reader, - SERD_ERR_BAD_SYNTAX, - "invalid IRI character (escape %%%02X)", - (unsigned)c); - if (reader->strict) { - break; - } - - if (!(st = push_byte(reader, *dest, c))) { - st = SERD_FAILURE; - } - } else if (!(c & 0x80)) { - st = push_byte(reader, *dest, c); - } else { - st = read_utf8_continuation(reader, *dest, (uint8_t)c); - } - } + st = read_IRIREF_suffix(reader, *dest); + if (!tolerate_status(reader, st)) { + return st; } - return tolerate_status(reader, st) ? SERD_SUCCESS : st; + return (reader->flags & SERD_READ_RELATIVE) + ? SERD_SUCCESS + : resolve_IRIREF(reader, *dest, string_start_offset); } static SerdStatus @@ -970,44 +592,6 @@ read_literal(SerdReader* const reader, } static SerdStatus -read_VARNAME(SerdReader* const reader, SerdNode** const dest) -{ - // Simplified from SPARQL: VARNAME ::= (PN_CHARS_U | [0-9])+ - SerdNode* n = *dest; - SerdStatus st = SERD_SUCCESS; - int c = 0; - peek_byte(reader); - while ((c = peek_byte(reader))) { - if (is_digit(c) || c == '_') { - st = push_byte(reader, n, eat_byte_safe(reader, c)); - } else if ((st = read_PN_CHARS(reader, n))) { - st = st > SERD_FAILURE ? st : SERD_SUCCESS; - break; - } - } - - return st; -} - -static SerdStatus -read_Var(SerdReader* const reader, SerdNode** const dest) -{ - if (!(reader->flags & SERD_READ_VARIABLES)) { - return r_err( - reader, SERD_ERR_BAD_SYNTAX, "syntax does not support variables"); - } - - if (!(*dest = push_node(reader, SERD_VARIABLE, "", 0))) { - return SERD_ERR_OVERFLOW; - } - - assert(peek_byte(reader) == '$' || peek_byte(reader) == '?'); - serd_byte_source_advance(reader->source); - - return read_VARNAME(reader, dest); -} - -static SerdStatus read_verb(SerdReader* reader, SerdNode** dest) { const size_t orig_stack_size = reader->stack.size; @@ -1055,83 +639,6 @@ read_verb(SerdReader* reader, SerdNode** dest) return SERD_SUCCESS; } -static bool -avoid_blank_clashes(const SerdReader* const reader) -{ - return fancy_syntax(reader) && !(reader->flags & SERD_READ_EXACT_BLANKS); -} - -static SerdStatus -adjust_blank_id(SerdReader* const reader, char* const buf) -{ - if (avoid_blank_clashes(reader) && is_digit(buf[reader->bprefix_len + 1])) { - const char tag = buf[reader->bprefix_len]; - if (tag == 'b') { - buf[reader->bprefix_len] = 'B'; // Prevent clash - reader->seen_genid = true; - } else if (tag == 'B' && reader->seen_genid) { - return r_err(reader, - SERD_ERR_ID_CLASH, - "found both `b' and `B' blank IDs, prefix required"); - } - } - - return SERD_SUCCESS; -} - -static SerdStatus -read_BLANK_NODE_LABEL(SerdReader* const reader, - SerdNode** const dest, - bool* const ate_dot) -{ - SerdStatus st = SERD_SUCCESS; - - eat_byte_safe(reader, '_'); - TRY(st, eat_byte_check(reader, ':')); - - if (!(*dest = push_node(reader, - SERD_BLANK, - reader->bprefix ? reader->bprefix : "", - reader->bprefix_len))) { - return SERD_ERR_OVERFLOW; - } - - // Read first: (PN_CHARS | '_' | [0-9]) - SerdNode* const n = *dest; - int c = peek_byte(reader); - if (is_digit(c) || c == '_') { - TRY(st, push_byte(reader, n, eat_byte_safe(reader, c))); - } else if ((st = read_PN_CHARS(reader, n))) { - return r_err(reader, st, "invalid name start"); - } - - // Read middle: (PN_CHARS | '.')* - while ((c = peek_byte(reader))) { - if (c == '.') { - TRY(st, push_byte(reader, n, eat_byte_safe(reader, c))); - } else if ((st = read_PN_CHARS(reader, n))) { - break; - } - } - - if (st > SERD_FAILURE) { - return st; - } - - // Deal with annoying edge case of having eaten the trailing dot - char* const buf = serd_node_buffer(n); - if (buf[n->length - 1] == '.' && read_PN_CHARS(reader, n)) { - --n->length; - serd_stack_pop(&reader->stack, 1); - *ate_dot = true; - } - - // Adjust ID to avoid clashes with generated IDs if necessary - st = adjust_blank_id(reader, buf); - - return tolerate_status(reader, st) ? SERD_SUCCESS : st; -} - static SerdStatus read_anon(SerdReader* const reader, ReadContext ctx, @@ -1312,10 +819,6 @@ read_objectList(SerdReader* const reader, ReadContext ctx, bool* const ate_dot) { SerdStatus st = SERD_SUCCESS; TRY(st, read_object(reader, &ctx, true, ate_dot)); - if (!fancy_syntax(reader) && peek_delim(reader, ',')) { - return r_err( - reader, SERD_ERR_BAD_SYNTAX, "syntax does not support abbreviation"); - } while (st <= SERD_FAILURE && !*ate_dot && eat_delim(reader, ',')) { st = read_object(reader, &ctx, true, ate_dot); @@ -1698,10 +1201,6 @@ read_n3_statement(SerdReader* const reader) case EOF: return SERD_FAILURE; case '@': - if (!fancy_syntax(reader)) { - return r_err( - reader, SERD_ERR_BAD_SYNTAX, "syntax does not support directives"); - } TRY(st, read_directive(reader)); read_ws_star(reader); break; @@ -1761,14 +1260,6 @@ read_n3_statement(SerdReader* const reader) return st; } -static void -skip_until(SerdReader* const reader, const uint8_t byte) -{ - for (int c = 0; (c = peek_byte(reader)) && c != EOF && c != byte;) { - eat_byte_safe(reader, c); - } -} - SerdStatus read_turtleTrigDoc(SerdReader* const reader) { @@ -1855,7 +1346,3 @@ read_nquadsDoc(SerdReader* const reader) } return st; } - -#if defined(__clang__) && __clang_major__ >= 10 -_Pragma("clang diagnostic pop") -#endif |