diff options
author | David Robillard <d@drobilla.net> | 2018-05-12 20:39:23 +0200 |
---|---|---|
committer | David Robillard <d@drobilla.net> | 2019-04-13 19:15:32 +0200 |
commit | fea20a9af56d5b7640ced14cde92fe6746291502 (patch) | |
tree | 848c9e3a4e3e33d0b65ef39142d0ff8507af3391 | |
parent | 29cfc326f8f64d8327597f2218f0caefeed4560f (diff) | |
download | serd-fea20a9af56d5b7640ced14cde92fe6746291502.tar.gz serd-fea20a9af56d5b7640ced14cde92fe6746291502.tar.bz2 serd-fea20a9af56d5b7640ced14cde92fe6746291502.zip |
Use a fixed-size reader stack
This improves performance, and makes the reader more suitable for embedded or
network-facing applications, at the cost of requiring the user to specify a
maximum stack size.
-rw-r--r-- | NEWS | 1 | ||||
-rw-r--r-- | doc/serdi.1 | 4 | ||||
-rw-r--r-- | serd/serd.h | 8 | ||||
-rw-r--r-- | src/n3.c | 334 | ||||
-rw-r--r-- | src/reader.c | 56 | ||||
-rw-r--r-- | src/reader.h | 73 | ||||
-rw-r--r-- | src/serdi.c | 20 | ||||
-rw-r--r-- | src/stack.h | 11 | ||||
-rw-r--r-- | src/string.c | 1 | ||||
-rw-r--r-- | src/writer.c | 2 | ||||
-rw-r--r-- | tests/serd_test.c | 4 | ||||
-rw-r--r-- | wscript | 2 |
12 files changed, 288 insertions, 228 deletions
@@ -9,6 +9,7 @@ serd (1.0.0) unstable; * Remove half-baked serd_uri_to_path() * Bring read/write interface closer to C standard * Add SerdWorld for shared library state + * Use a fixed-size reader stack -- David Robillard <d@drobilla.net> Sat, 19 Jan 2019 13:31:12 +0100 diff --git a/doc/serdi.1 b/doc/serdi.1 index d5051655..04696032 100644 --- a/doc/serdi.1 +++ b/doc/serdi.1 @@ -42,6 +42,10 @@ Read input as SYNTAX. Valid values (case-insensitive): turtle, ntriples, trig, nquads. .TP +\fB\-k BYTES\fR +Parser stack size. + +.TP \fB\-l\fR Lax (non-strict) parsing. diff --git a/serd/serd.h b/serd/serd.h index b4d5afcc..33c1f3cd 100644 --- a/serd/serd.h +++ b/serd/serd.h @@ -99,7 +99,8 @@ typedef enum { SERD_ERR_NOT_FOUND, /**< Not found */ SERD_ERR_ID_CLASH, /**< Encountered clashing blank node IDs */ SERD_ERR_BAD_CURIE, /**< Invalid CURIE (e.g. prefix does not exist) */ - SERD_ERR_INTERNAL /**< Unexpected internal error (should not happen) */ + SERD_ERR_INTERNAL, /**< Unexpected internal error (should not happen) */ + SERD_ERR_OVERFLOW /**< Stack overflow */ } SerdStatus; /** @@ -892,7 +893,10 @@ serd_env_foreach(const SerdEnv* env, */ SERD_API SerdReader* -serd_reader_new(SerdWorld* world, SerdSyntax syntax, const SerdSink* sink); +serd_reader_new(SerdWorld* world, + SerdSyntax syntax, + const SerdSink* sink, + size_t stack_size); /** Enable or disable strict parsing. @@ -42,7 +42,7 @@ fancy_syntax(const SerdReader* reader) } static bool -read_collection(SerdReader* reader, ReadContext ctx, Ref* dest); +read_collection(SerdReader* reader, ReadContext ctx, SerdNode** dest); static bool read_predicateObjectList(SerdReader* reader, ReadContext ctx, bool* ate_dot); @@ -59,8 +59,8 @@ read_HEX(SerdReader* reader) } // Read UCHAR escape, initial \ is already eaten by caller -static inline bool -read_UCHAR(SerdReader* reader, Ref dest, uint32_t* char_code) +static inline SerdStatus +read_UCHAR(SerdReader* reader, SerdNode* dest, uint32_t* char_code) { const uint8_t b = peek_byte(reader); unsigned length = 0; @@ -72,14 +72,14 @@ read_UCHAR(SerdReader* reader, Ref dest, uint32_t* char_code) length = 4; break; default: - return false; + return SERD_ERR_BAD_SYNTAX; } eat_byte_safe(reader, b); uint8_t buf[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 }; for (unsigned i = 0; i < length; ++i) { if (!(buf[i] = read_HEX(reader))) { - return false; + return SERD_ERR_BAD_SYNTAX; } } @@ -99,9 +99,9 @@ read_UCHAR(SerdReader* reader, Ref dest, uint32_t* char_code) } else { r_err(reader, SERD_ERR_BAD_SYNTAX, "unicode character 0x%X out of range\n", code); - push_bytes(reader, dest, replacement_char, 3); *char_code = 0xFFFD; - return true; + const SerdStatus st = push_bytes(reader, dest, replacement_char, 3); + return st ? st : SERD_SUCCESS; } // Build output in buf @@ -127,44 +127,37 @@ read_UCHAR(SerdReader* reader, Ref dest, uint32_t* char_code) buf[0] = (uint8_t)c; } - push_bytes(reader, dest, buf, size); *char_code = code; - return true; + return push_bytes(reader, dest, buf, size); } // Read ECHAR escape, initial \ is already eaten by caller -static inline bool -read_ECHAR(SerdReader* reader, Ref dest, SerdNodeFlags* flags) +static inline SerdStatus +read_ECHAR(SerdReader* reader, SerdNode* dest, SerdNodeFlags* flags) { const uint8_t c = peek_byte(reader); switch (c) { case 't': eat_byte_safe(reader, 't'); - push_byte(reader, dest, '\t'); - return true; + return push_byte(reader, dest, '\t'); case 'b': eat_byte_safe(reader, 'b'); - push_byte(reader, dest, '\b'); - return true; + return push_byte(reader, dest, '\b'); case 'n': *flags |= SERD_HAS_NEWLINE; eat_byte_safe(reader, 'n'); - push_byte(reader, dest, '\n'); - return true; + return push_byte(reader, dest, '\n'); case 'r': *flags |= SERD_HAS_NEWLINE; eat_byte_safe(reader, 'r'); - push_byte(reader, dest, '\r'); - return true; + return push_byte(reader, dest, '\r'); case 'f': eat_byte_safe(reader, 'f'); - push_byte(reader, dest, '\f'); - return true; + return push_byte(reader, dest, '\f'); case '\\': case '"': case '\'': - push_byte(reader, dest, eat_byte_safe(reader, c)); - return true; + return push_byte(reader, dest, eat_byte_safe(reader, c)); default: - return false; + return SERD_ERR_BAD_SYNTAX; } } @@ -202,21 +195,21 @@ read_utf8_bytes(SerdReader* reader, uint8_t bytes[4], uint32_t* size, uint8_t c) } static SerdStatus -read_utf8_character(SerdReader* reader, Ref dest, uint8_t c) +read_utf8_character(SerdReader* reader, SerdNode* dest, uint8_t c) { uint32_t size; uint8_t bytes[4]; SerdStatus st = read_utf8_bytes(reader, bytes, &size, c); if (st) { push_bytes(reader, dest, replacement_char, 3); - } else { - push_bytes(reader, dest, bytes, size); + return st; } - return st; + + return push_bytes(reader, dest, bytes, size); } static SerdStatus -read_utf8_code(SerdReader* reader, Ref dest, uint32_t* code, uint8_t c) +read_utf8_code(SerdReader* reader, SerdNode* dest, uint32_t* code, uint8_t c) { uint32_t size; uint8_t bytes[4] = { 0, 0, 0, 0 }; @@ -226,15 +219,17 @@ read_utf8_code(SerdReader* reader, Ref dest, uint32_t* code, uint8_t c) return st; } - push_bytes(reader, dest, bytes, size); - *code = parse_counted_utf8_char(bytes, size); + if (!(st = push_bytes(reader, dest, bytes, size))) { + *code = parse_counted_utf8_char(bytes, size); + } + return st; } // Read one character (possibly multi-byte) // The first byte, c, has already been eaten by caller static inline SerdStatus -read_character(SerdReader* reader, Ref dest, SerdNodeFlags* flags, uint8_t c) +read_character(SerdReader* reader, SerdNode* dest, SerdNodeFlags* flags, uint8_t c) { if (!(c & 0x80)) { switch (c) { @@ -245,8 +240,9 @@ read_character(SerdReader* reader, Ref dest, SerdNodeFlags* flags, uint8_t c) *flags |= SERD_HAS_QUOTE; break; } - push_byte(reader, dest, c); - return SERD_SUCCESS; + + const SerdStatus st = push_byte(reader, dest, c); + return st ? st : SERD_SUCCESS; } return read_utf8_character(reader, dest, c); } @@ -305,17 +301,18 @@ eat_delim(SerdReader* reader, const char delim) // STRING_LITERAL_LONG_QUOTE and STRING_LITERAL_LONG_SINGLE_QUOTE // Initial triple quotes are already eaten by caller -static Ref +static SerdNode* read_STRING_LITERAL_LONG(SerdReader* reader, SerdNodeFlags* flags, uint8_t q) { - Ref ref = push_node(reader, SERD_LITERAL, "", 0); + SerdNode* ref = push_node(reader, SERD_LITERAL, "", 0); + SerdStatus st = SERD_SUCCESS; while (!reader->status) { const uint8_t c = peek_byte(reader); if (c == '\\') { eat_byte_safe(reader, c); uint32_t code; - if (!read_ECHAR(reader, ref, flags) && - !read_UCHAR(reader, ref, &code)) { + if (read_ECHAR(reader, ref, flags) && + read_UCHAR(reader, ref, &code)) { r_err(reader, SERD_ERR_BAD_SYNTAX, "invalid escape `\\%c'\n", peek_byte(reader)); return pop_node(reader, ref); @@ -330,20 +327,21 @@ read_STRING_LITERAL_LONG(SerdReader* reader, SerdNodeFlags* flags, uint8_t q) } *flags |= SERD_HAS_QUOTE; push_byte(reader, ref, c); - read_character(reader, ref, flags, q2); + st = read_character(reader, ref, flags, q2); } else { - read_character(reader, ref, flags, eat_byte_safe(reader, c)); + st = read_character(reader, ref, flags, eat_byte_safe(reader, c)); } } - return ref; + return st ? NULL : ref; } // STRING_LITERAL_QUOTE and STRING_LITERAL_SINGLE_QUOTE // Initial quote is already eaten by caller -static Ref +static SerdNode* read_STRING_LITERAL(SerdReader* reader, SerdNodeFlags* flags, uint8_t q) { - Ref ref = push_node(reader, SERD_LITERAL, "", 0); + SerdNode* ref = push_node(reader, SERD_LITERAL, "", 0); + SerdStatus st = SERD_SUCCESS; while (!reader->status) { const uint8_t c = peek_byte(reader); uint32_t code = 0; @@ -353,8 +351,8 @@ read_STRING_LITERAL(SerdReader* reader, SerdNodeFlags* flags, uint8_t q) return pop_node(reader, ref); case '\\': eat_byte_safe(reader, c); - if (!read_ECHAR(reader, ref, flags) && - !read_UCHAR(reader, ref, &code)) { + if (read_ECHAR(reader, ref, flags) && + read_UCHAR(reader, ref, &code)) { r_err(reader, SERD_ERR_BAD_SYNTAX, "invalid escape `\\%c'\n", peek_byte(reader)); return pop_node(reader, ref); @@ -365,15 +363,24 @@ read_STRING_LITERAL(SerdReader* reader, SerdNodeFlags* flags, uint8_t q) eat_byte_check(reader, q); return ref; } else { - read_character(reader, ref, flags, eat_byte_safe(reader, c)); + st = read_character(reader, ref, flags, eat_byte_safe(reader, c)); } } } - eat_byte_check(reader, q); + + if (st) { + reader->status = st; + return NULL; + } + + if (!eat_byte_check(reader, q)) { + return pop_node(reader, ref); + } + return ref; } -static Ref +static SerdNode* read_String(SerdReader* reader, SerdNodeFlags* flags) { const uint8_t q1 = peek_byte(reader); @@ -391,8 +398,9 @@ read_String(SerdReader* reader, SerdNodeFlags* flags) } if (!fancy_syntax(reader)) { - return r_err(reader, SERD_ERR_BAD_SYNTAX, - "syntax does not support long literals\n"); + r_err(reader, SERD_ERR_BAD_SYNTAX, + "syntax does not support long literals\n"); + return NULL; } eat_byte_safe(reader, q3); @@ -411,7 +419,7 @@ is_PN_CHARS_BASE(const uint32_t c) } static SerdStatus -read_PN_CHARS_BASE(SerdReader* reader, Ref dest) +read_PN_CHARS_BASE(SerdReader* reader, SerdNode* dest) { uint32_t code; const uint8_t c = peek_byte(reader); @@ -441,7 +449,7 @@ is_PN_CHARS(const uint32_t c) } static SerdStatus -read_PN_CHARS(SerdReader* reader, Ref dest) +read_PN_CHARS(SerdReader* reader, SerdNode* dest) { uint32_t code; const uint8_t c = peek_byte(reader); @@ -461,7 +469,7 @@ read_PN_CHARS(SerdReader* reader, Ref dest) } static bool -read_PERCENT(SerdReader* reader, Ref dest) +read_PERCENT(SerdReader* reader, SerdNode* dest) { push_byte(reader, dest, eat_byte_safe(reader, '%')); const uint8_t h1 = read_HEX(reader); @@ -475,7 +483,7 @@ read_PERCENT(SerdReader* reader, Ref dest) } static SerdStatus -read_PLX(SerdReader* reader, Ref dest) +read_PLX(SerdReader* reader, SerdNode* dest) { uint8_t c = peek_byte(reader); switch (c) { @@ -499,7 +507,7 @@ read_PLX(SerdReader* reader, Ref dest) } static SerdStatus -read_PN_LOCAL(SerdReader* reader, Ref dest, bool* ate_dot) +read_PN_LOCAL(SerdReader* reader, SerdNode* dest, bool* ate_dot) { uint8_t c = peek_byte(reader); SerdStatus st = SERD_SUCCESS; @@ -528,10 +536,9 @@ read_PN_LOCAL(SerdReader* reader, Ref dest, bool* ate_dot) trailing_unescaped_dot = (c == '.'); } - SerdNode* const n = deref(reader, dest); if (trailing_unescaped_dot) { // Ate trailing dot, pop it from stack/node and inform caller - --n->n_bytes; + --dest->n_bytes; serd_stack_pop(&reader->stack, 1); *ate_dot = true; } @@ -541,7 +548,7 @@ read_PN_LOCAL(SerdReader* reader, Ref dest, bool* ate_dot) // Read the remainder of a PN_PREFIX after some initial characters static SerdStatus -read_PN_PREFIX_tail(SerdReader* reader, Ref dest) +read_PN_PREFIX_tail(SerdReader* reader, SerdNode* dest) { uint8_t c; while ((c = peek_byte(reader))) { // Middle: (PN_CHARS | '.')* @@ -552,8 +559,7 @@ read_PN_PREFIX_tail(SerdReader* reader, Ref dest) } } - const SerdNode* const n = deref(reader, dest); - if (serd_node_get_string(n)[n->n_bytes - 1] == '.' && + if (serd_node_get_string(dest)[dest->n_bytes - 1] == '.' && read_PN_CHARS(reader, dest)) { r_err(reader, SERD_ERR_BAD_SYNTAX, "prefix ends with `.'\n"); return SERD_ERR_BAD_SYNTAX; @@ -563,7 +569,7 @@ read_PN_PREFIX_tail(SerdReader* reader, Ref dest) } static SerdStatus -read_PN_PREFIX(SerdReader* reader, Ref dest) +read_PN_PREFIX(SerdReader* reader, SerdNode* dest) { if (!read_PN_CHARS_BASE(reader, dest)) { return read_PN_PREFIX_tail(reader, dest); @@ -571,14 +577,20 @@ read_PN_PREFIX(SerdReader* reader, Ref dest) return SERD_FAILURE; } -static Ref +static SerdNode* read_LANGTAG(SerdReader* reader) { uint8_t c = peek_byte(reader); if (!is_alpha(c)) { - return r_err(reader, SERD_ERR_BAD_SYNTAX, "unexpected `%c'\n", c); + r_err(reader, SERD_ERR_BAD_SYNTAX, "unexpected `%c'\n", c); + return NULL; + } + + SerdNode* ref = push_node(reader, SERD_LITERAL, "", 0); + if (!ref) { + return NULL; } - Ref ref = push_node(reader, SERD_LITERAL, "", 0); + push_byte(reader, ref, eat_byte_safe(reader, c)); while ((c = peek_byte(reader)) && is_alpha(c)) { push_byte(reader, ref, eat_byte_safe(reader, c)); @@ -593,7 +605,7 @@ read_LANGTAG(SerdReader* reader) } static bool -read_IRIREF_scheme(SerdReader* reader, Ref dest) +read_IRIREF_scheme(SerdReader* reader, SerdNode* dest) { uint8_t c = peek_byte(reader); if (!is_alpha(c)) { @@ -618,12 +630,12 @@ read_IRIREF_scheme(SerdReader* reader, Ref dest) return false; } -static Ref +static SerdNode* read_IRIREF(SerdReader* reader) { TRY_RET(eat_byte_check(reader, '<')); - Ref ref = push_node(reader, SERD_URI, "", 0); - if (!fancy_syntax(reader) && !read_IRIREF_scheme(reader, ref)) { + SerdNode* ref = push_node(reader, SERD_URI, "", 0); + if (!ref || (!fancy_syntax(reader) && !read_IRIREF_scheme(reader, ref))) { return pop_node(reader, ref); } @@ -638,7 +650,7 @@ read_IRIREF(SerdReader* reader) case '>': return ref; case '\\': - if (!read_UCHAR(reader, ref, &code)) { + if (read_UCHAR(reader, ref, &code)) { r_err(reader, SERD_ERR_BAD_SYNTAX, "invalid IRI escape\n"); return pop_node(reader, ref); } @@ -677,7 +689,7 @@ read_IRIREF(SerdReader* reader) } static bool -read_PrefixedName(SerdReader* reader, Ref dest, bool read_prefix, bool* ate_dot) +read_PrefixedName(SerdReader* reader, SerdNode* dest, bool read_prefix, bool* ate_dot) { if (read_prefix && read_PN_PREFIX(reader, dest) > SERD_FAILURE) { return false; @@ -690,7 +702,7 @@ read_PrefixedName(SerdReader* reader, Ref dest, bool read_prefix, bool* ate_dot) } static bool -read_0_9(SerdReader* reader, Ref str, bool at_least_one) +read_0_9(SerdReader* reader, SerdNode* str, bool at_least_one) { unsigned count = 0; for (uint8_t c; is_digit((c = peek_byte(reader))); ++count) { @@ -704,20 +716,23 @@ read_0_9(SerdReader* reader, Ref str, bool at_least_one) static bool read_number(SerdReader* reader, - Ref* dest, - Ref* datatype, + SerdNode** dest, + SerdNode** datatype, SerdNodeFlags* flags, bool* ate_dot) { #define XSD_DECIMAL NS_XSD "decimal" #define XSD_DOUBLE NS_XSD "double" #define XSD_INTEGER NS_XSD "integer" - Ref ref = push_node(reader, SERD_LITERAL, "", 0); - uint8_t c = peek_byte(reader); - bool has_decimal = false; - if (c == '-' || c == '+') { + SerdNode* ref = push_node(reader, SERD_LITERAL, "", 0); + uint8_t c = peek_byte(reader); + bool has_decimal = false; + if (!ref) { + return false; + } else if (c == '-' || c == '+') { push_byte(reader, ref, eat_byte_safe(reader, c)); } + if ((c = peek_byte(reader)) == '.') { has_decimal = true; // decimal case 2 (e.g. '.0' or `-.0' or `+.0') @@ -772,23 +787,29 @@ except: } static bool -read_iri(SerdReader* reader, Ref* dest, bool* ate_dot) +read_iri(SerdReader* reader, SerdNode** dest, bool* ate_dot) { switch (peek_byte(reader)) { case '<': *dest = read_IRIREF(reader); return true; default: - *dest = push_node(reader, SERD_CURIE, "", 0); + if (!(*dest = push_node(reader, SERD_CURIE, "", 0))) { + return false; + } return read_PrefixedName(reader, *dest, true, ate_dot); } } static bool -read_literal(SerdReader* reader, Ref* dest, - Ref* datatype, Ref* lang, SerdNodeFlags* flags, bool* ate_dot) +read_literal(SerdReader* reader, + SerdNode** dest, + SerdNode** datatype, + SerdNode** lang, + SerdNodeFlags* flags, + bool* ate_dot) { - Ref str = read_String(reader, flags); + SerdNode* str = read_String(reader, flags); if (!str) { return false; } @@ -816,7 +837,7 @@ except: } static bool -read_verb(SerdReader* reader, Ref* dest) +read_verb(SerdReader* reader, SerdNode** dest) { if (peek_byte(reader) == '<') { return (*dest = read_IRIREF(reader)); @@ -825,10 +846,13 @@ read_verb(SerdReader* reader, Ref* dest) /* Either a qname, or "a". Read the prefix first, and if it is in fact "a", produce that instead. */ - *dest = push_node(reader, SERD_CURIE, "", 0); + if (!(*dest = push_node(reader, SERD_CURIE, "", 0))) { + return false; + } + const SerdStatus st = read_PN_PREFIX(reader, *dest); bool ate_dot = false; - SerdNode* node = deref(reader, *dest); + SerdNode* node = *dest; const uint8_t next = peek_byte(reader); if (!st && node->n_bytes == 1 && serd_node_get_string(node)[0] == 'a' && @@ -845,34 +869,36 @@ read_verb(SerdReader* reader, Ref* dest) return true; } -static Ref +static SerdNode* read_BLANK_NODE_LABEL(SerdReader* reader, bool* ate_dot) { eat_byte_safe(reader, '_'); eat_byte_check(reader, ':'); - Ref ref = push_node(reader, SERD_BLANK, - reader->bprefix ? reader->bprefix : "", - reader->bprefix_len); + SerdNode* n = push_node(reader, SERD_BLANK, + reader->bprefix ? reader->bprefix : "", + reader->bprefix_len); + if (!n) { + return NULL; + } uint8_t c = peek_byte(reader); // First: (PN_CHARS | '_' | [0-9]) if (is_digit(c) || c == '_') { - push_byte(reader, ref, eat_byte_safe(reader, c)); - } else if (read_PN_CHARS(reader, ref)) { + push_byte(reader, n, eat_byte_safe(reader, c)); + } else if (read_PN_CHARS(reader, n)) { r_err(reader, SERD_ERR_BAD_SYNTAX, "invalid name start character\n"); - return pop_node(reader, ref); + return pop_node(reader, n); } while ((c = peek_byte(reader))) { // Middle: (PN_CHARS | '.')* if (c == '.') { - push_byte(reader, ref, eat_byte_safe(reader, c)); - } else if (read_PN_CHARS(reader, ref)) { + push_byte(reader, n, eat_byte_safe(reader, c)); + } else if (read_PN_CHARS(reader, n)) { break; } } - SerdNode* n = deref(reader, ref); - char* buf = serd_node_buffer(n); - if (buf[n->n_bytes - 1] == '.' && read_PN_CHARS(reader, ref)) { + char* buf = serd_node_buffer(n); + if (buf[n->n_bytes - 1] == '.' && read_PN_CHARS(reader, n)) { // Ate trailing dot, pop it from stack/node and inform caller --n->n_bytes; serd_stack_pop(&reader->stack, 1); @@ -887,30 +913,31 @@ read_BLANK_NODE_LABEL(SerdReader* reader, bool* ate_dot) } else if (reader->seen_genid && buf[reader->bprefix_len] == 'B') { r_err(reader, SERD_ERR_ID_CLASH, "found both `b' and `B' blank IDs, prefix required\n"); - return pop_node(reader, ref); + return pop_node(reader, n); } } } - return ref; + return n; } -static Ref +static SerdNode* read_blankName(SerdReader* reader) { eat_byte_safe(reader, '='); if (eat_byte_check(reader, '=') != '=') { - return r_err(reader, SERD_ERR_BAD_SYNTAX, "expected `='\n"); + r_err(reader, SERD_ERR_BAD_SYNTAX, "expected `='\n"); + return NULL; } - Ref subject = 0; - bool ate_dot = false; + SerdNode* subject = 0; + bool ate_dot = false; read_ws_star(reader); read_iri(reader, &subject, &ate_dot); return subject; } static bool -read_anon(SerdReader* reader, ReadContext ctx, bool subject, Ref* dest) +read_anon(SerdReader* reader, ReadContext ctx, bool subject, SerdNode** dest) { const SerdStatementFlags old_flags = *ctx.flags; bool empty; @@ -947,7 +974,7 @@ read_anon(SerdReader* reader, ReadContext ctx, bool subject, Ref* dest) } read_ws_star(reader); if (reader->sink->end) { - reader->sink->end(reader->sink->handle, deref(reader, *dest)); + reader->sink->end(reader->sink->handle, *dest); } *ctx.flags = old_flags; } @@ -969,10 +996,9 @@ read_object(SerdReader* reader, ReadContext* ctx, bool emit, bool* ate_dot) bool ret = false; bool simple = (ctx->subject != 0); - SerdNode* node = NULL; - Ref o = 0; - Ref datatype = 0; - Ref lang = 0; + SerdNode* o = 0; + SerdNode* datatype = 0; + SerdNode* lang = 0; uint32_t flags = 0; const uint8_t c = peek_byte(reader); if (!fancy_syntax(reader)) { @@ -1011,17 +1037,16 @@ read_object(SerdReader* reader, ReadContext* ctx, bool emit, bool* ate_dot) /* Either a boolean literal, or a qname. Read the prefix first, and if it is in fact a "true" or "false" literal, produce that instead. */ - o = push_node(reader, SERD_CURIE, "", 0); + TRY_THROW(o = push_node(reader, SERD_CURIE, "", 0)); while (!read_PN_CHARS_BASE(reader, o)) {} - node = deref(reader, o); - if ((node->n_bytes == 4 && - !memcmp(serd_node_get_string(node), "true", 4)) || - (node->n_bytes == 5 && - !memcmp(serd_node_get_string(node), "false", 5))) { - flags = flags | SERD_HAS_DATATYPE; - node->type = SERD_LITERAL; - datatype = push_node( - reader, SERD_URI, XSD_BOOLEAN, XSD_BOOLEAN_LEN); + if ((o->n_bytes == 4 && + !memcmp(serd_node_get_string(o), "true", 4)) || + (o->n_bytes == 5 && + !memcmp(serd_node_get_string(o), "false", 5))) { + flags = flags | SERD_HAS_DATATYPE; + o->type = SERD_LITERAL; + TRY_THROW(datatype = push_node( + reader, SERD_URI, XSD_BOOLEAN, XSD_BOOLEAN_LEN)); ret = true; } else if (read_PN_PREFIX_tail(reader, o) > SERD_FAILURE) { ret = false; @@ -1033,7 +1058,7 @@ read_object(SerdReader* reader, ReadContext* ctx, bool emit, bool* ate_dot) } if (simple && o) { - deref(reader, o)->flags = flags; + o->flags = flags; } if (ret && emit && simple) { @@ -1106,7 +1131,7 @@ read_predicateObjectList(SerdReader* reader, ReadContext ctx, bool* ate_dot) } static bool -end_collection(SerdReader* reader, ReadContext ctx, Ref n1, Ref n2, bool ret) +end_collection(SerdReader* reader, ReadContext ctx, SerdNode* n1, SerdNode* n2, bool ret) { pop_node(reader, n2); pop_node(reader, n1); @@ -1115,7 +1140,7 @@ end_collection(SerdReader* reader, ReadContext ctx, Ref n1, Ref n2, bool ret) } static bool -read_collection(SerdReader* reader, ReadContext ctx, Ref* dest) +read_collection(SerdReader* reader, ReadContext ctx, SerdNode** dest) { eat_byte_safe(reader, '('); bool end = peek_delim(reader, ')'); @@ -1135,10 +1160,14 @@ read_collection(SerdReader* reader, ReadContext ctx, Ref* dest) /* The order of node allocation here is necessarily not in stack order, so we create two nodes and recycle them throughout. */ - Ref n1 = push_node_padded(reader, genid_size(reader), SERD_BLANK, "", 0); - Ref n2 = 0; - Ref node = n1; - Ref rest = 0; + SerdNode* n1 = push_node_padded(reader, genid_size(reader), SERD_BLANK, "", 0); + SerdNode* n2 = 0; + SerdNode* node = n1; + SerdNode* rest = 0; + + if (!n1) { + return false; + } ctx.subject = *dest; while (!(end = peek_delim(reader, ')'))) { @@ -1172,8 +1201,8 @@ read_collection(SerdReader* reader, ReadContext ctx, Ref* dest) return end_collection(reader, ctx, n1, n2, true); } -static Ref -read_subject(SerdReader* reader, ReadContext ctx, Ref* dest, char* s_type) +static SerdNode* +read_subject(SerdReader* reader, ReadContext ctx, SerdNode** dest, char* s_type) { bool ate_dot = false; switch ((*s_type = peek_byte(reader))) { @@ -1192,11 +1221,11 @@ read_subject(SerdReader* reader, ReadContext ctx, Ref* dest, char* s_type) return ate_dot ? pop_node(reader, *dest) : *dest; } -static Ref +static SerdNode* read_labelOrSubject(SerdReader* reader) { - Ref subject = 0; - bool ate_dot = false; + SerdNode* subject = 0; + bool ate_dot = false; switch (peek_byte(reader)) { case '[': eat_byte_safe(reader, '['); @@ -1237,11 +1266,12 @@ read_base(SerdReader* reader, bool sparql, bool token) TRY_RET(eat_string(reader, "base", 4)); } - Ref uri; read_ws_star(reader); - TRY_RET(uri = read_IRIREF(reader)); - if (reader->sink->base) { - reader->sink->base(reader->sink->handle, deref(reader, uri)); + SerdNode* uri = read_IRIREF(reader); + if (!uri) { + return false; + } else if (reader->sink->base) { + reader->sink->base(reader->sink->handle, uri); } pop_node(reader, uri); @@ -1263,9 +1293,11 @@ read_prefixID(SerdReader* reader, bool sparql, bool token) } read_ws_star(reader); - bool ret = true; - Ref name = push_node(reader, SERD_LITERAL, "", 0); - if (read_PN_PREFIX(reader, name) > SERD_FAILURE) { + bool ret = true; + SerdNode* name = push_node(reader, SERD_LITERAL, "", 0); + if (!name) { + return false; + } else if (read_PN_PREFIX(reader, name) > SERD_FAILURE) { return pop_node(reader, name); } @@ -1274,16 +1306,14 @@ read_prefixID(SerdReader* reader, bool sparql, bool token) } read_ws_star(reader); - const Ref uri = read_IRIREF(reader); + const SerdNode* uri = read_IRIREF(reader); if (!uri) { pop_node(reader, name); return false; } if (reader->sink->prefix) { - ret = !reader->sink->prefix(reader->sink->handle, - deref(reader, name), - deref(reader, uri)); + ret = !reader->sink->prefix(reader->sink->handle, name, uri); } pop_node(reader, uri); pop_node(reader, name); @@ -1326,7 +1356,7 @@ read_wrappedGraph(SerdReader* reader, ReadContext* ctx) bool ate_dot = false; char s_type = 0; ctx->subject = 0; - Ref subj = read_subject(reader, *ctx, &ctx->subject, &s_type); + SerdNode* subj = read_subject(reader, *ctx, &ctx->subject, &s_type); if (!subj && ctx->subject) { return r_err(reader, SERD_ERR_BAD_SYNTAX, "bad subject\n"); } else if (!subj) { @@ -1346,13 +1376,11 @@ read_wrappedGraph(SerdReader* reader, ReadContext* ctx) } static int -tokcmp(SerdReader* reader, Ref ref, const char* tok, size_t n) +tokcmp(SerdNode* node, const char* tok, size_t n) { - SerdNode* node = deref(reader, ref); - if (!node || node->n_bytes != n) { - return -1; - } - return serd_strncasecmp(serd_node_get_string(node), tok, n); + return ((!node || node->n_bytes != n) + ? -1 + : serd_strncasecmp(serd_node_get_string(node), tok, n)); } bool @@ -1360,7 +1388,7 @@ read_n3_statement(SerdReader* reader) { SerdStatementFlags flags = 0; ReadContext ctx = { 0, 0, 0, 0, 0, 0, &flags }; - Ref subj = 0; + SerdNode* subj = 0; bool ate_dot = false; char s_type = 0; bool ret = true; @@ -1388,11 +1416,11 @@ read_n3_statement(SerdReader* reader) break; default: subj = read_subject(reader, ctx, &ctx.subject, &s_type); - if (!tokcmp(reader, ctx.subject, "base", 4)) { + if (!tokcmp(ctx.subject, "base", 4)) { ret = read_base(reader, true, false); - } else if (!tokcmp(reader, ctx.subject, "prefix", 6)) { + } else if (!tokcmp(ctx.subject, "prefix", 6)) { ret = read_prefixID(reader, true, false); - } else if (!tokcmp(reader, ctx.subject, "graph", 5)) { + } else if (!tokcmp(ctx.subject, "graph", 5)) { read_ws_star(reader); TRY_RET((ctx.graph = read_labelOrSubject(reader))); read_ws_star(reader); diff --git a/src/reader.c b/src/reader.c index a73135d7..6e784eea 100644 --- a/src/reader.c +++ b/src/reader.c @@ -46,9 +46,8 @@ r_err(SerdReader* reader, SerdStatus st, const char* fmt, ...) } void -set_blank_id(SerdReader* reader, Ref ref, size_t buf_size) +set_blank_id(SerdReader* reader, SerdNode* node, size_t buf_size) { - SerdNode* node = deref(reader, ref); char* buf = (char*)(node + 1); const char* prefix = reader->bprefix ? (const char*)reader->bprefix : ""; node->n_bytes = snprintf(buf, buf_size, "%sb%u", prefix, reader->next_id++); @@ -60,21 +59,29 @@ genid_size(SerdReader* reader) return reader->bprefix_len + 1 + 10 + 1; // + "b" + UINT32_MAX + \0 } -Ref +SerdNode* blank_id(SerdReader* reader) { - Ref ref = push_node_padded(reader, genid_size(reader), SERD_BLANK, "", 0); - set_blank_id(reader, ref, genid_size(reader)); + SerdNode* ref = push_node_padded( + reader, genid_size(reader), SERD_BLANK, "", 0); + if (ref) { + set_blank_id(reader, ref, genid_size(reader)); + } return ref; } -Ref +SerdNode* push_node_padded(SerdReader* reader, size_t maxlen, SerdType type, const char* str, size_t n_bytes) { void* mem = serd_stack_push_aligned( &reader->stack, sizeof(SerdNode) + maxlen + 1, sizeof(SerdNode)); + if (!mem) { + reader->status = SERD_ERR_OVERFLOW; + return NULL; + } + SerdNode* const node = (SerdNode*)mem; node->n_bytes = n_bytes; node->flags = 0; @@ -88,49 +95,41 @@ push_node_padded(SerdReader* reader, size_t maxlen, reader->allocs, sizeof(reader->allocs) * (++reader->n_allocs)); reader->allocs[reader->n_allocs - 1] = (mem - reader->stack.buf); #endif - return (char*)node - reader->stack.buf; + return node; } -Ref +SerdNode* push_node(SerdReader* reader, SerdType type, const char* str, size_t n_bytes) { return push_node_padded(reader, n_bytes, type, str, n_bytes); } SerdNode* -deref(SerdReader* reader, const Ref ref) +pop_node(SerdReader* reader, const SerdNode* node) { - return ref ? (SerdNode*)(reader->stack.buf + ref) : NULL; -} - -Ref -pop_node(SerdReader* reader, Ref ref) -{ - if (ref && ref != reader->rdf_first && ref != reader->rdf_rest - && ref != reader->rdf_nil) { + if (node && node != reader->rdf_first && node != reader->rdf_rest + && node != reader->rdf_nil) { #ifdef SERD_STACK_CHECK - SERD_STACK_ASSERT_TOP(reader, ref); + SERD_STACK_ASSERT_TOP(reader, node); --reader->n_allocs; #endif - SerdNode* const node = deref(reader, ref); - char* const top = reader->stack.buf + reader->stack.size; + char* const top = reader->stack.buf + reader->stack.size; serd_stack_pop_aligned(&reader->stack, top - (char*)node); } - return 0; + return NULL; } bool -emit_statement(SerdReader* reader, ReadContext ctx, Ref o) +emit_statement(SerdReader* reader, ReadContext ctx, SerdNode* o) { - SerdNode* graph = deref(reader, ctx.graph); + SerdNode* graph = ctx.graph; if (!graph && reader->default_graph) { graph = reader->default_graph; } bool ret = !reader->sink->statement || !reader->sink->statement( reader->sink->handle, *ctx.flags, graph, - deref(reader, ctx.subject), deref(reader, ctx.predicate), - deref(reader, o)); + ctx.subject, ctx.predicate, o); *ctx.flags &= SERD_ANON_CONT|SERD_LIST_CONT; // Preserve only cont flags return ret; } @@ -160,14 +159,17 @@ serd_reader_read_document(SerdReader* reader) } SerdReader* -serd_reader_new(SerdWorld* world, SerdSyntax syntax, const SerdSink* sink) +serd_reader_new(SerdWorld* world, + SerdSyntax syntax, + const SerdSink* sink, + size_t stack_size) { SerdReader* me = (SerdReader*)calloc(1, sizeof(SerdReader)); me->world = world; me->sink = sink; me->default_graph = NULL; - me->stack = serd_stack_new(SERD_PAGE_SIZE); + me->stack = serd_stack_new(stack_size); me->syntax = syntax; me->next_id = 1; me->strict = true; diff --git a/src/reader.h b/src/reader.h index c3ea2e77..36d6da03 100644 --- a/src/reader.h +++ b/src/reader.h @@ -35,18 +35,13 @@ # define SERD_STACK_ASSERT_TOP(reader, ref) #endif -/* Reference to a node in the stack (we can not use pointers since the - stack may be reallocated, invalidating any pointers to elements). -*/ -typedef size_t Ref; - typedef struct { - Ref graph; - Ref subject; - Ref predicate; - Ref object; - Ref datatype; - Ref lang; + SerdNode* graph; + SerdNode* subject; + SerdNode* predicate; + SerdNode* object; + SerdNode* datatype; + SerdNode* lang; SerdStatementFlags* flags; } ReadContext; @@ -55,9 +50,9 @@ struct SerdReaderImpl { const SerdSink* sink; SerdErrorSink error_sink; void* error_handle; - Ref rdf_first; - Ref rdf_rest; - Ref rdf_nil; + SerdNode* rdf_first; + SerdNode* rdf_rest; + SerdNode* rdf_nil; SerdNode* default_graph; SerdByteSource source; SerdStack stack; @@ -70,33 +65,31 @@ struct SerdReaderImpl { bool strict; ///< True iff strict parsing bool seen_genid; #ifdef SERD_STACK_CHECK - Ref* allocs; ///< Stack of push offsets + SerdNode** allocs; ///< Stack of push offsets size_t n_allocs; ///< Number of stack pushes #endif }; int r_err(SerdReader* reader, SerdStatus st, const char* fmt, ...); -Ref push_node_padded(SerdReader* reader, - size_t maxlen, - SerdType type, - const char* str, - size_t n_bytes); - -Ref push_node(SerdReader* reader, - SerdType type, - const char* str, - size_t n_bytes); +SerdNode* push_node_padded(SerdReader* reader, + size_t maxlen, + SerdType type, + const char* str, + size_t n_bytes); -size_t genid_size(SerdReader* reader); -Ref blank_id(SerdReader* reader); -void set_blank_id(SerdReader* reader, Ref ref, size_t buf_size); +SerdNode* push_node(SerdReader* reader, + SerdType type, + const char* str, + size_t n_bytes); -SerdNode* deref(SerdReader* reader, Ref ref); +size_t genid_size(SerdReader* reader); +SerdNode* blank_id(SerdReader* reader); +void set_blank_id(SerdReader* reader, SerdNode* node, size_t buf_size); -Ref pop_node(SerdReader* reader, Ref ref); +SerdNode* pop_node(SerdReader* reader, const SerdNode* node); -bool emit_statement(SerdReader* reader, ReadContext ctx, Ref o); +bool emit_statement(SerdReader* reader, ReadContext ctx, SerdNode* o); bool read_n3_statement(SerdReader* reader); SerdStatus read_nquadsDoc(SerdReader* reader); @@ -151,23 +144,31 @@ eat_string(SerdReader* reader, const char* str, unsigned n) } static inline SerdStatus -push_byte(SerdReader* reader, Ref ref, const uint8_t c) +push_byte(SerdReader* reader, SerdNode* node, const uint8_t c) { SERD_STACK_ASSERT_TOP(reader, ref); - char* const s = (char*)serd_stack_push(&reader->stack, 1); - SerdNode* const node = (SerdNode*)(reader->stack.buf + ref); + char* const s = (char*)serd_stack_push(&reader->stack, 1); + if (!s) { + return SERD_ERR_OVERFLOW; + } + ++node->n_bytes; *(s - 1) = c; *s = '\0'; return SERD_SUCCESS; } -static inline void -push_bytes(SerdReader* reader, Ref ref, const uint8_t* bytes, unsigned len) +static inline SerdStatus +push_bytes(SerdReader* reader, SerdNode* ref, const uint8_t* bytes, unsigned len) { + if (reader->stack.buf_size < reader->stack.size + len) { + return SERD_ERR_OVERFLOW; + } + for (unsigned i = 0; i < len; ++i) { push_byte(reader, ref, bytes[i]); } + return SERD_SUCCESS; } #endif // SERD_READER_H diff --git a/src/serdi.c b/src/serdi.c index 2627f572..ee437f90 100644 --- a/src/serdi.c +++ b/src/serdi.c @@ -24,9 +24,10 @@ #include <io.h> #endif +#include <limits.h> #include <stdbool.h> #include <stdio.h> -#include <string.h> +#include <stdlib.h> #define SERDI_ERROR(msg) fprintf(stderr, "serdi: " msg); #define SERDI_ERRORF(fmt, ...) fprintf(stderr, "serdi: " fmt, __VA_ARGS__); @@ -97,6 +98,7 @@ print_usage(const char* name, bool error) fprintf(os, " -f Keep full URIs in input (don't qualify).\n"); fprintf(os, " -h Display this help and exit.\n"); fprintf(os, " -i SYNTAX Input syntax: turtle/ntriples/trig/nquads.\n"); + fprintf(os, " -k BYTES Parser stack size.\n"); fprintf(os, " -l Lax (non-strict) parsing.\n"); fprintf(os, " -o SYNTAX Output syntax: turtle/ntriples/nquads.\n"); fprintf(os, " -p PREFIX Add PREFIX to blank node IDs.\n"); @@ -139,6 +141,7 @@ main(int argc, char** argv) bool full_uris = false; bool lax = false; bool quiet = false; + size_t stack_size = 4194304; const char* add_prefix = NULL; const char* chop_prefix = NULL; const char* root_uri = NULL; @@ -173,6 +176,17 @@ main(int argc, char** argv) } else if (!(input_syntax = get_syntax(argv[a]))) { return print_usage(argv[0], true); } + } else if (argv[a][1] == 'k') { + if (++a == argc) { + return missing_arg(argv[0], 'k'); + } + char* endptr = NULL; + const long size = strtol(argv[a], &endptr, 10); + if (size <= 0 || size == LONG_MAX || *endptr != '\0') { + SERDI_ERRORF("invalid stack size `%s'\n", argv[a]); + return 1; + } + stack_size = (size_t)size; } else if (argv[a][1] == 'o') { if (++a == argc) { return missing_arg(argv[0], 'o'); @@ -261,8 +275,8 @@ main(int argc, char** argv) (SerdWriteFunc)fwrite, out_fd); - SerdReader* reader = - serd_reader_new(world, input_syntax, serd_writer_get_sink(writer)); + SerdReader* reader = serd_reader_new( + world, input_syntax, serd_writer_get_sink(writer), stack_size); serd_reader_set_strict(reader, !lax); if (quiet) { diff --git a/src/stack.h b/src/stack.h index 9f112b6c..c066a75e 100644 --- a/src/stack.h +++ b/src/stack.h @@ -63,8 +63,7 @@ serd_stack_push(SerdStack* stack, size_t n_bytes) { const size_t new_size = stack->size + n_bytes; if (stack->buf_size < new_size) { - stack->buf_size += (stack->buf_size >> 1); // *= 1.5 - stack->buf = (char*)realloc(stack->buf, stack->buf_size); + return NULL; } char* const ret = (stack->buf + stack->size); stack->size = new_size; @@ -82,12 +81,16 @@ static inline void* serd_stack_push_aligned(SerdStack* stack, size_t n_bytes, size_t align) { // Push one byte to ensure space for a pad count - serd_stack_push(stack, 1); + if (!serd_stack_push(stack, 1)) { + return NULL; + } // Push padding if necessary const uint8_t pad = align - stack->size % align; if (pad > 0) { - serd_stack_push(stack, pad); + if (!serd_stack_push(stack, pad)) { + return NULL; + } } // Set top of stack to pad count so we can properly pop later diff --git a/src/string.c b/src/string.c index 279b2670..d6f98388 100644 --- a/src/string.c +++ b/src/string.c @@ -43,6 +43,7 @@ serd_strerror(SerdStatus status) case SERD_ERR_ID_CLASH: return "Blank node ID clash"; case SERD_ERR_BAD_CURIE: return "Invalid CURIE"; case SERD_ERR_INTERNAL: return "Internal error"; + case SERD_ERR_OVERFLOW: return "Stack overflow"; } return "Unknown error"; // never reached } diff --git a/src/writer.c b/src/writer.c index e5ee5e58..7c939961 100644 --- a/src/writer.c +++ b/src/writer.c @@ -879,7 +879,7 @@ serd_writer_new(SerdWorld* world, writer->env = env; writer->root_node = NULL; writer->root_uri = SERD_URI_NULL; - writer->anon_stack = serd_stack_new(4 * sizeof(WriteContext)); + writer->anon_stack = serd_stack_new(SERD_PAGE_SIZE); writer->context = context; writer->list_subj = NULL; writer->empty = true; diff --git a/tests/serd_test.c b/tests/serd_test.c index a77bcbbb..d9fc1388 100644 --- a/tests/serd_test.c +++ b/tests/serd_test.c @@ -257,7 +257,7 @@ main(void) const char* msg = NULL; assert(!strcmp((msg = serd_strerror(SERD_SUCCESS)), "Success")); - for (int i = SERD_FAILURE; i <= SERD_ERR_INTERNAL; ++i) { + for (int i = SERD_FAILURE; i <= SERD_ERR_OVERFLOW; ++i) { msg = serd_strerror((SerdStatus)i); assert(strcmp(msg, "Success")); } @@ -570,7 +570,7 @@ main(void) ReaderTest rt = { 0, NULL }; SerdSink sink = { &rt, NULL, NULL, test_sink, NULL }; - SerdReader* reader = serd_reader_new(world, SERD_TURTLE, &sink); + SerdReader* reader = serd_reader_new(world, SERD_TURTLE, &sink, 4096); assert(reader); SerdNode* g = serd_node_new_uri("http://example.org/"); @@ -431,6 +431,8 @@ def test(tst): check([serdi, '-i', 'illegal']) check([serdi, '-i', 'turtle']) check([serdi, '-i']) + check([serdi, '-k']) + check([serdi, '-k', '-1']) check([serdi, '-o', 'illegal']) check([serdi, '-o']) check([serdi, '-p']) |