diff options
author | David Robillard <d@drobilla.net> | 2021-07-08 16:15:46 -0400 |
---|---|---|
committer | David Robillard <d@drobilla.net> | 2022-01-13 23:03:31 -0500 |
commit | 5c90b6aff410bb4a9737680baffd79d10b5281fd (patch) | |
tree | e6f4c94fbc1c230fb238e3c91d8dc85adc0a1dec | |
parent | 00af9fa4e0344b1ff642a7ccd63626f77521ea8a (diff) | |
download | serd-5c90b6aff410bb4a9737680baffd79d10b5281fd.tar.gz serd-5c90b6aff410bb4a9737680baffd79d10b5281fd.tar.bz2 serd-5c90b6aff410bb4a9737680baffd79d10b5281fd.zip |
Use a fixed-size reader stack
-rw-r--r-- | NEWS | 1 | ||||
-rw-r--r-- | doc/serdi.1 | 9 | ||||
-rw-r--r-- | include/serd/serd.h | 9 | ||||
-rw-r--r-- | src/n3.c | 273 | ||||
-rw-r--r-- | src/node.c | 2 | ||||
-rw-r--r-- | src/node.h | 3 | ||||
-rw-r--r-- | src/reader.c | 83 | ||||
-rw-r--r-- | src/reader.h | 62 | ||||
-rw-r--r-- | src/serdi.c | 19 | ||||
-rw-r--r-- | src/stack.h | 12 | ||||
-rw-r--r-- | src/string.c | 4 | ||||
-rw-r--r-- | test/meson.build | 6 | ||||
-rw-r--r-- | test/test_read_chunk.c | 2 | ||||
-rw-r--r-- | test/test_reader_writer.c | 17 | ||||
-rw-r--r-- | test/test_string.c | 2 |
15 files changed, 292 insertions, 212 deletions
@@ -7,6 +7,7 @@ serd (1.0.1) unstable; * Remove support for Turtle named inline nodes extension * Remove useless character counting from API * Rename SerdChunk to SerdStringView + * Use a fixed-size reader stack * Use char* for strings in public API -- David Robillard <d@drobilla.net> Wed, 13 Jan 2021 13:29:44 +0000 diff --git a/doc/serdi.1 b/doc/serdi.1 index 0684a6e0..55fe6492 100644 --- a/doc/serdi.1 +++ b/doc/serdi.1 @@ -9,6 +9,7 @@ .Op Fl abefhlqv .Op Fl c Ar prefix .Op Fl i Ar syntax +.Op Fl k Ar bytes .Op Fl o Ar syntax .Op Fl p Ar prefix .Op Fl r Ar root @@ -68,6 +69,14 @@ Case is ignored, valid values are: .Dq TriG , .Dq Turtle . .Pp +.It Fl k Ar bytes +Parser stack size. +For performance and security reasons, parsing is performed with a fixed-size stack. +By default, the stack is 4096 bytes, which should be sufficient for most data. +If some data has very deep nesting or very large literal values, +it may exceed the default amount of space, +and this option can be used to increase it and allow the document to be parsed successfully. +.Pp .It Fl l Lax (non-strict) parsing. If this is enabled, recoverable syntax errors will print a warning, but parsing will proceed starting at the next statement if possible. diff --git a/include/serd/serd.h b/include/serd/serd.h index bfe53401..57f5af75 100644 --- a/include/serd/serd.h +++ b/include/serd/serd.h @@ -202,8 +202,9 @@ typedef enum { SERD_ERR_BAD_ARG, ///< Invalid argument SERD_ERR_NOT_FOUND, ///< Not found SERD_ERR_ID_CLASH, ///< Encountered clashing blank node IDs - SERD_ERR_BAD_CURIE, ///< Invalid CURIE (e.g. prefix does not exist) - SERD_ERR_INTERNAL ///< Unexpected internal error (should not happen) + SERD_ERR_BAD_CURIE, ///< Invalid CURIE or unknown namespace prefix + SERD_ERR_INTERNAL, ///< Unexpected internal error + SERD_ERR_OVERFLOW, ///< Stack overflow } SerdStatus; /// Return a string describing a status code @@ -930,7 +931,9 @@ typedef struct SerdReaderImpl SerdReader; /// Create a new RDF reader SERD_API SerdReader* SERD_ALLOCATED -serd_reader_new(SerdSyntax syntax, const SerdSink* SERD_NONNULL sink); +serd_reader_new(SerdSyntax syntax, + const SerdSink* SERD_NONNULL sink, + size_t stack_size); /** Enable or disable strict parsing. @@ -55,7 +55,7 @@ fancy_syntax(const SerdReader* const reader) } static SerdStatus -read_collection(SerdReader* reader, ReadContext ctx, Ref* dest); +read_collection(SerdReader* reader, ReadContext ctx, SerdNode** dest); static SerdStatus read_predicateObjectList(SerdReader* reader, ReadContext ctx, bool* ate_dot); @@ -74,7 +74,9 @@ read_HEX(SerdReader* const reader) // Read UCHAR escape, initial \ is already eaten by caller static SerdStatus -read_UCHAR(SerdReader* const reader, const Ref dest, uint32_t* const char_code) +read_UCHAR(SerdReader* const reader, + SerdNode* const dest, + uint32_t* const char_code) { const int b = peek_byte(reader); unsigned length = 0; @@ -88,7 +90,6 @@ read_UCHAR(SerdReader* const reader, const Ref dest, uint32_t* const char_code) default: return SERD_ERR_BAD_SYNTAX; } - eat_byte_safe(reader, b); uint8_t buf[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; @@ -116,9 +117,9 @@ read_UCHAR(SerdReader* const reader, const Ref dest, uint32_t* const char_code) SERD_ERR_BAD_SYNTAX, "unicode character 0x%X out of range\n", code); - push_bytes(reader, dest, replacement_char, 3); - *char_code = 0xFFFD; - return SERD_SUCCESS; + *char_code = 0xFFFD; + const SerdStatus st = push_bytes(reader, dest, replacement_char, 3); + return st ? st : SERD_SUCCESS; } // Build output in buf @@ -147,44 +148,39 @@ read_UCHAR(SerdReader* const reader, const Ref dest, uint32_t* const char_code) break; } - push_bytes(reader, dest, buf, size); *char_code = code; - return SERD_SUCCESS; + return push_bytes(reader, dest, buf, size); } // Read ECHAR escape, initial \ is already eaten by caller static SerdStatus -read_ECHAR(SerdReader* const reader, const Ref dest, SerdNodeFlags* const flags) +read_ECHAR(SerdReader* const reader, + SerdNode* const dest, + SerdNodeFlags* const flags) { const int c = peek_byte(reader); switch (c) { case 't': eat_byte_safe(reader, 't'); - push_byte(reader, dest, '\t'); - return SERD_SUCCESS; + return push_byte(reader, dest, '\t'); case 'b': eat_byte_safe(reader, 'b'); - push_byte(reader, dest, '\b'); - return SERD_SUCCESS; + return push_byte(reader, dest, '\b'); case 'n': *flags |= SERD_HAS_NEWLINE; eat_byte_safe(reader, 'n'); - push_byte(reader, dest, '\n'); - return SERD_SUCCESS; + return push_byte(reader, dest, '\n'); case 'r': *flags |= SERD_HAS_NEWLINE; eat_byte_safe(reader, 'r'); - push_byte(reader, dest, '\r'); - return SERD_SUCCESS; + return push_byte(reader, dest, '\r'); case 'f': eat_byte_safe(reader, 'f'); - push_byte(reader, dest, '\f'); - return SERD_SUCCESS; + return push_byte(reader, dest, '\f'); case '\\': case '"': case '\'': - push_byte(reader, dest, eat_byte_safe(reader, c)); - return SERD_SUCCESS; + return push_byte(reader, dest, eat_byte_safe(reader, c)); default: return SERD_ERR_BAD_SYNTAX; } @@ -229,23 +225,24 @@ read_utf8_bytes(SerdReader* const reader, } static SerdStatus -read_utf8_character(SerdReader* const reader, const Ref dest, const uint8_t c) +read_utf8_character(SerdReader* const reader, + SerdNode* const dest, + const uint8_t c) { uint32_t size = 0; uint8_t bytes[4] = {0, 0, 0, 0}; SerdStatus st = read_utf8_bytes(reader, bytes, &size, c); if (st) { push_bytes(reader, dest, replacement_char, 3); - } else { - push_bytes(reader, dest, bytes, size); + return st; } - return st; + return push_bytes(reader, dest, bytes, size); } static SerdStatus read_utf8_code(SerdReader* const reader, - const Ref dest, + SerdNode* const dest, uint32_t* const code, const uint8_t c) { @@ -257,8 +254,10 @@ read_utf8_code(SerdReader* const reader, return st; } - push_bytes(reader, dest, bytes, size); - *code = parse_counted_utf8_char(bytes, size); + if (!(st = push_bytes(reader, dest, bytes, size))) { + *code = parse_counted_utf8_char(bytes, size); + } + return st; } @@ -266,7 +265,7 @@ read_utf8_code(SerdReader* const reader, // The first byte, c, has already been eaten by caller static SerdStatus read_character(SerdReader* const reader, - const Ref dest, + SerdNode* const dest, SerdNodeFlags* const flags, const uint8_t c) { @@ -283,9 +282,9 @@ read_character(SerdReader* const reader, default: break; } + return push_byte(reader, dest, c); } - return read_utf8_character(reader, dest, c); } @@ -351,7 +350,7 @@ eat_delim(SerdReader* const reader, const uint8_t delim) // Initial triple quotes are already eaten by caller static SerdStatus read_STRING_LITERAL_LONG(SerdReader* const reader, - const Ref ref, + SerdNode* const ref, SerdNodeFlags* const flags, const uint8_t q) { @@ -392,7 +391,7 @@ read_STRING_LITERAL_LONG(SerdReader* const reader, // Initial quote is already eaten by caller static SerdStatus read_STRING_LITERAL(SerdReader* const reader, - const Ref ref, + SerdNode* const ref, SerdNodeFlags* const flags, const uint8_t q) { @@ -432,7 +431,7 @@ read_STRING_LITERAL(SerdReader* const reader, static SerdStatus read_String(SerdReader* const reader, - const Ref node, + SerdNode* const node, SerdNodeFlags* const flags) { const int q1 = peek_byte(reader); @@ -478,7 +477,7 @@ is_PN_CHARS_BASE(const uint32_t c) } static SerdStatus -read_PN_CHARS_BASE(SerdReader* const reader, const Ref dest) +read_PN_CHARS_BASE(SerdReader* const reader, SerdNode* const dest) { uint32_t code = 0; const int c = peek_byte(reader); @@ -508,7 +507,7 @@ is_PN_CHARS(const uint32_t c) } static SerdStatus -read_PN_CHARS(SerdReader* const reader, const Ref dest) +read_PN_CHARS(SerdReader* const reader, SerdNode* const dest) { uint32_t code = 0; const int c = peek_byte(reader); @@ -528,7 +527,7 @@ read_PN_CHARS(SerdReader* const reader, const Ref dest) } static SerdStatus -read_PERCENT(SerdReader* const reader, const Ref dest) +read_PERCENT(SerdReader* const reader, SerdNode* const dest) { push_byte(reader, dest, eat_byte_safe(reader, '%')); const uint8_t h1 = read_HEX(reader); @@ -542,7 +541,7 @@ read_PERCENT(SerdReader* const reader, const Ref dest) } static SerdStatus -read_PN_LOCAL_ESC(SerdReader* const reader, const Ref dest) +read_PN_LOCAL_ESC(SerdReader* const reader, SerdNode* const dest) { eat_byte_safe(reader, '\\'); @@ -578,7 +577,7 @@ read_PN_LOCAL_ESC(SerdReader* const reader, const Ref dest) } static SerdStatus -read_PLX(SerdReader* const reader, const Ref dest) +read_PLX(SerdReader* const reader, SerdNode* const dest) { const int c = peek_byte(reader); switch (c) { @@ -592,7 +591,9 @@ read_PLX(SerdReader* const reader, const Ref dest) } static SerdStatus -read_PN_LOCAL(SerdReader* const reader, const Ref dest, bool* const ate_dot) +read_PN_LOCAL(SerdReader* const reader, + SerdNode* const dest, + bool* const ate_dot) { int c = peek_byte(reader); SerdStatus st = SERD_SUCCESS; @@ -631,10 +632,9 @@ read_PN_LOCAL(SerdReader* const reader, const Ref dest, bool* const ate_dot) trailing_unescaped_dot = (c == '.'); } - SerdNode* const n = deref(reader, dest); if (trailing_unescaped_dot) { // Ate trailing dot, pop it from stack/node and inform caller - --n->length; + --dest->length; serd_stack_pop(&reader->stack, 1); *ate_dot = true; } @@ -644,28 +644,29 @@ read_PN_LOCAL(SerdReader* const reader, const Ref dest, bool* const ate_dot) // Read the remainder of a PN_PREFIX after some initial characters static SerdStatus -read_PN_PREFIX_tail(SerdReader* const reader, const Ref dest) +read_PN_PREFIX_tail(SerdReader* const reader, SerdNode* const dest) { - int c = 0; + SerdStatus st = SERD_SUCCESS; + int c = 0; while ((c = peek_byte(reader))) { // Middle: (PN_CHARS | '.')* if (c == '.') { - push_byte(reader, dest, eat_byte_safe(reader, c)); - } else if (read_PN_CHARS(reader, dest)) { + st = push_byte(reader, dest, eat_byte_safe(reader, c)); + } else if ((st = read_PN_CHARS(reader, dest))) { break; } } - const SerdNode* const n = deref(reader, dest); - if (serd_node_string(n)[serd_node_length(n) - 1] == '.' && + if (st <= SERD_FAILURE && + serd_node_string(dest)[serd_node_length(dest) - 1] == '.' && read_PN_CHARS(reader, dest)) { return r_err(reader, SERD_ERR_BAD_SYNTAX, "prefix ends with `.'\n"); } - return SERD_SUCCESS; + return st > SERD_FAILURE ? st : SERD_SUCCESS; } static SerdStatus -read_PN_PREFIX(SerdReader* const reader, const Ref dest) +read_PN_PREFIX(SerdReader* const reader, SerdNode* const dest) { if (!read_PN_CHARS_BASE(reader, dest)) { return read_PN_PREFIX_tail(reader, dest); @@ -675,33 +676,33 @@ read_PN_PREFIX(SerdReader* const reader, const Ref dest) } static SerdStatus -read_LANGTAG(SerdReader* const reader, Ref* const dest) +read_LANGTAG(SerdReader* const reader, SerdNode** const dest) { int c = peek_byte(reader); if (!is_alpha(c)) { return r_err(reader, SERD_ERR_BAD_SYNTAX, "unexpected `%c'\n", c); } - *dest = push_node(reader, SERD_LITERAL, "", 0); + if (!(*dest = push_node(reader, SERD_LITERAL, "", 0))) { + return SERD_ERR_OVERFLOW; + } SerdStatus st = SERD_SUCCESS; TRY(st, push_byte(reader, *dest, eat_byte_safe(reader, c))); while ((c = peek_byte(reader)) && is_alpha(c)) { TRY(st, push_byte(reader, *dest, eat_byte_safe(reader, c))); } - while (peek_byte(reader) == '-') { TRY(st, push_byte(reader, *dest, eat_byte_safe(reader, '-'))); while ((c = peek_byte(reader)) && (is_alpha(c) || is_digit(c))) { TRY(st, push_byte(reader, *dest, eat_byte_safe(reader, c))); } } - return SERD_SUCCESS; } static SerdStatus -read_IRIREF_scheme(SerdReader* const reader, const Ref dest) +read_IRIREF_scheme(SerdReader* const reader, SerdNode* const dest) { int c = peek_byte(reader); if (!is_alpha(c)) { @@ -731,7 +732,7 @@ read_IRIREF_scheme(SerdReader* const reader, const Ref dest) } static SerdStatus -read_IRIREF(SerdReader* const reader, Ref* const dest) +read_IRIREF(SerdReader* const reader, SerdNode** const dest) { if (!eat_byte_check(reader, '<')) { return SERD_ERR_BAD_SYNTAX; @@ -817,7 +818,7 @@ read_IRIREF(SerdReader* const reader, Ref* const dest) static SerdStatus read_PrefixedName(SerdReader* const reader, - const Ref dest, + SerdNode* const dest, const bool read_prefix, bool* const ate_dot) { @@ -838,7 +839,7 @@ read_PrefixedName(SerdReader* const reader, } static SerdStatus -read_0_9(SerdReader* const reader, const Ref str, const bool at_least_one) +read_0_9(SerdReader* const reader, SerdNode* const str, const bool at_least_one) { unsigned count = 0; SerdStatus st = SERD_SUCCESS; @@ -855,8 +856,8 @@ read_0_9(SerdReader* const reader, const Ref str, const bool at_least_one) static SerdStatus read_number(SerdReader* const reader, - Ref* const dest, - Ref* const datatype, + SerdNode** const dest, + SerdNode** const datatype, SerdNodeFlags* const flags, bool* const ate_dot) { @@ -869,9 +870,14 @@ read_number(SerdReader* const reader, SerdStatus st = SERD_SUCCESS; int c = peek_byte(reader); bool has_decimal = false; + if (!*dest) { + return SERD_ERR_OVERFLOW; + } + if (c == '-' || c == '+') { push_byte(reader, *dest, eat_byte_safe(reader, c)); } + if ((c = peek_byte(reader)) == '.') { has_decimal = true; // decimal case 2 (e.g. '.0' or `-.0' or `+.0') @@ -925,22 +931,24 @@ read_number(SerdReader* const reader, } static SerdStatus -read_iri(SerdReader* const reader, Ref* const dest, bool* const ate_dot) +read_iri(SerdReader* const reader, SerdNode** const dest, bool* const ate_dot) { switch (peek_byte(reader)) { case '<': return read_IRIREF(reader, dest); default: - *dest = push_node(reader, SERD_CURIE, "", 0); + if (!(*dest = push_node(reader, SERD_CURIE, "", 0))) { + return SERD_ERR_OVERFLOW; + } return read_PrefixedName(reader, *dest, true, ate_dot); } } static SerdStatus read_literal(SerdReader* const reader, - Ref* const dest, - Ref* const datatype, - Ref* const lang, + SerdNode** const dest, + SerdNode** const datatype, + SerdNode** const lang, SerdNodeFlags* const flags, bool* const ate_dot) { @@ -979,7 +987,7 @@ read_literal(SerdReader* const reader, } static SerdStatus -read_verb(SerdReader* const reader, Ref* const dest) +read_verb(SerdReader* const reader, SerdNode** const dest) { if (peek_byte(reader) == '<') { return read_IRIREF(reader, dest); @@ -988,11 +996,13 @@ read_verb(SerdReader* const reader, Ref* const dest) /* Either a qname, or "a". Read the prefix first, and if it is in fact "a", produce that instead. */ - *dest = push_node(reader, SERD_CURIE, "", 0); + if (!(*dest = push_node(reader, SERD_CURIE, "", 0))) { + return SERD_ERR_OVERFLOW; + } SerdStatus st = read_PN_PREFIX(reader, *dest); bool ate_dot = false; - SerdNode* node = deref(reader, *dest); + SerdNode* node = *dest; const int next = peek_byte(reader); if (!st && node->length == 1 && serd_node_string(node)[0] == 'a' && next != ':' && !is_PN_CHARS_BASE((uint32_t)next)) { @@ -1012,36 +1022,41 @@ read_verb(SerdReader* const reader, Ref* const dest) static SerdStatus read_BLANK_NODE_LABEL(SerdReader* const reader, - Ref* const dest, + SerdNode** const dest, bool* const ate_dot) { eat_byte_safe(reader, '_'); eat_byte_check(reader, ':'); - const Ref ref = *dest = push_node(reader, - SERD_BLANK, - reader->bprefix ? reader->bprefix : "", - reader->bprefix_len); + SerdStatus st = SERD_SUCCESS; + + SerdNode* n = *dest = push_node(reader, + SERD_BLANK, + reader->bprefix ? reader->bprefix : "", + reader->bprefix_len); int c = peek_byte(reader); // First: (PN_CHARS | '_' | [0-9]) if (is_digit(c) || c == '_') { - push_byte(reader, ref, eat_byte_safe(reader, c)); - } else if (read_PN_CHARS(reader, ref)) { - *dest = pop_node(reader, *dest); - return r_err(reader, SERD_ERR_BAD_SYNTAX, "invalid name start\n"); + push_byte(reader, n, eat_byte_safe(reader, c)); + } else if ((st = read_PN_CHARS(reader, n))) { + *dest = pop_node(reader, n); + return r_err(reader, st, "invalid name start\n"); } while ((c = peek_byte(reader))) { // Middle: (PN_CHARS | '.')* if (c == '.') { - push_byte(reader, ref, eat_byte_safe(reader, c)); - } else if (read_PN_CHARS(reader, ref)) { + TRY(st, push_byte(reader, n, eat_byte_safe(reader, c))); + } else if ((st = read_PN_CHARS(reader, n))) { break; } } - SerdNode* n = deref(reader, ref); - char* buf = serd_node_buffer(n); - if (buf[n->length - 1] == '.' && read_PN_CHARS(reader, ref)) { + if (st > SERD_FAILURE) { + return st; + } + + char* buf = serd_node_buffer(n); + if (buf[n->length - 1] == '.' && read_PN_CHARS(reader, n)) { // Ate trailing dot, pop it from stack/node and inform caller --n->length; serd_stack_pop(&reader->stack, 1); @@ -1054,13 +1069,14 @@ read_BLANK_NODE_LABEL(SerdReader* const reader, buf[reader->bprefix_len] = 'B'; // Prevent clash reader->seen_genid = true; } else if (reader->seen_genid && buf[reader->bprefix_len] == 'B') { - *dest = pop_node(reader, *dest); + *dest = pop_node(reader, n); return r_err(reader, SERD_ERR_ID_CLASH, "found both `b' and `B' blank IDs, prefix required\n"); } } } + return SERD_SUCCESS; } @@ -1068,7 +1084,7 @@ static SerdStatus read_anon(SerdReader* const reader, ReadContext ctx, const bool subject, - Ref* const dest) + SerdNode** const dest) { const SerdStatementFlags old_flags = *ctx.flags; bool empty = false; @@ -1100,7 +1116,7 @@ read_anon(SerdReader* const reader, return r_err(reader, SERD_ERR_BAD_SYNTAX, "`.' inside blank\n"); } read_ws_star(reader); - serd_sink_write_end(reader->sink, deref(reader, *dest)); + serd_sink_write_end(reader->sink, *dest); *ctx.flags = old_flags; } return (eat_byte_check(reader, ']') == ']') ? SERD_SUCCESS @@ -1126,10 +1142,9 @@ read_object(SerdReader* const reader, SerdStatus ret = SERD_FAILURE; bool simple = (ctx->subject != 0); - SerdNode* node = NULL; - Ref o = 0; - Ref datatype = 0; - Ref lang = 0; + SerdNode* o = 0; + SerdNode* datatype = 0; + SerdNode* lang = 0; uint32_t flags = 0; const int c = peek_byte(reader); if (!fancy_syntax(reader)) { @@ -1185,16 +1200,18 @@ read_object(SerdReader* const reader, /* Either a boolean literal, or a qname. Read the prefix first, and if it is in fact a "true" or "false" literal, produce that instead. */ - o = push_node(reader, SERD_CURIE, "", 0); + if (!(o = push_node(reader, SERD_CURIE, "", 0))) { + return SERD_ERR_OVERFLOW; + } + while (!read_PN_CHARS_BASE(reader, o)) { } - node = deref(reader, o); - if ((node->length == 4 && !memcmp(serd_node_string(node), "true", 4)) || - (node->length == 5 && !memcmp(serd_node_string(node), "false", 5))) { - flags = flags | SERD_HAS_DATATYPE; - node->type = SERD_LITERAL; - datatype = push_node(reader, SERD_URI, XSD_BOOLEAN, XSD_BOOLEAN_LEN); - ret = SERD_SUCCESS; + if ((o->length == 4 && !memcmp(serd_node_string(o), "true", 4)) || + (o->length == 5 && !memcmp(serd_node_string(o), "false", 5))) { + flags = flags | SERD_HAS_DATATYPE; + o->type = SERD_LITERAL; + datatype = push_node(reader, SERD_URI, XSD_BOOLEAN, XSD_BOOLEAN_LEN); + ret = SERD_SUCCESS; } else if (read_PN_PREFIX_tail(reader, o) > SERD_FAILURE) { ret = SERD_ERR_BAD_SYNTAX; } else { @@ -1207,7 +1224,7 @@ read_object(SerdReader* const reader, } if (!ret && simple && o) { - deref(reader, o)->flags = flags; + o->flags = flags; } if (!ret && emit && simple) { @@ -1287,8 +1304,8 @@ read_predicateObjectList(SerdReader* const reader, static SerdStatus end_collection(SerdReader* const reader, const ReadContext ctx, - const Ref n1, - const Ref n2, + SerdNode* const n1, + SerdNode* const n2, const SerdStatus st) { pop_node(reader, n2); @@ -1303,7 +1320,9 @@ end_collection(SerdReader* const reader, } static SerdStatus -read_collection(SerdReader* const reader, ReadContext ctx, Ref* const dest) +read_collection(SerdReader* const reader, + ReadContext ctx, + SerdNode** const dest) { SerdStatus st = SERD_SUCCESS; eat_byte_safe(reader, '('); @@ -1324,10 +1343,15 @@ read_collection(SerdReader* const reader, ReadContext ctx, Ref* const dest) /* The order of node allocation here is necessarily not in stack order, so we create two nodes and recycle them throughout. */ - Ref n1 = push_node_padded(reader, genid_size(reader), SERD_BLANK, "", 0); - Ref n2 = 0; - Ref node = n1; - Ref rest = 0; + SerdNode* n1 = + push_node_padded(reader, genid_size(reader), SERD_BLANK, "", 0); + SerdNode* n2 = 0; + SerdNode* node = n1; + SerdNode* rest = 0; + + if (!n1) { + return SERD_ERR_OVERFLOW; + } ctx.subject = *dest; while (!peek_delim(reader, ')')) { @@ -1363,8 +1387,8 @@ read_collection(SerdReader* const reader, ReadContext ctx, Ref* const dest) static SerdStatus read_subject(SerdReader* const reader, - const ReadContext ctx, - Ref* const dest, + ReadContext ctx, + SerdNode** const dest, int* const s_type) { SerdStatus st = SERD_SUCCESS; @@ -1392,7 +1416,7 @@ read_subject(SerdReader* const reader, } static SerdStatus -read_labelOrSubject(SerdReader* const reader, Ref* const dest) +read_labelOrSubject(SerdReader* const reader, SerdNode** const dest) { bool ate_dot = false; switch (peek_byte(reader)) { @@ -1445,9 +1469,9 @@ read_base(SerdReader* const reader, const bool sparql, const bool token) read_ws_star(reader); - Ref uri = 0; + SerdNode* uri = NULL; TRY(st, read_IRIREF(reader, &uri)); - TRY(st, serd_sink_write_base(reader->sink, deref(reader, uri))); + TRY(st, serd_sink_write_base(reader->sink, uri)); pop_node(reader, uri); read_ws_star(reader); @@ -1471,7 +1495,11 @@ read_prefixID(SerdReader* const reader, const bool sparql, const bool token) } read_ws_star(reader); - Ref name = push_node(reader, SERD_LITERAL, "", 0); + SerdNode* name = push_node(reader, SERD_LITERAL, "", 0); + if (!name) { + return SERD_ERR_OVERFLOW; + } + if ((st = read_PN_PREFIX(reader, name)) > SERD_FAILURE) { return st; } @@ -1482,11 +1510,10 @@ read_prefixID(SerdReader* const reader, const bool sparql, const bool token) } read_ws_star(reader); - Ref uri = 0; + SerdNode* uri = NULL; TRY(st, read_IRIREF(reader, &uri)); - st = serd_sink_write_prefix( - reader->sink, deref(reader, name), deref(reader, uri)); + st = serd_sink_write_prefix(reader->sink, name, uri); pop_node(reader, uri); pop_node(reader, name); @@ -1564,17 +1591,11 @@ read_wrappedGraph(SerdReader* const reader, ReadContext* const ctx) } static int -tokcmp(SerdReader* const reader, - const Ref ref, - const char* const tok, - const size_t n) +tokcmp(SerdNode* const node, const char* const tok, const size_t n) { - SerdNode* node = deref(reader, ref); - if (!node || node->length != n) { - return -1; - } - - return serd_strncasecmp(serd_node_string(node), tok, n); + return ((!node || node->length != n) + ? -1 + : serd_strncasecmp(serd_node_string(node), tok, n)); } SerdStatus @@ -1615,11 +1636,11 @@ read_n3_statement(SerdReader* const reader) return st; } - if (!tokcmp(reader, ctx.subject, "base", 4)) { + if (!tokcmp(ctx.subject, "base", 4)) { st = read_base(reader, true, false); - } else if (!tokcmp(reader, ctx.subject, "prefix", 6)) { + } else if (!tokcmp(ctx.subject, "prefix", 6)) { st = read_prefixID(reader, true, false); - } else if (!tokcmp(reader, ctx.subject, "graph", 5)) { + } else if (!tokcmp(ctx.subject, "graph", 5)) { read_ws_star(reader); TRY(st, read_labelOrSubject(reader, &ctx.graph)); read_ws_star(reader); @@ -41,8 +41,6 @@ # endif #endif -static const size_t serd_node_align = 2 * sizeof(uint64_t); - static const SerdNodeFlags meta_mask = (SERD_HAS_DATATYPE | SERD_HAS_LANGUAGE); static SerdNode* @@ -20,6 +20,7 @@ #include "serd/serd.h" #include <stddef.h> +#include <stdint.h> struct SerdNodeImpl { size_t length; ///< Length in bytes (not including null) @@ -27,6 +28,8 @@ struct SerdNodeImpl { SerdNodeType type; ///< Node type }; +static const size_t serd_node_align = 2 * sizeof(uint64_t); + static inline char* SERD_NONNULL serd_node_buffer(SerdNode* SERD_NONNULL node) { diff --git a/src/reader.c b/src/reader.c index c8b0ca16..d89ea197 100644 --- a/src/reader.c +++ b/src/reader.c @@ -44,9 +44,10 @@ r_err(SerdReader* const reader, const SerdStatus st, const char* const fmt, ...) } void -set_blank_id(SerdReader* const reader, const Ref ref, const size_t buf_size) +set_blank_id(SerdReader* const reader, + SerdNode* const node, + const size_t buf_size) { - SerdNode* node = deref(reader, ref); char* buf = (char*)(node + 1); const char* prefix = reader->bprefix ? (const char*)reader->bprefix : ""; @@ -60,15 +61,18 @@ genid_size(const SerdReader* const reader) return reader->bprefix_len + 1 + 10 + 1; // + "b" + UINT32_MAX + \0 } -Ref +SerdNode* blank_id(SerdReader* const reader) { - Ref ref = push_node_padded(reader, genid_size(reader), SERD_BLANK, "", 0); - set_blank_id(reader, ref, genid_size(reader)); + SerdNode* ref = + push_node_padded(reader, genid_size(reader), SERD_BLANK, "", 0); + if (ref) { + set_blank_id(reader, ref, genid_size(reader)); + } return ref; } -Ref +SerdNode* push_node_padded(SerdReader* const reader, const size_t maxlen, const SerdNodeType type, @@ -78,6 +82,10 @@ push_node_padded(SerdReader* const reader, void* mem = serd_stack_push_aligned( &reader->stack, sizeof(SerdNode) + maxlen + 1, sizeof(SerdNode)); + if (!mem) { + return NULL; + } + SerdNode* const node = (SerdNode*)mem; node->length = length; @@ -88,14 +96,15 @@ push_node_padded(SerdReader* const reader, memcpy(buf, str, length + 1); #ifdef SERD_STACK_CHECK - reader->allocs = (Ref*)realloc(reader->allocs, - sizeof(reader->allocs) * (++reader->n_allocs)); - reader->allocs[reader->n_allocs - 1] = ((char*)mem - reader->stack.buf); + reader->allocs = (SerdNode**)realloc( + reader->allocs, sizeof(reader->allocs) * (++reader->n_allocs)); + reader->allocs[reader->n_allocs - 1] = + (SerdNode*)((char*)mem - reader->stack.buf); #endif - return (Ref)((char*)node - reader->stack.buf); + return node; } -Ref +SerdNode* push_node(SerdReader* const reader, const SerdNodeType type, const char* const str, @@ -104,43 +113,33 @@ push_node(SerdReader* const reader, return push_node_padded(reader, length, type, str, length); } -SERD_PURE_FUNC SerdNode* -deref(SerdReader* const reader, const Ref ref) +pop_node(SerdReader* const reader, const SerdNode* const node) { - return ref ? (SerdNode*)(reader->stack.buf + ref) : NULL; -} - -Ref -pop_node(SerdReader* const reader, const Ref ref) -{ - if (ref && ref != reader->rdf_first && ref != reader->rdf_rest && - ref != reader->rdf_nil) { + if (node && node != reader->rdf_first && node != reader->rdf_rest && + node != reader->rdf_nil) { #ifdef SERD_STACK_CHECK - SERD_STACK_ASSERT_TOP(reader, ref); + SERD_STACK_ASSERT_TOP(reader, node); --reader->n_allocs; #endif - SerdNode* const node = deref(reader, ref); - char* const top = reader->stack.buf + reader->stack.size; + char* const top = reader->stack.buf + reader->stack.size; serd_stack_pop_aligned(&reader->stack, (size_t)(top - (char*)node)); } - return 0; + return NULL; } SerdStatus -emit_statement(SerdReader* const reader, const ReadContext ctx, const Ref o) +emit_statement(SerdReader* const reader, + const ReadContext ctx, + SerdNode* const o) { - SerdNode* graph = deref(reader, ctx.graph); + SerdNode* graph = ctx.graph; if (!graph && reader->default_graph) { graph = reader->default_graph; } - const SerdStatus st = serd_sink_write(reader->sink, - *ctx.flags, - deref(reader, ctx.subject), - deref(reader, ctx.predicate), - deref(reader, o), - graph); + const SerdStatus st = serd_sink_write( + reader->sink, *ctx.flags, ctx.subject, ctx.predicate, o, graph); *ctx.flags &= SERD_ANON_CONT | SERD_LIST_CONT; // Preserve only cont flags return st; @@ -167,21 +166,35 @@ serd_reader_read_document(SerdReader* const reader) } SerdReader* -serd_reader_new(const SerdSyntax syntax, const SerdSink* const sink) +serd_reader_new(const SerdSyntax syntax, + const SerdSink* const sink, + const size_t stack_size) { + if (stack_size < 3 * sizeof(SerdNode) + 192 + serd_node_align) { + return NULL; + } + SerdReader* me = (SerdReader*)calloc(1, sizeof(SerdReader)); me->sink = sink; me->default_graph = NULL; - me->stack = serd_stack_new(SERD_PAGE_SIZE); + me->stack = serd_stack_new(stack_size); me->syntax = syntax; me->next_id = 1; me->strict = true; + // Reserve a bit of space at the end of the stack to zero pad nodes + me->stack.buf_size -= serd_node_align; + me->rdf_first = push_node(me, SERD_URI, NS_RDF "first", 48); me->rdf_rest = push_node(me, SERD_URI, NS_RDF "rest", 47); me->rdf_nil = push_node(me, SERD_URI, NS_RDF "nil", 46); + // The initial stack size check should cover this + assert(me->rdf_first); + assert(me->rdf_rest); + assert(me->rdf_nil); + return me; } diff --git a/src/reader.h b/src/reader.h index d5c80373..8134dfd1 100644 --- a/src/reader.h +++ b/src/reader.h @@ -41,18 +41,13 @@ # define SERD_STACK_ASSERT_TOP(reader, ref) #endif -/* Reference to a node in the stack (we can not use pointers since the - stack may be reallocated, invalidating any pointers to elements). -*/ -typedef size_t Ref; - typedef struct { - Ref graph; - Ref subject; - Ref predicate; - Ref object; - Ref datatype; - Ref lang; + SerdNode* graph; + SerdNode* subject; + SerdNode* predicate; + SerdNode* object; + SerdNode* datatype; + SerdNode* lang; SerdStatementFlags* flags; } ReadContext; @@ -60,9 +55,9 @@ struct SerdReaderImpl { const SerdSink* sink; SerdErrorFunc error_func; void* error_handle; - Ref rdf_first; - Ref rdf_rest; - Ref rdf_nil; + SerdNode* rdf_first; + SerdNode* rdf_rest; + SerdNode* rdf_nil; SerdNode* default_graph; SerdByteSource source; SerdStack stack; @@ -74,8 +69,8 @@ struct SerdReaderImpl { bool strict; ///< True iff strict parsing bool seen_genid; #ifdef SERD_STACK_CHECK - Ref* allocs; ///< Stack of push offsets - size_t n_allocs; ///< Number of stack pushes + SerdNode** allocs; ///< Stack of push offsets + size_t n_allocs; ///< Number of stack pushes #endif }; @@ -83,14 +78,14 @@ SERD_LOG_FUNC(3, 4) SerdStatus r_err(SerdReader* reader, SerdStatus st, const char* fmt, ...); -Ref +SerdNode* push_node_padded(SerdReader* reader, size_t maxlen, SerdNodeType type, const char* str, size_t length); -Ref +SerdNode* push_node(SerdReader* reader, SerdNodeType type, const char* str, @@ -100,20 +95,17 @@ SERD_PURE_FUNC size_t genid_size(const SerdReader* reader); -Ref +SerdNode* blank_id(SerdReader* reader); void -set_blank_id(SerdReader* reader, Ref ref, size_t buf_size); +set_blank_id(SerdReader* reader, SerdNode* node, size_t buf_size); SerdNode* -deref(SerdReader* reader, Ref ref); - -Ref -pop_node(SerdReader* reader, Ref ref); +pop_node(SerdReader* reader, const SerdNode* node); SerdStatus -emit_statement(SerdReader* reader, ReadContext ctx, Ref o); +emit_statement(SerdReader* reader, ReadContext ctx, SerdNode* o); SerdStatus read_n3_statement(SerdReader* reader); @@ -167,13 +159,15 @@ eat_string(SerdReader* reader, const char* str, unsigned n) } static inline SerdStatus -push_byte(SerdReader* reader, Ref ref, const int c) +push_byte(SerdReader* reader, SerdNode* node, const int c) { assert(c != EOF); SERD_STACK_ASSERT_TOP(reader, ref); - char* const s = (char*)serd_stack_push(&reader->stack, 1); - SerdNode* const node = (SerdNode*)(reader->stack.buf + ref); + char* const s = (char*)serd_stack_push(&reader->stack, 1); + if (!s) { + return SERD_ERR_OVERFLOW; + } *(uint8_t*)(s - 1) = (uint8_t)c; *s = '\0'; @@ -182,12 +176,20 @@ push_byte(SerdReader* reader, Ref ref, const int c) return SERD_SUCCESS; } -static inline void -push_bytes(SerdReader* reader, Ref ref, const uint8_t* bytes, unsigned len) +static inline SerdStatus +push_bytes(SerdReader* reader, + SerdNode* ref, + const uint8_t* bytes, + unsigned len) { + if (reader->stack.buf_size < reader->stack.size + len) { + return SERD_ERR_OVERFLOW; + } + for (unsigned i = 0; i < len; ++i) { push_byte(reader, ref, bytes[i]); } + return SERD_SUCCESS; } #endif // SERD_READER_H diff --git a/src/serdi.c b/src/serdi.c index 74ad358a..61fb8f09 100644 --- a/src/serdi.c +++ b/src/serdi.c @@ -28,8 +28,10 @@ # include <io.h> #endif +#include <limits.h> #include <stdbool.h> #include <stdio.h> +#include <stdlib.h> #include <string.h> #define SERDI_ERROR(msg) fprintf(stderr, "serdi: " msg) @@ -102,6 +104,7 @@ print_usage(const char* const name, const bool error) fprintf(os, " -f Keep full URIs in input (don't qualify).\n"); fprintf(os, " -h Display this help and exit.\n"); fprintf(os, " -i SYNTAX Input syntax: turtle/ntriples/trig/nquads.\n"); + fprintf(os, " -k BYTES Parser stack size.\n"); fprintf(os, " -l Lax (non-strict) parsing.\n"); fprintf(os, " -o SYNTAX Output syntax: turtle/ntriples/nquads.\n"); fprintf(os, " -p PREFIX Add PREFIX to blank node IDs.\n"); @@ -175,6 +178,7 @@ main(int argc, char** argv) bool full_uris = false; bool lax = false; bool quiet = false; + size_t stack_size = 4194304; const char* add_prefix = NULL; const char* chop_prefix = NULL; const char* root_uri = NULL; @@ -223,6 +227,19 @@ main(int argc, char** argv) return print_usage(prog, true); } break; + } else if (opt == 'k') { + if (argv[a][o + 1] || ++a == argc) { + return missing_arg(prog, 'k'); + } + + char* endptr = NULL; + const long size = strtol(argv[a], &endptr, 10); + if (size <= 0 || size == LONG_MAX || *endptr != '\0') { + SERDI_ERRORF("invalid stack size `%s'\n", argv[a]); + return 1; + } + stack_size = (size_t)size; + break; } else if (opt == 'o') { if (argv[a][o + 1] || ++a == argc) { return missing_arg(prog, 'o'); @@ -294,7 +311,7 @@ main(int argc, char** argv) output_syntax, writer_flags, env, (SerdWriteFunc)fwrite, out_fd); SerdReader* const reader = - serd_reader_new(input_syntax, serd_writer_sink(writer)); + serd_reader_new(input_syntax, serd_writer_sink(writer), stack_size); serd_reader_set_strict(reader, !lax); if (quiet) { diff --git a/src/stack.h b/src/stack.h index b49d645a..f2ab836b 100644 --- a/src/stack.h +++ b/src/stack.h @@ -66,9 +66,9 @@ serd_stack_push(SerdStack* stack, size_t n_bytes) { const size_t new_size = stack->size + n_bytes; if (stack->buf_size < new_size) { - stack->buf_size += (stack->buf_size >> 1); // *= 1.5 - stack->buf = (char*)realloc(stack->buf, stack->buf_size); + return NULL; } + char* const ret = (stack->buf + stack->size); stack->size = new_size; return ret; @@ -85,12 +85,16 @@ static inline void* serd_stack_push_aligned(SerdStack* stack, size_t n_bytes, size_t align) { // Push one byte to ensure space for a pad count - serd_stack_push(stack, 1); + if (!serd_stack_push(stack, 1)) { + return NULL; + } // Push padding if necessary const size_t pad = align - stack->size % align; if (pad > 0) { - serd_stack_push(stack, pad); + if (!serd_stack_push(stack, pad)) { + return NULL; + } } // Set top of stack to pad count so we can properly pop later diff --git a/src/string.c b/src/string.c index f3c0e50a..3a750c4f 100644 --- a/src/string.c +++ b/src/string.c @@ -48,9 +48,11 @@ serd_strerror(const SerdStatus status) case SERD_ERR_ID_CLASH: return "Blank node ID clash"; case SERD_ERR_BAD_CURIE: - return "Invalid CURIE"; + return "Invalid CURIE or unknown namespace prefix"; case SERD_ERR_INTERNAL: return "Internal error"; + case SERD_ERR_OVERFLOW: + return "Stack overflow"; } return "Unknown error"; diff --git a/test/meson.build b/test/meson.build index 794dec2c..3ec9d38b 100644 --- a/test/meson.build +++ b/test/meson.build @@ -46,7 +46,7 @@ if get_option('utils') good_args = [ ['-v'], ['-h'], - ['-s', '<urn:eg:s> a <urn:eg:T> .'], + ['-k', '512', '-s', '<urn:eg:s> a <urn:eg:T> .'], ] foreach args : good_args @@ -61,6 +61,10 @@ if get_option('utils') ['-i', 'turtle'], ['-i'], ['-fi'], + ['-k'], + ['-k', '-1'], + ['-k', '9223372036854775807'], + ['-k', '1024junk'], ['-o', 'unknown'], ['-o'], ['-p'], diff --git a/test/test_read_chunk.c b/test/test_read_chunk.c index ce529b67..17421a52 100644 --- a/test/test_read_chunk.c +++ b/test/test_read_chunk.c @@ -85,7 +85,7 @@ main(void) serd_sink_set_statement_func(sink, on_statement); serd_sink_set_end_func(sink, on_end); - SerdReader* reader = serd_reader_new(SERD_TURTLE, sink); + SerdReader* reader = serd_reader_new(SERD_TURTLE, sink, 4096); assert(reader); assert(!serd_reader_start_string(reader, diff --git a/test/test_reader_writer.c b/test/test_reader_writer.c index d5669db0..62b77f1b 100644 --- a/test/test_reader_writer.c +++ b/test/test_reader_writer.c @@ -100,7 +100,7 @@ test_read_chunks(void) FILE* const f = tmpfile(); static const char null = 0; SerdSink* sink = serd_sink_new(rt, NULL); - SerdReader* reader = serd_reader_new(SERD_TURTLE, sink); + SerdReader* reader = serd_reader_new(SERD_TURTLE, sink, 4096); assert(reader); assert(sink); @@ -160,7 +160,7 @@ test_read_string(void) { ReaderTest* rt = (ReaderTest*)calloc(1, sizeof(ReaderTest)); SerdSink* sink = serd_sink_new(rt, NULL); - SerdReader* reader = serd_reader_new(SERD_TURTLE, sink); + SerdReader* reader = serd_reader_new(SERD_TURTLE, sink, 4096); assert(reader); assert(sink); @@ -275,14 +275,17 @@ test_writer(const char* const path) static void test_reader(const char* path) { - ReaderTest rt = {0, NULL}; - SerdSink* const sink = serd_sink_new(&rt, NULL); - SerdReader* reader = serd_reader_new(SERD_TURTLE, sink); - - assert(reader); + ReaderTest rt = {0, NULL}; + SerdSink* const sink = serd_sink_new(&rt, NULL); assert(sink); serd_sink_set_statement_func(sink, test_sink); + // Test that too little stack space fails gracefully + assert(!serd_reader_new(SERD_TURTLE, sink, 32)); + + SerdReader* reader = serd_reader_new(SERD_TURTLE, sink, 4096); + assert(reader); + SerdNode* g = serd_new_uri(SERD_STRING("http://example.org/")); serd_reader_set_default_graph(reader, g); serd_reader_add_blank_prefix(reader, "tmp"); diff --git a/test/test_string.c b/test/test_string.c index 842ff3df..a3fb9247 100644 --- a/test/test_string.c +++ b/test/test_string.c @@ -39,7 +39,7 @@ test_strerror(void) { const char* msg = serd_strerror(SERD_SUCCESS); assert(!strcmp(msg, "Success")); - for (int i = SERD_FAILURE; i <= SERD_ERR_INTERNAL; ++i) { + for (int i = SERD_FAILURE; i <= SERD_ERR_OVERFLOW; ++i) { msg = serd_strerror((SerdStatus)i); assert(strcmp(msg, "Success")); } |