diff options
author | David Robillard <d@drobilla.net> | 2018-05-12 20:39:23 +0200 |
---|---|---|
committer | David Robillard <d@drobilla.net> | 2020-10-27 13:13:58 +0100 |
commit | 95f7334bbc02f3e75b33b9914eee58c69f1588bd (patch) | |
tree | bed06e929f286d099181d5e2e76ecf508df0add9 | |
parent | 4f4408029cd8cefc1804d75a03f5c3f0ee7922fa (diff) | |
download | serd-95f7334bbc02f3e75b33b9914eee58c69f1588bd.tar.gz serd-95f7334bbc02f3e75b33b9914eee58c69f1588bd.tar.bz2 serd-95f7334bbc02f3e75b33b9914eee58c69f1588bd.zip |
Use a fixed-size reader stack
This improves performance, and makes the reader more suitable for embedded or
network-facing applications, at the cost of requiring the user to specify a
maximum stack size.
-rw-r--r-- | NEWS | 1 | ||||
-rw-r--r-- | doc/serdi.1 | 4 | ||||
-rw-r--r-- | serd/serd.h | 8 | ||||
-rw-r--r-- | src/n3.c | 237 | ||||
-rw-r--r-- | src/reader.c | 58 | ||||
-rw-r--r-- | src/reader.h | 67 | ||||
-rw-r--r-- | src/serdi.c | 19 | ||||
-rw-r--r-- | src/stack.h | 11 | ||||
-rw-r--r-- | src/string.c | 1 | ||||
-rw-r--r-- | src/writer.c | 6 | ||||
-rw-r--r-- | tests/read_chunk_test.c | 2 | ||||
-rw-r--r-- | tests/serd_test.c | 8 | ||||
-rw-r--r-- | wscript | 6 |
13 files changed, 240 insertions, 188 deletions
@@ -8,6 +8,7 @@ serd (1.0.1) unstable; * Remove half-baked serd_uri_to_path() * Remove useless character counting from API * Rename SerdChunk to SerdStringView + * Use a fixed-size reader stack * Use char* for strings in public API -- David Robillard <d@drobilla.net> Sat, 19 Jan 2019 12:31:12 +0000 diff --git a/doc/serdi.1 b/doc/serdi.1 index fefafd67..a246ff16 100644 --- a/doc/serdi.1 +++ b/doc/serdi.1 @@ -40,6 +40,10 @@ Read input as \fISYNTAX\fR. Valid values (case-insensitive): \*(lqturtle\*(rq, \*(lqntriples\*(rq, \*(lqtrig\*(rq, \*(lqnquads\*(rq. .TP +.BR \-k " " \fIBYTES\fR +Parser stack size. + +.TP .BR \-l Lax (non-strict) parsing. diff --git a/serd/serd.h b/serd/serd.h index bf1ac8d5..8aa97ed4 100644 --- a/serd/serd.h +++ b/serd/serd.h @@ -102,7 +102,8 @@ typedef enum { SERD_ERR_NOT_FOUND, /**< Not found */ SERD_ERR_ID_CLASH, /**< Encountered clashing blank node IDs */ SERD_ERR_BAD_CURIE, /**< Invalid CURIE (e.g. prefix does not exist) */ - SERD_ERR_INTERNAL /**< Unexpected internal error (should not happen) */ + SERD_ERR_INTERNAL, /**< Unexpected internal error (should not happen) */ + SERD_ERR_OVERFLOW /**< Stack overflow */ } SerdStatus; /** @@ -903,7 +904,10 @@ serd_env_foreach(const SerdEnv* env, */ SERD_API SerdReader* -serd_reader_new(SerdWorld* world, SerdSyntax syntax, const SerdSink* sink); +serd_reader_new(SerdWorld* world, + SerdSyntax syntax, + const SerdSink* sink, + size_t stack_size); /** Enable or disable strict parsing. @@ -41,7 +41,7 @@ fancy_syntax(const SerdReader* reader) } static SerdStatus -read_collection(SerdReader* reader, ReadContext ctx, Ref* dest); +read_collection(SerdReader* reader, ReadContext ctx, SerdNode** dest); static SerdStatus read_predicateObjectList(SerdReader* reader, ReadContext ctx, bool* ate_dot); @@ -60,7 +60,7 @@ read_HEX(SerdReader* reader) // Read UCHAR escape, initial \ is already eaten by caller static inline SerdStatus -read_UCHAR(SerdReader* reader, Ref dest, uint32_t* char_code) +read_UCHAR(SerdReader* reader, SerdNode* dest, uint32_t* char_code) { const int b = peek_byte(reader); unsigned length = 0; @@ -99,9 +99,9 @@ read_UCHAR(SerdReader* reader, Ref dest, uint32_t* char_code) } else { r_err(reader, SERD_ERR_BAD_SYNTAX, "unicode character 0x%X out of range\n", code); - push_bytes(reader, dest, replacement_char, 3); *char_code = 0xFFFD; - return SERD_SUCCESS; + const SerdStatus st = push_bytes(reader, dest, replacement_char, 3); + return st ? st : SERD_SUCCESS; } // Build output in buf @@ -130,42 +130,35 @@ read_UCHAR(SerdReader* reader, Ref dest, uint32_t* char_code) break; } - push_bytes(reader, dest, buf, size); *char_code = code; - return SERD_SUCCESS; + return push_bytes(reader, dest, buf, size); } // Read ECHAR escape, initial \ is already eaten by caller static inline SerdStatus -read_ECHAR(SerdReader* reader, Ref dest, SerdNodeFlags* flags) +read_ECHAR(SerdReader* reader, SerdNode* dest, SerdNodeFlags* flags) { const int c = peek_byte(reader); switch (c) { case 't': eat_byte_safe(reader, 't'); - push_byte(reader, dest, '\t'); - return SERD_SUCCESS; + return push_byte(reader, dest, '\t'); case 'b': eat_byte_safe(reader, 'b'); - push_byte(reader, dest, '\b'); - return SERD_SUCCESS; + return push_byte(reader, dest, '\b'); case 'n': *flags |= SERD_HAS_NEWLINE; eat_byte_safe(reader, 'n'); - push_byte(reader, dest, '\n'); - return SERD_SUCCESS; + return push_byte(reader, dest, '\n'); case 'r': *flags |= SERD_HAS_NEWLINE; eat_byte_safe(reader, 'r'); - push_byte(reader, dest, '\r'); - return SERD_SUCCESS; + return push_byte(reader, dest, '\r'); case 'f': eat_byte_safe(reader, 'f'); - push_byte(reader, dest, '\f'); - return SERD_SUCCESS; + return push_byte(reader, dest, '\f'); case '\\': case '"': case '\'': - push_byte(reader, dest, eat_byte_safe(reader, c)); - return SERD_SUCCESS; + return push_byte(reader, dest, eat_byte_safe(reader, c)); default: return SERD_ERR_BAD_SYNTAX; } @@ -208,21 +201,21 @@ read_utf8_bytes(SerdReader* reader, uint8_t bytes[4], uint32_t* size, uint8_t c) } static SerdStatus -read_utf8_character(SerdReader* reader, Ref dest, uint8_t c) +read_utf8_character(SerdReader* reader, SerdNode* dest, uint8_t c) { uint32_t size = 0; uint8_t bytes[4] = {0, 0, 0, 0}; SerdStatus st = read_utf8_bytes(reader, bytes, &size, c); if (st) { push_bytes(reader, dest, replacement_char, 3); - } else { - push_bytes(reader, dest, bytes, size); + return st; } - return st; + + return push_bytes(reader, dest, bytes, size); } static SerdStatus -read_utf8_code(SerdReader* reader, Ref dest, uint32_t* code, uint8_t c) +read_utf8_code(SerdReader* reader, SerdNode* dest, uint32_t* code, uint8_t c) { uint32_t size = 0; uint8_t bytes[4] = {0, 0, 0, 0}; @@ -232,15 +225,17 @@ read_utf8_code(SerdReader* reader, Ref dest, uint32_t* code, uint8_t c) return st; } - push_bytes(reader, dest, bytes, size); - *code = parse_counted_utf8_char(bytes, size); + if (!(st = push_bytes(reader, dest, bytes, size))) { + *code = parse_counted_utf8_char(bytes, size); + } + return st; } // Read one character (possibly multi-byte) // The first byte, c, has already been eaten by caller static inline SerdStatus -read_character(SerdReader* reader, Ref dest, SerdNodeFlags* flags, uint8_t c) +read_character(SerdReader* reader, SerdNode* dest, SerdNodeFlags* flags, uint8_t c) { if (!(c & 0x80)) { switch (c) { @@ -253,6 +248,7 @@ read_character(SerdReader* reader, Ref dest, SerdNodeFlags* flags, uint8_t c) default: break; } + return push_byte(reader, dest, c); } return read_utf8_character(reader, dest, c); @@ -314,7 +310,7 @@ eat_delim(SerdReader* reader, const char delim) // Initial triple quotes are already eaten by caller static SerdStatus read_STRING_LITERAL_LONG(SerdReader* reader, - Ref ref, + SerdNode* ref, SerdNodeFlags* flags, uint8_t q) { @@ -357,7 +353,7 @@ read_STRING_LITERAL_LONG(SerdReader* reader, // Initial quote is already eaten by caller static SerdStatus read_STRING_LITERAL(SerdReader* reader, - Ref ref, + SerdNode* ref, SerdNodeFlags* flags, uint8_t q) { @@ -397,7 +393,7 @@ read_STRING_LITERAL(SerdReader* reader, } static SerdStatus -read_String(SerdReader* reader, Ref node, SerdNodeFlags* flags) +read_String(SerdReader* reader, SerdNode* node, SerdNodeFlags* flags) { const int q1 = peek_byte(reader); eat_byte_safe(reader, q1); @@ -438,7 +434,7 @@ is_PN_CHARS_BASE(const uint32_t c) } static SerdStatus -read_PN_CHARS_BASE(SerdReader* reader, Ref dest) +read_PN_CHARS_BASE(SerdReader* reader, SerdNode* dest) { uint32_t code = 0; const int c = peek_byte(reader); @@ -468,7 +464,7 @@ is_PN_CHARS(const uint32_t c) } static SerdStatus -read_PN_CHARS(SerdReader* reader, Ref dest) +read_PN_CHARS(SerdReader* reader, SerdNode* dest) { uint32_t code = 0; const int c = peek_byte(reader); @@ -488,7 +484,7 @@ read_PN_CHARS(SerdReader* reader, Ref dest) } static SerdStatus -read_PERCENT(SerdReader* reader, Ref dest) +read_PERCENT(SerdReader* reader, SerdNode* dest) { push_byte(reader, dest, eat_byte_safe(reader, '%')); const uint8_t h1 = read_HEX(reader); @@ -501,7 +497,7 @@ read_PERCENT(SerdReader* reader, Ref dest) } static SerdStatus -read_PN_LOCAL_ESC(SerdReader* reader, Ref dest) +read_PN_LOCAL_ESC(SerdReader* reader, SerdNode* dest) { eat_byte_safe(reader, '\\'); @@ -537,7 +533,7 @@ read_PN_LOCAL_ESC(SerdReader* reader, Ref dest) } static SerdStatus -read_PLX(SerdReader* reader, Ref dest) +read_PLX(SerdReader* reader, SerdNode* dest) { const int c = peek_byte(reader); switch (c) { @@ -551,7 +547,7 @@ read_PLX(SerdReader* reader, Ref dest) } static SerdStatus -read_PN_LOCAL(SerdReader* reader, Ref dest, bool* ate_dot) +read_PN_LOCAL(SerdReader* reader, SerdNode* dest, bool* ate_dot) { int c = peek_byte(reader); SerdStatus st = SERD_SUCCESS; @@ -580,10 +576,9 @@ read_PN_LOCAL(SerdReader* reader, Ref dest, bool* ate_dot) trailing_unescaped_dot = (c == '.'); } - SerdNode* const n = deref(reader, dest); if (trailing_unescaped_dot) { // Ate trailing dot, pop it from stack/node and inform caller - --n->n_bytes; + --dest->n_bytes; serd_stack_pop(&reader->stack, 1); *ate_dot = true; } @@ -593,7 +588,7 @@ read_PN_LOCAL(SerdReader* reader, Ref dest, bool* ate_dot) // Read the remainder of a PN_PREFIX after some initial characters static SerdStatus -read_PN_PREFIX_tail(SerdReader* reader, Ref dest) +read_PN_PREFIX_tail(SerdReader* reader, SerdNode* dest) { int c = 0; while ((c = peek_byte(reader))) { // Middle: (PN_CHARS | '.')* @@ -604,8 +599,7 @@ read_PN_PREFIX_tail(SerdReader* reader, Ref dest) } } - const SerdNode* const n = deref(reader, dest); - if (serd_node_string(n)[n->n_bytes - 1] == '.' && + if (serd_node_string(dest)[dest->n_bytes - 1] == '.' && read_PN_CHARS(reader, dest)) { return r_err(reader, SERD_ERR_BAD_SYNTAX, "prefix ends with `.'\n"); } @@ -614,7 +608,7 @@ read_PN_PREFIX_tail(SerdReader* reader, Ref dest) } static SerdStatus -read_PN_PREFIX(SerdReader* reader, Ref dest) +read_PN_PREFIX(SerdReader* reader, SerdNode* dest) { if (!read_PN_CHARS_BASE(reader, dest)) { return read_PN_PREFIX_tail(reader, dest); @@ -623,14 +617,16 @@ read_PN_PREFIX(SerdReader* reader, Ref dest) } static SerdStatus -read_LANGTAG(SerdReader* reader, Ref* dest) +read_LANGTAG(SerdReader* reader, SerdNode** dest) { int c = peek_byte(reader); if (!is_alpha(c)) { return r_err(reader, SERD_ERR_BAD_SYNTAX, "unexpected `%c'\n", c); } - *dest = push_node(reader, SERD_LITERAL, "", 0); + if (!(*dest = push_node(reader, SERD_LITERAL, "", 0))) { + return SERD_ERR_OVERFLOW; + } SerdStatus st = SERD_SUCCESS; TRY(st, push_byte(reader, *dest, eat_byte_safe(reader, c))); @@ -647,7 +643,7 @@ read_LANGTAG(SerdReader* reader, Ref* dest) } static SerdStatus -read_IRIREF_scheme(SerdReader* reader, Ref dest) +read_IRIREF_scheme(SerdReader* reader, SerdNode* dest) { int c = peek_byte(reader); if (!is_alpha(c)) { @@ -675,7 +671,7 @@ read_IRIREF_scheme(SerdReader* reader, Ref dest) } static SerdStatus -read_IRIREF(SerdReader* reader, Ref* dest) +read_IRIREF(SerdReader* reader, SerdNode** dest) { if (!eat_byte_check(reader, '<')) { return SERD_ERR_BAD_SYNTAX; @@ -747,7 +743,7 @@ read_IRIREF(SerdReader* reader, Ref* dest) } static SerdStatus -read_PrefixedName(SerdReader* reader, Ref dest, bool read_prefix, bool* ate_dot) +read_PrefixedName(SerdReader* reader, SerdNode* dest, bool read_prefix, bool* ate_dot) { SerdStatus st = SERD_SUCCESS; if (read_prefix && ((st = read_PN_PREFIX(reader, dest)) > SERD_FAILURE)) { @@ -764,7 +760,7 @@ read_PrefixedName(SerdReader* reader, Ref dest, bool read_prefix, bool* ate_dot) } static SerdStatus -read_0_9(SerdReader* reader, Ref str, bool at_least_one) +read_0_9(SerdReader* reader, SerdNode* str, bool at_least_one) { unsigned count = 0; SerdStatus st = SERD_SUCCESS; @@ -779,8 +775,8 @@ read_0_9(SerdReader* reader, Ref str, bool at_least_one) static SerdStatus read_number(SerdReader* reader, - Ref* dest, - Ref* datatype, + SerdNode** dest, + SerdNode** datatype, SerdNodeFlags* flags, bool* ate_dot) { @@ -793,9 +789,12 @@ read_number(SerdReader* reader, SerdStatus st = SERD_SUCCESS; int c = peek_byte(reader); bool has_decimal = false; - if (c == '-' || c == '+') { + if (!*dest) { + return SERD_ERR_OVERFLOW; + } else if (c == '-' || c == '+') { push_byte(reader, *dest, eat_byte_safe(reader, c)); } + if ((c = peek_byte(reader)) == '.') { has_decimal = true; // decimal case 2 (e.g. '.0' or `-.0' or `+.0') @@ -847,20 +846,26 @@ read_number(SerdReader* reader, } static SerdStatus -read_iri(SerdReader* reader, Ref* dest, bool* ate_dot) +read_iri(SerdReader* reader, SerdNode** dest, bool* ate_dot) { switch (peek_byte(reader)) { case '<': return read_IRIREF(reader, dest); default: - *dest = push_node(reader, SERD_CURIE, "", 0); + if (!(*dest = push_node(reader, SERD_CURIE, "", 0))) { + return SERD_ERR_OVERFLOW; + } return read_PrefixedName(reader, *dest, true, ate_dot); } } static SerdStatus -read_literal(SerdReader* reader, Ref* dest, - Ref* datatype, Ref* lang, SerdNodeFlags* flags, bool* ate_dot) +read_literal(SerdReader* reader, + SerdNode** dest, + SerdNode** datatype, + SerdNode** lang, + SerdNodeFlags* flags, + bool* ate_dot) { *dest = push_node(reader, SERD_LITERAL, "", 0); @@ -897,7 +902,7 @@ read_literal(SerdReader* reader, Ref* dest, } static SerdStatus -read_verb(SerdReader* reader, Ref* dest) +read_verb(SerdReader* reader, SerdNode** dest) { if (peek_byte(reader) == '<') { return read_IRIREF(reader, dest); @@ -906,11 +911,13 @@ read_verb(SerdReader* reader, Ref* dest) /* Either a qname, or "a". Read the prefix first, and if it is in fact "a", produce that instead. */ - *dest = push_node(reader, SERD_CURIE, "", 0); + if (!(*dest = push_node(reader, SERD_CURIE, "", 0))) { + return SERD_ERR_OVERFLOW; + } SerdStatus st = read_PN_PREFIX(reader, *dest); bool ate_dot = false; - SerdNode* node = deref(reader, *dest); + SerdNode* node = *dest; const int next = peek_byte(reader); if (!st && node->n_bytes == 1 && serd_node_string(node)[0] == 'a' && @@ -929,12 +936,12 @@ read_verb(SerdReader* reader, Ref* dest) } static SerdStatus -read_BLANK_NODE_LABEL(SerdReader* reader, Ref* dest, bool* ate_dot) +read_BLANK_NODE_LABEL(SerdReader* reader, SerdNode** dest, bool* ate_dot) { eat_byte_safe(reader, '_'); eat_byte_check(reader, ':'); - const Ref ref = *dest = + SerdNode* n = *dest = push_node(reader, SERD_BLANK, reader->bprefix ? reader->bprefix : "", @@ -942,23 +949,22 @@ read_BLANK_NODE_LABEL(SerdReader* reader, Ref* dest, bool* ate_dot) int c = peek_byte(reader); // First: (PN_CHARS | '_' | [0-9]) if (is_digit(c) || c == '_') { - push_byte(reader, ref, eat_byte_safe(reader, c)); - } else if (read_PN_CHARS(reader, ref)) { - *dest = pop_node(reader, *dest); + push_byte(reader, n, eat_byte_safe(reader, c)); + } else if (read_PN_CHARS(reader, n)) { + *dest = pop_node(reader, n); return r_err(reader, SERD_ERR_BAD_SYNTAX, "invalid name start\n"); } while ((c = peek_byte(reader))) { // Middle: (PN_CHARS | '.')* if (c == '.') { - push_byte(reader, ref, eat_byte_safe(reader, c)); - } else if (read_PN_CHARS(reader, ref)) { + push_byte(reader, n, eat_byte_safe(reader, c)); + } else if (read_PN_CHARS(reader, n)) { break; } } - SerdNode* n = deref(reader, ref); - char* buf = serd_node_buffer(n); - if (buf[n->n_bytes - 1] == '.' && read_PN_CHARS(reader, ref)) { + char* buf = serd_node_buffer(n); + if (buf[n->n_bytes - 1] == '.' && read_PN_CHARS(reader, n)) { // Ate trailing dot, pop it from stack/node and inform caller --n->n_bytes; serd_stack_pop(&reader->stack, 1); @@ -972,7 +978,7 @@ read_BLANK_NODE_LABEL(SerdReader* reader, Ref* dest, bool* ate_dot) reader->seen_genid = true; } else if (reader->seen_genid && buf[reader->bprefix_len] == 'B') { - *dest = pop_node(reader, *dest); + *dest = pop_node(reader, n); return r_err( reader, SERD_ERR_ID_CLASH, "found both `b' and `B' blank IDs, prefix required\n"); @@ -982,24 +988,24 @@ read_BLANK_NODE_LABEL(SerdReader* reader, Ref* dest, bool* ate_dot) return SERD_SUCCESS; } -static Ref +static SerdNode* read_blankName(SerdReader* reader) { eat_byte_safe(reader, '='); if (eat_byte_check(reader, '=') != '=') { r_err(reader, SERD_ERR_BAD_SYNTAX, "expected `='\n"); - return 0; + return NULL; } - Ref subject = 0; - bool ate_dot = false; + SerdNode* subject = 0; + bool ate_dot = false; read_ws_star(reader); read_iri(reader, &subject, &ate_dot); return subject; } static SerdStatus -read_anon(SerdReader* reader, ReadContext ctx, bool subject, Ref* dest) +read_anon(SerdReader* reader, ReadContext ctx, bool subject, SerdNode** dest) { const SerdStatementFlags old_flags = *ctx.flags; bool empty = false; @@ -1038,7 +1044,7 @@ read_anon(SerdReader* reader, ReadContext ctx, bool subject, Ref* dest) } read_ws_star(reader); if (reader->sink->end) { - reader->sink->end(reader->sink->handle, deref(reader, *dest)); + reader->sink->end(reader->sink->handle, *dest); } *ctx.flags = old_flags; } @@ -1062,10 +1068,9 @@ read_object(SerdReader* reader, ReadContext* ctx, bool emit, bool* ate_dot) SerdStatus ret = SERD_FAILURE; bool simple = (ctx->subject != 0); - SerdNode* node = NULL; - Ref o = 0; - Ref datatype = 0; - Ref lang = 0; + SerdNode* o = 0; + SerdNode* datatype = 0; + SerdNode* lang = 0; uint32_t flags = 0; const int c = peek_byte(reader); if (!fancy_syntax(reader)) { @@ -1105,15 +1110,17 @@ read_object(SerdReader* reader, ReadContext* ctx, bool emit, bool* ate_dot) /* Either a boolean literal, or a qname. Read the prefix first, and if it is in fact a "true" or "false" literal, produce that instead. */ - o = push_node(reader, SERD_CURIE, "", 0); + if (!(o = push_node(reader, SERD_CURIE, "", 0))) { + return SERD_ERR_OVERFLOW; + } + while (!read_PN_CHARS_BASE(reader, o)) {} - node = deref(reader, o); - if ((node->n_bytes == 4 && - !memcmp(serd_node_string(node), "true", 4)) || - (node->n_bytes == 5 && - !memcmp(serd_node_string(node), "false", 5))) { + if ((o->n_bytes == 4 && + !memcmp(serd_node_string(o), "true", 4)) || + (o->n_bytes == 5 && + !memcmp(serd_node_string(o), "false", 5))) { flags = flags | SERD_HAS_DATATYPE; - node->type = SERD_LITERAL; + o->type = SERD_LITERAL; datatype = push_node( reader, SERD_URI, XSD_BOOLEAN, XSD_BOOLEAN_LEN); ret = SERD_SUCCESS; @@ -1129,7 +1136,7 @@ read_object(SerdReader* reader, ReadContext* ctx, bool emit, bool* ate_dot) } if (!ret && simple && o) { - deref(reader, o)->flags = flags; + o->flags = flags; } if (!ret && emit && simple) { @@ -1204,7 +1211,11 @@ read_predicateObjectList(SerdReader* reader, ReadContext ctx, bool* ate_dot) } static SerdStatus -end_collection(SerdReader* reader, ReadContext ctx, Ref n1, Ref n2, SerdStatus st) +end_collection(SerdReader* reader, + ReadContext ctx, + SerdNode* n1, + SerdNode* n2, + SerdStatus st) { pop_node(reader, n2); pop_node(reader, n1); @@ -1217,7 +1228,7 @@ end_collection(SerdReader* reader, ReadContext ctx, Ref n1, Ref n2, SerdStatus s } static SerdStatus -read_collection(SerdReader* reader, ReadContext ctx, Ref* dest) +read_collection(SerdReader* reader, ReadContext ctx, SerdNode** dest) { SerdStatus st = SERD_SUCCESS; eat_byte_safe(reader, '('); @@ -1238,10 +1249,14 @@ read_collection(SerdReader* reader, ReadContext ctx, Ref* dest) /* The order of node allocation here is necessarily not in stack order, so we create two nodes and recycle them throughout. */ - Ref n1 = push_node_padded(reader, genid_size(reader), SERD_BLANK, "", 0); - Ref n2 = 0; - Ref node = n1; - Ref rest = 0; + SerdNode* n1 = push_node_padded(reader, genid_size(reader), SERD_BLANK, "", 0); + SerdNode* n2 = 0; + SerdNode* node = n1; + SerdNode* rest = 0; + + if (!n1) { + return SERD_ERR_OVERFLOW; + } ctx.subject = *dest; while (!peek_delim(reader, ')')) { @@ -1276,7 +1291,7 @@ read_collection(SerdReader* reader, ReadContext ctx, Ref* dest) } static SerdStatus -read_subject(SerdReader* reader, ReadContext ctx, Ref* dest, int* s_type) +read_subject(SerdReader* reader, ReadContext ctx, SerdNode** dest, int* s_type) { SerdStatus st = SERD_SUCCESS; bool ate_dot = false; @@ -1303,7 +1318,7 @@ read_subject(SerdReader* reader, ReadContext ctx, Ref* dest, int* s_type) } static SerdStatus -read_labelOrSubject(SerdReader* reader, Ref* dest) +read_labelOrSubject(SerdReader* reader, SerdNode** dest) { bool ate_dot = false; switch (peek_byte(reader)) { @@ -1356,10 +1371,10 @@ read_base(SerdReader* reader, bool sparql, bool token) read_ws_star(reader); - Ref uri = 0; + SerdNode* uri = NULL; TRY(st, read_IRIREF(reader, &uri)); if (reader->sink->base) { - TRY(st, reader->sink->base(reader->sink->handle, deref(reader, uri))); + TRY(st, reader->sink->base(reader->sink->handle, uri)); } pop_node(reader, uri); @@ -1382,8 +1397,10 @@ read_prefixID(SerdReader* reader, bool sparql, bool token) } read_ws_star(reader); - Ref name = push_node(reader, SERD_LITERAL, "", 0); - if ((st = read_PN_PREFIX(reader, name)) > SERD_FAILURE) { + SerdNode* name = push_node(reader, SERD_LITERAL, "", 0); + if (!name) { + return SERD_ERR_OVERFLOW; + } else if ((st = read_PN_PREFIX(reader, name)) > SERD_FAILURE) { return st; } @@ -1393,13 +1410,11 @@ read_prefixID(SerdReader* reader, bool sparql, bool token) } read_ws_star(reader); - Ref uri = 0; + SerdNode* uri = NULL; TRY(st, read_IRIREF(reader, &uri)); if (reader->sink->prefix) { - st = reader->sink->prefix(reader->sink->handle, - deref(reader, name), - deref(reader, uri)); + st = reader->sink->prefix(reader->sink->handle, name, uri); } pop_node(reader, uri); pop_node(reader, name); @@ -1469,13 +1484,11 @@ read_wrappedGraph(SerdReader* reader, ReadContext* ctx) } static int -tokcmp(SerdReader* reader, Ref ref, const char* tok, size_t n) +tokcmp(SerdNode* node, const char* tok, size_t n) { - SerdNode* node = deref(reader, ref); - if (!node || node->n_bytes != n) { - return -1; - } - return serd_strncasecmp(serd_node_string(node), tok, n); + return ((!node || node->n_bytes != n) + ? -1 + : serd_strncasecmp(serd_node_string(node), tok, n)); } SerdStatus @@ -1516,11 +1529,11 @@ read_n3_statement(SerdReader* reader) return st; } - if (!tokcmp(reader, ctx.subject, "base", 4)) { + if (!tokcmp(ctx.subject, "base", 4)) { st = read_base(reader, true, false); - } else if (!tokcmp(reader, ctx.subject, "prefix", 6)) { + } else if (!tokcmp(ctx.subject, "prefix", 6)) { st = read_prefixID(reader, true, false); - } else if (!tokcmp(reader, ctx.subject, "graph", 5)) { + } else if (!tokcmp(ctx.subject, "graph", 5)) { read_ws_star(reader); TRY(st, read_labelOrSubject(reader, &ctx.graph)); read_ws_star(reader); diff --git a/src/reader.c b/src/reader.c index 95031e0c..bb71be83 100644 --- a/src/reader.c +++ b/src/reader.c @@ -41,9 +41,8 @@ r_err(SerdReader* reader, SerdStatus st, const char* fmt, ...) } void -set_blank_id(SerdReader* reader, Ref ref, size_t buf_size) +set_blank_id(SerdReader* reader, SerdNode* node, size_t buf_size) { - SerdNode* node = deref(reader, ref); char* buf = (char*)(node + 1); const char* prefix = reader->bprefix ? (const char*)reader->bprefix : ""; @@ -57,21 +56,28 @@ genid_size(SerdReader* reader) return reader->bprefix_len + 1 + 10 + 1; // + "b" + UINT32_MAX + \0 } -Ref +SerdNode* blank_id(SerdReader* reader) { - Ref ref = push_node_padded(reader, genid_size(reader), SERD_BLANK, "", 0); - set_blank_id(reader, ref, genid_size(reader)); + SerdNode* ref = push_node_padded( + reader, genid_size(reader), SERD_BLANK, "", 0); + if (ref) { + set_blank_id(reader, ref, genid_size(reader)); + } return ref; } -Ref +SerdNode* push_node_padded(SerdReader* reader, size_t maxlen, SerdType type, const char* str, size_t n_bytes) { void* mem = serd_stack_push_aligned( &reader->stack, sizeof(SerdNode) + maxlen + 1, sizeof(SerdNode)); + if (!mem) { + return NULL; + } + SerdNode* const node = (SerdNode*)mem; node->n_bytes = n_bytes; node->flags = 0; @@ -85,41 +91,34 @@ push_node_padded(SerdReader* reader, size_t maxlen, reader->allocs, sizeof(reader->allocs) * (++reader->n_allocs)); reader->allocs[reader->n_allocs - 1] = (mem - reader->stack.buf); #endif - return (Ref)((char*)node - reader->stack.buf); + return node; } -Ref +SerdNode* push_node(SerdReader* reader, SerdType type, const char* str, size_t n_bytes) { return push_node_padded(reader, n_bytes, type, str, n_bytes); } SerdNode* -deref(SerdReader* reader, const Ref ref) -{ - return ref ? (SerdNode*)(reader->stack.buf + ref) : NULL; -} - -Ref -pop_node(SerdReader* reader, Ref ref) +pop_node(SerdReader* reader, const SerdNode* node) { - if (ref && ref != reader->rdf_first && ref != reader->rdf_rest - && ref != reader->rdf_nil) { + if (node && node != reader->rdf_first && node != reader->rdf_rest + && node != reader->rdf_nil) { #ifdef SERD_STACK_CHECK - SERD_STACK_ASSERT_TOP(reader, ref); + SERD_STACK_ASSERT_TOP(reader, node); --reader->n_allocs; #endif - SerdNode* const node = deref(reader, ref); - char* const top = reader->stack.buf + reader->stack.size; + char* const top = reader->stack.buf + reader->stack.size; serd_stack_pop_aligned(&reader->stack, (size_t)(top - (char*)node)); } - return 0; + return NULL; } SerdStatus -emit_statement(SerdReader* reader, ReadContext ctx, Ref o) +emit_statement(SerdReader* reader, ReadContext ctx, SerdNode* o) { - SerdNode* graph = deref(reader, ctx.graph); + SerdNode* graph = ctx.graph; if (!graph && reader->default_graph) { graph = reader->default_graph; } @@ -130,9 +129,9 @@ emit_statement(SerdReader* reader, ReadContext ctx, Ref o) : reader->sink->statement(reader->sink->handle, *ctx.flags, graph, - deref(reader, ctx.subject), - deref(reader, ctx.predicate), - deref(reader, o)); + ctx.subject, + ctx.predicate, + o); *ctx.flags &= SERD_ANON_CONT|SERD_LIST_CONT; // Preserve only cont flags return st; @@ -159,14 +158,17 @@ serd_reader_read_document(SerdReader* reader) } SerdReader* -serd_reader_new(SerdWorld* world, SerdSyntax syntax, const SerdSink* sink) +serd_reader_new(SerdWorld* world, + SerdSyntax syntax, + const SerdSink* sink, + size_t stack_size) { SerdReader* me = (SerdReader*)calloc(1, sizeof(SerdReader)); me->world = world; me->sink = sink; me->default_graph = NULL; - me->stack = serd_stack_new(SERD_PAGE_SIZE); + me->stack = serd_stack_new(stack_size); me->syntax = syntax; me->next_id = 1; me->strict = true; diff --git a/src/reader.h b/src/reader.h index d3d27f39..4925dbda 100644 --- a/src/reader.h +++ b/src/reader.h @@ -41,18 +41,13 @@ # define SERD_STACK_ASSERT_TOP(reader, ref) #endif -/* Reference to a node in the stack (we can not use pointers since the - stack may be reallocated, invalidating any pointers to elements). -*/ -typedef size_t Ref; - typedef struct { - Ref graph; - Ref subject; - Ref predicate; - Ref object; - Ref datatype; - Ref lang; + SerdNode* graph; + SerdNode* subject; + SerdNode* predicate; + SerdNode* object; + SerdNode* datatype; + SerdNode* lang; SerdStatementFlags* flags; } ReadContext; @@ -61,9 +56,9 @@ struct SerdReaderImpl { const SerdSink* sink; SerdErrorSink error_sink; void* error_handle; - Ref rdf_first; - Ref rdf_rest; - Ref rdf_nil; + SerdNode* rdf_first; + SerdNode* rdf_rest; + SerdNode* rdf_nil; SerdNode* default_graph; SerdByteSource source; SerdStack stack; @@ -75,8 +70,8 @@ struct SerdReaderImpl { bool strict; ///< True iff strict parsing bool seen_genid; #ifdef SERD_STACK_CHECK - Ref* allocs; ///< Stack of push offsets - size_t n_allocs; ///< Number of stack pushes + SerdNode** allocs; ///< Stack of push offsets + size_t n_allocs; ///< Number of stack pushes #endif }; @@ -84,27 +79,25 @@ SERD_LOG_FUNC(3, 4) SerdStatus r_err(SerdReader* reader, SerdStatus st, const char* fmt, ...); -Ref push_node_padded(SerdReader* reader, +SerdNode* push_node_padded(SerdReader* reader, size_t maxlen, SerdType type, const char* str, size_t n_bytes); -Ref push_node(SerdReader* reader, - SerdType type, - const char* str, - size_t n_bytes); - -size_t genid_size(SerdReader* reader); -Ref blank_id(SerdReader* reader); -void set_blank_id(SerdReader* reader, Ref ref, size_t buf_size); +SerdNode* push_node(SerdReader* reader, + SerdType type, + const char* str, + size_t n_bytes); -SerdNode* deref(SerdReader* reader, Ref ref); +size_t genid_size(SerdReader* reader); +SerdNode* blank_id(SerdReader* reader); +void set_blank_id(SerdReader* reader, SerdNode* node, size_t buf_size); -Ref pop_node(SerdReader* reader, Ref ref); +SerdNode* pop_node(SerdReader* reader, const SerdNode* node); SerdStatus -emit_statement(SerdReader* reader, ReadContext ctx, Ref o); +emit_statement(SerdReader* reader, ReadContext ctx, SerdNode* o); SerdStatus read_n3_statement(SerdReader* reader); SerdStatus read_nquadsDoc(SerdReader* reader); @@ -154,25 +147,33 @@ eat_string(SerdReader* reader, const char* str, unsigned n) } static inline SerdStatus -push_byte(SerdReader* reader, Ref ref, const int c) +push_byte(SerdReader* reader, SerdNode* node, const int c) { assert(c != EOF); SERD_STACK_ASSERT_TOP(reader, ref); - char* const s = (char*)serd_stack_push(&reader->stack, 1); - SerdNode* const node = (SerdNode*)(reader->stack.buf + ref); + char* const s = (char*)serd_stack_push(&reader->stack, 1); + if (!s) { + return SERD_ERR_OVERFLOW; + } + ++node->n_bytes; *(s - 1) = (uint8_t)c; *s = '\0'; return SERD_SUCCESS; } -static inline void -push_bytes(SerdReader* reader, Ref ref, const uint8_t* bytes, unsigned len) +static inline SerdStatus +push_bytes(SerdReader* reader, SerdNode* ref, const uint8_t* bytes, unsigned len) { + if (reader->stack.buf_size < reader->stack.size + len) { + return SERD_ERR_OVERFLOW; + } + for (unsigned i = 0; i < len; ++i) { push_byte(reader, ref, bytes[i]); } + return SERD_SUCCESS; } #endif // SERD_READER_H diff --git a/src/serdi.c b/src/serdi.c index ef620c3c..fcb4274e 100644 --- a/src/serdi.c +++ b/src/serdi.c @@ -25,8 +25,10 @@ #include <io.h> #endif +#include <limits.h> #include <stdbool.h> #include <stdio.h> +#include <stdlib.h> #include <string.h> #define SERDI_ERROR(msg) fprintf(stderr, "serdi: " msg) @@ -98,6 +100,7 @@ print_usage(const char* name, bool error) fprintf(os, " -f Keep full URIs in input (don't qualify).\n"); fprintf(os, " -h Display this help and exit.\n"); fprintf(os, " -i SYNTAX Input syntax: turtle/ntriples/trig/nquads.\n"); + fprintf(os, " -k BYTES Parser stack size.\n"); fprintf(os, " -l Lax (non-strict) parsing.\n"); fprintf(os, " -o SYNTAX Output syntax: turtle/ntriples/nquads.\n"); fprintf(os, " -p PREFIX Add PREFIX to blank node IDs.\n"); @@ -140,6 +143,7 @@ main(int argc, char** argv) bool full_uris = false; bool lax = false; bool quiet = false; + size_t stack_size = 4194304; const char* add_prefix = NULL; const char* chop_prefix = NULL; const char* root_uri = NULL; @@ -174,6 +178,17 @@ main(int argc, char** argv) } else if (!(input_syntax = get_syntax(argv[a]))) { return print_usage(argv[0], true); } + } else if (argv[a][1] == 'k') { + if (++a == argc) { + return missing_arg(argv[0], 'k'); + } + char* endptr = NULL; + const long size = strtol(argv[a], &endptr, 10); + if (size <= 0 || size == LONG_MAX || *endptr != '\0') { + SERDI_ERRORF("invalid stack size `%s'\n", argv[a]); + return 1; + } + stack_size = (size_t)size; } else if (argv[a][1] == 'o') { if (++a == argc) { return missing_arg(argv[0], 'o'); @@ -262,8 +277,8 @@ main(int argc, char** argv) (SerdWriteFunc)fwrite, out_fd); - SerdReader* reader = - serd_reader_new(world, input_syntax, serd_writer_get_sink(writer)); + SerdReader* reader = serd_reader_new( + world, input_syntax, serd_writer_get_sink(writer), stack_size); serd_reader_set_strict(reader, !lax); if (quiet) { diff --git a/src/stack.h b/src/stack.h index 8943370c..174c6378 100644 --- a/src/stack.h +++ b/src/stack.h @@ -66,8 +66,7 @@ serd_stack_push(SerdStack* stack, size_t n_bytes) { const size_t new_size = stack->size + n_bytes; if (stack->buf_size < new_size) { - stack->buf_size += (stack->buf_size >> 1); // *= 1.5 - stack->buf = (char*)realloc(stack->buf, stack->buf_size); + return NULL; } char* const ret = (stack->buf + stack->size); stack->size = new_size; @@ -85,12 +84,16 @@ static inline void* serd_stack_push_aligned(SerdStack* stack, size_t n_bytes, size_t align) { // Push one byte to ensure space for a pad count - serd_stack_push(stack, 1); + if (!serd_stack_push(stack, 1)) { + return NULL; + } // Push padding if necessary const size_t pad = align - stack->size % align; if (pad > 0) { - serd_stack_push(stack, pad); + if (!serd_stack_push(stack, pad)) { + return NULL; + } } // Set top of stack to pad count so we can properly pop later diff --git a/src/string.c b/src/string.c index 9a6b4ff6..c755ff97 100644 --- a/src/string.c +++ b/src/string.c @@ -42,6 +42,7 @@ serd_strerror(SerdStatus status) case SERD_ERR_ID_CLASH: return "Blank node ID clash"; case SERD_ERR_BAD_CURIE: return "Invalid CURIE"; case SERD_ERR_INTERNAL: return "Internal error"; + case SERD_ERR_OVERFLOW: return "Stack overflow"; default: break; } return "Unknown error"; // never reached diff --git a/src/writer.c b/src/writer.c index 9d7a6790..827f7264 100644 --- a/src/writer.c +++ b/src/writer.c @@ -804,6 +804,10 @@ serd_writer_write_statement(SerdWriter* writer, if (flags & (SERD_ANON_S_BEGIN|SERD_ANON_O_BEGIN)) { WriteContext* ctx = (WriteContext*)serd_stack_push( &writer->anon_stack, sizeof(WriteContext)); + if (!ctx) { + return SERD_ERR_OVERFLOW; + } + *ctx = writer->context; WriteContext new_context = { serd_node_copy(graph), serd_node_copy(subject), NULL }; @@ -876,7 +880,7 @@ serd_writer_new(SerdWorld* world, writer->env = env; writer->root_node = NULL; writer->root_uri = SERD_URI_NULL; - writer->anon_stack = serd_stack_new(4 * sizeof(WriteContext)); + writer->anon_stack = serd_stack_new(SERD_PAGE_SIZE); writer->context = context; writer->list_subj = NULL; writer->empty = true; diff --git a/tests/read_chunk_test.c b/tests/read_chunk_test.c index decfe829..9fa40bab 100644 --- a/tests/read_chunk_test.c +++ b/tests/read_chunk_test.c @@ -82,7 +82,7 @@ main(void) SerdWorld* world = serd_world_new(); const SerdSink sink = {NULL, on_base, on_prefix, on_statement, on_end}; - SerdReader* reader = serd_reader_new(world, SERD_TURTLE, &sink); + SerdReader* reader = serd_reader_new(world, SERD_TURTLE, &sink, 4096); assert(reader); assert(!serd_reader_start_string(reader, diff --git a/tests/serd_test.c b/tests/serd_test.c index 72f5db84..859ba088 100644 --- a/tests/serd_test.c +++ b/tests/serd_test.c @@ -133,7 +133,7 @@ test_read_chunks(void) FILE* const f = tmpfile(); static const char null = 0; SerdSink sink = {rt, NULL, NULL, test_sink, NULL}; - SerdReader* reader = serd_reader_new(world, SERD_TURTLE, &sink); + SerdReader* reader = serd_reader_new(world, SERD_TURTLE, &sink, 4096); assert(reader); assert(f); @@ -213,7 +213,7 @@ test_read_string(void) SerdWorld* world = serd_world_new(); ReaderTest* rt = (ReaderTest*)calloc(1, sizeof(ReaderTest)); SerdSink sink = {rt, NULL, NULL, test_sink, NULL}; - SerdReader* reader = serd_reader_new(world, SERD_TURTLE, &sink); + SerdReader* reader = serd_reader_new(world, SERD_TURTLE, &sink, 4096); assert(reader); // Test reading a string that ends exactly at the end of input (no newline) @@ -360,7 +360,7 @@ test_strerror(void) { const char* msg = serd_strerror(SERD_SUCCESS); assert(!strcmp(msg, "Success")); - for (int i = SERD_FAILURE; i <= SERD_ERR_INTERNAL; ++i) { + for (int i = SERD_FAILURE; i <= SERD_ERR_OVERFLOW; ++i) { msg = serd_strerror((SerdStatus)i); assert(strcmp(msg, "Success")); } @@ -687,7 +687,7 @@ test_reader(const char* path) SerdWorld* world = serd_world_new(); ReaderTest rt = { 0, NULL }; SerdSink sink = { &rt, NULL, NULL, test_sink, NULL }; - SerdReader* reader = serd_reader_new(world, SERD_TURTLE, &sink); + SerdReader* reader = serd_reader_new(world, SERD_TURTLE, &sink, 4096); assert(reader); SerdNode* g = serd_node_new_uri("http://example.org/"); @@ -560,7 +560,7 @@ def test(tst): check([serdi, '%s/tests/good/manifest.ttl' % srcdir]) check([serdi, '-v']) check([serdi, '-h']) - check([serdi, '-s', '<foo> a <#Thingie> .']) + check([serdi, '-k', '512', '-s', '<foo> a <#Thingie> .']) check([serdi, os.devnull]) with tempfile.TemporaryFile(mode='r') as stdin: check([serdi, '-'], stdin=stdin) @@ -575,6 +575,10 @@ def test(tst): check([serdi, '-i', 'illegal']) check([serdi, '-i', 'turtle']) check([serdi, '-i']) + check([serdi, '-k']) + check([serdi, '-k', '-1']) + check([serdi, '-k', str(2**63 - 1)]) + check([serdi, '-k', '1024junk']) check([serdi, '-o', 'illegal']) check([serdi, '-o']) check([serdi, '-p']) |