From 02d56e83931e53e1cde57247c64d56fda3804f77 Mon Sep 17 00:00:00 2001 From: David Robillard Date: Fri, 1 Dec 2023 20:39:44 -0500 Subject: [WIP] Tighten up reader node management [WIP] Broken on 32-bit This makes the reader stack manipulations stricter, to make the code more regular and avoid redundant work and bad cache activity. Now, functions that push node headers and their bodies are responsible for (more or less) immediately pushing any trailing null bytes required for termination and alignment. This makes the writes to the node in the stack more local, ensures nodes are terminated as early as possible (to reduce the risk of using non-terminated strings), and avoids the need to calculate aligned stack allocations. --- src/byte_source.c | 89 ++++++++++++++-------------- src/byte_source.h | 43 +++++++------- src/env.h | 4 +- src/model.c | 3 +- src/node.c | 6 +- src/node.h | 21 ++++--- src/nodes.c | 4 +- src/read_nquads.c | 6 +- src/read_ntriples.c | 86 ++++++++++++++++----------- src/read_trig.c | 10 +++- src/read_turtle.c | 107 ++++++++++++++++++--------------- src/read_utf8.c | 8 +-- src/reader.c | 168 ++++++++++++++++++++++++++++++++-------------------- src/reader.h | 75 ++++++++++++----------- src/stack.h | 21 +++---- 15 files changed, 361 insertions(+), 290 deletions(-) (limited to 'src') diff --git a/src/byte_source.c b/src/byte_source.c index 8959b1e6..0dcb7615 100644 --- a/src/byte_source.c +++ b/src/byte_source.c @@ -38,28 +38,36 @@ serd_byte_source_page(SerdByteSource* const source) return SERD_SUCCESS; } -static void +static SerdStatus serd_byte_source_init_buffer(ZixAllocator* const allocator, SerdByteSource* const source) { if (source->block_size > 1) { - source->block = (uint8_t*)zix_aligned_alloc( - allocator, SERD_PAGE_SIZE, source->block_size); + void* const block = + zix_aligned_alloc(allocator, SERD_PAGE_SIZE, source->block_size); - if ((source->read_buf = source->block)) { - memset(source->block, '\0', source->block_size); + if (!block) { + return SERD_BAD_ALLOC; } + + source->block = (uint8_t*)block; + source->read_buf = source->block; + memset(source->block, '\0', source->block_size); } else { source->read_buf = &source->read_byte; } + + return SERD_SUCCESS; } -SerdByteSource* -serd_byte_source_new_input(ZixAllocator* const allocator, - SerdInputStream* const input, - const SerdNode* const name, - const size_t block_size) +SerdStatus +serd_byte_source_init(ZixAllocator* const allocator, + SerdByteSource* const source, + SerdInputStream* const input, + const SerdNode* const name, + const size_t block_size) { + assert(source); assert(input); assert(block_size); assert(input->stream); @@ -69,70 +77,59 @@ serd_byte_source_new_input(ZixAllocator* const allocator, : serd_node_new(allocator, serd_a_string("input")); if (!source_name) { - return NULL; - } - - SerdByteSource* source = - (SerdByteSource*)zix_calloc(allocator, 1, sizeof(SerdByteSource)); - - if (!source) { - serd_node_free(allocator, source_name); - return NULL; + return SERD_BAD_ALLOC; } - source->name = source_name; source->in = input; - source->block_size = block_size; - source->buf_size = block_size; - source->caret.document = source->name; + source->read_buf = NULL; + source->read_head = 0U; + source->block_size = (uint32_t)block_size; + source->buf_size = (uint32_t)block_size; + source->caret.document = source_name; source->caret.line = 1U; source->caret.col = 1U; + source->name = source_name; + source->block = NULL; + source->read_byte = 0U; + source->prepared = false; - serd_byte_source_init_buffer(allocator, source); - if (block_size > 1 && !source->block) { + if (serd_byte_source_init_buffer(allocator, source)) { serd_node_free(allocator, source_name); - zix_free(allocator, source); - return NULL; + memset(source, 0, sizeof(SerdByteSource)); + return SERD_BAD_ALLOC; } - return source; + return SERD_SUCCESS; } void -serd_byte_source_free(ZixAllocator* const allocator, - SerdByteSource* const source) +serd_byte_source_destroy(ZixAllocator* const allocator, + SerdByteSource* const source) { - if (source) { - if (source->block_size > 1) { - zix_aligned_free(allocator, source->block); - } - - serd_node_free(allocator, source->name); - zix_free(allocator, source); + if (source->block_size > 1) { + zix_aligned_free(allocator, source->block); } + + serd_node_free(allocator, source->name); + memset(source, 0, sizeof(SerdByteSource)); } SerdStatus serd_byte_source_prepare(SerdByteSource* const source) { source->prepared = true; - - if (source->block_size > 1) { - return serd_byte_source_page(source); - } - - return serd_byte_source_advance(source); + return serd_byte_source_page(source); } SerdStatus serd_byte_source_skip_bom(SerdByteSource* const source) { if (serd_byte_source_peek(source) == 0xEF) { - if (serd_byte_source_advance(source) || + if (serd_byte_source_advance_past(source, 0xEF) || serd_byte_source_peek(source) != 0xBB || - serd_byte_source_advance(source) || + serd_byte_source_advance_past(source, 0xBB) || serd_byte_source_peek(source) != 0xBF || - serd_byte_source_advance(source)) { + serd_byte_source_advance_past(source, 0xBF)) { return SERD_BAD_SYNTAX; } } diff --git a/src/byte_source.h b/src/byte_source.h index 5ae40acb..9e65ef75 100644 --- a/src/byte_source.h +++ b/src/byte_source.h @@ -15,8 +15,8 @@ #include #include -#include #include +#include typedef struct { SerdInputStream* in; ///< Input stream to read from @@ -32,14 +32,15 @@ typedef struct { bool eof; ///< True iff end of file reached } SerdByteSource; -SerdByteSource* -serd_byte_source_new_input(ZixAllocator* allocator, - SerdInputStream* input, - const SerdNode* name, - size_t block_size); +SerdStatus +serd_byte_source_init(ZixAllocator* allocator, + SerdByteSource* source, + SerdInputStream* input, + const SerdNode* name, + size_t block_size); void -serd_byte_source_free(ZixAllocator* allocator, SerdByteSource* source); +serd_byte_source_destroy(ZixAllocator* allocator, SerdByteSource* source); SerdStatus serd_byte_source_prepare(SerdByteSource* source); @@ -50,35 +51,37 @@ serd_byte_source_page(SerdByteSource* source); SerdStatus serd_byte_source_skip_bom(SerdByteSource* source); -ZIX_PURE_FUNC static inline uint8_t -serd_byte_source_peek(SerdByteSource* source) +ZIX_PURE_FUNC static inline int +serd_byte_source_peek(const SerdByteSource* const source) { assert(source->prepared); - return source->read_buf[source->read_head]; + + return source->eof ? EOF : (int)source->read_buf[source->read_head]; } static inline SerdStatus -serd_byte_source_advance(SerdByteSource* source) +serd_byte_source_advance_past(SerdByteSource* const source, const int current) { - SerdStatus st = SERD_SUCCESS; - const bool was_eof = source->eof; + /* Reading the buffer here can be an expensive cache miss, so we only assert + that the passed current character is correct in debug builds. In release + builds, this function only accesses the `source` structure, unless a page + read needs to happen. */ + + assert(current == serd_byte_source_peek(source)); - switch (serd_byte_source_peek(source)) { - case '\0': - break; - case '\n': + if (current == '\n') { ++source->caret.line; source->caret.col = 0; - break; - default: + } else { ++source->caret.col; } + SerdStatus st = SERD_SUCCESS; if (++source->read_head >= source->buf_size) { st = serd_byte_source_page(source); } - return (was_eof && source->eof) ? SERD_FAILURE : st; + return st; } #endif // SERD_SRC_BYTE_SOURCE_H diff --git a/src/env.h b/src/env.h index 54ae84e1..ad56b055 100644 --- a/src/env.h +++ b/src/env.h @@ -21,8 +21,8 @@ serd_env_find_prefix(const SerdEnv* env, ZixStringView name); /** Expand `curie`. - Errors: SERD_BAD_ARG if `curie` is not valid, or SERD_BAD_CURIE if prefix is - not defined in `env`. + Errors: SERD_BAD_ARG if `curie` is not valid, or SERD_BAD_CURIE if prefix + is not defined in `env`. */ SerdStatus serd_env_expand_in_place(const SerdEnv* env, diff --git a/src/model.c b/src/model.c index fd1b0f92..fb0fd940 100644 --- a/src/model.c +++ b/src/model.c @@ -15,6 +15,7 @@ #include "serd/statement.h" #include "serd/status.h" #include "zix/allocator.h" +#include "zix/attributes.h" #include "zix/btree.h" #include "zix/status.h" @@ -386,7 +387,7 @@ simple_order(const SerdStatementOrder order) } /// Return the best index scanning strategy for a pattern with given nodes -static ScanStrategy +ZIX_PURE_FUNC static ScanStrategy serd_model_strategy(const SerdModel* const model, const bool with_s, const bool with_p, diff --git a/src/node.c b/src/node.c index c89f20fd..56fcb4b6 100644 --- a/src/node.c +++ b/src/node.c @@ -229,7 +229,7 @@ serd_node_set(ZixAllocator* const allocator, which must be normalized before being passed to a sink so comparison will work correctly. */ -void +static void serd_node_zero_pad(SerdNode* node) { char* buf = serd_node_buffer(node); @@ -274,6 +274,8 @@ serd_node_construct_simple(const size_t buf_size, } serd_node_zero_pad(node); + assert(total_size % sizeof(SerdNode) == 0); + return result(SERD_SUCCESS, total_size); } @@ -612,7 +614,7 @@ serd_node_new(ZixAllocator* const allocator, const SerdNodeArgs args) assert(r.count % sizeof(SerdNode) == 0); SerdNode* const node = - serd_node_malloc(allocator, sizeof(SerdNode) + r.count + 1); + serd_node_malloc(allocator, sizeof(SerdNode) + r.count); if (node) { r = serd_node_construct(r.count, node, args); diff --git a/src/node.h b/src/node.h index 4c7afe2b..7223eb67 100644 --- a/src/node.h +++ b/src/node.h @@ -25,14 +25,19 @@ static const size_t serd_node_align = 2 * sizeof(uint64_t); #if SIZE_MAX == UINT64_MAX +/** + Pad a node string length to the number of bytes it will occupy in a node. + + This returns a size that is at least one larger than `n_bytes` (to ensure + the string is null terminated), but possibly even larger (to align the node + size). +*/ static inline size_t serd_node_pad_length(const size_t n_bytes) { - const size_t align = sizeof(SerdNode); - - assert((align & (align - 1U)) == 0U); + assert((serd_node_align & (serd_node_align - 1U)) == 0U); - return (n_bytes + align + 2U) & ~(align - 1U); + return (n_bytes + serd_node_align) & ~(serd_node_align - 1U); } #else @@ -40,10 +45,7 @@ serd_node_pad_length(const size_t n_bytes) static inline size_t serd_node_pad_length(const size_t n_bytes) { - const size_t pad = sizeof(SerdNode) - (n_bytes + 2) % sizeof(SerdNode); - const size_t size = n_bytes + 2 + pad; - assert(size % sizeof(SerdNode) == 0); - return size; + return (n_bytes + sizeof(SerdNode)) / sizeof(SerdNode) * sizeof(SerdNode); } #endif @@ -101,7 +103,4 @@ serd_node_set(ZixAllocator* ZIX_NULLABLE allocator, ZIX_PURE_FUNC size_t serd_node_total_size(const SerdNode* ZIX_NONNULL node); -void -serd_node_zero_pad(SerdNode* ZIX_NONNULL node); - #endif // SERD_SRC_NODE_H diff --git a/src/nodes.c b/src/nodes.c index e353b9aa..dae67077 100644 --- a/src/nodes.c +++ b/src/nodes.c @@ -198,7 +198,7 @@ node_equals_spec(const SerdNode* const node, const NodeSpec* const spec) !strcmp(serd_node_string_i(serd_node_meta_c(node)), spec->meta.data)); } -static bool +ZIX_PURE_FUNC static bool nodes_meta_equal(const SerdNode* const a, const SerdNode* const b) { assert(a->flags & meta_mask); @@ -213,7 +213,7 @@ nodes_meta_equal(const SerdNode* const a, const SerdNode* const b) !memcmp(serd_node_string_i(am), serd_node_string_i(bm), am->length); } -static bool +ZIX_PURE_FUNC static bool nodes_equal(const SerdNode* const a, const SerdNode* const b) { return (a == b) || diff --git a/src/read_nquads.c b/src/read_nquads.c index 6f0120d1..44c29d0a 100644 --- a/src/read_nquads.c +++ b/src/read_nquads.c @@ -4,7 +4,6 @@ #include "read_nquads.h" #include "caret.h" -#include "node.h" #include "read_ntriples.h" #include "reader.h" #include "stack.h" @@ -44,7 +43,7 @@ read_nquads_statement(SerdReader* const reader) } // Preserve the caret for error reporting and read object - SerdCaret orig_caret = reader->source->caret; + SerdCaret orig_caret = reader->source.caret; if ((st = read_nt_object(reader, &ctx.object, &ate_dot)) || (st = skip_horizontal_whitespace(reader))) { return st; @@ -52,7 +51,7 @@ read_nquads_statement(SerdReader* const reader) if (!ate_dot) { if (peek_byte(reader) == '.') { - eat_byte(reader); + skip_byte(reader, '.'); } else { TRY(st, read_graphLabel(reader, &ctx.graph)); skip_horizontal_whitespace(reader); @@ -60,7 +59,6 @@ read_nquads_statement(SerdReader* const reader) } } - serd_node_zero_pad(ctx.object); const SerdStatement statement = { {ctx.subject, ctx.predicate, ctx.object, ctx.graph}, &orig_caret}; diff --git a/src/read_ntriples.c b/src/read_ntriples.c index 5c02abfe..e5101522 100644 --- a/src/read_ntriples.c +++ b/src/read_ntriples.c @@ -42,23 +42,29 @@ read_LANGTAG(SerdReader* const reader) return r_err(reader, SERD_BAD_SYNTAX, "expected A-Z or a-z"); } - SerdNode* node = push_node(reader, SERD_LITERAL, "", 0); + SerdNode* const node = push_node_head(reader, SERD_LITERAL); if (!node) { return SERD_BAD_STACK; } SerdStatus st = SERD_SUCCESS; - TRY(st, push_byte(reader, node, eat_byte_safe(reader, c))); + TRY(st, skip_byte(reader, c)); + TRY(st, push_byte(reader, node, c)); while ((c = peek_byte(reader)) && is_alpha(c)) { TRY(st, push_byte(reader, node, eat_byte_safe(reader, c))); } while (peek_byte(reader) == '-') { TRY(st, push_byte(reader, node, eat_byte_safe(reader, '-'))); - while ((c = peek_byte(reader)) && (is_alpha(c) || is_digit(c))) { - TRY(st, push_byte(reader, node, eat_byte_safe(reader, c))); + + c = peek_byte(reader); + while (is_alpha(c) || is_digit(c)) { + TRY(st, push_byte(reader, node, c)); + TRY(st, skip_byte(reader, c)); + c = peek_byte(reader); } } - return SERD_SUCCESS; + + return push_node_tail(reader); } static bool @@ -71,13 +77,16 @@ is_EOL(const int c) SerdStatus read_EOL(SerdReader* const reader) { - if (!is_EOL(peek_byte(reader))) { + int c = peek_byte(reader); + + if (!is_EOL(c)) { return r_err(reader, SERD_BAD_SYNTAX, "expected a line ending"); } - while (is_EOL(peek_byte(reader))) { - eat_byte(reader); - } + do { + skip_byte(reader, c); + c = peek_byte(reader); + } while (is_EOL(c)); return SERD_SUCCESS; } @@ -176,7 +185,9 @@ read_IRIREF_suffix(SerdReader* const reader, SerdNode* const node) uint32_t code = 0U; while (st <= SERD_FAILURE) { - const int c = eat_byte(reader); + const int c = peek_byte(reader); + skip_byte(reader, c); + switch (c) { case ' ': case '"': @@ -239,9 +250,10 @@ static SerdStatus read_IRI(SerdReader* const reader, SerdNode** const dest) { SerdStatus st = SERD_SUCCESS; - TRY(st, eat_byte_check(reader, '<')); - if (!(*dest = push_node(reader, SERD_URI, "", 0))) { + TRY(st, skip_byte(reader, '<')); + + if (!(*dest = push_node_head(reader, SERD_URI))) { return SERD_BAD_STACK; } @@ -249,7 +261,8 @@ read_IRI(SerdReader* const reader, SerdNode** const dest) return r_err(reader, st, "expected IRI scheme"); } - return read_IRIREF_suffix(reader, *dest); + TRY(st, read_IRIREF_suffix(reader, *dest)); + return push_node_tail(reader); } SerdStatus @@ -287,7 +300,7 @@ read_STRING_LITERAL(SerdReader* const reader, case '\r': return r_err(reader, SERD_BAD_SYNTAX, "line end in short string"); case '\\': - skip_byte(reader, c); + TRY(st, skip_byte(reader, c)); TRY(st, read_string_escape(reader, ref)); break; default: @@ -330,7 +343,7 @@ read_BLANK_NODE_LABEL(SerdReader* const reader, { SerdStatus st = SERD_SUCCESS; - skip_byte(reader, '_'); + TRY(st, skip_byte(reader, '_')); TRY(st, eat_byte_check(reader, ':')); int c = peek_byte(reader); @@ -339,8 +352,7 @@ read_BLANK_NODE_LABEL(SerdReader* const reader, return r_err(reader, SERD_BAD_SYNTAX, "expected blank node label"); } - if (!(*dest = push_node( - reader, SERD_BLANK, reader->bprefix, reader->bprefix_len))) { + if (!(*dest = push_node_head(reader, SERD_BLANK))) { return SERD_BAD_STACK; } @@ -373,7 +385,7 @@ read_BLANK_NODE_LABEL(SerdReader* const reader, // Adjust ID to avoid clashes with generated IDs if necessary st = adjust_blank_id(reader, buf); - return tolerate_status(reader, st) ? SERD_SUCCESS : st; + return tolerate_status(reader, st) ? push_node_tail(reader) : st; } static unsigned @@ -592,19 +604,22 @@ read_VARNAME(SerdReader* const reader, SerdNode** const dest) SerdStatus read_Var(SerdReader* const reader, SerdNode** const dest) { + SerdStatus st = SERD_SUCCESS; + const int c = peek_byte(reader); + assert(c == '$' || c == '?'); + if (!(reader->flags & SERD_READ_VARIABLES)) { return r_err(reader, SERD_BAD_SYNTAX, "syntax does not support variables"); } - const int c = peek_byte(reader); - assert(c == '$' || c == '?'); - skip_byte(reader, c); - - if (!(*dest = push_node(reader, SERD_VARIABLE, "", 0))) { + if (!(*dest = push_node_head(reader, SERD_VARIABLE))) { return SERD_BAD_STACK; } - return read_VARNAME(reader, dest); + TRY(st, skip_byte(reader, c)); + TRY(st, read_VARNAME(reader, dest)); + + return st ? st : push_node_tail(reader); } // Nonterminals @@ -613,14 +628,16 @@ read_Var(SerdReader* const reader, SerdNode** const dest) SerdStatus read_comment(SerdReader* const reader) { - skip_byte(reader, '#'); + SerdStatus st = SERD_SUCCESS; + + TRY(st, skip_byte(reader, '#')); for (int c = peek_byte(reader); c && c != '\n' && c != '\r' && c != EOF;) { - skip_byte(reader, c); + TRY(st, skip_byte(reader, c)); c = peek_byte(reader); } - return SERD_SUCCESS; + return st; } /// [6] literal @@ -629,22 +646,23 @@ read_literal(SerdReader* const reader, SerdNode** const dest) { SerdStatus st = SERD_SUCCESS; - if (!(*dest = push_node(reader, SERD_LITERAL, "", 0))) { + if (!(*dest = push_node_head(reader, SERD_LITERAL))) { return SERD_BAD_STACK; } - skip_byte(reader, '"'); + TRY(st, skip_byte(reader, '"')); TRY(st, read_STRING_LITERAL(reader, *dest, '"')); + TRY(st, push_node_tail(reader)); SerdNode* datatype = NULL; switch (peek_byte(reader)) { case '@': - skip_byte(reader, '@'); + TRY(st, skip_byte(reader, '@')); TRY(st, read_LANGTAG(reader)); (*dest)->flags |= SERD_HAS_LANGUAGE; break; case '^': - skip_byte(reader, '^'); + TRY(st, skip_byte(reader, '^')); TRY(st, eat_byte_check(reader, '^')); TRY(st, read_IRI(reader, &datatype)); (*dest)->flags |= SERD_HAS_DATATYPE; @@ -724,7 +742,7 @@ read_triple(SerdReader* const reader) } // Preserve the caret for error reporting and read object - SerdCaret orig_caret = reader->source->caret; + SerdCaret orig_caret = reader->source.caret; if ((st = read_nt_object(reader, &ctx.object, &ate_dot)) || (st = skip_horizontal_whitespace(reader))) { return st; @@ -734,10 +752,6 @@ read_triple(SerdReader* const reader) return st; } - if (ctx.object) { - serd_node_zero_pad(ctx.object); - } - const SerdStatement statement = { {ctx.subject, ctx.predicate, ctx.object, ctx.graph}, &orig_caret}; diff --git a/src/read_trig.c b/src/read_trig.c index e3d7a7e9..76a693ec 100644 --- a/src/read_trig.c +++ b/src/read_trig.c @@ -83,16 +83,20 @@ read_sparql_directive(SerdReader* const reader, ReadContext* const ctx, const SerdNode* const token) { + SerdStatus st = SERD_SUCCESS; + if (!tokcmp(token, "base", 4)) { + TRY(st, push_node_tail(reader)); return read_turtle_base(reader, true, false); } if (!tokcmp(token, "prefix", 6)) { + TRY(st, push_node_tail(reader)); return read_turtle_prefixID(reader, true, false); } if (!tokcmp(token, "graph", 5)) { - SerdStatus st = SERD_SUCCESS; + TRY(st, push_node_tail(reader)); read_turtle_ws_star(reader); TRY(st, read_labelOrSubject(reader, &ctx->graph)); read_turtle_ws_star(reader); @@ -165,7 +169,7 @@ read_trig_statement(SerdReader* const reader) return SERD_FAILURE; case '\0': - eat_byte(reader); + skip_byte(reader, '\0'); return SERD_FAILURE; case '@': @@ -185,7 +189,7 @@ read_trig_statement(SerdReader* const reader) SerdStatus read_trigDoc(SerdReader* const reader) { - while (!reader->source->eof) { + while (!reader->source.eof) { const size_t orig_stack_size = reader->stack.size; const SerdStatus st = read_trig_statement(reader); diff --git a/src/read_turtle.c b/src/read_turtle.c index c6982327..a041e873 100644 --- a/src/read_turtle.c +++ b/src/read_turtle.c @@ -2,7 +2,8 @@ // SPDX-License-Identifier: ISC #include "read_turtle.h" -#include "byte_source.h" + +#include "caret.h" #include "env.h" #include "namespaces.h" #include "node.h" @@ -14,7 +15,6 @@ #include "try.h" #include "turtle.h" -#include "serd/caret.h" #include "serd/env.h" #include "serd/node.h" #include "serd/reader.h" @@ -49,12 +49,14 @@ read_predicateObjectList(SerdReader* reader, ReadContext ctx, bool* ate_dot); static SerdStatus read_whitespace(SerdReader* const reader) { - switch (peek_byte(reader)) { + const int c = peek_byte(reader); + + switch (c) { case '\t': case '\n': case '\r': case ' ': - return serd_byte_source_advance(reader->source); + return skip_byte(reader, c); case '#': return read_comment(reader); default: @@ -338,7 +340,7 @@ resolve_IRIREF(SerdReader* const reader, } // Push a new temporary node for constructing the resolved URI - SerdNode* const temp = push_node(reader, SERD_URI, "", 0); + SerdNode* const temp = push_node_head(reader, SERD_URI); if (!temp) { return SERD_BAD_STACK; } @@ -346,10 +348,12 @@ resolve_IRIREF(SerdReader* const reader, // Write resolved URI to the temporary node WriteNodeContext ctx = {reader, temp, SERD_SUCCESS}; temp->length = serd_write_uri(uri, write_to_stack, &ctx); + if (!ctx.status) { // Replace the destination with the new expanded node memmove(dest, temp, sizeof(SerdNode) + serd_node_pad_length(temp->length)); serd_stack_pop_to(&reader->stack, string_start_offset + dest->length); + TRY(ctx.status, push_node_tail(reader)); } return ctx.status; @@ -361,7 +365,7 @@ read_IRIREF(SerdReader* const reader, SerdNode** const dest) SerdStatus st = SERD_SUCCESS; TRY(st, eat_byte_check(reader, '<')); - if (!(*dest = push_node(reader, SERD_URI, "", 0))) { + if (!(*dest = push_node_head(reader, SERD_URI))) { return SERD_BAD_STACK; } @@ -372,8 +376,10 @@ read_IRIREF(SerdReader* const reader, SerdNode** const dest) return st; } + TRY(st, push_node_tail(reader)); + return (reader->flags & SERD_READ_RELATIVE) - ? SERD_SUCCESS + ? st : resolve_IRIREF(reader, *dest, string_start_offset); } @@ -411,7 +417,7 @@ read_PrefixedName(SerdReader* const reader, return st; } - return SERD_SUCCESS; + return push_node_tail(reader); } static SerdStatus @@ -439,12 +445,11 @@ read_number(SerdReader* const reader, #define XSD_DOUBLE NS_XSD "double" #define XSD_INTEGER NS_XSD "integer" - *dest = push_node(reader, SERD_LITERAL, "", 0); - SerdStatus st = SERD_SUCCESS; int c = peek_byte(reader); bool has_decimal = false; - if (!*dest) { + + if (!(*dest = push_node_head(reader, SERD_LITERAL))) { return SERD_BAD_STACK; } @@ -467,8 +472,8 @@ read_number(SerdReader* const reader, skip_byte(reader, c); c = peek_byte(reader); if (!is_digit(c) && c != 'e' && c != 'E') { - *ate_dot = true; // Force caller to deal with stupid grammar - return SERD_SUCCESS; // Next byte is not a number character + *ate_dot = true; // Force caller to deal with stupid grammar + return push_node_tail(reader); // Next byte is not a number character } TRY(st, push_byte(reader, *dest, '.')); @@ -491,12 +496,13 @@ read_number(SerdReader* const reader, break; } TRY(st, read_0_9(reader, *dest, true)); + TRY(st, push_node_tail(reader)); meta = push_node(reader, SERD_URI, XSD_DOUBLE, sizeof(XSD_DOUBLE) - 1); - (*dest)->flags |= SERD_HAS_DATATYPE; } else if (has_decimal) { + TRY(st, push_node_tail(reader)); meta = push_node(reader, SERD_URI, XSD_DECIMAL, sizeof(XSD_DECIMAL) - 1); - (*dest)->flags |= SERD_HAS_DATATYPE; } else { + TRY(st, push_node_tail(reader)); meta = push_node(reader, SERD_URI, XSD_INTEGER, sizeof(XSD_INTEGER) - 1); } @@ -513,11 +519,14 @@ read_turtle_iri(SerdReader* const reader, return read_IRIREF(reader, dest); } - if (!(*dest = push_node(reader, SERD_LITERAL, "", 0))) { + if (!(*dest = push_node_head(reader, SERD_LITERAL))) { return SERD_BAD_STACK; } - return read_PrefixedName(reader, *dest, true, ate_dot, reader->stack.size); + const SerdStatus st = + read_PrefixedName(reader, *dest, true, ate_dot, reader->stack.size); + + return st; } static SerdStatus @@ -525,14 +534,14 @@ read_literal(SerdReader* const reader, SerdNode** const dest, bool* const ate_dot) { - if (!(*dest = push_node(reader, SERD_LITERAL, "", 0))) { + SerdStatus st = SERD_SUCCESS; + + if (!(*dest = push_node_head(reader, SERD_LITERAL))) { return SERD_BAD_STACK; } - SerdStatus st = read_String(reader, *dest); - if (st) { - return st; - } + TRY(st, read_String(reader, *dest)); + TRY(st, push_node_tail(reader)); SerdNode* datatype = NULL; switch (peek_byte(reader)) { @@ -546,9 +555,11 @@ read_literal(SerdReader* const reader, TRY(st, eat_byte_check(reader, '^')); (*dest)->flags |= SERD_HAS_DATATYPE; TRY(st, read_turtle_iri(reader, &datatype, ate_dot)); + assert(datatype == serd_node_meta_c(*dest)); break; } - return SERD_SUCCESS; + + return st; } static SerdStatus @@ -567,7 +578,7 @@ read_verb(SerdReader* reader, SerdNode** const dest) /* Either a qname, or "a". Read the prefix first, and if it is in fact "a", produce that instead. */ - if (!(*dest = push_node(reader, SERD_URI, "", 0))) { + if (!(*dest = push_node_head(reader, SERD_URI))) { return SERD_BAD_STACK; } @@ -593,7 +604,7 @@ read_verb(SerdReader* reader, SerdNode** const dest) reader, st > SERD_FAILURE ? st : SERD_BAD_SYNTAX, "expected verb"); } - return SERD_SUCCESS; + return SERD_SUCCESS; // push_node_tail(reader); } static SerdStatus @@ -670,7 +681,7 @@ read_named_object(SerdReader* const reader, Deal with this here by trying to read a prefixed node, then if it turns out to actually be "true" or "false", switch it to a boolean literal. */ - if (!(*dest = push_node(reader, SERD_URI, "", 0))) { + if (!(*dest = push_node_head(reader, SERD_URI))) { return SERD_BAD_STACK; } @@ -685,6 +696,7 @@ read_named_object(SerdReader* const reader, node_has_string(node, false_string))) { node->flags = SERD_HAS_DATATYPE; node->type = SERD_LITERAL; + TRY(st, push_node_tail(reader)); return push_node(reader, SERD_URI, XSD_BOOLEAN, XSD_BOOLEAN_LEN) ? SERD_SUCCESS : SERD_BAD_STACK; @@ -696,7 +708,7 @@ read_named_object(SerdReader* const reader, return r_err(reader, st, "expected prefixed name or boolean"); } - return SERD_SUCCESS; + return SERD_SUCCESS; // push_node_tail(reader); } // Read an object and emit statements, possibly recursively @@ -705,8 +717,8 @@ read_object(SerdReader* const reader, ReadContext* const ctx, bool* const ate_dot) { - const size_t orig_stack_size = reader->stack.size; - SerdCaret orig_caret = reader->source->caret; + const size_t orig_stack_size = reader->stack.size; + struct SerdCaretImpl orig_caret = reader->source.caret; assert(ctx->subject); @@ -809,7 +821,7 @@ read_predicateObjectList(SerdReader* const reader, int c = 0; do { read_turtle_ws_star(reader); - switch (c = peek_byte(reader)) { + switch ((c = peek_byte(reader))) { case EOF: serd_stack_pop_to(&reader->stack, orig_stack_size); return r_err(reader, SERD_BAD_SYNTAX, "unexpected end of file"); @@ -869,11 +881,9 @@ read_collection(SerdReader* const reader, /* The order of node allocation here is necessarily not in stack order, so we create two nodes and recycle them throughout. */ - SerdNode* n1 = - push_node_padded(reader, genid_length(reader), SERD_BLANK, "", 0); - + SerdNode* n1 = push_node_padding(reader, SERD_BLANK, genid_length(reader)); SerdNode* node = n1; - SerdNode* rest = 0; + SerdNode* rest = NULL; if (!n1) { return SERD_BAD_STACK; @@ -985,7 +995,6 @@ read_turtle_base(SerdReader* const reader, const bool sparql, const bool token) return SERD_BAD_STACK; } - serd_node_zero_pad(uri); TRY(st, serd_env_set_base_uri(reader->env, serd_node_string_view(uri))); TRY(st, serd_sink_write_base(reader->sink, uri)); @@ -1012,25 +1021,22 @@ read_turtle_prefixID(SerdReader* const reader, } read_turtle_ws_star(reader); - SerdNode* name = push_node(reader, SERD_LITERAL, "", 0); + + SerdNode* const name = push_node_head(reader, SERD_LITERAL); if (!name) { return SERD_BAD_STACK; } + // Read (possibly empty) name node TRY_LAX(st, read_PN_PREFIX(reader, name)); + TRY(st, push_node_tail(reader)); TRY(st, eat_byte_check(reader, ':')); read_turtle_ws_star(reader); + // Read URI node SerdNode* uri = NULL; TRY(st, read_IRIREF(reader, &uri)); - if (reader->stack.size + sizeof(SerdNode) > reader->stack.buf_size) { - return SERD_BAD_STACK; - } - - serd_node_zero_pad(name); - serd_node_zero_pad(uri); - TRY(st, serd_env_set_prefix( reader->env, serd_node_string_view(name), serd_node_string_view(uri))); @@ -1064,11 +1070,15 @@ read_turtle_directive(SerdReader* const reader) static SerdStatus read_sparql_directive(SerdReader* const reader, const SerdNode* const token) { + SerdStatus st = SERD_SUCCESS; + if (!tokcmp(token, "base", 4)) { + TRY(st, push_node_tail(reader)); return read_turtle_base(reader, true, false); } if (!tokcmp(token, "prefix", 6)) { + TRY(st, push_node_tail(reader)); return read_turtle_prefixID(reader, true, false); } @@ -1078,11 +1088,10 @@ read_sparql_directive(SerdReader* const reader, const SerdNode* const token) static SerdStatus read_block(SerdReader* const reader, ReadContext* const ctx) { - SerdStatus st = SERD_SUCCESS; - // Try to read a subject, though it may actually be a directive or graph name - SerdNode* token = NULL; - int s_type = 0; + SerdNode* token = NULL; + SerdStatus st = SERD_SUCCESS; + int s_type = 0; TRY_LAX(st, read_turtle_subject(reader, *ctx, &token, &s_type)); // Try to interpret as a SPARQL "PREFIX" or "BASE" directive @@ -1121,7 +1130,7 @@ read_turtle_statement(SerdReader* const reader) return SERD_FAILURE; case '\0': - eat_byte(reader); + skip_byte(reader, '\0'); return SERD_FAILURE; case '@': @@ -1138,7 +1147,7 @@ read_turtle_statement(SerdReader* const reader) SerdStatus read_turtleDoc(SerdReader* const reader) { - while (!reader->source->eof) { + while (!reader->source.eof) { const size_t orig_stack_size = reader->stack.size; const SerdStatus st = read_turtle_statement(reader); diff --git a/src/read_utf8.c b/src/read_utf8.c index 4639c34e..5b1f737c 100644 --- a/src/read_utf8.c +++ b/src/read_utf8.c @@ -70,15 +70,15 @@ read_utf8_code_point(SerdReader* const reader, uint32_t* const code, const uint8_t lead) { - uint8_t size = 0U; - uint8_t bytes[MAX_UTF8_BYTES] = {lead, 0U, 0U, 0U}; + SerdStatus st = SERD_SUCCESS; + uint8_t size = 0U; + uint8_t bytes[MAX_UTF8_BYTES] = {lead, 0U, 0U, 0U}; *code = 0U; skip_byte(reader, lead); - SerdStatus st = read_utf8_continuation_bytes(reader, bytes, &size, lead); - if (st) { + if ((st = read_utf8_continuation_bytes(reader, bytes, &size, lead))) { return reader->strict ? st : push_bytes(reader, dest, replacement_char, 3); } diff --git a/src/reader.c b/src/reader.c index cfeaf0c0..10d298ea 100644 --- a/src/reader.c +++ b/src/reader.c @@ -34,7 +34,7 @@ r_err(SerdReader* const reader, const SerdStatus st, const char* const fmt, ...) va_start(args, fmt); serd_vlogf_at( - reader->world, SERD_LOG_LEVEL_ERROR, &reader->source->caret, fmt, args); + reader->world, SERD_LOG_LEVEL_ERROR, &reader->source.caret, fmt, args); va_end(args); return st; @@ -43,8 +43,10 @@ r_err(SerdReader* const reader, const SerdStatus st, const char* const fmt, ...) SerdStatus skip_horizontal_whitespace(SerdReader* const reader) { - while (peek_byte(reader) == '\t' || peek_byte(reader) == ' ') { - eat_byte(reader); + int c = peek_byte(reader); + while (c == '\t' || c == ' ') { + skip_byte(reader, c); + c = peek_byte(reader); } return SERD_SUCCESS; @@ -105,47 +107,94 @@ tolerate_status(const SerdReader* const reader, const SerdStatus status) SerdNode* blank_id(SerdReader* const reader) { - SerdNode* const ref = - push_node_padded(reader, genid_length(reader), SERD_BLANK, "", 0); + const size_t length = genid_length(reader); + SerdNode* const ref = push_node_padding(reader, SERD_BLANK, length); if (ref) { - set_blank_id(reader, ref, genid_length(reader) + 1); + set_blank_id(reader, ref, length + 1U); } return ref; } -SerdNode* -push_node_padded(SerdReader* const reader, - const size_t max_length, - const SerdNodeType type, - const char* const str, - const size_t length) +static SerdNode* +push_node_start(SerdReader* const reader, + const SerdNodeType type, + const size_t body_size) { - // Push a null byte to ensure the previous node was null terminated - char* terminator = (char*)serd_stack_push(&reader->stack, 1); - if (!terminator) { - return NULL; + /* The top of the stack should already be aligned, because the previous node + must be terminated before starting a new one. This is statically + assumed/enforced here to ensure that it's done earlier, usually right + after writing the node body. That way is less error-prone, because nodes + are terminated earlier which reduces the risk of accidentally using a + non-terminated node. It's also faster, for two reasons: + + - Nodes, including termination, are written to the stack in a single + sweep, as "tightly" as possible (avoiding the need to re-load that + section of the stack into the cache for writing). + + - Pushing a new node header (this function) doesn't need to do any + alignment calculations. + */ + + assert(!(reader->stack.size % sizeof(SerdNode))); + + const size_t size = sizeof(SerdNode) + body_size; + SerdNode* const node = (SerdNode*)serd_stack_push(&reader->stack, size); + + if (node) { + node->length = 0U; + node->flags = 0U; + node->type = type; } - *terminator = 0; - void* mem = serd_stack_push_aligned( - &reader->stack, sizeof(SerdNode) + max_length + 1, sizeof(SerdNode)); + return node; +} - if (!mem) { - return NULL; +/// Push a null byte to ensure the previous node was null terminated +static char* +push_node_end(SerdReader* const reader) +{ + char* const terminator = (char*)serd_stack_push(&reader->stack, 1U); + + if (terminator) { + *terminator = 0; } - SerdNode* const node = (SerdNode*)mem; + return terminator; +} - node->length = length; - node->flags = 0; - node->type = type; +SerdNode* +push_node_head(SerdReader* const reader, const SerdNodeType type) +{ + return push_node_start(reader, type, 0U); +} - char* buf = (char*)(node + 1); - memcpy(buf, str, length + 1); +SerdStatus +push_node_tail(SerdReader* const reader) +{ + if (!push_node_end(reader) || + !serd_stack_push_pad(&reader->stack, sizeof(SerdNode))) { + return SERD_BAD_STACK; + } - return node; + assert(!(reader->stack.size % sizeof(SerdNode))); + return SERD_SUCCESS; +} + +SerdNode* +push_node_padding(SerdReader* const reader, + const SerdNodeType type, + const size_t max_length) +{ + SerdNode* const node = push_node_start(reader, type, max_length); + if (!node) { + return NULL; + } + + memset(serd_node_buffer(node), 0, max_length); + + return !push_node_tail(reader) ? node : NULL; } SerdNode* @@ -154,7 +203,15 @@ push_node(SerdReader* const reader, const char* const str, const size_t length) { - return push_node_padded(reader, length, type, str, length); + SerdNode* const node = push_node_start(reader, type, length); + if (!node) { + return NULL; + } + + node->length = length; + memcpy(serd_node_buffer(node), str, length); + + return !push_node_tail(reader) ? node : NULL; } int @@ -175,10 +232,6 @@ emit_statement_at(SerdReader* const reader, return SERD_BAD_STACK; } - /* Zero the pad of the object node on the top of the stack. Lower nodes - (subject and predicate) were already zeroed by subsequent pushes. */ - serd_node_zero_pad(o); - const SerdStatement statement = {{ctx.subject, ctx.predicate, o, ctx.graph}, caret}; @@ -194,7 +247,7 @@ emit_statement(SerdReader* const reader, const ReadContext ctx, SerdNode* const o) { - return emit_statement_at(reader, ctx, o, &reader->source->caret); + return emit_statement_at(reader, ctx, o, &reader->source.caret); } SerdStatus @@ -202,7 +255,7 @@ serd_reader_read_document(SerdReader* const reader) { assert(reader); - if (!reader->source) { + if (!reader->source.read_buf) { return SERD_BAD_CALL; } @@ -213,7 +266,7 @@ serd_reader_read_document(SerdReader* const reader) ++reader->world->next_document_id); } - if (reader->syntax != SERD_SYNTAX_EMPTY && !reader->source->prepared) { + if (reader->syntax != SERD_SYNTAX_EMPTY && !reader->source.prepared) { SerdStatus st = serd_reader_prepare(reader); if (st) { return st; @@ -301,7 +354,7 @@ serd_reader_free(SerdReader* const reader) return; } - if (reader->source) { + if (reader->source.in) { serd_reader_finish(reader); } @@ -318,30 +371,28 @@ serd_reader_start(SerdReader* const reader, assert(reader); assert(input); - if (!block_size || !input->stream) { + if (!block_size || block_size > UINT32_MAX || !input->stream) { return SERD_BAD_ARG; } - if (reader->source) { + if (reader->source.in) { return SERD_BAD_CALL; } - reader->source = serd_byte_source_new_input( - reader->world->allocator, input, input_name, block_size); - - return reader->source ? SERD_SUCCESS : SERD_BAD_ALLOC; + return serd_byte_source_init( + reader->world->allocator, &reader->source, input, input_name, block_size); } static SerdStatus serd_reader_prepare(SerdReader* const reader) { - SerdStatus st = serd_byte_source_prepare(reader->source); + SerdStatus st = serd_byte_source_prepare(&reader->source); if (st == SERD_SUCCESS) { - if ((st = serd_byte_source_skip_bom(reader->source))) { + if ((st = serd_byte_source_skip_bom(&reader->source))) { r_err(reader, SERD_BAD_SYNTAX, "corrupt byte order mark"); } } else if (st == SERD_FAILURE) { - reader->source->eof = true; + reader->source.eof = true; } return st; } @@ -351,22 +402,11 @@ serd_reader_read_chunk(SerdReader* const reader) { assert(reader); - SerdStatus st = SERD_SUCCESS; - if (!reader->source) { - return SERD_BAD_CALL; - } - - if (!reader->source->prepared) { - st = serd_reader_prepare(reader); - } else if (reader->source->eof) { - st = serd_byte_source_advance(reader->source); - } - - if (peek_byte(reader) == 0) { - // Skip leading null byte, for reading from a null-delimited socket - serd_byte_source_advance(reader->source); - return SERD_FAILURE; - } + const SerdStatus st = + (!reader->source.in) ? SERD_BAD_CALL + : (!reader->source.prepared) ? serd_reader_prepare(reader) + : (reader->source.eof) ? serd_byte_source_page(&reader->source) + : SERD_SUCCESS; if (st) { return st; @@ -392,7 +432,7 @@ serd_reader_finish(SerdReader* const reader) { assert(reader); - serd_byte_source_free(reader->world->allocator, reader->source); - reader->source = NULL; + serd_byte_source_destroy(reader->world->allocator, &reader->source); + return SERD_SUCCESS; } diff --git a/src/reader.h b/src/reader.h index 9ec2e4ac..837bf969 100644 --- a/src/reader.h +++ b/src/reader.h @@ -25,6 +25,7 @@ #include #include #include +#include typedef struct { SerdNode* graph; @@ -37,13 +38,13 @@ typedef struct { struct SerdReaderImpl { SerdWorld* world; const SerdSink* sink; + SerdByteSource source; + SerdStack stack; SerdNode* rdf_first; SerdNode* rdf_rest; SerdNode* rdf_nil; SerdNode* rdf_type; - SerdByteSource* source; SerdEnv* env; - SerdStack stack; SerdSyntax syntax; SerdReaderFlags flags; unsigned next_id; @@ -60,13 +61,34 @@ SERD_LOG_FUNC(3, 4) SerdStatus r_err(SerdReader* reader, SerdStatus st, const char* fmt, ...); +/** + Push the SerdNode header of a node with zero flags and length. + + If this is called, push_node_tail() must eventually be called before + starting a new node. +*/ SerdNode* -push_node_padded(SerdReader* reader, - size_t max_length, - SerdNodeType type, - const char* str, - size_t length); +push_node_head(SerdReader* reader, SerdNodeType type); + +/** + Push the end of a node, a null terminator and any necessary padding. + + This must be called to close the scope opened with push_node_head(). +*/ +SerdStatus +push_node_tail(SerdReader* reader); +/** + Push a node with reserved space for a body. + + The body is initially all zero, as are the node's length and flags. +*/ +SerdNode* +push_node_padding(SerdReader* reader, SerdNodeType type, size_t max_length); + +/** + Push a complete node with a given string body. +*/ SerdNode* push_node(SerdReader* reader, SerdNodeType type, @@ -98,11 +120,9 @@ SerdStatus emit_statement(SerdReader* reader, ReadContext ctx, SerdNode* o); static inline int -peek_byte(SerdReader* reader) +peek_byte(const SerdReader* const reader) { - SerdByteSource* source = reader->source; - - return source->eof ? EOF : (int)source->read_buf[source->read_head]; + return serd_byte_source_peek(&reader->source); } static inline SerdStatus @@ -112,19 +132,7 @@ skip_byte(SerdReader* reader, const int byte) assert(peek_byte(reader) == byte); - return serd_byte_source_advance(reader->source); -} - -static inline int -eat_byte(SerdReader* const reader) -{ - const int c = peek_byte(reader); - - if (c != EOF) { - serd_byte_source_advance(reader->source); - } - - return c; + return serd_byte_source_advance_past(&reader->source, byte); } static inline int SERD_NODISCARD @@ -134,7 +142,7 @@ eat_byte_safe(SerdReader* reader, const int byte) assert(peek_byte(reader) == byte); - serd_byte_source_advance(reader->source); + serd_byte_source_advance_past(&reader->source, byte); return byte; } @@ -167,32 +175,27 @@ push_byte(SerdReader* reader, SerdNode* node, const int c) { assert(c != EOF); - if (reader->stack.size + 1 > reader->stack.buf_size) { + const size_t old_size = reader->stack.size; + if (old_size >= reader->stack.buf_size) { return SERD_BAD_STACK; } - ((uint8_t*)reader->stack.buf)[reader->stack.size - 1] = (uint8_t)c; ++reader->stack.size; ++node->length; + reader->stack.buf[old_size] = (char)c; + return SERD_SUCCESS; } static inline SerdStatus -push_bytes(SerdReader* const reader, - SerdNode* const node, - const uint8_t* const bytes, - const size_t len) +push_bytes(SerdReader* reader, SerdNode* node, const uint8_t* bytes, size_t len) { if (reader->stack.buf_size < reader->stack.size + len) { return SERD_BAD_STACK; } - const size_t begin = reader->stack.size - 1U; - for (unsigned i = 0U; i < len; ++i) { - reader->stack.buf[begin + i] = (char)bytes[i]; - } - + memcpy(reader->stack.buf + reader->stack.size, bytes, len); reader->stack.size += len; node->length += len; return SERD_SUCCESS; diff --git a/src/stack.h b/src/stack.h index 94e091a1..c11970c1 100644 --- a/src/stack.h +++ b/src/stack.h @@ -47,14 +47,14 @@ serd_stack_free(ZixAllocator* const allocator, SerdStack* stack) static inline void* serd_stack_push(SerdStack* stack, size_t n_bytes) { - const size_t new_size = stack->size + n_bytes; + const size_t old_size = stack->size; + const size_t new_size = old_size + n_bytes; if (stack->buf_size < new_size) { return NULL; } - char* const ret = (stack->buf + stack->size); - stack->size = new_size; - return ret; + stack->size = new_size; + return stack->buf + old_size; } static inline void @@ -73,20 +73,21 @@ serd_stack_pop_to(SerdStack* stack, size_t n_bytes) } static inline void* -serd_stack_push_aligned(SerdStack* stack, size_t n_bytes, size_t align) +serd_stack_push_pad(SerdStack* stack, size_t align) { // Push padding if necessary - const size_t pad = align - stack->size % align; - if (pad > 0) { - void* padding = serd_stack_push(stack, pad); + const size_t leftovers = stack->size % align; + if (leftovers) { + const size_t pad = align - leftovers; + void* const padding = serd_stack_push(stack, pad); if (!padding) { return NULL; } memset(padding, 0, pad); + return padding; } - // Push requested space at aligned location - return serd_stack_push(stack, n_bytes); + return stack->buf + stack->size; } #endif // SERD_SRC_STACK_H -- cgit v1.2.1