diff options
-rw-r--r-- | src/byte_source.c | 89 | ||||
-rw-r--r-- | src/byte_source.h | 43 | ||||
-rw-r--r-- | src/env.h | 4 | ||||
-rw-r--r-- | src/model.c | 3 | ||||
-rw-r--r-- | src/node.c | 6 | ||||
-rw-r--r-- | src/node.h | 21 | ||||
-rw-r--r-- | src/nodes.c | 4 | ||||
-rw-r--r-- | src/read_nquads.c | 6 | ||||
-rw-r--r-- | src/read_ntriples.c | 86 | ||||
-rw-r--r-- | src/read_trig.c | 10 | ||||
-rw-r--r-- | src/read_turtle.c | 107 | ||||
-rw-r--r-- | src/read_utf8.c | 8 | ||||
-rw-r--r-- | src/reader.c | 168 | ||||
-rw-r--r-- | src/reader.h | 75 | ||||
-rw-r--r-- | src/stack.h | 21 |
15 files changed, 361 insertions, 290 deletions
diff --git a/src/byte_source.c b/src/byte_source.c index 8959b1e6..0dcb7615 100644 --- a/src/byte_source.c +++ b/src/byte_source.c @@ -38,28 +38,36 @@ serd_byte_source_page(SerdByteSource* const source) return SERD_SUCCESS; } -static void +static SerdStatus serd_byte_source_init_buffer(ZixAllocator* const allocator, SerdByteSource* const source) { if (source->block_size > 1) { - source->block = (uint8_t*)zix_aligned_alloc( - allocator, SERD_PAGE_SIZE, source->block_size); + void* const block = + zix_aligned_alloc(allocator, SERD_PAGE_SIZE, source->block_size); - if ((source->read_buf = source->block)) { - memset(source->block, '\0', source->block_size); + if (!block) { + return SERD_BAD_ALLOC; } + + source->block = (uint8_t*)block; + source->read_buf = source->block; + memset(source->block, '\0', source->block_size); } else { source->read_buf = &source->read_byte; } + + return SERD_SUCCESS; } -SerdByteSource* -serd_byte_source_new_input(ZixAllocator* const allocator, - SerdInputStream* const input, - const SerdNode* const name, - const size_t block_size) +SerdStatus +serd_byte_source_init(ZixAllocator* const allocator, + SerdByteSource* const source, + SerdInputStream* const input, + const SerdNode* const name, + const size_t block_size) { + assert(source); assert(input); assert(block_size); assert(input->stream); @@ -69,70 +77,59 @@ serd_byte_source_new_input(ZixAllocator* const allocator, : serd_node_new(allocator, serd_a_string("input")); if (!source_name) { - return NULL; - } - - SerdByteSource* source = - (SerdByteSource*)zix_calloc(allocator, 1, sizeof(SerdByteSource)); - - if (!source) { - serd_node_free(allocator, source_name); - return NULL; + return SERD_BAD_ALLOC; } - source->name = source_name; source->in = input; - source->block_size = block_size; - source->buf_size = block_size; - source->caret.document = source->name; + source->read_buf = NULL; + source->read_head = 0U; + source->block_size = (uint32_t)block_size; + source->buf_size = (uint32_t)block_size; + source->caret.document = source_name; source->caret.line = 1U; source->caret.col = 1U; + source->name = source_name; + source->block = NULL; + source->read_byte = 0U; + source->prepared = false; - serd_byte_source_init_buffer(allocator, source); - if (block_size > 1 && !source->block) { + if (serd_byte_source_init_buffer(allocator, source)) { serd_node_free(allocator, source_name); - zix_free(allocator, source); - return NULL; + memset(source, 0, sizeof(SerdByteSource)); + return SERD_BAD_ALLOC; } - return source; + return SERD_SUCCESS; } void -serd_byte_source_free(ZixAllocator* const allocator, - SerdByteSource* const source) +serd_byte_source_destroy(ZixAllocator* const allocator, + SerdByteSource* const source) { - if (source) { - if (source->block_size > 1) { - zix_aligned_free(allocator, source->block); - } - - serd_node_free(allocator, source->name); - zix_free(allocator, source); + if (source->block_size > 1) { + zix_aligned_free(allocator, source->block); } + + serd_node_free(allocator, source->name); + memset(source, 0, sizeof(SerdByteSource)); } SerdStatus serd_byte_source_prepare(SerdByteSource* const source) { source->prepared = true; - - if (source->block_size > 1) { - return serd_byte_source_page(source); - } - - return serd_byte_source_advance(source); + return serd_byte_source_page(source); } SerdStatus serd_byte_source_skip_bom(SerdByteSource* const source) { if (serd_byte_source_peek(source) == 0xEF) { - if (serd_byte_source_advance(source) || + if (serd_byte_source_advance_past(source, 0xEF) || serd_byte_source_peek(source) != 0xBB || - serd_byte_source_advance(source) || + serd_byte_source_advance_past(source, 0xBB) || serd_byte_source_peek(source) != 0xBF || - serd_byte_source_advance(source)) { + serd_byte_source_advance_past(source, 0xBF)) { return SERD_BAD_SYNTAX; } } diff --git a/src/byte_source.h b/src/byte_source.h index 5ae40acb..9e65ef75 100644 --- a/src/byte_source.h +++ b/src/byte_source.h @@ -15,8 +15,8 @@ #include <assert.h> #include <stdbool.h> -#include <stddef.h> #include <stdint.h> +#include <stdio.h> typedef struct { SerdInputStream* in; ///< Input stream to read from @@ -32,14 +32,15 @@ typedef struct { bool eof; ///< True iff end of file reached } SerdByteSource; -SerdByteSource* -serd_byte_source_new_input(ZixAllocator* allocator, - SerdInputStream* input, - const SerdNode* name, - size_t block_size); +SerdStatus +serd_byte_source_init(ZixAllocator* allocator, + SerdByteSource* source, + SerdInputStream* input, + const SerdNode* name, + size_t block_size); void -serd_byte_source_free(ZixAllocator* allocator, SerdByteSource* source); +serd_byte_source_destroy(ZixAllocator* allocator, SerdByteSource* source); SerdStatus serd_byte_source_prepare(SerdByteSource* source); @@ -50,35 +51,37 @@ serd_byte_source_page(SerdByteSource* source); SerdStatus serd_byte_source_skip_bom(SerdByteSource* source); -ZIX_PURE_FUNC static inline uint8_t -serd_byte_source_peek(SerdByteSource* source) +ZIX_PURE_FUNC static inline int +serd_byte_source_peek(const SerdByteSource* const source) { assert(source->prepared); - return source->read_buf[source->read_head]; + + return source->eof ? EOF : (int)source->read_buf[source->read_head]; } static inline SerdStatus -serd_byte_source_advance(SerdByteSource* source) +serd_byte_source_advance_past(SerdByteSource* const source, const int current) { - SerdStatus st = SERD_SUCCESS; - const bool was_eof = source->eof; + /* Reading the buffer here can be an expensive cache miss, so we only assert + that the passed current character is correct in debug builds. In release + builds, this function only accesses the `source` structure, unless a page + read needs to happen. */ + + assert(current == serd_byte_source_peek(source)); - switch (serd_byte_source_peek(source)) { - case '\0': - break; - case '\n': + if (current == '\n') { ++source->caret.line; source->caret.col = 0; - break; - default: + } else { ++source->caret.col; } + SerdStatus st = SERD_SUCCESS; if (++source->read_head >= source->buf_size) { st = serd_byte_source_page(source); } - return (was_eof && source->eof) ? SERD_FAILURE : st; + return st; } #endif // SERD_SRC_BYTE_SOURCE_H @@ -21,8 +21,8 @@ serd_env_find_prefix(const SerdEnv* env, ZixStringView name); /** Expand `curie`. - Errors: SERD_BAD_ARG if `curie` is not valid, or SERD_BAD_CURIE if prefix is - not defined in `env`. + Errors: SERD_BAD_ARG if `curie` is not valid, or SERD_BAD_CURIE if prefix + is not defined in `env`. */ SerdStatus serd_env_expand_in_place(const SerdEnv* env, diff --git a/src/model.c b/src/model.c index fd1b0f92..fb0fd940 100644 --- a/src/model.c +++ b/src/model.c @@ -15,6 +15,7 @@ #include "serd/statement.h" #include "serd/status.h" #include "zix/allocator.h" +#include "zix/attributes.h" #include "zix/btree.h" #include "zix/status.h" @@ -386,7 +387,7 @@ simple_order(const SerdStatementOrder order) } /// Return the best index scanning strategy for a pattern with given nodes -static ScanStrategy +ZIX_PURE_FUNC static ScanStrategy serd_model_strategy(const SerdModel* const model, const bool with_s, const bool with_p, @@ -229,7 +229,7 @@ serd_node_set(ZixAllocator* const allocator, which must be normalized before being passed to a sink so comparison will work correctly. */ -void +static void serd_node_zero_pad(SerdNode* node) { char* buf = serd_node_buffer(node); @@ -274,6 +274,8 @@ serd_node_construct_simple(const size_t buf_size, } serd_node_zero_pad(node); + assert(total_size % sizeof(SerdNode) == 0); + return result(SERD_SUCCESS, total_size); } @@ -612,7 +614,7 @@ serd_node_new(ZixAllocator* const allocator, const SerdNodeArgs args) assert(r.count % sizeof(SerdNode) == 0); SerdNode* const node = - serd_node_malloc(allocator, sizeof(SerdNode) + r.count + 1); + serd_node_malloc(allocator, sizeof(SerdNode) + r.count); if (node) { r = serd_node_construct(r.count, node, args); @@ -25,14 +25,19 @@ static const size_t serd_node_align = 2 * sizeof(uint64_t); #if SIZE_MAX == UINT64_MAX +/** + Pad a node string length to the number of bytes it will occupy in a node. + + This returns a size that is at least one larger than `n_bytes` (to ensure + the string is null terminated), but possibly even larger (to align the node + size). +*/ static inline size_t serd_node_pad_length(const size_t n_bytes) { - const size_t align = sizeof(SerdNode); - - assert((align & (align - 1U)) == 0U); + assert((serd_node_align & (serd_node_align - 1U)) == 0U); - return (n_bytes + align + 2U) & ~(align - 1U); + return (n_bytes + serd_node_align) & ~(serd_node_align - 1U); } #else @@ -40,10 +45,7 @@ serd_node_pad_length(const size_t n_bytes) static inline size_t serd_node_pad_length(const size_t n_bytes) { - const size_t pad = sizeof(SerdNode) - (n_bytes + 2) % sizeof(SerdNode); - const size_t size = n_bytes + 2 + pad; - assert(size % sizeof(SerdNode) == 0); - return size; + return (n_bytes + sizeof(SerdNode)) / sizeof(SerdNode) * sizeof(SerdNode); } #endif @@ -101,7 +103,4 @@ serd_node_set(ZixAllocator* ZIX_NULLABLE allocator, ZIX_PURE_FUNC size_t serd_node_total_size(const SerdNode* ZIX_NONNULL node); -void -serd_node_zero_pad(SerdNode* ZIX_NONNULL node); - #endif // SERD_SRC_NODE_H diff --git a/src/nodes.c b/src/nodes.c index e353b9aa..dae67077 100644 --- a/src/nodes.c +++ b/src/nodes.c @@ -198,7 +198,7 @@ node_equals_spec(const SerdNode* const node, const NodeSpec* const spec) !strcmp(serd_node_string_i(serd_node_meta_c(node)), spec->meta.data)); } -static bool +ZIX_PURE_FUNC static bool nodes_meta_equal(const SerdNode* const a, const SerdNode* const b) { assert(a->flags & meta_mask); @@ -213,7 +213,7 @@ nodes_meta_equal(const SerdNode* const a, const SerdNode* const b) !memcmp(serd_node_string_i(am), serd_node_string_i(bm), am->length); } -static bool +ZIX_PURE_FUNC static bool nodes_equal(const SerdNode* const a, const SerdNode* const b) { return (a == b) || diff --git a/src/read_nquads.c b/src/read_nquads.c index 6f0120d1..44c29d0a 100644 --- a/src/read_nquads.c +++ b/src/read_nquads.c @@ -4,7 +4,6 @@ #include "read_nquads.h" #include "caret.h" -#include "node.h" #include "read_ntriples.h" #include "reader.h" #include "stack.h" @@ -44,7 +43,7 @@ read_nquads_statement(SerdReader* const reader) } // Preserve the caret for error reporting and read object - SerdCaret orig_caret = reader->source->caret; + SerdCaret orig_caret = reader->source.caret; if ((st = read_nt_object(reader, &ctx.object, &ate_dot)) || (st = skip_horizontal_whitespace(reader))) { return st; @@ -52,7 +51,7 @@ read_nquads_statement(SerdReader* const reader) if (!ate_dot) { if (peek_byte(reader) == '.') { - eat_byte(reader); + skip_byte(reader, '.'); } else { TRY(st, read_graphLabel(reader, &ctx.graph)); skip_horizontal_whitespace(reader); @@ -60,7 +59,6 @@ read_nquads_statement(SerdReader* const reader) } } - serd_node_zero_pad(ctx.object); const SerdStatement statement = { {ctx.subject, ctx.predicate, ctx.object, ctx.graph}, &orig_caret}; diff --git a/src/read_ntriples.c b/src/read_ntriples.c index 5c02abfe..e5101522 100644 --- a/src/read_ntriples.c +++ b/src/read_ntriples.c @@ -42,23 +42,29 @@ read_LANGTAG(SerdReader* const reader) return r_err(reader, SERD_BAD_SYNTAX, "expected A-Z or a-z"); } - SerdNode* node = push_node(reader, SERD_LITERAL, "", 0); + SerdNode* const node = push_node_head(reader, SERD_LITERAL); if (!node) { return SERD_BAD_STACK; } SerdStatus st = SERD_SUCCESS; - TRY(st, push_byte(reader, node, eat_byte_safe(reader, c))); + TRY(st, skip_byte(reader, c)); + TRY(st, push_byte(reader, node, c)); while ((c = peek_byte(reader)) && is_alpha(c)) { TRY(st, push_byte(reader, node, eat_byte_safe(reader, c))); } while (peek_byte(reader) == '-') { TRY(st, push_byte(reader, node, eat_byte_safe(reader, '-'))); - while ((c = peek_byte(reader)) && (is_alpha(c) || is_digit(c))) { - TRY(st, push_byte(reader, node, eat_byte_safe(reader, c))); + + c = peek_byte(reader); + while (is_alpha(c) || is_digit(c)) { + TRY(st, push_byte(reader, node, c)); + TRY(st, skip_byte(reader, c)); + c = peek_byte(reader); } } - return SERD_SUCCESS; + + return push_node_tail(reader); } static bool @@ -71,13 +77,16 @@ is_EOL(const int c) SerdStatus read_EOL(SerdReader* const reader) { - if (!is_EOL(peek_byte(reader))) { + int c = peek_byte(reader); + + if (!is_EOL(c)) { return r_err(reader, SERD_BAD_SYNTAX, "expected a line ending"); } - while (is_EOL(peek_byte(reader))) { - eat_byte(reader); - } + do { + skip_byte(reader, c); + c = peek_byte(reader); + } while (is_EOL(c)); return SERD_SUCCESS; } @@ -176,7 +185,9 @@ read_IRIREF_suffix(SerdReader* const reader, SerdNode* const node) uint32_t code = 0U; while (st <= SERD_FAILURE) { - const int c = eat_byte(reader); + const int c = peek_byte(reader); + skip_byte(reader, c); + switch (c) { case ' ': case '"': @@ -239,9 +250,10 @@ static SerdStatus read_IRI(SerdReader* const reader, SerdNode** const dest) { SerdStatus st = SERD_SUCCESS; - TRY(st, eat_byte_check(reader, '<')); - if (!(*dest = push_node(reader, SERD_URI, "", 0))) { + TRY(st, skip_byte(reader, '<')); + + if (!(*dest = push_node_head(reader, SERD_URI))) { return SERD_BAD_STACK; } @@ -249,7 +261,8 @@ read_IRI(SerdReader* const reader, SerdNode** const dest) return r_err(reader, st, "expected IRI scheme"); } - return read_IRIREF_suffix(reader, *dest); + TRY(st, read_IRIREF_suffix(reader, *dest)); + return push_node_tail(reader); } SerdStatus @@ -287,7 +300,7 @@ read_STRING_LITERAL(SerdReader* const reader, case '\r': return r_err(reader, SERD_BAD_SYNTAX, "line end in short string"); case '\\': - skip_byte(reader, c); + TRY(st, skip_byte(reader, c)); TRY(st, read_string_escape(reader, ref)); break; default: @@ -330,7 +343,7 @@ read_BLANK_NODE_LABEL(SerdReader* const reader, { SerdStatus st = SERD_SUCCESS; - skip_byte(reader, '_'); + TRY(st, skip_byte(reader, '_')); TRY(st, eat_byte_check(reader, ':')); int c = peek_byte(reader); @@ -339,8 +352,7 @@ read_BLANK_NODE_LABEL(SerdReader* const reader, return r_err(reader, SERD_BAD_SYNTAX, "expected blank node label"); } - if (!(*dest = push_node( - reader, SERD_BLANK, reader->bprefix, reader->bprefix_len))) { + if (!(*dest = push_node_head(reader, SERD_BLANK))) { return SERD_BAD_STACK; } @@ -373,7 +385,7 @@ read_BLANK_NODE_LABEL(SerdReader* const reader, // Adjust ID to avoid clashes with generated IDs if necessary st = adjust_blank_id(reader, buf); - return tolerate_status(reader, st) ? SERD_SUCCESS : st; + return tolerate_status(reader, st) ? push_node_tail(reader) : st; } static unsigned @@ -592,19 +604,22 @@ read_VARNAME(SerdReader* const reader, SerdNode** const dest) SerdStatus read_Var(SerdReader* const reader, SerdNode** const dest) { + SerdStatus st = SERD_SUCCESS; + const int c = peek_byte(reader); + assert(c == '$' || c == '?'); + if (!(reader->flags & SERD_READ_VARIABLES)) { return r_err(reader, SERD_BAD_SYNTAX, "syntax does not support variables"); } - const int c = peek_byte(reader); - assert(c == '$' || c == '?'); - skip_byte(reader, c); - - if (!(*dest = push_node(reader, SERD_VARIABLE, "", 0))) { + if (!(*dest = push_node_head(reader, SERD_VARIABLE))) { return SERD_BAD_STACK; } - return read_VARNAME(reader, dest); + TRY(st, skip_byte(reader, c)); + TRY(st, read_VARNAME(reader, dest)); + + return st ? st : push_node_tail(reader); } // Nonterminals @@ -613,14 +628,16 @@ read_Var(SerdReader* const reader, SerdNode** const dest) SerdStatus read_comment(SerdReader* const reader) { - skip_byte(reader, '#'); + SerdStatus st = SERD_SUCCESS; + + TRY(st, skip_byte(reader, '#')); for (int c = peek_byte(reader); c && c != '\n' && c != '\r' && c != EOF;) { - skip_byte(reader, c); + TRY(st, skip_byte(reader, c)); c = peek_byte(reader); } - return SERD_SUCCESS; + return st; } /// [6] literal @@ -629,22 +646,23 @@ read_literal(SerdReader* const reader, SerdNode** const dest) { SerdStatus st = SERD_SUCCESS; - if (!(*dest = push_node(reader, SERD_LITERAL, "", 0))) { + if (!(*dest = push_node_head(reader, SERD_LITERAL))) { return SERD_BAD_STACK; } - skip_byte(reader, '"'); + TRY(st, skip_byte(reader, '"')); TRY(st, read_STRING_LITERAL(reader, *dest, '"')); + TRY(st, push_node_tail(reader)); SerdNode* datatype = NULL; switch (peek_byte(reader)) { case '@': - skip_byte(reader, '@'); + TRY(st, skip_byte(reader, '@')); TRY(st, read_LANGTAG(reader)); (*dest)->flags |= SERD_HAS_LANGUAGE; break; case '^': - skip_byte(reader, '^'); + TRY(st, skip_byte(reader, '^')); TRY(st, eat_byte_check(reader, '^')); TRY(st, read_IRI(reader, &datatype)); (*dest)->flags |= SERD_HAS_DATATYPE; @@ -724,7 +742,7 @@ read_triple(SerdReader* const reader) } // Preserve the caret for error reporting and read object - SerdCaret orig_caret = reader->source->caret; + SerdCaret orig_caret = reader->source.caret; if ((st = read_nt_object(reader, &ctx.object, &ate_dot)) || (st = skip_horizontal_whitespace(reader))) { return st; @@ -734,10 +752,6 @@ read_triple(SerdReader* const reader) return st; } - if (ctx.object) { - serd_node_zero_pad(ctx.object); - } - const SerdStatement statement = { {ctx.subject, ctx.predicate, ctx.object, ctx.graph}, &orig_caret}; diff --git a/src/read_trig.c b/src/read_trig.c index e3d7a7e9..76a693ec 100644 --- a/src/read_trig.c +++ b/src/read_trig.c @@ -83,16 +83,20 @@ read_sparql_directive(SerdReader* const reader, ReadContext* const ctx, const SerdNode* const token) { + SerdStatus st = SERD_SUCCESS; + if (!tokcmp(token, "base", 4)) { + TRY(st, push_node_tail(reader)); return read_turtle_base(reader, true, false); } if (!tokcmp(token, "prefix", 6)) { + TRY(st, push_node_tail(reader)); return read_turtle_prefixID(reader, true, false); } if (!tokcmp(token, "graph", 5)) { - SerdStatus st = SERD_SUCCESS; + TRY(st, push_node_tail(reader)); read_turtle_ws_star(reader); TRY(st, read_labelOrSubject(reader, &ctx->graph)); read_turtle_ws_star(reader); @@ -165,7 +169,7 @@ read_trig_statement(SerdReader* const reader) return SERD_FAILURE; case '\0': - eat_byte(reader); + skip_byte(reader, '\0'); return SERD_FAILURE; case '@': @@ -185,7 +189,7 @@ read_trig_statement(SerdReader* const reader) SerdStatus read_trigDoc(SerdReader* const reader) { - while (!reader->source->eof) { + while (!reader->source.eof) { const size_t orig_stack_size = reader->stack.size; const SerdStatus st = read_trig_statement(reader); diff --git a/src/read_turtle.c b/src/read_turtle.c index c6982327..a041e873 100644 --- a/src/read_turtle.c +++ b/src/read_turtle.c @@ -2,7 +2,8 @@ // SPDX-License-Identifier: ISC #include "read_turtle.h" -#include "byte_source.h" + +#include "caret.h" #include "env.h" #include "namespaces.h" #include "node.h" @@ -14,7 +15,6 @@ #include "try.h" #include "turtle.h" -#include "serd/caret.h" #include "serd/env.h" #include "serd/node.h" #include "serd/reader.h" @@ -49,12 +49,14 @@ read_predicateObjectList(SerdReader* reader, ReadContext ctx, bool* ate_dot); static SerdStatus read_whitespace(SerdReader* const reader) { - switch (peek_byte(reader)) { + const int c = peek_byte(reader); + + switch (c) { case '\t': case '\n': case '\r': case ' ': - return serd_byte_source_advance(reader->source); + return skip_byte(reader, c); case '#': return read_comment(reader); default: @@ -338,7 +340,7 @@ resolve_IRIREF(SerdReader* const reader, } // Push a new temporary node for constructing the resolved URI - SerdNode* const temp = push_node(reader, SERD_URI, "", 0); + SerdNode* const temp = push_node_head(reader, SERD_URI); if (!temp) { return SERD_BAD_STACK; } @@ -346,10 +348,12 @@ resolve_IRIREF(SerdReader* const reader, // Write resolved URI to the temporary node WriteNodeContext ctx = {reader, temp, SERD_SUCCESS}; temp->length = serd_write_uri(uri, write_to_stack, &ctx); + if (!ctx.status) { // Replace the destination with the new expanded node memmove(dest, temp, sizeof(SerdNode) + serd_node_pad_length(temp->length)); serd_stack_pop_to(&reader->stack, string_start_offset + dest->length); + TRY(ctx.status, push_node_tail(reader)); } return ctx.status; @@ -361,7 +365,7 @@ read_IRIREF(SerdReader* const reader, SerdNode** const dest) SerdStatus st = SERD_SUCCESS; TRY(st, eat_byte_check(reader, '<')); - if (!(*dest = push_node(reader, SERD_URI, "", 0))) { + if (!(*dest = push_node_head(reader, SERD_URI))) { return SERD_BAD_STACK; } @@ -372,8 +376,10 @@ read_IRIREF(SerdReader* const reader, SerdNode** const dest) return st; } + TRY(st, push_node_tail(reader)); + return (reader->flags & SERD_READ_RELATIVE) - ? SERD_SUCCESS + ? st : resolve_IRIREF(reader, *dest, string_start_offset); } @@ -411,7 +417,7 @@ read_PrefixedName(SerdReader* const reader, return st; } - return SERD_SUCCESS; + return push_node_tail(reader); } static SerdStatus @@ -439,12 +445,11 @@ read_number(SerdReader* const reader, #define XSD_DOUBLE NS_XSD "double" #define XSD_INTEGER NS_XSD "integer" - *dest = push_node(reader, SERD_LITERAL, "", 0); - SerdStatus st = SERD_SUCCESS; int c = peek_byte(reader); bool has_decimal = false; - if (!*dest) { + + if (!(*dest = push_node_head(reader, SERD_LITERAL))) { return SERD_BAD_STACK; } @@ -467,8 +472,8 @@ read_number(SerdReader* const reader, skip_byte(reader, c); c = peek_byte(reader); if (!is_digit(c) && c != 'e' && c != 'E') { - *ate_dot = true; // Force caller to deal with stupid grammar - return SERD_SUCCESS; // Next byte is not a number character + *ate_dot = true; // Force caller to deal with stupid grammar + return push_node_tail(reader); // Next byte is not a number character } TRY(st, push_byte(reader, *dest, '.')); @@ -491,12 +496,13 @@ read_number(SerdReader* const reader, break; } TRY(st, read_0_9(reader, *dest, true)); + TRY(st, push_node_tail(reader)); meta = push_node(reader, SERD_URI, XSD_DOUBLE, sizeof(XSD_DOUBLE) - 1); - (*dest)->flags |= SERD_HAS_DATATYPE; } else if (has_decimal) { + TRY(st, push_node_tail(reader)); meta = push_node(reader, SERD_URI, XSD_DECIMAL, sizeof(XSD_DECIMAL) - 1); - (*dest)->flags |= SERD_HAS_DATATYPE; } else { + TRY(st, push_node_tail(reader)); meta = push_node(reader, SERD_URI, XSD_INTEGER, sizeof(XSD_INTEGER) - 1); } @@ -513,11 +519,14 @@ read_turtle_iri(SerdReader* const reader, return read_IRIREF(reader, dest); } - if (!(*dest = push_node(reader, SERD_LITERAL, "", 0))) { + if (!(*dest = push_node_head(reader, SERD_LITERAL))) { return SERD_BAD_STACK; } - return read_PrefixedName(reader, *dest, true, ate_dot, reader->stack.size); + const SerdStatus st = + read_PrefixedName(reader, *dest, true, ate_dot, reader->stack.size); + + return st; } static SerdStatus @@ -525,14 +534,14 @@ read_literal(SerdReader* const reader, SerdNode** const dest, bool* const ate_dot) { - if (!(*dest = push_node(reader, SERD_LITERAL, "", 0))) { + SerdStatus st = SERD_SUCCESS; + + if (!(*dest = push_node_head(reader, SERD_LITERAL))) { return SERD_BAD_STACK; } - SerdStatus st = read_String(reader, *dest); - if (st) { - return st; - } + TRY(st, read_String(reader, *dest)); + TRY(st, push_node_tail(reader)); SerdNode* datatype = NULL; switch (peek_byte(reader)) { @@ -546,9 +555,11 @@ read_literal(SerdReader* const reader, TRY(st, eat_byte_check(reader, '^')); (*dest)->flags |= SERD_HAS_DATATYPE; TRY(st, read_turtle_iri(reader, &datatype, ate_dot)); + assert(datatype == serd_node_meta_c(*dest)); break; } - return SERD_SUCCESS; + + return st; } static SerdStatus @@ -567,7 +578,7 @@ read_verb(SerdReader* reader, SerdNode** const dest) /* Either a qname, or "a". Read the prefix first, and if it is in fact "a", produce that instead. */ - if (!(*dest = push_node(reader, SERD_URI, "", 0))) { + if (!(*dest = push_node_head(reader, SERD_URI))) { return SERD_BAD_STACK; } @@ -593,7 +604,7 @@ read_verb(SerdReader* reader, SerdNode** const dest) reader, st > SERD_FAILURE ? st : SERD_BAD_SYNTAX, "expected verb"); } - return SERD_SUCCESS; + return SERD_SUCCESS; // push_node_tail(reader); } static SerdStatus @@ -670,7 +681,7 @@ read_named_object(SerdReader* const reader, Deal with this here by trying to read a prefixed node, then if it turns out to actually be "true" or "false", switch it to a boolean literal. */ - if (!(*dest = push_node(reader, SERD_URI, "", 0))) { + if (!(*dest = push_node_head(reader, SERD_URI))) { return SERD_BAD_STACK; } @@ -685,6 +696,7 @@ read_named_object(SerdReader* const reader, node_has_string(node, false_string))) { node->flags = SERD_HAS_DATATYPE; node->type = SERD_LITERAL; + TRY(st, push_node_tail(reader)); return push_node(reader, SERD_URI, XSD_BOOLEAN, XSD_BOOLEAN_LEN) ? SERD_SUCCESS : SERD_BAD_STACK; @@ -696,7 +708,7 @@ read_named_object(SerdReader* const reader, return r_err(reader, st, "expected prefixed name or boolean"); } - return SERD_SUCCESS; + return SERD_SUCCESS; // push_node_tail(reader); } // Read an object and emit statements, possibly recursively @@ -705,8 +717,8 @@ read_object(SerdReader* const reader, ReadContext* const ctx, bool* const ate_dot) { - const size_t orig_stack_size = reader->stack.size; - SerdCaret orig_caret = reader->source->caret; + const size_t orig_stack_size = reader->stack.size; + struct SerdCaretImpl orig_caret = reader->source.caret; assert(ctx->subject); @@ -809,7 +821,7 @@ read_predicateObjectList(SerdReader* const reader, int c = 0; do { read_turtle_ws_star(reader); - switch (c = peek_byte(reader)) { + switch ((c = peek_byte(reader))) { case EOF: serd_stack_pop_to(&reader->stack, orig_stack_size); return r_err(reader, SERD_BAD_SYNTAX, "unexpected end of file"); @@ -869,11 +881,9 @@ read_collection(SerdReader* const reader, /* The order of node allocation here is necessarily not in stack order, so we create two nodes and recycle them throughout. */ - SerdNode* n1 = - push_node_padded(reader, genid_length(reader), SERD_BLANK, "", 0); - + SerdNode* n1 = push_node_padding(reader, SERD_BLANK, genid_length(reader)); SerdNode* node = n1; - SerdNode* rest = 0; + SerdNode* rest = NULL; if (!n1) { return SERD_BAD_STACK; @@ -985,7 +995,6 @@ read_turtle_base(SerdReader* const reader, const bool sparql, const bool token) return SERD_BAD_STACK; } - serd_node_zero_pad(uri); TRY(st, serd_env_set_base_uri(reader->env, serd_node_string_view(uri))); TRY(st, serd_sink_write_base(reader->sink, uri)); @@ -1012,25 +1021,22 @@ read_turtle_prefixID(SerdReader* const reader, } read_turtle_ws_star(reader); - SerdNode* name = push_node(reader, SERD_LITERAL, "", 0); + + SerdNode* const name = push_node_head(reader, SERD_LITERAL); if (!name) { return SERD_BAD_STACK; } + // Read (possibly empty) name node TRY_LAX(st, read_PN_PREFIX(reader, name)); + TRY(st, push_node_tail(reader)); TRY(st, eat_byte_check(reader, ':')); read_turtle_ws_star(reader); + // Read URI node SerdNode* uri = NULL; TRY(st, read_IRIREF(reader, &uri)); - if (reader->stack.size + sizeof(SerdNode) > reader->stack.buf_size) { - return SERD_BAD_STACK; - } - - serd_node_zero_pad(name); - serd_node_zero_pad(uri); - TRY(st, serd_env_set_prefix( reader->env, serd_node_string_view(name), serd_node_string_view(uri))); @@ -1064,11 +1070,15 @@ read_turtle_directive(SerdReader* const reader) static SerdStatus read_sparql_directive(SerdReader* const reader, const SerdNode* const token) { + SerdStatus st = SERD_SUCCESS; + if (!tokcmp(token, "base", 4)) { + TRY(st, push_node_tail(reader)); return read_turtle_base(reader, true, false); } if (!tokcmp(token, "prefix", 6)) { + TRY(st, push_node_tail(reader)); return read_turtle_prefixID(reader, true, false); } @@ -1078,11 +1088,10 @@ read_sparql_directive(SerdReader* const reader, const SerdNode* const token) static SerdStatus read_block(SerdReader* const reader, ReadContext* const ctx) { - SerdStatus st = SERD_SUCCESS; - // Try to read a subject, though it may actually be a directive or graph name - SerdNode* token = NULL; - int s_type = 0; + SerdNode* token = NULL; + SerdStatus st = SERD_SUCCESS; + int s_type = 0; TRY_LAX(st, read_turtle_subject(reader, *ctx, &token, &s_type)); // Try to interpret as a SPARQL "PREFIX" or "BASE" directive @@ -1121,7 +1130,7 @@ read_turtle_statement(SerdReader* const reader) return SERD_FAILURE; case '\0': - eat_byte(reader); + skip_byte(reader, '\0'); return SERD_FAILURE; case '@': @@ -1138,7 +1147,7 @@ read_turtle_statement(SerdReader* const reader) SerdStatus read_turtleDoc(SerdReader* const reader) { - while (!reader->source->eof) { + while (!reader->source.eof) { const size_t orig_stack_size = reader->stack.size; const SerdStatus st = read_turtle_statement(reader); diff --git a/src/read_utf8.c b/src/read_utf8.c index 4639c34e..5b1f737c 100644 --- a/src/read_utf8.c +++ b/src/read_utf8.c @@ -70,15 +70,15 @@ read_utf8_code_point(SerdReader* const reader, uint32_t* const code, const uint8_t lead) { - uint8_t size = 0U; - uint8_t bytes[MAX_UTF8_BYTES] = {lead, 0U, 0U, 0U}; + SerdStatus st = SERD_SUCCESS; + uint8_t size = 0U; + uint8_t bytes[MAX_UTF8_BYTES] = {lead, 0U, 0U, 0U}; *code = 0U; skip_byte(reader, lead); - SerdStatus st = read_utf8_continuation_bytes(reader, bytes, &size, lead); - if (st) { + if ((st = read_utf8_continuation_bytes(reader, bytes, &size, lead))) { return reader->strict ? st : push_bytes(reader, dest, replacement_char, 3); } diff --git a/src/reader.c b/src/reader.c index cfeaf0c0..10d298ea 100644 --- a/src/reader.c +++ b/src/reader.c @@ -34,7 +34,7 @@ r_err(SerdReader* const reader, const SerdStatus st, const char* const fmt, ...) va_start(args, fmt); serd_vlogf_at( - reader->world, SERD_LOG_LEVEL_ERROR, &reader->source->caret, fmt, args); + reader->world, SERD_LOG_LEVEL_ERROR, &reader->source.caret, fmt, args); va_end(args); return st; @@ -43,8 +43,10 @@ r_err(SerdReader* const reader, const SerdStatus st, const char* const fmt, ...) SerdStatus skip_horizontal_whitespace(SerdReader* const reader) { - while (peek_byte(reader) == '\t' || peek_byte(reader) == ' ') { - eat_byte(reader); + int c = peek_byte(reader); + while (c == '\t' || c == ' ') { + skip_byte(reader, c); + c = peek_byte(reader); } return SERD_SUCCESS; @@ -105,47 +107,94 @@ tolerate_status(const SerdReader* const reader, const SerdStatus status) SerdNode* blank_id(SerdReader* const reader) { - SerdNode* const ref = - push_node_padded(reader, genid_length(reader), SERD_BLANK, "", 0); + const size_t length = genid_length(reader); + SerdNode* const ref = push_node_padding(reader, SERD_BLANK, length); if (ref) { - set_blank_id(reader, ref, genid_length(reader) + 1); + set_blank_id(reader, ref, length + 1U); } return ref; } -SerdNode* -push_node_padded(SerdReader* const reader, - const size_t max_length, - const SerdNodeType type, - const char* const str, - const size_t length) +static SerdNode* +push_node_start(SerdReader* const reader, + const SerdNodeType type, + const size_t body_size) { - // Push a null byte to ensure the previous node was null terminated - char* terminator = (char*)serd_stack_push(&reader->stack, 1); - if (!terminator) { - return NULL; + /* The top of the stack should already be aligned, because the previous node + must be terminated before starting a new one. This is statically + assumed/enforced here to ensure that it's done earlier, usually right + after writing the node body. That way is less error-prone, because nodes + are terminated earlier which reduces the risk of accidentally using a + non-terminated node. It's also faster, for two reasons: + + - Nodes, including termination, are written to the stack in a single + sweep, as "tightly" as possible (avoiding the need to re-load that + section of the stack into the cache for writing). + + - Pushing a new node header (this function) doesn't need to do any + alignment calculations. + */ + + assert(!(reader->stack.size % sizeof(SerdNode))); + + const size_t size = sizeof(SerdNode) + body_size; + SerdNode* const node = (SerdNode*)serd_stack_push(&reader->stack, size); + + if (node) { + node->length = 0U; + node->flags = 0U; + node->type = type; } - *terminator = 0; - void* mem = serd_stack_push_aligned( - &reader->stack, sizeof(SerdNode) + max_length + 1, sizeof(SerdNode)); + return node; +} - if (!mem) { - return NULL; +/// Push a null byte to ensure the previous node was null terminated +static char* +push_node_end(SerdReader* const reader) +{ + char* const terminator = (char*)serd_stack_push(&reader->stack, 1U); + + if (terminator) { + *terminator = 0; } - SerdNode* const node = (SerdNode*)mem; + return terminator; +} - node->length = length; - node->flags = 0; - node->type = type; +SerdNode* +push_node_head(SerdReader* const reader, const SerdNodeType type) +{ + return push_node_start(reader, type, 0U); +} - char* buf = (char*)(node + 1); - memcpy(buf, str, length + 1); +SerdStatus +push_node_tail(SerdReader* const reader) +{ + if (!push_node_end(reader) || + !serd_stack_push_pad(&reader->stack, sizeof(SerdNode))) { + return SERD_BAD_STACK; + } - return node; + assert(!(reader->stack.size % sizeof(SerdNode))); + return SERD_SUCCESS; +} + +SerdNode* +push_node_padding(SerdReader* const reader, + const SerdNodeType type, + const size_t max_length) +{ + SerdNode* const node = push_node_start(reader, type, max_length); + if (!node) { + return NULL; + } + + memset(serd_node_buffer(node), 0, max_length); + + return !push_node_tail(reader) ? node : NULL; } SerdNode* @@ -154,7 +203,15 @@ push_node(SerdReader* const reader, const char* const str, const size_t length) { - return push_node_padded(reader, length, type, str, length); + SerdNode* const node = push_node_start(reader, type, length); + if (!node) { + return NULL; + } + + node->length = length; + memcpy(serd_node_buffer(node), str, length); + + return !push_node_tail(reader) ? node : NULL; } int @@ -175,10 +232,6 @@ emit_statement_at(SerdReader* const reader, return SERD_BAD_STACK; } - /* Zero the pad of the object node on the top of the stack. Lower nodes - (subject and predicate) were already zeroed by subsequent pushes. */ - serd_node_zero_pad(o); - const SerdStatement statement = {{ctx.subject, ctx.predicate, o, ctx.graph}, caret}; @@ -194,7 +247,7 @@ emit_statement(SerdReader* const reader, const ReadContext ctx, SerdNode* const o) { - return emit_statement_at(reader, ctx, o, &reader->source->caret); + return emit_statement_at(reader, ctx, o, &reader->source.caret); } SerdStatus @@ -202,7 +255,7 @@ serd_reader_read_document(SerdReader* const reader) { assert(reader); - if (!reader->source) { + if (!reader->source.read_buf) { return SERD_BAD_CALL; } @@ -213,7 +266,7 @@ serd_reader_read_document(SerdReader* const reader) ++reader->world->next_document_id); } - if (reader->syntax != SERD_SYNTAX_EMPTY && !reader->source->prepared) { + if (reader->syntax != SERD_SYNTAX_EMPTY && !reader->source.prepared) { SerdStatus st = serd_reader_prepare(reader); if (st) { return st; @@ -301,7 +354,7 @@ serd_reader_free(SerdReader* const reader) return; } - if (reader->source) { + if (reader->source.in) { serd_reader_finish(reader); } @@ -318,30 +371,28 @@ serd_reader_start(SerdReader* const reader, assert(reader); assert(input); - if (!block_size || !input->stream) { + if (!block_size || block_size > UINT32_MAX || !input->stream) { return SERD_BAD_ARG; } - if (reader->source) { + if (reader->source.in) { return SERD_BAD_CALL; } - reader->source = serd_byte_source_new_input( - reader->world->allocator, input, input_name, block_size); - - return reader->source ? SERD_SUCCESS : SERD_BAD_ALLOC; + return serd_byte_source_init( + reader->world->allocator, &reader->source, input, input_name, block_size); } static SerdStatus serd_reader_prepare(SerdReader* const reader) { - SerdStatus st = serd_byte_source_prepare(reader->source); + SerdStatus st = serd_byte_source_prepare(&reader->source); if (st == SERD_SUCCESS) { - if ((st = serd_byte_source_skip_bom(reader->source))) { + if ((st = serd_byte_source_skip_bom(&reader->source))) { r_err(reader, SERD_BAD_SYNTAX, "corrupt byte order mark"); } } else if (st == SERD_FAILURE) { - reader->source->eof = true; + reader->source.eof = true; } return st; } @@ -351,22 +402,11 @@ serd_reader_read_chunk(SerdReader* const reader) { assert(reader); - SerdStatus st = SERD_SUCCESS; - if (!reader->source) { - return SERD_BAD_CALL; - } - - if (!reader->source->prepared) { - st = serd_reader_prepare(reader); - } else if (reader->source->eof) { - st = serd_byte_source_advance(reader->source); - } - - if (peek_byte(reader) == 0) { - // Skip leading null byte, for reading from a null-delimited socket - serd_byte_source_advance(reader->source); - return SERD_FAILURE; - } + const SerdStatus st = + (!reader->source.in) ? SERD_BAD_CALL + : (!reader->source.prepared) ? serd_reader_prepare(reader) + : (reader->source.eof) ? serd_byte_source_page(&reader->source) + : SERD_SUCCESS; if (st) { return st; @@ -392,7 +432,7 @@ serd_reader_finish(SerdReader* const reader) { assert(reader); - serd_byte_source_free(reader->world->allocator, reader->source); - reader->source = NULL; + serd_byte_source_destroy(reader->world->allocator, &reader->source); + return SERD_SUCCESS; } diff --git a/src/reader.h b/src/reader.h index 9ec2e4ac..837bf969 100644 --- a/src/reader.h +++ b/src/reader.h @@ -25,6 +25,7 @@ #include <stdbool.h> #include <stdint.h> #include <stdio.h> +#include <string.h> typedef struct { SerdNode* graph; @@ -37,13 +38,13 @@ typedef struct { struct SerdReaderImpl { SerdWorld* world; const SerdSink* sink; + SerdByteSource source; + SerdStack stack; SerdNode* rdf_first; SerdNode* rdf_rest; SerdNode* rdf_nil; SerdNode* rdf_type; - SerdByteSource* source; SerdEnv* env; - SerdStack stack; SerdSyntax syntax; SerdReaderFlags flags; unsigned next_id; @@ -60,13 +61,34 @@ SERD_LOG_FUNC(3, 4) SerdStatus r_err(SerdReader* reader, SerdStatus st, const char* fmt, ...); +/** + Push the SerdNode header of a node with zero flags and length. + + If this is called, push_node_tail() must eventually be called before + starting a new node. +*/ SerdNode* -push_node_padded(SerdReader* reader, - size_t max_length, - SerdNodeType type, - const char* str, - size_t length); +push_node_head(SerdReader* reader, SerdNodeType type); + +/** + Push the end of a node, a null terminator and any necessary padding. + + This must be called to close the scope opened with push_node_head(). +*/ +SerdStatus +push_node_tail(SerdReader* reader); +/** + Push a node with reserved space for a body. + + The body is initially all zero, as are the node's length and flags. +*/ +SerdNode* +push_node_padding(SerdReader* reader, SerdNodeType type, size_t max_length); + +/** + Push a complete node with a given string body. +*/ SerdNode* push_node(SerdReader* reader, SerdNodeType type, @@ -98,11 +120,9 @@ SerdStatus emit_statement(SerdReader* reader, ReadContext ctx, SerdNode* o); static inline int -peek_byte(SerdReader* reader) +peek_byte(const SerdReader* const reader) { - SerdByteSource* source = reader->source; - - return source->eof ? EOF : (int)source->read_buf[source->read_head]; + return serd_byte_source_peek(&reader->source); } static inline SerdStatus @@ -112,19 +132,7 @@ skip_byte(SerdReader* reader, const int byte) assert(peek_byte(reader) == byte); - return serd_byte_source_advance(reader->source); -} - -static inline int -eat_byte(SerdReader* const reader) -{ - const int c = peek_byte(reader); - - if (c != EOF) { - serd_byte_source_advance(reader->source); - } - - return c; + return serd_byte_source_advance_past(&reader->source, byte); } static inline int SERD_NODISCARD @@ -134,7 +142,7 @@ eat_byte_safe(SerdReader* reader, const int byte) assert(peek_byte(reader) == byte); - serd_byte_source_advance(reader->source); + serd_byte_source_advance_past(&reader->source, byte); return byte; } @@ -167,32 +175,27 @@ push_byte(SerdReader* reader, SerdNode* node, const int c) { assert(c != EOF); - if (reader->stack.size + 1 > reader->stack.buf_size) { + const size_t old_size = reader->stack.size; + if (old_size >= reader->stack.buf_size) { return SERD_BAD_STACK; } - ((uint8_t*)reader->stack.buf)[reader->stack.size - 1] = (uint8_t)c; ++reader->stack.size; ++node->length; + reader->stack.buf[old_size] = (char)c; + return SERD_SUCCESS; } static inline SerdStatus -push_bytes(SerdReader* const reader, - SerdNode* const node, - const uint8_t* const bytes, - const size_t len) +push_bytes(SerdReader* reader, SerdNode* node, const uint8_t* bytes, size_t len) { if (reader->stack.buf_size < reader->stack.size + len) { return SERD_BAD_STACK; } - const size_t begin = reader->stack.size - 1U; - for (unsigned i = 0U; i < len; ++i) { - reader->stack.buf[begin + i] = (char)bytes[i]; - } - + memcpy(reader->stack.buf + reader->stack.size, bytes, len); reader->stack.size += len; node->length += len; return SERD_SUCCESS; diff --git a/src/stack.h b/src/stack.h index 94e091a1..c11970c1 100644 --- a/src/stack.h +++ b/src/stack.h @@ -47,14 +47,14 @@ serd_stack_free(ZixAllocator* const allocator, SerdStack* stack) static inline void* serd_stack_push(SerdStack* stack, size_t n_bytes) { - const size_t new_size = stack->size + n_bytes; + const size_t old_size = stack->size; + const size_t new_size = old_size + n_bytes; if (stack->buf_size < new_size) { return NULL; } - char* const ret = (stack->buf + stack->size); - stack->size = new_size; - return ret; + stack->size = new_size; + return stack->buf + old_size; } static inline void @@ -73,20 +73,21 @@ serd_stack_pop_to(SerdStack* stack, size_t n_bytes) } static inline void* -serd_stack_push_aligned(SerdStack* stack, size_t n_bytes, size_t align) +serd_stack_push_pad(SerdStack* stack, size_t align) { // Push padding if necessary - const size_t pad = align - stack->size % align; - if (pad > 0) { - void* padding = serd_stack_push(stack, pad); + const size_t leftovers = stack->size % align; + if (leftovers) { + const size_t pad = align - leftovers; + void* const padding = serd_stack_push(stack, pad); if (!padding) { return NULL; } memset(padding, 0, pad); + return padding; } - // Push requested space at aligned location - return serd_stack_push(stack, n_bytes); + return stack->buf + stack->size; } #endif // SERD_SRC_STACK_H |