diff options
author | David Robillard <d@drobilla.net> | 2021-04-15 17:52:44 -0400 |
---|---|---|
committer | David Robillard <d@drobilla.net> | 2023-12-02 18:49:08 -0500 |
commit | 8c67f9eba47d30913749e607c440b170a5cbd804 (patch) | |
tree | 01d9750c6e646c76519e3a00bb200d6312e65ffc | |
parent | 7ffa2c0488fcd96c3c12713e5650633eb03e91f7 (diff) | |
download | serd-8c67f9eba47d30913749e607c440b170a5cbd804.tar.gz serd-8c67f9eba47d30913749e607c440b170a5cbd804.tar.bz2 serd-8c67f9eba47d30913749e607c440b170a5cbd804.zip |
[WIP] Expand URIs in reader
This expands relative and prefixed URIs in the reader on the stack, rather than
passing them to the caller to be dealt with. This pushes these context-full
forms to the edge of the system as much as possible to minimise the headaches
they can cause.
Towards having stricter guarantees about nodes and eliminating the CURIE node
type altogether.
-rw-r--r-- | include/serd/reader.h | 2 | ||||
-rw-r--r-- | src/env.c | 15 | ||||
-rw-r--r-- | src/env.h | 8 | ||||
-rw-r--r-- | src/node.c | 2 | ||||
-rw-r--r-- | src/node.h | 3 | ||||
-rw-r--r-- | src/read_turtle.c | 119 | ||||
-rw-r--r-- | src/reader.h | 24 | ||||
-rw-r--r-- | test/extra/bad/bad-prefix-dot.ttl | 1 | ||||
-rw-r--r-- | test/extra/bad/manifest.ttl | 8 | ||||
-rw-r--r-- | test/extra/full/full-uris.ttl | 6 | ||||
-rw-r--r-- | test/extra/good/manifest.ttl | 14 | ||||
-rw-r--r-- | test/extra/good/test-local-name-ends-with-dot.nt | 1 | ||||
-rw-r--r-- | test/extra/good/test-local-name-escapes.nt | 17 | ||||
-rw-r--r-- | test/extra/good/test-local-name-escapes.ttl | 19 | ||||
-rw-r--r-- | test/meson.build | 1 | ||||
-rw-r--r-- | test/test_overflow.c | 44 | ||||
-rw-r--r-- | test/test_reader_writer.c | 2 | ||||
-rw-r--r-- | test/test_writer.c | 31 |
18 files changed, 282 insertions, 35 deletions
diff --git a/include/serd/reader.h b/include/serd/reader.h index 658193fb..b6b9cac3 100644 --- a/include/serd/reader.h +++ b/include/serd/reader.h @@ -32,6 +32,8 @@ typedef struct SerdReaderImpl SerdReader; typedef enum { SERD_READ_LAX = 1U << 0U, ///< Tolerate invalid input where possible SERD_READ_VARIABLES = 1U << 1U, ///< Support variable nodes + SERD_READ_PREFIXED = 1U << 2U, ///< Do not expand prefixed names + SERD_READ_RELATIVE = 1U << 3U, ///< Do not expand relative URI references } SerdReaderFlag; /// Bitwise OR of SerdReaderFlag values @@ -139,6 +139,21 @@ serd_env_set_base_uri(SerdEnv* const env, const SerdStringView uri) return SERD_SUCCESS; } +SerdStringView +serd_env_find_prefix(const SerdEnv* const env, const SerdStringView name) +{ + for (size_t i = 0; i < env->n_prefixes; ++i) { + const SerdNode* const prefix_name = env->prefixes[i].name; + if (prefix_name->length == name.length) { + if (!memcmp(serd_node_string(prefix_name), name.data, name.length)) { + return serd_node_string_view(env->prefixes[i].uri); + } + } + } + + return serd_empty_string(); +} + ZIX_PURE_FUNC static SerdPrefix* serd_env_find(const SerdEnv* const env, const char* const name, @@ -21,6 +21,14 @@ serd_env_qualify_in_place(const SerdEnv* env, SerdStringView* suffix); /** + Return the URI for the prefix with the given name. + + If no such prefix is known, returns an empty string view. +*/ +ZIX_PURE_FUNC SerdStringView +serd_env_find_prefix(const SerdEnv* env, SerdStringView name); + +/** Expand `curie`. Errors: SERD_BAD_ARG if `curie` is not valid, or SERD_BAD_CURIE if prefix is @@ -104,7 +104,7 @@ serd_node_check_padding(const SerdNode* node) #endif } -static ZIX_PURE_FUNC size_t +size_t serd_node_total_size(const SerdNode* const node) { return node ? (sizeof(SerdNode) + serd_node_pad_length(node->length) + @@ -54,6 +54,9 @@ void serd_node_set(SerdNode* ZIX_NONNULL* ZIX_NONNULL dst, const SerdNode* ZIX_NONNULL src); +ZIX_PURE_FUNC size_t +serd_node_total_size(const SerdNode* ZIX_NULLABLE node); + void serd_node_zero_pad(SerdNode* ZIX_NONNULL node); diff --git a/src/read_turtle.c b/src/read_turtle.c index 8d9ec78a..fa7b9731 100644 --- a/src/read_turtle.c +++ b/src/read_turtle.c @@ -3,6 +3,7 @@ #include "read_turtle.h" #include "byte_source.h" +#include "env.h" #include "namespaces.h" #include "node.h" #include "ntriples.h" @@ -21,6 +22,8 @@ #include "serd/statement.h" #include "serd/status.h" #include "serd/string_view.h" +#include "serd/uri.h" +#include "zix/attributes.h" #include <assert.h> #include <stdbool.h> @@ -292,6 +295,66 @@ read_PN_PREFIX(SerdReader* const reader, SerdNode* const dest) return st ? st : read_PN_PREFIX_tail(reader, dest); } +typedef struct { + SerdReader* reader; + SerdNode* node; + SerdStatus status; +} WriteNodeContext; + +static size_t +write_to_stack(const void* const ZIX_NONNULL buf, + const size_t size, + const size_t nmemb, + void* const ZIX_NONNULL stream) +{ + WriteNodeContext* const ctx = (WriteNodeContext*)stream; + const uint8_t* const utf8 = (const uint8_t*)buf; + + ctx->status = push_bytes(ctx->reader, ctx->node, utf8, nmemb * size); + + return nmemb; +} + +static SerdStatus +resolve_IRIREF(SerdReader* const reader, + SerdNode* const dest, + const size_t string_start_offset) +{ + // If the URI is already absolute, we don't need to do anything + if (serd_uri_string_has_scheme(serd_node_string(dest))) { + return SERD_SUCCESS; + } + + // Parse the URI reference so we can resolve it + SerdURIView uri = serd_parse_uri(serd_node_string(dest)); + + // Resolve relative URI reference to a full URI + uri = serd_resolve_uri(uri, serd_env_base_uri_view(reader->env)); + if (!uri.scheme.length) { + return r_err(reader, + SERD_BAD_SYNTAX, + "failed to resolve relative URI reference <%s>", + serd_node_string(dest)); + } + + // Push a new temporary node for constructing the resolved URI + SerdNode* const temp = push_node(reader, SERD_URI, "", 0); + if (!temp) { + return SERD_BAD_STACK; + } + + // Write resolved URI to the temporary node + WriteNodeContext ctx = {reader, temp, SERD_SUCCESS}; + temp->length = serd_write_uri(uri, write_to_stack, &ctx); + if (!ctx.status) { + // Replace the destination with the new expanded node + memmove(dest, temp, serd_node_total_size(temp)); + serd_stack_pop_to(&reader->stack, string_start_offset + dest->length); + } + + return ctx.status; +} + static SerdStatus read_IRIREF(SerdReader* const reader, SerdNode** const dest) { @@ -302,14 +365,24 @@ read_IRIREF(SerdReader* const reader, SerdNode** const dest) return SERD_BAD_STACK; } - return read_IRIREF_suffix(reader, *dest); + const size_t string_start_offset = reader->stack.size; + + st = read_IRIREF_suffix(reader, *dest); + if (!tolerate_status(reader, st)) { + return st; + } + + return (reader->flags & SERD_READ_RELATIVE) + ? SERD_SUCCESS + : resolve_IRIREF(reader, *dest, string_start_offset); } static SerdStatus read_PrefixedName(SerdReader* const reader, SerdNode* const dest, const bool read_prefix, - bool* const ate_dot) + bool* const ate_dot, + const size_t string_start_offset) { SerdStatus st = SERD_SUCCESS; if (read_prefix) { @@ -320,8 +393,24 @@ read_PrefixedName(SerdReader* const reader, return SERD_FAILURE; } - TRY(st, push_byte(reader, dest, eat_byte_safe(reader, ':'))); - TRY_FAILING(st, read_PN_LOCAL(reader, dest, ate_dot)); + skip_byte(reader, ':'); + + // Search environment for the prefix URI + const SerdStringView prefix = serd_node_string_view(dest); + const SerdStringView prefix_uri = serd_env_find_prefix(reader->env, prefix); + if (!prefix_uri.length) { + return r_err(reader, st, "unknown prefix \"%s\"", prefix.data); + } + + // Pop back to the start of the string + serd_stack_pop_to(&reader->stack, string_start_offset); + dest->length = 0U; + dest->type = SERD_URI; + push_bytes(reader, dest, (const uint8_t*)prefix_uri.data, prefix_uri.length); + if ((st = read_PN_LOCAL(reader, dest, ate_dot)) > SERD_FAILURE) { + return st; + } + return SERD_SUCCESS; } @@ -420,14 +509,15 @@ read_turtle_iri(SerdReader* const reader, SerdNode** const dest, bool* const ate_dot) { - switch (peek_byte(reader)) { - case '<': + if (peek_byte(reader) == '<') { return read_IRIREF(reader, dest); - default: - *dest = push_node(reader, SERD_CURIE, "", 0); - return *dest ? read_PrefixedName(reader, *dest, true, ate_dot) - : SERD_BAD_STACK; } + + if (!(*dest = push_node(reader, SERD_CURIE, "", 0))) { + return SERD_BAD_STACK; + } + + return read_PrefixedName(reader, *dest, true, ate_dot, reader->stack.size); } static SerdStatus @@ -481,7 +571,8 @@ read_verb(SerdReader* reader, SerdNode** const dest) return SERD_BAD_STACK; } - SerdStatus st = SERD_SUCCESS; + const size_t string_start_offset = reader->stack.size; + SerdStatus st = SERD_SUCCESS; TRY_LAX(st, read_PN_PREFIX(reader, *dest)); bool ate_dot = false; @@ -495,7 +586,9 @@ read_verb(SerdReader* reader, SerdNode** const dest) : SERD_BAD_STACK); } - if ((st = read_PrefixedName(reader, *dest, false, &ate_dot)) || ate_dot) { + if ((st = read_PrefixedName( + reader, *dest, false, &ate_dot, string_start_offset)) || + ate_dot) { *dest = NULL; return r_err( reader, st > SERD_FAILURE ? st : SERD_BAD_SYNTAX, "expected verb"); @@ -586,7 +679,7 @@ read_named_object(SerdReader* const reader, SerdStatus st = SERD_SUCCESS; // Attempt to read a prefixed name - st = read_PrefixedName(reader, node, true, ate_dot); + st = read_PrefixedName(reader, node, true, ate_dot, reader->stack.size); // Check if this is actually a special boolean node if (st == SERD_FAILURE && (node_has_string(node, true_string) || diff --git a/src/reader.h b/src/reader.h index 302f8c6f..a98d5ef8 100644 --- a/src/reader.h +++ b/src/reader.h @@ -178,19 +178,23 @@ push_byte(SerdReader* reader, SerdNode* node, const int c) } static inline SerdStatus -push_bytes(SerdReader* reader, - SerdNode* ref, - const uint8_t* bytes, - unsigned len) +push_bytes(SerdReader* const reader, + SerdNode* const node, + const uint8_t* const bytes, + const size_t len) { - const bool has_space = reader->stack.buf_size >= reader->stack.size + len; - if (has_space) { - for (unsigned i = 0; i < len; ++i) { - push_byte(reader, ref, bytes[i]); - } + if (reader->stack.buf_size < reader->stack.size + len) { + return SERD_BAD_STACK; + } + + const size_t begin = reader->stack.size - 1U; + for (unsigned i = 0U; i < len; ++i) { + reader->stack.buf[begin + i] = (char)bytes[i]; } - return has_space ? SERD_SUCCESS : SERD_BAD_STACK; + reader->stack.size += len; + node->length += len; + return SERD_SUCCESS; } #endif // SERD_SRC_READER_H diff --git a/test/extra/bad/bad-prefix-dot.ttl b/test/extra/bad/bad-prefix-dot.ttl new file mode 100644 index 00000000..7b02211f --- /dev/null +++ b/test/extra/bad/bad-prefix-dot.ttl @@ -0,0 +1 @@ +@prefix dotted.: <http://example.org/> . diff --git a/test/extra/bad/manifest.ttl b/test/extra/bad/manifest.ttl index cd4aee24..f930b812 100644 --- a/test/extra/bad/manifest.ttl +++ b/test/extra/bad/manifest.ttl @@ -74,8 +74,9 @@ <#bad-object2> <#bad-paths> <#bad-pn-escape> - <#bad-prefix-missing-colon> <#bad-prefix> + <#bad-prefix-dot> + <#bad-prefix-missing-colon> <#bad-quote-in-uri> <#bad-semicolon-after-subject> <#bad-string> @@ -436,6 +437,11 @@ mf:action <bad-prefix-missing-colon.ttl> ; mf:name "bad-prefix-missing-colon" . +<#bad-prefix-dot> + a rdft:TestTurtleNegativeSyntax ; + mf:action <bad-prefix-dot.ttl> ; + mf:name "bad-prefix-dot" . + <#bad-quote-in-uri> a rdft:TestTurtleNegativeSyntax ; mf:action <bad-quote-in-uri.ttl> ; diff --git a/test/extra/full/full-uris.ttl b/test/extra/full/full-uris.ttl index cfb048df..dd6c5144 100644 --- a/test/extra/full/full-uris.ttl +++ b/test/extra/full/full-uris.ttl @@ -1,7 +1,7 @@ @prefix eg: <http://example.org/> . <http://example.org/s1> - eg:prefixed false . + <http://example.org/prefixed> false . -eg:s2 - eg:prefixed true . +<http://example.org/s2> + <http://example.org/prefixed> true . diff --git a/test/extra/good/manifest.ttl b/test/extra/good/manifest.ttl index 7c356285..11a7ec5e 100644 --- a/test/extra/good/manifest.ttl +++ b/test/extra/good/manifest.ttl @@ -27,6 +27,8 @@ <#test-id> <#test-list-in-blank> <#test-list-subject> + <#test-local-name-ends-with-dot> + <#test-local-name-escapes> <#test-long-backspace-escape> <#test-long-delete-escape> <#test-long-form-feed-escape> @@ -169,6 +171,18 @@ mf:name "test-list-subject" ; mf:result <test-list-subject.nt> . +<#test-local-name-ends-with-dot> + a rdft:TestTurtleEval ; + mf:action <test-local-name-ends-with-dot.ttl> ; + mf:name "test-local-name-ends-with-dot" ; + mf:result <test-local-name-ends-with-dot.nt> . + +<#test-local-name-escapes> + a rdft:TestTurtleEval ; + mf:action <test-local-name-escapes.ttl> ; + mf:name "test-local-name-escapes" ; + mf:result <test-local-name-escapes.nt> . + <#test-long-backspace-escape> a rdft:TestTurtleEval ; mf:action <test-long-backspace-escape.ttl> ; diff --git a/test/extra/good/test-local-name-ends-with-dot.nt b/test/extra/good/test-local-name-ends-with-dot.nt new file mode 100644 index 00000000..3285348a --- /dev/null +++ b/test/extra/good/test-local-name-ends-with-dot.nt @@ -0,0 +1 @@ +<http://example.org/eg#s> <http://example.org/eg#p> <http://example.org/eg#foo.> . diff --git a/test/extra/good/test-local-name-escapes.nt b/test/extra/good/test-local-name-escapes.nt new file mode 100644 index 00000000..a6362d7a --- /dev/null +++ b/test/extra/good/test-local-name-escapes.nt @@ -0,0 +1,17 @@ +<http://example.org/s> <http://example.org/p> <http://example.org/o'> . +<http://example.org/s> <http://example.org/p> <http://example.org/o!> . +<http://example.org/s> <http://example.org/p> <http://example.org/o#> . +<http://example.org/s> <http://example.org/p> <http://example.org/o$> . +<http://example.org/s> <http://example.org/p> <http://example.org/o%> . +<http://example.org/s> <http://example.org/p> <http://example.org/o&> . +<http://example.org/s> <http://example.org/p> <http://example.org/o(> . +<http://example.org/s> <http://example.org/p> <http://example.org/o)> . +<http://example.org/s> <http://example.org/p> <http://example.org/o*> . +<http://example.org/s> <http://example.org/p> <http://example.org/o+> . +<http://example.org/s> <http://example.org/p> <http://example.org/o,> . +<http://example.org/s> <http://example.org/p> <http://example.org/o/> . +<http://example.org/s> <http://example.org/p> <http://example.org/o;> . +<http://example.org/s> <http://example.org/p> <http://example.org/o=> . +<http://example.org/s> <http://example.org/p> <http://example.org/o?> . +<http://example.org/s> <http://example.org/p> <http://example.org/o@> . +<http://example.org/s> <http://example.org/p> <http://example.org/o~> . diff --git a/test/extra/good/test-local-name-escapes.ttl b/test/extra/good/test-local-name-escapes.ttl new file mode 100644 index 00000000..8c5fce37 --- /dev/null +++ b/test/extra/good/test-local-name-escapes.ttl @@ -0,0 +1,19 @@ +@prefix eg: <http://example.org/> . + +eg:s eg:p eg:o\' . +eg:s eg:p eg:o\! . +eg:s eg:p eg:o\# . +eg:s eg:p eg:o\$ . +eg:s eg:p eg:o\% . +eg:s eg:p eg:o\& . +eg:s eg:p eg:o\( . +eg:s eg:p eg:o\) . +eg:s eg:p eg:o\* . +eg:s eg:p eg:o\+ . +eg:s eg:p eg:o\, . +eg:s eg:p eg:o\/ . +eg:s eg:p eg:o\; . +eg:s eg:p eg:o\= . +eg:s eg:p eg:o\? . +eg:s eg:p eg:o\@ . +eg:s eg:p eg:o\~ . diff --git a/test/meson.build b/test/meson.build index 01f75b5a..b01dde81 100644 --- a/test/meson.build +++ b/test/meson.build @@ -195,6 +195,7 @@ simple_command_tests = { ['-o'], ['-p'], ['-r'], + ['-s', '<foo> a <Bar> .'], ['-s'], ['-z'], ], diff --git a/test/test_overflow.c b/test/test_overflow.c index db20f734..9b1a6fb7 100644 --- a/test/test_overflow.c +++ b/test/test_overflow.c @@ -8,7 +8,7 @@ #include <assert.h> #include <stdio.h> -static const size_t min_stack_size = 4U * sizeof(size_t) + 240U; +static const size_t min_stack_size = 4U * sizeof(size_t) + 238U; static const size_t max_stack_size = 1024U; static SerdStatus @@ -89,11 +89,9 @@ static void test_turtle_overflow(void) { static const char* const test_strings[] = { - "<http://example.org/s> <http://example.org/p> :%99 .", "<http://example.org/s> <http://example.org/p> <http://example.org/> .", "<http://example.org/s> <http://example.org/p> " "<thisisanabsurdlylongurischeme://because/testing/> .", - "<http://example.org/s> <http://example.org/p> eg:foo .", "<http://example.org/s> <http://example.org/p> 1234 .", "<http://example.org/s> <http://example.org/p> (1 2 3 4) .", "<http://example.org/s> <http://example.org/p> (((((((42))))))) .", @@ -111,7 +109,41 @@ test_turtle_overflow(void) "@prefix ug.dot: <http://example.org/> . \nug.dot:s ug.dot:p ug.dot:o .\n", // NOLINTNEXTLINE(bugprone-suspicious-missing-comma) - "@prefix øøøøøøøøø: <http://example.org/long> . \n" + "<http://example.org/subject/with/a/long/path> " + "<http://example.org/predicate/with/a/long/path> " + "<http://example.org/object/with/a/long/path> .", + + // NOLINTNEXTLINE(bugprone-suspicious-missing-comma) + "<http://example.org/s> <http://example.org/p> " + "\"typed\"^^<http://example.org/Datatype> .", + + // NOLINTNEXTLINE(bugprone-suspicious-missing-comma) + "@prefix eg: <http://example.org/ns/test> .\n" + "<http://example.org/s> <http://example.org/p> " + "\"typed\"^^eg:Datatype .", + + // NOLINTNEXTLINE(bugprone-suspicious-missing-comma) + "@prefix eg: <http://example.org/ns/test> .\n" + "<http://example.org/s> <http://example.org/p> eg:foo .", + + // NOLINTNEXTLINE(bugprone-suspicious-missing-comma) + "@prefix prefix: <http://example.org/testing/curies> .\n" + "prefix:subject prefix:predicate prefix:object .\n", + + // NOLINTNEXTLINE(bugprone-suspicious-missing-comma) + "@prefix eg: <http://example.org/> .\n" + "eg:s eg:p [ eg:p [ eg:p [ eg:p [ eg:p []]]]] .\n", + + // NOLINTNEXTLINE(bugprone-suspicious-missing-comma) + "@prefix eg: <http://example.org/> .\n" + "eg:s eg:p ( 1 2 3 ( 4 5 6 ( 7 8 9 ) ) ) .\n", + + // NOLINTNEXTLINE(bugprone-suspicious-missing-comma) + "@prefix eg: <http://example.org/ns/test> .\n" + "<http://example.org/s> <http://example.org/p> eg:%99 .", + + // NOLINTNEXTLINE(bugprone-suspicious-missing-comma) + "@prefix øøøøøøøøø: <http://example.org/long> .\n" "<http://example.org/somewhatlongsubjecttooffsetthepredicate> øøøøøøøøø:p " "øøøøøøøøø:o .\n", @@ -139,8 +171,8 @@ test_turtle_overflow(void) // NOLINTNEXTLINE(bugprone-suspicious-missing-comma) "@prefix prefix: <http://example.org/testing/curies> .\n" - "prefix:subjectthatwillcomearoundtobeingfinishedanycharacternow " - "prefix:predicate prefix:object .\n", + "<http://example.org/very/long/uri/subject/to/overflow/the/predicate> " + "prefix:predicate prefix:object ; prefix:p prefix:o .\n", // NOLINTNEXTLINE(bugprone-suspicious-missing-comma) "@prefix eg: <http://example.org/> .\n" diff --git a/test/test_reader_writer.c b/test/test_reader_writer.c index 3a20bb7a..55c4b584 100644 --- a/test/test_reader_writer.c +++ b/test/test_reader_writer.c @@ -100,7 +100,7 @@ test_write_errors(void) SerdWorld* const world = serd_world_new(); ErrorContext ctx = {0U, 0U}; - const size_t max_offsets[] = {0, 373, 1911, 2003, 414}; + const size_t max_offsets[] = {0, 368, 1900, 1992, 413}; // Test errors at different offsets to hit different code paths for (unsigned s = 1; s <= (unsigned)SERD_TRIG; ++s) { diff --git a/test/test_writer.c b/test/test_writer.c index f76cc800..ee602044 100644 --- a/test/test_writer.c +++ b/test/test_writer.c @@ -406,6 +406,36 @@ test_write_pname_escapes(void) check_pname_escape((const char*)last_escape, "eg:s\n\teg:p eg:wx%C3%B7 .\n"); } +static void +test_write_bad_uri(void) +{ + SerdWorld* world = serd_world_new(); + SerdEnv* env = serd_env_new(serd_empty_string()); + SerdNode* s = serd_new_uri(serd_string("http://example.org/s")); + SerdNode* p = serd_new_uri(serd_string("http://example.org/p")); + SerdNode* rel = serd_new_uri(serd_string("rel")); + SerdBuffer buffer = {NULL, 0}; + SerdOutputStream output = serd_open_output_buffer(&buffer); + SerdWriter* writer = + serd_writer_new(world, SERD_NTRIPLES, 0U, env, &output, 1); + + assert(writer); + + const SerdStatus st = + serd_sink_write(serd_writer_sink(writer), 0U, s, p, rel, NULL); + assert(st); + assert(st == SERD_BAD_ARG); + + serd_writer_free(writer); + serd_close_output(&output); + serd_free(buffer.buf); + serd_node_free(rel); + serd_node_free(p); + serd_node_free(s); + serd_env_free(env); + serd_world_free(world); +} + int main(void) { @@ -418,6 +448,7 @@ main(void) test_writer_stack_overflow(); test_write_empty_syntax(); test_write_pname_escapes(); + test_write_bad_uri(); return 0; } |