diff options
author | David Robillard <d@drobilla.net> | 2021-04-15 17:52:44 -0400 |
---|---|---|
committer | David Robillard <d@drobilla.net> | 2022-01-14 01:13:45 -0500 |
commit | 7269c12bbe4532f5f5844aaec16cf1bb75a1c71c (patch) | |
tree | 4fe7f6b0545cb05b2491c70fa8f13d78645fd56b | |
parent | 6c31c9d159424d1a6ba2e64b8d271743b9710f32 (diff) | |
download | serd-7269c12bbe4532f5f5844aaec16cf1bb75a1c71c.tar.gz serd-7269c12bbe4532f5f5844aaec16cf1bb75a1c71c.tar.bz2 serd-7269c12bbe4532f5f5844aaec16cf1bb75a1c71c.zip |
Expand URIs in reader
This expands relative and prefixed URIs in the reader on the stack, rather than
passing them to the caller to be dealt with. This pushes these context-full
forms to the edge of the system as much as possible to minimise the headaches
they can cause.
Towards having stricter guarantees about nodes and eliminating the CURIE node
type altogether.
-rw-r--r-- | include/serd/serd.h | 3 | ||||
-rw-r--r-- | src/n3.c | 123 | ||||
-rw-r--r-- | src/reader.h | 5 | ||||
-rw-r--r-- | src/string.c | 2 | ||||
-rw-r--r-- | test/test_overflow.c | 44 | ||||
-rw-r--r-- | test/test_string.c | 2 | ||||
-rw-r--r-- | test/test_writer.c | 36 |
7 files changed, 191 insertions, 24 deletions
diff --git a/include/serd/serd.h b/include/serd/serd.h index 0db6205f..7be9395c 100644 --- a/include/serd/serd.h +++ b/include/serd/serd.h @@ -215,6 +215,7 @@ typedef enum { SERD_ERR_BAD_WRITE, ///< Error writing to file/stream SERD_ERR_NO_DATA, ///< Unexpected end of input SERD_ERR_BAD_CALL, ///< Invalid call + SERD_ERR_BAD_URI, ///< Invalid or unresolved URI } SerdStatus; /** @@ -1739,6 +1740,8 @@ typedef enum { SERD_READ_LAX = 1u << 0u, ///< Tolerate invalid input where possible SERD_READ_VARIABLES = 1u << 1u, ///< Support variable nodes SERD_READ_EXACT_BLANKS = 1u << 2u, ///< Allow clashes with generated blanks + SERD_READ_PREFIXED = 1u << 3u, ///< Do not expand prefixed names + SERD_READ_RELATIVE = 1u << 4u, ///< Do not expand relative URI references } SerdReaderFlag; /// Bitwise OR of SerdReaderFlag values @@ -15,6 +15,7 @@ */ #include "byte_source.h" +#include "env.h" #include "namespaces.h" #include "node.h" #include "reader.h" @@ -732,6 +733,64 @@ read_IRIREF_scheme(SerdReader* const reader, SerdNode* const dest) return SERD_FAILURE; } +typedef struct { + SerdReader* reader; + SerdNode* node; + SerdStatus status; +} WriteNodeContext; + +static size_t +write_to_stack(const void* const SERD_NONNULL buf, + const size_t size, + const size_t nmemb, + void* const SERD_NONNULL stream) +{ + WriteNodeContext* const ctx = (WriteNodeContext*)stream; + const uint8_t* const utf8 = (const uint8_t*)buf; + + ctx->status = push_bytes(ctx->reader, ctx->node, utf8, nmemb * size); + + return nmemb; +} + +static SerdStatus +resolve_IRIREF(SerdReader* const reader, + SerdNode* const dest, + const size_t string_start_offset) +{ + // If the URI is already absolute, we don't need to do anything + SerdURIView uri = serd_parse_uri(serd_node_string(dest)); + if (uri.scheme.len) { + return SERD_SUCCESS; + } + + // Resolve relative URI reference to a full URI + uri = serd_resolve_uri(uri, serd_env_base_uri_view(reader->env)); + if (!uri.scheme.len) { + return r_err(reader, + SERD_ERR_BAD_SYNTAX, + "failed to resolve relative URI reference <%s>", + serd_node_string(dest)); + } + + // Push a new temporary node for constructing the resolved URI + SerdNode* const temp = push_node(reader, SERD_URI, "", 0); + if (!temp) { + return SERD_ERR_OVERFLOW; + } + + // Write resolved URI to the temporary node + WriteNodeContext ctx = {reader, temp, SERD_SUCCESS}; + temp->length = serd_write_uri(uri, write_to_stack, &ctx); + if (!ctx.status) { + // Replace the destination with the new expanded node + memmove(dest, temp, serd_node_total_size(temp)); + serd_stack_pop_to(&reader->stack, string_start_offset + dest->length); + } + + return ctx.status; +} + static SerdStatus read_IRIREF(SerdReader* const reader, SerdNode** const dest) { @@ -744,6 +803,8 @@ read_IRIREF(SerdReader* const reader, SerdNode** const dest) return SERD_ERR_OVERFLOW; } + const size_t string_start_offset = reader->stack.size; + if (!fancy_syntax(reader) && (st = read_IRIREF_scheme(reader, *dest))) { return r_err(reader, st, "expected IRI scheme"); } @@ -757,7 +818,9 @@ read_IRIREF(SerdReader* const reader, SerdNode** const dest) return r_err( reader, SERD_ERR_BAD_SYNTAX, "invalid IRI character `%c'", c); case '>': - return SERD_SUCCESS; + return (reader->flags & SERD_READ_RELATIVE) + ? SERD_SUCCESS + : resolve_IRIREF(reader, *dest, string_start_offset); case '\\': if (read_UCHAR(reader, *dest, &code)) { return r_err(reader, SERD_ERR_BAD_SYNTAX, "invalid IRI escape"); @@ -810,7 +873,8 @@ static SerdStatus read_PrefixedName(SerdReader* const reader, SerdNode* const dest, const bool read_prefix, - bool* const ate_dot) + bool* const ate_dot, + const size_t string_start_offset) { SerdStatus st = SERD_SUCCESS; if (read_prefix && ((st = read_PN_PREFIX(reader, dest)) > SERD_FAILURE)) { @@ -822,10 +886,35 @@ read_PrefixedName(SerdReader* const reader, } if ((st = push_byte(reader, dest, eat_byte_safe(reader, ':'))) || - (st = read_PN_LOCAL(reader, dest, ate_dot)) > SERD_FAILURE) { + (st = read_PN_LOCAL(reader, dest, ate_dot)) > SERD_FAILURE || + (reader->flags & SERD_READ_PREFIXED)) { return st; } + // Expand to absolute URI + const SerdStringView curie = serd_node_string_view(dest); + SerdStringView prefix; + SerdStringView suffix; + if ((st = serd_env_expand_in_place(reader->env, curie, &prefix, &suffix))) { + return r_err( + reader, st, "failed to expand URI \"%s\"", serd_node_string(dest)); + } + + // Push a new temporary node for constructing the full URI + SerdNode* const temp = push_node(reader, SERD_URI, "", 0); + if ((st = push_bytes(reader, temp, (const uint8_t*)prefix.buf, prefix.len)) || + (st = push_bytes(reader, temp, (const uint8_t*)suffix.buf, suffix.len))) { + return st; + } + + // Replace the destination with the new expanded node + const size_t total_size = serd_node_total_size(temp); + + memmove(dest, temp, total_size); + + serd_stack_pop_to(&reader->stack, + string_start_offset + serd_node_length(dest)); + return SERD_SUCCESS; } @@ -919,14 +1008,15 @@ read_number(SerdReader* const reader, static SerdStatus read_iri(SerdReader* const reader, SerdNode** const dest, bool* const ate_dot) { - switch (peek_byte(reader)) { - case '<': + if (peek_byte(reader) == '<') { return read_IRIREF(reader, dest); - default: - *dest = push_node(reader, SERD_CURIE, "", 0); - return *dest ? read_PrefixedName(reader, *dest, true, ate_dot) - : SERD_ERR_OVERFLOW; } + + if (!(*dest = push_node(reader, SERD_CURIE, "", 0))) { + return SERD_ERR_OVERFLOW; + } + + return read_PrefixedName(reader, *dest, true, ate_dot, reader->stack.size); } static SerdStatus @@ -1018,7 +1108,8 @@ read_verb(SerdReader* reader, SerdNode** dest) return SERD_ERR_OVERFLOW; } - SerdStatus st = read_PN_PREFIX(reader, *dest); + const size_t string_start_offset = reader->stack.size; + SerdStatus st = read_PN_PREFIX(reader, *dest); if (st > SERD_FAILURE) { return st; } @@ -1034,7 +1125,9 @@ read_verb(SerdReader* reader, SerdNode** dest) : SERD_ERR_OVERFLOW); } - if ((st = read_PrefixedName(reader, *dest, false, &ate_dot)) || ate_dot) { + if ((st = read_PrefixedName( + reader, *dest, false, &ate_dot, string_start_offset)) || + ate_dot) { *dest = NULL; return r_err( reader, st > SERD_FAILURE ? st : SERD_ERR_BAD_SYNTAX, "expected verb"); @@ -1231,7 +1324,7 @@ read_object(SerdReader* const reader, case '\'': ret = read_literal(reader, &o, ate_dot); break; - default: + default: { /* Either a boolean literal, or a qname. Read the prefix first, and if it is in fact a "true" or "false" literal, produce that instead. */ @@ -1239,6 +1332,7 @@ read_object(SerdReader* const reader, return SERD_ERR_OVERFLOW; } + const size_t string_start_offset = reader->stack.size; while (!(ret = read_PN_CHARS_BASE(reader, o))) { } @@ -1256,11 +1350,13 @@ read_object(SerdReader* const reader, ret = SERD_SUCCESS; } } else if ((ret = read_PN_PREFIX_tail(reader, o)) > SERD_FAILURE || - (ret = read_PrefixedName(reader, o, false, ate_dot))) { + (ret = read_PrefixedName( + reader, o, false, ate_dot, string_start_offset))) { ret = (ret > SERD_FAILURE) ? ret : SERD_ERR_BAD_SYNTAX; return r_err(reader, ret, "expected prefixed name"); } } + } if (!ret && emit && simple && o) { ret = emit_statement(reader, *ctx, o); @@ -1569,6 +1665,7 @@ read_prefixID(SerdReader* const reader, const bool sparql, const bool token) read_ws_star(reader); st = eat_byte_check(reader, '.'); } + return st; } diff --git a/src/reader.h b/src/reader.h index 2374d7de..acef8ce2 100644 --- a/src/reader.h +++ b/src/reader.h @@ -164,10 +164,7 @@ push_byte(SerdReader* reader, SerdNode* node, const int c) } static inline SerdStatus -push_bytes(SerdReader* reader, - SerdNode* ref, - const uint8_t* bytes, - unsigned len) +push_bytes(SerdReader* reader, SerdNode* ref, const uint8_t* bytes, size_t len) { const bool has_space = reader->stack.buf_size >= reader->stack.size + len; if (has_space) { diff --git a/src/string.c b/src/string.c index 13fb9263..97c1432b 100644 --- a/src/string.c +++ b/src/string.c @@ -60,6 +60,8 @@ serd_strerror(const SerdStatus status) return "Unexpected end of input"; case SERD_ERR_BAD_CALL: return "Invalid call"; + case SERD_ERR_BAD_URI: + return "Invalid or unresolved URI"; } return "Unknown error"; diff --git a/test/test_overflow.c b/test/test_overflow.c index 7f08112d..02b71008 100644 --- a/test/test_overflow.c +++ b/test/test_overflow.c @@ -21,7 +21,7 @@ #include <assert.h> #include <stdio.h> -static const size_t min_stack_size = 4 * sizeof(size_t) + 256u; +static const size_t min_stack_size = 4 * sizeof(size_t) + 230u; static const size_t max_stack_size = 1024u; static SerdStatus @@ -90,11 +90,9 @@ static void test_turtle_overflow(void) { static const char* const test_strings[] = { - "<http://example.org/s> <http://example.org/p> :%99 .", "<http://example.org/s> <http://example.org/p> <http://example.org/> .", "<http://example.org/s> <http://example.org/p> " "<thisisanabsurdlylongurischeme://because/testing/> .", - "<http://example.org/s> <http://example.org/p> eg:foo .", "<http://example.org/s> <http://example.org/p> 1234 .", "<http://example.org/s> <http://example.org/p> (1 2 3 4) .", "<http://example.org/s> <http://example.org/p> (((((((42))))))) .", @@ -112,7 +110,41 @@ test_turtle_overflow(void) "@prefix ug.dot: <http://example.org/> . \nug.dot:s ug.dot:p ug.dot:o .\n", // NOLINTNEXTLINE(bugprone-suspicious-missing-comma) - "@prefix øøøøøøøøø: <http://example.org/long> . \n" + "<http://example.org/subject/with/a/long/path> " + "<http://example.org/predicate/with/a/long/path> " + "<http://example.org/object/with/a/long/path> .", + + // NOLINTNEXTLINE(bugprone-suspicious-missing-comma) + "<http://example.org/s> <http://example.org/p> " + "\"typed\"^^<http://example.org/Datatype> .", + + // NOLINTNEXTLINE(bugprone-suspicious-missing-comma) + "@prefix eg: <http://example.org/ns/test> .\n" + "<http://example.org/s> <http://example.org/p> " + "\"typed\"^^eg:Datatype .", + + // NOLINTNEXTLINE(bugprone-suspicious-missing-comma) + "@prefix eg: <http://example.org/ns/test> .\n" + "<http://example.org/s> <http://example.org/p> eg:foo .", + + // NOLINTNEXTLINE(bugprone-suspicious-missing-comma) + "@prefix prefix: <http://example.org/testing/curies> .\n" + "prefix:subject prefix:predicate prefix:object .\n", + + // NOLINTNEXTLINE(bugprone-suspicious-missing-comma) + "@prefix eg: <http://example.org/> .\n" + "eg:s eg:p [ eg:p [ eg:p [ eg:p [ eg:p []]]]] .\n", + + // NOLINTNEXTLINE(bugprone-suspicious-missing-comma) + "@prefix eg: <http://example.org/> .\n" + "eg:s eg:p ( 1 2 3 ( 4 5 6 ( 7 8 9 ) ) ) .\n", + + // NOLINTNEXTLINE(bugprone-suspicious-missing-comma) + "@prefix eg: <http://example.org/ns/test> .\n" + "<http://example.org/s> <http://example.org/p> eg:%99 .", + + // NOLINTNEXTLINE(bugprone-suspicious-missing-comma) + "@prefix øøøøøøøøø: <http://example.org/long> .\n" "<http://example.org/somewhatlongsubjecttooffsetthepredicate> øøøøøøøøø:p " "øøøøøøøøø:o .\n", @@ -140,8 +172,8 @@ test_turtle_overflow(void) // NOLINTNEXTLINE(bugprone-suspicious-missing-comma) "@prefix prefix: <http://example.org/testing/curies> .\n" - "prefix:subjectthatwillcomearoundtobeingfinishedanycharacternow " - "prefix:predicate prefix:object .\n", + "<http://example.org/very/long/uri/subject/to/overflow/the/predicate> " + "prefix:predicate prefix:object ; prefix:p prefix:o .\n", // NOLINTNEXTLINE(bugprone-suspicious-missing-comma) "@prefix eg: <http://example.org/> .\n" diff --git a/test/test_string.c b/test/test_string.c index 495138d8..2f805015 100644 --- a/test/test_string.c +++ b/test/test_string.c @@ -44,7 +44,7 @@ test_strerror(void) { const char* msg = serd_strerror(SERD_SUCCESS); assert(!strcmp(msg, "Success")); - for (int i = SERD_FAILURE; i <= SERD_ERR_BAD_CALL; ++i) { + for (int i = SERD_FAILURE; i <= SERD_ERR_BAD_URI; ++i) { msg = serd_strerror((SerdStatus)i); assert(strcmp(msg, "Success")); } diff --git a/test/test_writer.c b/test/test_writer.c index dc1ebfcb..0ced87d6 100644 --- a/test/test_writer.c +++ b/test/test_writer.c @@ -286,6 +286,41 @@ test_write_empty_syntax(void) serd_world_free(world); } +static void +test_write_bad_uri(void) +{ + SerdWorld* world = serd_world_new(); + SerdNodes* nodes = serd_world_nodes(world); + SerdEnv* env = serd_env_new(SERD_EMPTY_STRING()); + + const SerdNode* s = + serd_nodes_uri(nodes, SERD_STRING("http://example.org/s")); + + const SerdNode* p = + serd_nodes_uri(nodes, SERD_STRING("http://example.org/p")); + + const SerdNode* rel = serd_nodes_uri(nodes, SERD_STRING("rel")); + + SerdBuffer buffer = {NULL, 0}; + SerdByteSink* byte_sink = serd_byte_sink_new_buffer(&buffer); + + SerdWriter* writer = + serd_writer_new(world, SERD_NTRIPLES, 0u, env, byte_sink); + + assert(writer); + + const SerdStatus st = + serd_sink_write(serd_writer_sink(writer), 0u, s, p, rel, NULL); + assert(st); + assert(st == SERD_ERR_BAD_ARG); + + serd_free(serd_buffer_sink_finish(&buffer)); + serd_writer_free(writer); + serd_byte_sink_free(byte_sink); + serd_env_free(env); + serd_world_free(world); +} + int main(void) { @@ -295,6 +330,7 @@ main(void) test_strict_write(); test_write_error(); test_write_empty_syntax(); + test_write_bad_uri(); return 0; } |