From fff826f406e0b9975fd8672041e50dd1a342339f Mon Sep 17 00:00:00 2001 From: David Robillard Date: Wed, 29 Mar 2023 07:28:19 -0400 Subject: Simplify URI API and implementation --- include/serd/node.h | 43 +------- include/serd/uri.h | 125 ++++++++++++++------- src/env.c | 44 ++++---- src/node.c | 122 ++++++++------------- src/node.h | 6 + src/reader.c | 2 +- src/serdi.c | 5 +- src/uri.c | 307 ++++++++++++++++++++++++++-------------------------- src/uri_utils.h | 12 +- src/writer.c | 28 +++-- test/test_env.c | 11 ++ test/test_uri.c | 162 +++++++++++++++++++++------ 12 files changed, 490 insertions(+), 377 deletions(-) diff --git a/include/serd/node.h b/include/serd/node.h index 1c64b6b9..b95891c9 100644 --- a/include/serd/node.h +++ b/include/serd/node.h @@ -112,20 +112,16 @@ serd_new_substring(SerdNodeType type, size_t len); /** - Create a new URI node from a node. + Create a new URI node from a parsed URI. */ SERD_API SerdNode* SERD_ALLOCATED -serd_new_uri_from_node(const SerdNode* SERD_NONNULL uri_node, - const SerdURIView* SERD_NULLABLE base, - SerdURIView* SERD_NULLABLE out); +serd_new_parsed_uri(SerdURIView uri); /** Create a new URI node from a string. */ SERD_API SerdNode* SERD_ALLOCATED -serd_new_uri_from_string(const char* SERD_NULLABLE str, - const SerdURIView* SERD_NULLABLE base, - SerdURIView* SERD_NULLABLE out); +serd_new_uri(const char* SERD_NONNULL str); /** Create a new file URI node from a file system path and optional hostname. @@ -141,39 +137,6 @@ serd_new_file_uri(const char* SERD_NONNULL path, const char* SERD_NULLABLE hostname, SerdURIView* SERD_NULLABLE out); -/** - Create a new node by serialising `uri` into a new string. - - @param uri The URI to serialise. - - @param base Base URI to resolve `uri` against (or NULL for no resolution). - - @param out Set to the parsing of the new URI (i.e. points only to - memory owned by the new returned node). -*/ -SERD_API SerdNode* SERD_ALLOCATED -serd_new_uri(const SerdURIView* SERD_NONNULL uri, - const SerdURIView* SERD_NULLABLE base, - SerdURIView* SERD_NULLABLE out); - -/** - Create a new node by serialising `uri` into a new relative URI. - - @param uri The URI to serialise. - - @param base Base URI to make `uri` relative to, if possible. - - @param root Root URI for resolution (see serd_uri_serialise_relative()). - - @param out Set to the parsing of the new URI (i.e. points only to - memory owned by the new returned node). -*/ -SERD_API SerdNode* SERD_ALLOCATED -serd_new_relative_uri(const SerdURIView* SERD_NONNULL uri, - const SerdURIView* SERD_NULLABLE base, - const SerdURIView* SERD_NULLABLE root, - SerdURIView* SERD_NULLABLE out); - /** Create a new node by serialising `d` into an xsd:decimal string. diff --git a/include/serd/uri.h b/include/serd/uri.h index 06cb97fb..7b5529dc 100644 --- a/include/serd/uri.h +++ b/include/serd/uri.h @@ -5,7 +5,6 @@ #define SERD_URI_H #include "serd/attributes.h" -#include "serd/status.h" #include "serd/stream.h" #include "serd/string_view.h" @@ -21,19 +20,30 @@ SERD_BEGIN_DECLS */ /** - A parsed URI. - - This struct directly refers to slices in other strings, it does not own any - memory itself. This allows some URI operations like resolution to be done - in-place without allocating memory. + A parsed view of a URI. + + This representation is designed for fast streaming. It makes it possible to + create relative URI references or resolve them into absolute URIs in-place + without any string allocation. + + Each component refers to slices in other strings, so a URI view must outlive + any strings it was parsed from. Note that the components are not + necessarily null-terminated. + + The scheme, authority, path, query, and fragment simply point to the string + value of those components, not including any delimiters. The path_prefix is + a special component for storing relative or resolved paths. If it points to + a string (usually a base URI the URI was resolved against), then this string + is prepended to the path. Otherwise, the length is interpreted as the + number of up-references ("../") that must be prepended to the path. */ typedef struct { - SerdStringView scheme; ///< Scheme - SerdStringView authority; ///< Authority - SerdStringView path_base; ///< Path prefix if relative - SerdStringView path; ///< Path suffix - SerdStringView query; ///< Query - SerdStringView fragment; ///< Fragment + SerdStringView scheme; ///< Scheme + SerdStringView authority; ///< Authority + SerdStringView path_prefix; ///< Path prefix for relative/resolved paths + SerdStringView path; ///< Path suffix + SerdStringView query; ///< Query + SerdStringView fragment; ///< Fragment } SerdURIView; static const SerdURIView SERD_URI_NULL = @@ -46,49 +56,86 @@ static const SerdURIView SERD_URI_NULL = @param uri A file URI. @param hostname If non-NULL, set to the hostname, if present. - @return The path component of the URI. + @return A newly-allocated filesystem path. */ -SERD_API char* SERD_NULLABLE -serd_file_uri_parse(const char* SERD_NONNULL uri, +SERD_API char* SERD_ALLOCATED +serd_parse_file_uri(const char* SERD_NONNULL uri, char* SERD_NONNULL* SERD_NULLABLE hostname); -/// Return true iff `utf8` starts with a valid URI scheme +/// Return true iff `string` starts with a valid URI scheme SERD_PURE_API bool -serd_uri_string_has_scheme(const char* SERD_NULLABLE utf8); +serd_uri_string_has_scheme(const char* SERD_NONNULL string); -/// Parse `utf8`, writing result to `out` -SERD_API SerdStatus -serd_uri_parse(const char* SERD_NONNULL utf8, SerdURIView* SERD_NONNULL out); +/// Parse `string` and return a URI view that points into it +SERD_PURE_API SerdURIView +serd_parse_uri(const char* SERD_NONNULL string); /** - Set target `t` to reference `r` resolved against `base`. + Return reference `r` resolved against `base`. + + This will make `r` an absolute URI if possible. @see [RFC3986 5.2.2](http://tools.ietf.org/html/rfc3986#section-5.2.2) + + @param r URI reference to make absolute, for example "child/path". + + @param base Base URI, for example "http://example.org/base/". + + @return An absolute URI, for example "http://example.org/base/child/path", + or `r` if it is not a URI reference that can be resolved against `base`. */ -SERD_API void -serd_uri_resolve(const SerdURIView* SERD_NONNULL r, - const SerdURIView* SERD_NONNULL base, - SerdURIView* SERD_NONNULL t); +SERD_PURE_API SerdURIView +serd_resolve_uri(SerdURIView r, SerdURIView base); -/// Serialise `uri` with a series of calls to `sink` -SERD_API size_t -serd_uri_serialise(const SerdURIView* SERD_NONNULL uri, - SerdSink SERD_NONNULL sink, - void* SERD_NONNULL stream); +/** + Return `r` as a reference relative to `base` if possible. + + @see [RFC3986 5.2.2](http://tools.ietf.org/html/rfc3986#section-5.2.2) + + @param r URI to make relative, for example + "http://example.org/base/child/path". + + @param base Base URI, for example "http://example.org/base". + + @return A relative URI reference, for example "child/path", `r` if it can + not be made relative to `base`, or a null URI if `r` could be made relative + to base, but the path prefix is already being used (most likely because `r` + was previously a relative URI reference that was resolved against some + base). +*/ +SERD_PURE_API SerdURIView +serd_relative_uri(SerdURIView r, SerdURIView base); + +/** + Return whether `r` can be written as a reference relative to `base`. + + For example, with `base` "http://example.org/base/", this returns true if + `r` is also "http://example.org/base/", or something like + "http://example.org/base/child" ("child") + "http://example.org/base/child/grandchild#fragment" + ("child/grandchild#fragment"), + "http://example.org/base/child/grandchild?query" ("child/grandchild?query"), + and so on. + + @return True if `r` and `base` are equal or if `r` is a child of `base`. +*/ +SERD_PURE_API bool +serd_uri_is_within(SerdURIView r, SerdURIView base); /** - Serialise `uri` relative to `base` with a series of calls to `sink` + Write `uri` as a string to `sink`. + + This will call `sink` several times to emit the URI. - The `uri` is written as a relative URI iff if it a child of `base` and - `root`. The optional `root` parameter must be a prefix of `base` and can be - used keep up-references ("../") within a certain namespace. + @param uri URI to write as a string. + @param sink Sink to write string output to. + @param stream Opaque user argument to pass to `sink`. + @return The number of bytes written. */ SERD_API size_t -serd_uri_serialise_relative(const SerdURIView* SERD_NONNULL uri, - const SerdURIView* SERD_NULLABLE base, - const SerdURIView* SERD_NULLABLE root, - SerdSink SERD_NONNULL sink, - void* SERD_NONNULL stream); +serd_write_uri(SerdURIView uri, + SerdSink SERD_NONNULL sink, + void* SERD_NONNULL stream); /** @} diff --git a/src/env.c b/src/env.c index 516eaf32..78680a0f 100644 --- a/src/env.c +++ b/src/env.c @@ -6,6 +6,7 @@ #include "node.h" #include "serd/node.h" +#include "serd/uri.h" #include #include @@ -78,15 +79,16 @@ serd_env_set_base_uri(SerdEnv* const env, const SerdNode* const uri) return SERD_SUCCESS; } - // Resolve base URI and create a new node and URI for it - SerdURIView base_uri; - SerdNode* base_uri_node = - serd_new_uri_from_node(uri, &env->base_uri, &base_uri); + // Resolve the new base against the current base in case it is relative + const SerdURIView new_base_uri = + serd_resolve_uri(serd_parse_uri(serd_node_string(uri)), env->base_uri); + + SerdNode* const new_base_node = serd_new_parsed_uri(new_base_uri); // Replace the current base URI serd_node_free(env->base_uri_node); - env->base_uri_node = base_uri_node; - env->base_uri = base_uri; + env->base_uri_node = new_base_node; + env->base_uri = serd_node_uri_view(env->base_uri_node); return SERD_SUCCESS; } @@ -143,17 +145,22 @@ serd_env_set_prefix(SerdEnv* const env, if (serd_uri_string_has_scheme(serd_node_string(uri))) { // Set prefix to absolute URI serd_env_add(env, name, uri); - } else { - // Resolve relative URI and create a new node and URI for it - SerdURIView abs_uri; - SerdNode* abs_uri_node = - serd_new_uri_from_node(uri, &env->base_uri, &abs_uri); - - // Set prefix to resolved (absolute) URI - serd_env_add(env, name, abs_uri_node); - serd_node_free(abs_uri_node); + return SERD_SUCCESS; + } + + if (!env->base_uri_node) { + return SERD_BAD_ARG; } + // Resolve relative URI and create a new node and URI for it + SerdNode* const abs_uri = + serd_new_resolved_uri(serd_node_string_view(uri), env->base_uri); + + // Set prefix to resolved (absolute) URI + serd_env_add(env, name, abs_uri); + + serd_node_free(abs_uri); + return SERD_SUCCESS; } @@ -237,16 +244,15 @@ serd_env_expand_node(const SerdEnv* const env, const SerdNode* const node) switch (node->type) { case SERD_LITERAL: break; - case SERD_URI: { - SerdURIView ignored; - return serd_new_uri_from_node(node, &env->base_uri, &ignored); - } + case SERD_URI: + return serd_new_resolved_uri(serd_node_string_view(node), env->base_uri); case SERD_CURIE: { SerdStringView prefix; SerdStringView suffix; if (serd_env_expand(env, node, &prefix, &suffix)) { return NULL; } + const size_t len = prefix.length + suffix.length; SerdNode* ret = serd_node_malloc(len, 0, SERD_URI); char* buf = serd_node_buffer(ret); diff --git a/src/node.c b/src/node.c index 9acbb8be..de42075e 100644 --- a/src/node.c +++ b/src/node.c @@ -37,7 +37,7 @@ static const size_t serd_node_align = 2 * sizeof(uint64_t); static size_t serd_uri_string_length(const SerdURIView* const uri) { - size_t len = uri->path_base.length; + size_t len = uri->path_prefix.length; #define ADD_LEN(field, n_delims) \ if ((field).length) { \ @@ -152,29 +152,56 @@ serd_node_equals(const SerdNode* const a, const SerdNode* const b) } SerdNode* -serd_new_uri_from_node(const SerdNode* const uri_node, - const SerdURIView* const base, - SerdURIView* const out) +serd_new_uri(const char* const str) { - const char* uri_str = serd_node_string(uri_node); - return (uri_node && uri_node->type == SERD_URI && uri_str) - ? serd_new_uri_from_string(uri_str, base, out) - : NULL; + const size_t length = strlen(str); + SerdNode* node = serd_node_malloc(length, 0, SERD_URI); + memcpy(serd_node_buffer(node), str, length); + node->length = length; + return node; } SerdNode* -serd_new_uri_from_string(const char* const str, - const SerdURIView* const base, - SerdURIView* const out) +serd_new_parsed_uri(const SerdURIView uri) +{ + const size_t len = serd_uri_string_length(&uri); + SerdNode* const node = serd_node_malloc(len, 0, SERD_URI); + char* ptr = serd_node_buffer(node); + const size_t actual_len = serd_write_uri(uri, string_sink, &ptr); + + serd_node_buffer(node)[actual_len] = '\0'; + node->length = actual_len; + + return node; +} + +static SerdNode* +serd_new_from_uri(const SerdURIView uri, const SerdURIView base) { - if (!str || str[0] == '\0') { - // Empty URI => Base URI, or nothing if no base is given - return base ? serd_new_uri(base, NULL, out) : NULL; + const SerdURIView abs_uri = serd_resolve_uri(uri, base); + const size_t len = serd_uri_string_length(&abs_uri); + SerdNode* node = serd_node_malloc(len, 0, SERD_URI); + char* ptr = serd_node_buffer(node); + const size_t actual_len = serd_write_uri(abs_uri, string_sink, &ptr); + + serd_node_buffer(node)[actual_len] = '\0'; + node->length = actual_len; + + return node; +} + +SerdNode* +serd_new_resolved_uri(const SerdStringView string, const SerdURIView base) +{ + const SerdURIView uri = serd_parse_uri(string.data); + SerdNode* const result = serd_new_from_uri(uri, base); + + if (!serd_uri_string_has_scheme(serd_node_string(result))) { + serd_node_free(result); + return NULL; } - SerdURIView uri; - serd_uri_parse(str, &uri); - return serd_new_uri(&uri, base, out); // Resolve/Serialise + return result; } static bool @@ -270,61 +297,13 @@ serd_new_file_uri(const char* const path, const char* const string = serd_buffer_sink_finish(&buffer); SerdNode* const node = serd_new_substring(SERD_URI, string, length); if (out) { - serd_uri_parse(serd_node_buffer(node), out); + *out = serd_parse_uri(serd_node_buffer(node)); } free(buffer.buf); return node; } -SerdNode* -serd_new_uri(const SerdURIView* const uri, - const SerdURIView* const base, - SerdURIView* const out) -{ - SerdURIView abs_uri = *uri; - if (base) { - serd_uri_resolve(uri, base, &abs_uri); - } - - const size_t len = serd_uri_string_length(&abs_uri); - SerdNode* node = serd_node_malloc(len, 0, SERD_URI); - char* ptr = serd_node_buffer(node); - const size_t actual_len = serd_uri_serialise(&abs_uri, string_sink, &ptr); - - serd_node_buffer(node)[actual_len] = '\0'; - node->length = actual_len; - - if (out) { - serd_uri_parse(serd_node_buffer(node), out); // TODO: avoid double parse - } - - return node; -} - -SerdNode* -serd_new_relative_uri(const SerdURIView* const uri, - const SerdURIView* const base, - const SerdURIView* const root, - SerdURIView* const out) -{ - const size_t uri_len = serd_uri_string_length(uri); - const size_t base_len = serd_uri_string_length(base); - SerdNode* node = serd_node_malloc(uri_len + base_len, 0, SERD_URI); - char* ptr = serd_node_buffer(node); - const size_t actual_len = - serd_uri_serialise_relative(uri, base, root, string_sink, &ptr); - - serd_node_buffer(node)[actual_len] = '\0'; - node->length = actual_len; - - if (out) { - serd_uri_parse(serd_node_buffer(node), out); // TODO: avoid double parse - } - - return node; -} - static unsigned serd_digits(const double abs) { @@ -458,16 +437,11 @@ serd_node_string_view(const SerdNode* const node) return r; } -SerdURIView +SERD_PURE_FUNC SerdURIView serd_node_uri_view(const SerdNode* const node) { - SerdURIView result = SERD_URI_NULL; - - if (node->type == SERD_URI) { - serd_uri_parse(serd_node_string(node), &result); - } - - return result; + return (node->type == SERD_URI) ? serd_parse_uri(serd_node_string(node)) + : SERD_URI_NULL; } SerdNodeFlags diff --git a/src/node.h b/src/node.h index 60e0d2ff..41cf0d82 100644 --- a/src/node.h +++ b/src/node.h @@ -6,6 +6,8 @@ #include "serd/attributes.h" #include "serd/node.h" +#include "serd/string_view.h" +#include "serd/uri.h" #include @@ -34,4 +36,8 @@ void serd_node_set(SerdNode* SERD_NONNULL* SERD_NONNULL dst, const SerdNode* SERD_NONNULL src); +/// Create a new URI from a string, resolved against a base URI +SerdNode* SERD_ALLOCATED +serd_new_resolved_uri(SerdStringView string, SerdURIView base_uri); + #endif // SERD_SRC_NODE_H diff --git a/src/reader.c b/src/reader.c index 9a9fe744..140c07ba 100644 --- a/src/reader.c +++ b/src/reader.c @@ -268,7 +268,7 @@ serd_reader_set_default_graph(SerdReader* const reader, SerdStatus serd_reader_read_file(SerdReader* const reader, const char* const uri) { - char* const path = serd_file_uri_parse(uri, NULL); + char* const path = serd_parse_file_uri(uri, NULL); if (!path) { return SERD_BAD_ARG; } diff --git a/src/serdi.c b/src/serdi.c index afae904c..85462532 100644 --- a/src/serdi.c +++ b/src/serdi.c @@ -308,7 +308,7 @@ main(int argc, char** argv) in_name = in_name ? in_name : input; if (!in_fd) { if (!strncmp(input, "file:", 5)) { - input_path = serd_file_uri_parse(input, NULL); + input_path = serd_parse_file_uri(input, NULL); input = input_path; } if (!input || !(in_fd = serd_fopen(input, "rb"))) { @@ -334,7 +334,8 @@ main(int argc, char** argv) SerdURIView base_uri = SERD_URI_NULL; SerdNode* base = NULL; if (a < argc) { // Base URI given on command line - base = serd_new_uri_from_string((const char*)argv[a], NULL, &base_uri); + base_uri = serd_parse_uri(argv[a]); + base = serd_new_parsed_uri(base_uri); } else if (from_file && in_fd != stdin) { // Use input file URI base = serd_new_file_uri(input, NULL, &base_uri); } diff --git a/src/uri.c b/src/uri.c index a5d10877..eec9c7fc 100644 --- a/src/uri.c +++ b/src/uri.c @@ -5,7 +5,6 @@ #include "uri_utils.h" #include "serd/buffer.h" -#include "serd/status.h" #include "serd/stream.h" #include "serd/string_view.h" #include "serd/uri.h" @@ -17,12 +16,13 @@ #include char* -serd_file_uri_parse(const char* const uri, char** const hostname) +serd_parse_file_uri(const char* const uri, char** const hostname) { const char* path = uri; if (hostname) { *hostname = NULL; } + if (!strncmp(uri, "file://", 7)) { const char* auth = uri + 7; if (*auth == '/') { // No hostname @@ -31,6 +31,7 @@ serd_file_uri_parse(const char* const uri, char** const hostname) if (!(path = strchr(auth, '/'))) { return NULL; } + if (hostname) { const size_t len = (size_t)(path - auth); *hostname = (char*)calloc(len + 1, 1); @@ -62,36 +63,34 @@ serd_file_uri_parse(const char* const uri, char** const hostname) serd_buffer_sink(s, 1, &buffer); } } + return serd_buffer_sink_finish(&buffer); } +/// RFC3986: scheme ::= ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) bool -serd_uri_string_has_scheme(const char* utf8) +serd_uri_string_has_scheme(const char* const string) { - // RFC3986: scheme ::= ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) - if (!utf8 || !is_alpha(utf8[0])) { - return false; // Invalid scheme initial character, URI is relative - } - - for (char c = 0; (c = *++utf8) != '\0';) { - if (!is_uri_scheme_char(c)) { - return false; - } + if (is_alpha(string[0])) { + for (size_t i = 1; string[i]; ++i) { + if (!is_uri_scheme_char(string[i])) { + return false; // Non-scheme character before a ':' + } - if (c == ':') { - return true; // End of scheme + if (string[i] == ':') { + return true; // Valid scheme terminated by a ':' + } } } return false; } -SerdStatus -serd_uri_parse(const char* const utf8, SerdURIView* const out) +SerdURIView +serd_parse_uri(const char* const string) { - *out = SERD_URI_NULL; - - const char* ptr = utf8; + SerdURIView result = SERD_URI_NULL; + const char* ptr = string; /* See http://tools.ietf.org/html/rfc3986#section-3 URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] @@ -105,11 +104,11 @@ serd_uri_parse(const char* const utf8, SerdURIView* const out) case '/': case '?': case '#': - ptr = utf8; + ptr = string; goto path; // Relative URI (starts with path by definition) case ':': - out->scheme.data = utf8; - out->scheme.length = (size_t)((ptr++) - utf8); + result.scheme.data = string; + result.scheme.length = (size_t)((ptr++) - string); goto maybe_authority; // URI with scheme case '+': case '-': @@ -130,7 +129,7 @@ serd_uri_parse(const char* const utf8, SerdURIView* const out) maybe_authority: if (*ptr == '/' && *(ptr + 1) == '/') { ptr += 2; - out->authority.data = ptr; + result.authority.data = ptr; for (char c = 0; (c = *ptr) != '\0'; ++ptr) { switch (c) { case '/': @@ -140,7 +139,7 @@ maybe_authority: case '#': goto fragment; default: - ++out->authority.length; + ++result.authority.length; } } } @@ -159,8 +158,8 @@ path: default: break; } - out->path.data = ptr; - out->path.length = 0; + result.path.data = ptr; + result.path.length = 0; for (char c = 0; (c = *ptr) != '\0'; ++ptr) { switch (c) { case '?': @@ -168,7 +167,7 @@ path: case '#': goto fragment; default: - ++out->path.length; + ++result.path.length; } } @@ -178,12 +177,12 @@ path: */ query: if (*ptr == '?') { - out->query.data = ++ptr; + result.query.data = ++ptr; for (char c = 0; (c = *ptr) != '\0'; ++ptr) { if (c == '#') { goto fragment; } - ++out->query.length; + ++result.query.length; } } @@ -193,14 +192,14 @@ query: */ fragment: if (*ptr == '#') { - out->fragment.data = ptr; + result.fragment.data = ptr; while (*ptr++ != '\0') { - ++out->fragment.length; + ++result.fragment.length; } } end: - return SERD_SUCCESS; + return result; } /** @@ -262,179 +261,185 @@ merge(SerdStringView* const base, SerdStringView* const path) } /// See http://tools.ietf.org/html/rfc3986#section-5.2.2 -void -serd_uri_resolve(const SerdURIView* const r, - const SerdURIView* const base, - SerdURIView* const t) +SerdURIView +serd_resolve_uri(const SerdURIView r, const SerdURIView base) { - if (!base->scheme.length) { - *t = *r; // Don't resolve against non-absolute URIs - return; + if (r.scheme.length || !base.scheme.length) { + return r; // No resolution necessary || possible (respectively) } - t->path_base.data = ""; - t->path_base.length = 0; - if (r->scheme.length) { - *t = *r; + SerdURIView t = SERD_URI_NULL; + + if (r.authority.length) { + t.authority = r.authority; + t.path = r.path; + t.query = r.query; } else { - if (r->authority.length) { - t->authority = r->authority; - t->path = r->path; - t->query = r->query; + t.path = r.path; + if (!r.path.length) { + t.path_prefix = base.path; + t.query = r.query.length ? r.query : base.query; } else { - t->path = r->path; - if (!r->path.length) { - t->path_base = base->path; - if (r->query.length) { - t->query = r->query; - } else { - t->query = base->query; - } - } else { - if (r->path.data[0] != '/') { - t->path_base = base->path; - } - merge(&t->path_base, &t->path); - t->query = r->query; + if (r.path.data[0] != '/') { + t.path_prefix = base.path; } - t->authority = base->authority; + + merge(&t.path_prefix, &t.path); + t.query = r.query; } - t->scheme = base->scheme; - t->fragment = r->fragment; + + t.authority = base.authority; } + + t.scheme = base.scheme; + t.fragment = r.fragment; + + return t; } -/** Write the path of `uri` starting at index `i` */ -static size_t -write_path_tail(SerdSink sink, - void* const stream, - const SerdURIView* const uri, - const size_t i) +SerdURIView +serd_relative_uri(const SerdURIView uri, const SerdURIView base) { - size_t len = 0; - if (i < uri->path_base.length) { - len += sink(uri->path_base.data + i, uri->path_base.length - i, stream); + if (!uri_is_related(&uri, &base)) { + return uri; } - if (uri->path.data) { - if (i < uri->path_base.length) { - len += sink(uri->path.data, uri->path.length, stream); - } else { - const size_t j = (i - uri->path_base.length); - len += sink(uri->path.data + j, uri->path.length - j, stream); - } - } + SerdURIView result = SERD_URI_NULL; - return len; -} + // Regardless of the path, the query and/or fragment come along + result.query = uri.query; + result.fragment = uri.fragment; -/** Write the path of `uri` relative to the path of `base`. */ -static size_t -write_rel_path(SerdSink sink, - void* const stream, - const SerdURIView* const uri, - const SerdURIView* const base) -{ - const size_t path_len = uri_path_len(uri); - const size_t base_len = uri_path_len(base); + const size_t path_len = uri_path_len(&uri); + const size_t base_len = uri_path_len(&base); const size_t min_len = (path_len < base_len) ? path_len : base_len; // Find the last separator common to both paths size_t last_shared_sep = 0; size_t i = 0; - for (; i < min_len && uri_path_at(uri, i) == uri_path_at(base, i); ++i) { - if (uri_path_at(uri, i) == '/') { + for (; i < min_len && uri_path_at(&uri, i) == uri_path_at(&base, i); ++i) { + if (uri_path_at(&uri, i) == '/') { last_shared_sep = i; } } - if (i == path_len && i == base_len) { // Paths are identical - return 0; + // If the URI and base URI have identical paths, the relative path is empty + if (i == path_len && i == base_len) { + result.path.data = uri.path.data; + result.path.length = 0; + return result; } + // Otherwise, we need to build the relative path out of string slices + // Find the number of up references ("..") required size_t up = 0; for (size_t s = last_shared_sep + 1; s < base_len; ++s) { - if (uri_path_at(base, s) == '/') { + if (uri_path_at(&base, s) == '/') { ++up; } } - // Write up references - size_t len = 0; - for (size_t u = 0; u < up; ++u) { - len += sink("../", 3, stream); + if (up > 0) { + if (last_shared_sep < uri.path_prefix.length) { + return SERD_URI_NULL; + } + + // Special representation: NULL buffer and len set to the depth + result.path_prefix.length = up; } - // Write suffix - return len + write_path_tail(sink, stream, uri, last_shared_sep + 1); + if (last_shared_sep < uri.path_prefix.length) { + result.path_prefix.data = uri.path_prefix.data + last_shared_sep + 1; + result.path_prefix.length = uri.path_prefix.length - last_shared_sep - 1; + result.path = uri.path; + } else { + result.path.data = uri.path.data + last_shared_sep + 1; + result.path.length = uri.path.length - last_shared_sep - 1; + } + + return result; } -static uint8_t -serd_uri_path_starts_without_slash(const SerdURIView* uri) +bool +serd_uri_is_within(const SerdURIView uri, const SerdURIView base) { - return ((uri->path_base.length || uri->path.length) && - ((!uri->path_base.length || uri->path_base.data[0] != '/') && - (!uri->path.length || uri->path.data[0] != '/'))); + if (!base.scheme.length || !slice_equals(&base.scheme, &uri.scheme) || + !slice_equals(&base.authority, &uri.authority)) { + return false; + } + + bool differ = false; + const size_t path_len = uri_path_len(&uri); + const size_t base_len = uri_path_len(&base); + + size_t last_base_slash = 0; + for (size_t i = 0; i < path_len && i < base_len; ++i) { + const char u = uri_path_at(&uri, i); + const char b = uri_path_at(&base, i); + + differ = differ || u != b; + if (b == '/') { + last_base_slash = i; + if (differ) { + return false; + } + } + } + + for (size_t i = last_base_slash + 1; i < base_len; ++i) { + if (uri_path_at(&base, i) == '/') { + return false; + } + } + + return true; } /// See http://tools.ietf.org/html/rfc3986#section-5.3 size_t -serd_uri_serialise_relative(const SerdURIView* const uri, - const SerdURIView* const base, - const SerdURIView* const root, - SerdSink sink, - void* const stream) +serd_write_uri(const SerdURIView uri, SerdSink sink, void* const stream) { - size_t len = 0; - const bool relative = - root ? uri_is_under(uri, root) : uri_is_related(uri, base); + size_t len = 0; - if (relative) { - len = write_rel_path(sink, stream, uri, base); + if (uri.scheme.data) { + len += sink(uri.scheme.data, uri.scheme.length, stream); + len += sink(":", 1, stream); } - if (!relative || (!len && base->query.data)) { - if (uri->scheme.data) { - len += sink(uri->scheme.data, uri->scheme.length, stream); - len += sink(":", 1, stream); + if (uri.authority.data) { + len += sink("//", 2, stream); + len += sink(uri.authority.data, uri.authority.length, stream); + + if (uri.authority.length > 0 && uri_path_len(&uri) > 0 && + uri_path_at(&uri, 0) != '/') { + // Special case: ensure path begins with a slash + // https://tools.ietf.org/html/rfc3986#section-3.2 + len += sink("/", 1, stream); } - if (uri->authority.data) { - len += sink("//", 2, stream); - len += sink(uri->authority.data, uri->authority.length, stream); - - const bool authority_ends_with_slash = - (uri->authority.length > 0 && - uri->authority.data[uri->authority.length - 1] == '/'); - - if (!authority_ends_with_slash && - serd_uri_path_starts_without_slash(uri)) { - // Special case: ensure path begins with a slash - // https://tools.ietf.org/html/rfc3986#section-3.2 - len += sink("/", 1, stream); - } + } + + if (uri.path_prefix.data) { + len += sink(uri.path_prefix.data, uri.path_prefix.length, stream); + } else if (uri.path_prefix.length) { + for (size_t i = 0; i < uri.path_prefix.length; ++i) { + len += sink("../", 3, stream); } - len += write_path_tail(sink, stream, uri, 0); } - if (uri->query.data) { + if (uri.path.data) { + len += sink(uri.path.data, uri.path.length, stream); + } + + if (uri.query.data) { len += sink("?", 1, stream); - len += sink(uri->query.data, uri->query.length, stream); + len += sink(uri.query.data, uri.query.length, stream); } - if (uri->fragment.data) { - // Note uri->fragment.data includes the leading '#' - len += sink(uri->fragment.data, uri->fragment.length, stream); + if (uri.fragment.data) { + // Note that uri.fragment.data includes the leading '#' + len += sink(uri.fragment.data, uri.fragment.length, stream); } return len; } - -/// See http://tools.ietf.org/html/rfc3986#section-5.3 -size_t -serd_uri_serialise(const SerdURIView* const uri, - SerdSink sink, - void* const stream) -{ - return serd_uri_serialise_relative(uri, NULL, NULL, sink, stream); -} diff --git a/src/uri_utils.h b/src/uri_utils.h index ebc32b78..36db5fb5 100644 --- a/src/uri_utils.h +++ b/src/uri_utils.h @@ -25,17 +25,17 @@ slice_equals(const SerdStringView* a, const SerdStringView* b) static inline size_t uri_path_len(const SerdURIView* uri) { - return uri->path_base.length + uri->path.length; + return uri->path_prefix.length + uri->path.length; } static inline char uri_path_at(const SerdURIView* uri, size_t i) { - if (i < uri->path_base.length) { - return uri->path_base.data[i]; + if (i < uri->path_prefix.length) { + return uri->path_prefix.data[i]; } - return uri->path.data[i - uri->path_base.length]; + return uri->path.data[i - uri->path_prefix.length]; } /** @@ -87,7 +87,9 @@ uri_rooted_index(const SerdURIView* uri, const SerdURIView* root) static inline SERD_PURE_FUNC bool uri_is_related(const SerdURIView* uri, const SerdURIView* root) { - return uri_rooted_index(uri, root).shared != SIZE_MAX; + return root && root->scheme.length && + slice_equals(&root->scheme, &uri->scheme) && + slice_equals(&root->authority, &uri->authority); } /** Return true iff `uri` is within the base of `root` */ diff --git a/src/writer.c b/src/writer.c index 2372f712..a0325243 100644 --- a/src/writer.c +++ b/src/writer.c @@ -770,21 +770,19 @@ write_uri_node(SerdWriter* const writer, TRY(st, esink("<", 1, writer)); - if (writer->flags & SERD_WRITE_RESOLVED) { - SerdURIView in_base_uri; - SerdURIView uri; - SerdURIView abs_uri; - serd_env_base_uri(writer->env, &in_base_uri); - serd_uri_parse(node_str, &uri); - serd_uri_resolve(&uri, &in_base_uri, &abs_uri); - bool rooted = uri_is_under(&in_base_uri, &writer->root_uri); - SerdURIView* root = rooted ? &writer->root_uri : &in_base_uri; - UriSinkContext ctx = {writer, SERD_SUCCESS}; - if (!uri_is_under(&abs_uri, root) || writer->syntax == SERD_NTRIPLES || - writer->syntax == SERD_NQUADS) { - serd_uri_serialise(&abs_uri, uri_sink, &ctx); + SerdURIView base_uri = SERD_URI_NULL; + if ((writer->flags & SERD_WRITE_RESOLVED) && + serd_env_base_uri(writer->env, &base_uri)) { + SerdURIView uri = serd_parse_uri(node_str); + SerdURIView abs_uri = serd_resolve_uri(uri, base_uri); + bool rooted = uri_is_under(&base_uri, &writer->root_uri); + const SerdURIView* root = rooted ? &writer->root_uri : &base_uri; + UriSinkContext ctx = {writer, SERD_SUCCESS}; + + if (!supports_abbrev(writer) || !uri_is_under(&abs_uri, root)) { + serd_write_uri(abs_uri, uri_sink, &ctx); } else { - serd_uri_serialise_relative(&uri, &in_base_uri, root, uri_sink, &ctx); + serd_write_uri(serd_relative_uri(uri, base_uri), uri_sink, &ctx); } } else { TRY(st, write_uri_from_node(writer, node)); @@ -1211,7 +1209,7 @@ serd_writer_set_root_uri(SerdWriter* writer, const SerdNode* uri) if (uri) { writer->root_node = serd_node_copy(uri); - serd_uri_parse(serd_node_string(writer->root_node), &writer->root_uri); + writer->root_uri = serd_parse_uri(serd_node_string(writer->root_node)); } return SERD_SUCCESS; diff --git a/test/test_env.c b/test/test_env.c index 198c7e9b..b1371d16 100644 --- a/test/test_env.c +++ b/test/test_env.c @@ -26,6 +26,7 @@ test_env(void) { SerdNode* u = serd_new_string(SERD_URI, "http://example.org/foo"); SerdNode* b = serd_new_string(SERD_CURIE, "invalid"); + SerdNode* e = serd_new_string(SERD_URI, ""); SerdNode* c = serd_new_string(SERD_CURIE, "eg.2:b"); SerdNode* s = serd_new_string(SERD_LITERAL, "hello"); SerdEnv* env = serd_env_new(NULL); @@ -41,9 +42,12 @@ test_env(void) assert(serd_env_expand(env, NULL, &prefix, &suffix) == SERD_BAD_CURIE); + assert(serd_env_set_prefix_from_strings(env, "eg.3", "rel") == SERD_BAD_ARG); + assert(!serd_env_expand_node(NULL, u)); assert(!serd_env_expand_node(env, b)); assert(!serd_env_expand_node(env, s)); + assert(!serd_env_expand_node(env, e)); assert(!serd_env_set_base_uri(env, NULL)); @@ -76,6 +80,12 @@ test_env(void) assert(!serd_env_set_base_uri(env, u)); assert(serd_node_equals(serd_env_base_uri(env, NULL), u)); + + SerdNode* xe = serd_env_expand_node(env, e); + assert(xe); + assert(!strcmp(serd_node_string(xe), "http://example.org/foo")); + serd_node_free(xe); + assert(!serd_env_set_base_uri(env, NULL)); assert(!serd_env_base_uri(env, NULL)); @@ -84,6 +94,7 @@ test_env(void) serd_node_free(badpre); serd_node_free(s); serd_node_free(c); + serd_node_free(e); serd_node_free(b); serd_node_free(u); diff --git a/test/test_uri.c b/test/test_uri.c index 0942439f..3fe0ebd5 100644 --- a/test/test_uri.c +++ b/test/test_uri.c @@ -47,7 +47,7 @@ test_file_uri(const char* const hostname, SerdNode* node = serd_new_file_uri(path, hostname, 0); const char* node_str = serd_node_string(node); char* out_hostname = NULL; - char* out_path = serd_file_uri_parse(node_str, &out_hostname); + char* out_path = serd_parse_file_uri(node_str, &out_hostname); assert(!strcmp(node_str, expected_uri)); assert((hostname && out_hostname) || (!hostname && !out_hostname)); assert(!hostname || !strcmp(hostname, out_hostname)); @@ -105,30 +105,74 @@ test_uri_parsing(void) "/C:\\Pointless Space"); #endif - // Test tolerance of parsing junk URI escapes + // Missing trailing '/' after authority + assert(!serd_parse_file_uri("file://truncated", NULL)); - char* out_path = serd_file_uri_parse("file:///foo/%0Xbar", NULL); + // Check that NULL hostname doesn't crash + char* out_path = serd_parse_file_uri("file://me/path", NULL); + assert(!strcmp(out_path, "/path")); + serd_free(out_path); + + // Invalid first escape character + out_path = serd_parse_file_uri("file:///foo/%0Xbar", NULL); + assert(!strcmp(out_path, "/foo/bar")); + serd_free(out_path); + + // Invalid second escape character + out_path = serd_parse_file_uri("file:///foo/%X0bar", NULL); assert(!strcmp(out_path, "/foo/bar")); serd_free(out_path); } static void -test_uri_from_string(void) +test_parse_uri(void) { - assert(!serd_new_uri_from_string(NULL, NULL, NULL)); + const SerdStringView base = serd_string("http://example.org/a/b/c/"); + + const SerdURIView base_uri = serd_parse_uri(base.data); + const SerdURIView empty_uri = serd_parse_uri(""); + + SerdNode* const nil = + serd_new_parsed_uri(serd_resolve_uri(empty_uri, base_uri)); - SerdURIView base_uri; - SerdNode* base = - serd_new_uri_from_string("http://example.org/", NULL, &base_uri); - SerdNode* nil = serd_new_uri_from_string(NULL, &base_uri, NULL); - SerdNode* nil2 = serd_new_uri_from_string("", &base_uri, NULL); assert(serd_node_type(nil) == SERD_URI); - assert(!strcmp(serd_node_string(nil), serd_node_string(base))); - assert(serd_node_type(nil2) == SERD_URI); - assert(!strcmp(serd_node_string(nil2), serd_node_string(base))); + assert(!strcmp(serd_node_string(nil), base.data)); + serd_node_free(nil); - serd_node_free(nil2); - serd_node_free(base); +} + +static void +check_is_within(const char* const uri_string, + const char* const base_uri_string, + const bool expected) +{ + const SerdURIView uri = serd_parse_uri(uri_string); + const SerdURIView base_uri = serd_parse_uri(base_uri_string); + + assert(serd_uri_is_within(uri, base_uri) == expected); +} + +static void +test_is_within(void) +{ + static const char* const base = "http://example.org/base/"; + + check_is_within("http://example.org/base/", base, true); + check_is_within("http://example.org/base/kid?q", base, true); + check_is_within("http://example.org/base/kid", base, true); + check_is_within("http://example.org/base/kid#f", base, true); + check_is_within("http://example.org/base/kid?q#f", base, true); + check_is_within("http://example.org/base/kid/grandkid", base, true); + + check_is_within("http://example.org/base", base, false); + check_is_within("http://example.org/based", base, false); + check_is_within("http://example.org/bose", base, false); + check_is_within("http://example.org/", base, false); + check_is_within("http://other.org/base", base, false); + check_is_within("ftp://other.org/base", base, false); + check_is_within("base", base, false); + + check_is_within("http://example.org/", "rel", false); } static inline bool @@ -149,31 +193,32 @@ check_relative_uri(const char* const uri_string, assert(base_string); assert(expected_string); - SerdURIView uri = SERD_URI_NULL; - SerdURIView base = SERD_URI_NULL; - SerdURIView result = SERD_URI_NULL; - - SerdNode* uri_node = serd_new_uri_from_string(uri_string, NULL, &uri); - SerdNode* base_node = serd_new_uri_from_string(base_string, NULL, &base); + SerdNode* const uri_node = serd_new_uri(uri_string); + const SerdURIView uri = serd_node_uri_view(uri_node); + SerdNode* const base_node = serd_new_uri(base_string); + const SerdURIView base = serd_node_uri_view(base_node); SerdNode* result_node = NULL; - if (root_string) { - SerdURIView root = SERD_URI_NULL; - SerdNode* root_node = serd_new_uri_from_string(root_string, NULL, &root); + if (!root_string) { + result_node = serd_new_parsed_uri(serd_relative_uri(uri, base)); + } else { + SerdNode* const root_node = serd_new_uri(root_string); + const SerdURIView root = serd_node_uri_view(root_node); + + result_node = serd_uri_is_within(uri, root) + ? serd_new_parsed_uri(serd_relative_uri(uri, base)) + : serd_new_uri(uri_string); - result_node = serd_new_relative_uri(&uri, &base, &root, &result); serd_node_free(root_node); - } else { - result_node = serd_new_relative_uri(&uri, &base, NULL, &result); } assert(!strcmp(serd_node_string(result_node), expected_string)); - SerdURIView expected = SERD_URI_NULL; - assert(!serd_uri_parse(expected_string, &expected)); + const SerdURIView result = serd_node_uri_view(result_node); + const SerdURIView expected = serd_parse_uri(expected_string); assert(chunk_equals(&result.scheme, &expected.scheme)); assert(chunk_equals(&result.authority, &expected.authority)); - assert(chunk_equals(&result.path_base, &expected.path_base)); + assert(chunk_equals(&result.path_prefix, &expected.path_prefix)); assert(chunk_equals(&result.path, &expected.path)); assert(chunk_equals(&result.query, &expected.query)); assert(chunk_equals(&result.fragment, &expected.fragment)); @@ -278,13 +323,68 @@ test_relative_uri(void) "http://example.org/a"); } +static void +check_uri_string(const SerdURIView uri, const char* const expected) +{ + SerdNode* const node = serd_new_parsed_uri(uri); + assert(!strcmp(serd_node_string(node), expected)); + serd_node_free(node); +} + +static void +test_uri_resolution(void) +{ + const SerdStringView top = serd_string("http://example.org/t/"); + const SerdStringView base = serd_string("http://example.org/t/b/"); + const SerdStringView sub = serd_string("http://example.org/t/b/s"); + const SerdStringView deep = serd_string("http://example.org/t/b/s/d"); + const SerdStringView other = serd_string("http://example.org/o"); + + const SerdURIView top_uri = serd_parse_uri(top.data); + const SerdURIView base_uri = serd_parse_uri(base.data); + const SerdURIView sub_uri = serd_parse_uri(sub.data); + const SerdURIView deep_uri = serd_parse_uri(deep.data); + const SerdURIView other_uri = serd_parse_uri(other.data); + const SerdURIView rel_sub_uri = serd_relative_uri(sub_uri, base_uri); + const SerdURIView resolved_sub_uri = serd_resolve_uri(rel_sub_uri, base_uri); + + check_uri_string(top_uri, top.data); + check_uri_string(base_uri, base.data); + check_uri_string(sub_uri, sub.data); + check_uri_string(deep_uri, deep.data); + check_uri_string(other_uri, other.data); + check_uri_string(rel_sub_uri, "s"); + check_uri_string(resolved_sub_uri, sub.data); + + // Failure to resolve because up-reference escapes path prefix + const SerdURIView up_uri = serd_relative_uri(resolved_sub_uri, deep_uri); + assert(!up_uri.scheme.data); + assert(!up_uri.scheme.length); + assert(!up_uri.authority.data); + assert(!up_uri.authority.length); + assert(!up_uri.path_prefix.data); + assert(!up_uri.path_prefix.length); + assert(!up_uri.path.data); + assert(!up_uri.path.length); + assert(!up_uri.query.data); + assert(!up_uri.query.length); + assert(!up_uri.fragment.data); + assert(!up_uri.fragment.length); + + // Shared path prefix is within URI path prefix + const SerdURIView prefix_uri = serd_relative_uri(resolved_sub_uri, other_uri); + check_uri_string(prefix_uri, "t/b/s"); +} + int main(void) { test_uri_string_has_scheme(); test_uri_parsing(); - test_uri_from_string(); + test_parse_uri(); + test_is_within(); test_relative_uri(); + test_uri_resolution(); printf("Success\n"); return 0; -- cgit v1.2.1