diff options
-rw-r--r-- | ChangeLog | 3 | ||||
-rw-r--r-- | doc/serdi.1 | 4 | ||||
-rw-r--r-- | serd/serd.h | 20 | ||||
-rw-r--r-- | src/node.c | 2 | ||||
-rw-r--r-- | src/serd_internal.h | 51 | ||||
-rw-r--r-- | src/serdi.c | 41 | ||||
-rw-r--r-- | src/uri.c | 150 | ||||
-rw-r--r-- | src/writer.c | 59 | ||||
-rw-r--r-- | tests/test-base-query.out | 1 | ||||
-rw-r--r-- | tests/test-base-query.ttl | 3 | ||||
-rw-r--r-- | tests/test-rel.out | 6 | ||||
-rw-r--r-- | tests/test-rel.ttl | 6 | ||||
-rw-r--r-- | wscript | 5 |
13 files changed, 235 insertions, 116 deletions
@@ -36,6 +36,9 @@ serd (UNRELEASED) unstable; urgency=low * Add serd_uri_serialise_relative() for making URIs relative to a base where possible (by chopping a common prefix and adding dot segments). * Make URIs serialised by the writer properly escape characters. + * Add serd_writer_set_root_uri() and corresponding -r option to serdi to + enable writing URIs with up references (../). + * Resolve dot segments in serd_uri_resolve() instead of at write time. -- David Robillard <d@drobilla.net> (UNRELEASED) diff --git a/doc/serdi.1 b/doc/serdi.1 index 22c32892..04a3b5e8 100644 --- a/doc/serdi.1 +++ b/doc/serdi.1 @@ -37,6 +37,10 @@ Write output in SYNTAX (`turtle' or `ntriples'). Add PREFIX to blank node IDs. .TP +\fB\-r ROOT_URI\fR +Keep relative URIs within ROOT_URI. + +.TP \fB\-s INPUT\fR Parse INPUT as a string (terminates options). diff --git a/serd/serd.h b/serd/serd.h index 94679a46..40e47499 100644 --- a/serd/serd.h +++ b/serd/serd.h @@ -362,11 +362,16 @@ serd_uri_serialise(const SerdURI* uri, SerdSink sink, void* stream); /** Serialise @c uri relative to @c base with a series of calls to @c sink. + + The @c uri is written as a relative URI iff if it a child of @c base and @c + root. The optional @c root parameter must be a prefix of @c base and can be + used keep up-references ("../") within a certain namespace. */ SERD_API size_t serd_uri_serialise_relative(const SerdURI* uri, const SerdURI* base, + const SerdURI* root, SerdSink sink, void* stream); @@ -793,6 +798,21 @@ serd_writer_set_base_uri(SerdWriter* writer, const SerdNode* uri); /** + Set the current root URI. + + The root URI should be a prefix of the base URI. The path of the root URI + is the highest path any relative up-reference can refer to. For example, + with root <file:///foo/root> and base <file:///foo/root/base>, + <file:///foo/root> will be written as <../>, but <file:///foo> will be + written non-relatively as <file:///foo>. If the root is not explicitly set, + it defaults to the base URI, so no up-references will be created at all. +*/ +SERD_API +SerdStatus +serd_writer_set_root_uri(SerdWriter* writer, + const SerdNode* uri); + +/** Set a namespace prefix (and emit directive if applicable). Note this function can be safely casted to SerdPrefixSink. @@ -41,7 +41,7 @@ SERD_API SerdNode serd_node_copy(const SerdNode* node) { - if (!node) { + if (!node || !node->buf) { return SERD_NODE_NULL; } diff --git a/src/serd_internal.h b/src/serd_internal.h index f0137f28..08d68fd5 100644 --- a/src/serd_internal.h +++ b/src/serd_internal.h @@ -240,4 +240,55 @@ is_windows_path(const uint8_t* path) && (path[2] == '/' || path[2] == '\\'); } +/* URI utilities */ + +static inline bool +chunk_equals(const SerdChunk* a, const SerdChunk* b) +{ + return a->len == b->len + && !strncmp((const char*)a->buf, (const char*)b->buf, a->len); +} + +static inline size_t +uri_path_len(const SerdURI* uri) +{ + return uri->path_base.len + uri->path.len; +} + +static inline uint8_t +uri_path_at(const SerdURI* uri, size_t i) +{ + if (i < uri->path_base.len) { + return uri->path_base.buf[i]; + } else { + return uri->path.buf[i - uri->path_base.len]; + } +} + +/** Return true iff @p uri is within the base of @p root */ +static inline bool +uri_is_under(const SerdURI* uri, const SerdURI* root) +{ + if (!root || !uri || !root->scheme.len || + !chunk_equals(&root->scheme, &uri->scheme) || + !chunk_equals(&root->authority, &uri->authority)) { + return false; + } + + bool differ = false; + const size_t path_len = uri_path_len(uri); + const size_t root_len = uri_path_len(root); + for (size_t i = 0; i < path_len && i < root_len; ++i) { + if (uri_path_at(uri, i) != uri_path_at(root, i)) { + differ = true; + } + if (differ && uri_path_at(root, i) == '/') { + return false; + } + } + + return true; +} + + #endif // SERD_INTERNAL_H diff --git a/src/serdi.c b/src/serdi.c index ff1f8d51..0236156e 100644 --- a/src/serdi.c +++ b/src/serdi.c @@ -45,14 +45,15 @@ print_usage(const char* name, bool error) fprintf(os, "Read and write RDF syntax.\n"); fprintf(os, "Use - for INPUT to read from standard input.\n\n"); fprintf(os, " -b Fast bulk output for large serialisations.\n"); - fprintf(os, " -c PREFIX Chop PREFIX from matching blank node IDs\n"); + fprintf(os, " -c PREFIX Chop PREFIX from matching blank node IDs.\n"); fprintf(os, " -f Keep full URIs in input (don't qualify).\n"); - fprintf(os, " -h Display this help and exit\n"); - fprintf(os, " -i SYNTAX Input syntax (`turtle' or `ntriples')\n"); - fprintf(os, " -o SYNTAX Output syntax (`turtle' or `ntriples')\n"); - fprintf(os, " -p PREFIX Add PREFIX to blank node IDs\n"); - fprintf(os, " -s INPUT Parse INPUT as string (terminates options)\n"); - fprintf(os, " -v Display version information and exit\n"); + fprintf(os, " -h Display this help and exit.\n"); + fprintf(os, " -i SYNTAX Input syntax (`turtle' or `ntriples').\n"); + fprintf(os, " -o SYNTAX Output syntax (`turtle' or `ntriples').\n"); + fprintf(os, " -p PREFIX Add PREFIX to blank node IDs.\n"); + fprintf(os, " -r ROOT_URI Keep relative URIs within ROOT_URI.\n"); + fprintf(os, " -s INPUT Parse INPUT as string (terminates options).\n"); + fprintf(os, " -v Display version information and exit.\n"); return error ? 1 : 0; } @@ -93,6 +94,7 @@ main(int argc, char** argv) const uint8_t* in_name = NULL; const uint8_t* add_prefix = NULL; const uint8_t* chop_prefix = NULL; + const uint8_t* root_uri = NULL; int a = 1; for (; a < argc && argv[a][0] == '-'; ++a) { if (argv[a][1] == '\0') { @@ -130,6 +132,11 @@ main(int argc, char** argv) return bad_arg(argv[0], 'c'); } chop_prefix = (const uint8_t*)argv[a]; + } else if (argv[a][1] == 'r') { + if (++a == argc) { + return bad_arg(argv[0], 'r'); + } + root_uri = (const uint8_t*)argv[a]; } else { fprintf(stderr, "%s: Unknown option `%s'\n", argv[0], argv[a]); return print_usage(argv[0], true); @@ -152,21 +159,17 @@ main(int argc, char** argv) } } - const uint8_t* base_uri_str = NULL; + SerdURI base_uri = SERD_URI_NULL; + SerdNode base = SERD_NODE_NULL; if (a < argc) { // Base URI given on command line - base_uri_str = (const uint8_t*)argv[a]; + base = serd_node_new_uri_from_string( + (const uint8_t*)argv[a], NULL, &base_uri); } else if (from_file) { // Use input file URI - base_uri_str = input; - } else { - base_uri_str = (const uint8_t*)""; + base = serd_node_new_file_uri(input, NULL, &base_uri, false); } - SerdURI base_uri = SERD_URI_NULL; - SerdNode base_uri_node = serd_node_new_uri_from_string( - base_uri_str, &base_uri, &base_uri); - FILE* out_fd = stdout; - SerdEnv* env = serd_env_new(&base_uri_node); + SerdEnv* env = serd_env_new(&base); int output_style = 0; if (output_syntax == SERD_NTRIPLES) { @@ -198,6 +201,8 @@ main(int argc, char** argv) (SerdStatementSink)serd_writer_write_statement, (SerdEndSink)serd_writer_end_anon); + SerdNode root = serd_node_from_string(SERD_URI, root_uri); + serd_writer_set_root_uri(writer, &root); serd_writer_chop_blank_prefix(writer, chop_prefix); serd_reader_add_blank_prefix(reader, add_prefix); @@ -214,7 +219,7 @@ main(int argc, char** argv) serd_writer_finish(writer); serd_writer_free(writer); serd_env_free(env); - serd_node_free(&base_uri_node); + serd_node_free(&base); return (status > SERD_FAILURE) ? 1 : 0; } @@ -130,12 +130,12 @@ serd_uri_dump(const SerdURI* uri, FILE* file) fprintf(stderr, "\n"); \ } - PRINT_PART(uri->scheme, "scheme"); + PRINT_PART(uri->scheme, "scheme "); PRINT_PART(uri->authority, "authority"); PRINT_PART(uri->path_base, "path_base"); - PRINT_PART(uri->path, "path"); - PRINT_PART(uri->query, "query"); - PRINT_PART(uri->fragment, "fragment"); + PRINT_PART(uri->path, "path "); + PRINT_PART(uri->query, "query "); + PRINT_PART(uri->fragment, "fragment "); } #endif @@ -317,6 +317,39 @@ remove_dot_segments(const uint8_t* path, size_t len, size_t* up) return begin; } +/// Merge @p base and @p path in-place +static void +merge(SerdChunk* base, SerdChunk* path) +{ + size_t up; + const uint8_t* begin = remove_dot_segments(path->buf, path->len, &up); + const uint8_t* end = path->buf + path->len; + + if (base->buf) { + assert(base->len > 0); + // Find the up'th last slash + const uint8_t* base_last = (base->buf + base->len - 1); + ++up; + do { + if (*base_last == '/') { + --up; + } + } while (up > 0 && (--base_last > base->buf)); + + // Set path prefix + if (*base_last == '/') { + base->len = base_last - base->buf + 1; + } else { + base->len = 0; + base->buf = NULL; + } + } + + // Set path suffix + path->buf = begin; + path->len = end - begin; +} + /// See http://tools.ietf.org/html/rfc3986#section-5.2.2 SERD_API void @@ -344,6 +377,7 @@ serd_uri_resolve(const SerdURI* r, const SerdURI* base, SerdURI* t) if (r->path.buf[0] != '/') { t->path_base = base->path; } + merge(&t->path_base, &t->path); t->query = r->query; } t->authority = base->authority; @@ -353,110 +387,77 @@ serd_uri_resolve(const SerdURI* r, const SerdURI* base, SerdURI* t) } #ifdef URI_DEBUG - fprintf(stderr, "RESOLVE URI\nBASE:\n"); + fprintf(stderr, "## RESOLVE URI\n# BASE\n"); serd_uri_dump(base, stderr); - fprintf(stderr, "URI:\n"); + fprintf(stderr, "# URI\n"); serd_uri_dump(r, stderr); - fprintf(stderr, "RESULT:\n"); + fprintf(stderr, "# RESULT\n"); serd_uri_dump(t, stderr); fprintf(stderr, "\n"); #endif } -/** Write a relative path relative to a base path. */ +/** Write the path of @p uri starting at index @p i */ static size_t -write_rel_path(SerdSink sink, - void* stream, - const SerdChunk* base, - const SerdChunk* path) +write_path_tail(SerdSink sink, void* stream, const SerdURI* uri, size_t i) { - size_t up; - size_t len = 0; - const uint8_t* begin = remove_dot_segments(path->buf, path->len, &up); - const uint8_t* end = path->buf + path->len; - - if (base && base->buf) { - // Find the up'th last slash - const uint8_t* base_last = (base->buf + base->len - 1); - ++up; - do { - if (*base_last == '/') { - --up; - } - } while (up > 0 && (--base_last > base->buf)); - - // Write base URI prefix - if (*base_last == '/') { - const size_t base_len = base_last - base->buf + 1; - len += sink(base->buf, base_len, stream); + size_t len = 0; + if (i < uri->path_base.len) { + len += sink(uri->path_base.buf + i, uri->path_base.len - i, stream); + } + if (uri->path.buf) { + if (i < uri->path_base.len) { + len += sink(uri->path.buf, uri->path.len, stream); + } else { + const size_t j = (i - uri->path_base.len); + len += sink(uri->path.buf + j, uri->path.len - j, stream); } } - - // Write URI suffix - len += sink(begin, end - begin, stream); - return len; } -/** Write an absolute path relative to a base path. */ +/** Write the path of @p uri relative to the path of @p base. */ static size_t -write_abs_path(SerdSink sink, - void* stream, - const SerdChunk* base, - const SerdChunk* path) +write_rel_path(SerdSink sink, + void* stream, + const SerdURI* uri, + const SerdURI* base) { - size_t len = 0; - const size_t min_len = (path->len < base->len) ? path->len : base->len; + const size_t path_len = uri_path_len(uri); + const size_t base_len = uri_path_len(base); + const size_t min_len = (path_len < base_len) ? path_len : base_len; // Find the last separator common to both paths size_t last_shared_sep = 0; size_t i = 0; - for (; i < min_len && path->buf[i] == base->buf[i]; ++i) { - if (path->buf[i] == '/') { + for (; i < min_len && uri_path_at(uri, i) == uri_path_at(base, i); ++i) { + if (uri_path_at(uri, i) == '/') { last_shared_sep = i; } } - if (i == path->len && i == base->len) { // Paths are identical + if (i == path_len && i == base_len) { // Paths are identical return 0; } else if (last_shared_sep == 0) { // No common components - return sink(path->buf, path->len, stream); + return write_path_tail(sink, stream, uri, 0); } // Find the number of up references ("..") required size_t up = 0; - for (size_t i = last_shared_sep + 1; i < base->len; ++i) { - if (base->buf[i] == '/') { + for (size_t i = last_shared_sep + 1; i < base_len; ++i) { + if (uri_path_at(base, i) == '/') { ++up; } } // Write up references + size_t len = 0; for (size_t i = 0; i < up; ++i) { len += sink("../", 3, stream); } // Write suffix - const size_t suffix_len = path->len - last_shared_sep - 1; - len += sink(path->buf + last_shared_sep + 1, suffix_len, stream); - - return len; -} - -static inline bool -chunk_equals(const SerdChunk* a, const SerdChunk* b) -{ - return a->len == b->len - && !strncmp((const char*)a->buf, (const char*)b->buf, a->len); -} - -/** Return true iff both are absolute URIs on the same host. */ -static inline bool -same_host(const SerdURI* base, const SerdURI* uri) -{ - return base && uri && base->scheme.len - && chunk_equals(&base->scheme, &uri->scheme) - && chunk_equals(&base->authority, &uri->authority); + return len += write_path_tail(sink, stream, uri, last_shared_sep + 1); } /// See http://tools.ietf.org/html/rfc3986#section-5.3 @@ -464,13 +465,14 @@ SERD_API size_t serd_uri_serialise_relative(const SerdURI* uri, const SerdURI* base, + const SerdURI* root, SerdSink sink, void* stream) { size_t len = 0; - const bool relative = same_host(base, uri); + const bool relative = uri_is_under(uri, root ? root : base); if (relative) { - len = write_abs_path(sink, stream, base ? &base->path : 0, &uri->path); + len = write_rel_path(sink, stream, uri, base); } if (!relative || (!len && base->query.buf)) { if (uri->scheme.buf) { @@ -481,11 +483,7 @@ serd_uri_serialise_relative(const SerdURI* uri, len += sink("//", 2, stream); len += sink(uri->authority.buf, uri->authority.len, stream); } - if (uri->path.buf) { - len += write_rel_path(sink, stream, &uri->path_base, &uri->path); - } else { - len += sink(uri->path_base.buf, uri->path_base.len, stream); - } + len += write_path_tail(sink, stream, uri, 0); } if (uri->query.buf) { len += sink("?", 1, stream); @@ -503,5 +501,5 @@ SERD_API size_t serd_uri_serialise(const SerdURI* uri, SerdSink sink, void* stream) { - return serd_uri_serialise_relative(uri, NULL, sink, stream); + return serd_uri_serialise_relative(uri, NULL, NULL, sink, stream); } diff --git a/src/writer.c b/src/writer.c index f538e486..0870c785 100644 --- a/src/writer.c +++ b/src/writer.c @@ -77,6 +77,8 @@ struct SerdWriterImpl { SerdSyntax syntax; SerdStyle style; SerdEnv* env; + SerdNode root_node; + SerdURI root_uri; SerdURI base_uri; SerdStack anon_stack; SerdBulkSink bulk_sink; @@ -411,28 +413,24 @@ write_node(SerdWriter* writer, break; } } - if (!has_scheme && (writer->style & SERD_STYLE_RESOLVED)) { - SerdURI uri; - serd_uri_parse(node->buf, &uri); - SerdURI abs_uri; - serd_uri_resolve(&uri, &writer->base_uri, &abs_uri); - sink("<", 1, writer); - serd_uri_serialise(&abs_uri, uri_sink, writer); - sink(">", 1, writer); - break; - } else if (has_scheme && (writer->syntax == SERD_TURTLE) - && (writer->style & SERD_STYLE_RESOLVED)) { - SerdURI uri; + sink("<", 1, writer); + if (writer->style & SERD_STYLE_RESOLVED) { + SerdURI in_base_uri, uri, abs_uri; + serd_env_get_base_uri(writer->env, &in_base_uri); serd_uri_parse(node->buf, &uri); - sink("<", 1, writer); - serd_uri_serialise_relative( - &uri, &writer->base_uri, uri_sink, writer); - sink(">", 1, writer); - break; + serd_uri_resolve(&uri, &in_base_uri, &abs_uri); + bool rooted = uri_is_under(&writer->base_uri, &writer->root_uri); + SerdURI* root = rooted ? &writer->root_uri : & writer->base_uri; + if (!uri_is_under(&abs_uri, root) || + writer->syntax == SERD_NTRIPLES) { + serd_uri_serialise(&abs_uri, uri_sink, writer); + } else { + serd_uri_serialise_relative( + &uri, &writer->base_uri, root, uri_sink, writer); + } + } else { + write_text(writer, WRITE_URI, node->buf, node->n_bytes); } - - sink("<", 1, writer); - write_text(writer, WRITE_URI, node->buf, node->n_bytes); sink(">", 1, writer); default: break; @@ -637,6 +635,8 @@ serd_writer_new(SerdSyntax syntax, writer->syntax = syntax; writer->style = style; writer->env = env; + writer->root_node = SERD_NODE_NULL; + writer->root_uri = SERD_URI_NULL; writer->base_uri = base_uri ? *base_uri : SERD_URI_NULL; writer->anon_stack = serd_stack_new(sizeof(WriteContext)); writer->sink = sink; @@ -687,6 +687,7 @@ serd_writer_set_base_uri(SerdWriter* writer, sink(uri->buf, uri->n_bytes, writer); sink("> .\n", 4, writer); } + writer->indent = 0; return reset_context(writer, false); } return SERD_ERR_UNKNOWN; @@ -694,6 +695,22 @@ serd_writer_set_base_uri(SerdWriter* writer, SERD_API SerdStatus +serd_writer_set_root_uri(SerdWriter* writer, + const SerdNode* uri) +{ + serd_node_free(&writer->root_node); + if (uri && uri->buf) { + writer->root_node = serd_node_copy(uri); + serd_uri_parse(uri->buf, &writer->root_uri); + } else { + writer->root_node = SERD_NODE_NULL; + writer->root_uri = SERD_URI_NULL; + } + return SERD_SUCCESS; +} + +SERD_API +SerdStatus serd_writer_set_prefix(SerdWriter* writer, const SerdNode* name, const SerdNode* uri) @@ -710,6 +727,7 @@ serd_writer_set_prefix(SerdWriter* writer, write_text(writer, WRITE_URI, uri->buf, uri->n_bytes); sink("> .\n", 4, writer); } + writer->indent = 0; return reset_context(writer, false); } return SERD_ERR_UNKNOWN; @@ -725,6 +743,7 @@ serd_writer_free(SerdWriter* writer) if (writer->style & SERD_STYLE_BULK) { serd_bulk_sink_free(&writer->bulk_sink); } + serd_node_free(&writer->root_node); free(writer); } diff --git a/tests/test-base-query.out b/tests/test-base-query.out new file mode 100644 index 00000000..d40c2f6b --- /dev/null +++ b/tests/test-base-query.out @@ -0,0 +1 @@ +<http://example.org/a/b/c/d;p?q> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://example.org/Thing> . diff --git a/tests/test-base-query.ttl b/tests/test-base-query.ttl new file mode 100644 index 00000000..77638817 --- /dev/null +++ b/tests/test-base-query.ttl @@ -0,0 +1,3 @@ +@base <http://example.org/a/b/c/d;p?q> . + +<> a <http://example.org/Thing> .
\ No newline at end of file diff --git a/tests/test-rel.out b/tests/test-rel.out new file mode 100644 index 00000000..01c18c2f --- /dev/null +++ b/tests/test-rel.out @@ -0,0 +1,6 @@ +<http://www.w3.org/2001/sw/DataAccess/df1/tests/a/b/c> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://example.org/Thing> . +<http://www.w3.org/2001/sw/DataAccess/df1> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://example.org/Thing> . +<http://www.w3.org/2001/sw/DataAccess/df1/tests/test-rel.ttl> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://example.org/Thing> . +<http://www.w3.org/2001/sw/DataAccess/df1/tests/test-not.ttl> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://example.org/Thing> . +<http://www.w3.org/2001/sw/DataAccess/df1/tests/a/b/c> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://example.org/Thing> . +<http://www.w3.org/> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://example.org/Thing> . diff --git a/tests/test-rel.ttl b/tests/test-rel.ttl new file mode 100644 index 00000000..a5927cd3 --- /dev/null +++ b/tests/test-rel.ttl @@ -0,0 +1,6 @@ +<http://www.w3.org/2001/sw/DataAccess/df1/tests/a/b/c> a <http://example.org/Thing> . +<http://www.w3.org/2001/sw/DataAccess/df1> a <http://example.org/Thing> . +<http://www.w3.org/2001/sw/DataAccess/df1/tests/test-rel.ttl> a <http://example.org/Thing> . +<http://www.w3.org/2001/sw/DataAccess/df1/tests/test-not.ttl> a <http://example.org/Thing> . +<a/b/c> a <http://example.org/Thing> . +<http://www.w3.org/> a <http://example.org/Thing> . @@ -9,7 +9,7 @@ from waflib.extras import autowaf as autowaf import waflib.Logs as Logs, waflib.Options as Options # Version of this package (even if built as a child) -SERD_VERSION = '0.12.0' +SERD_VERSION = '0.13.0' SERD_MAJOR_VERSION = '0' # Library version (UNIX style major, minor, micro) @@ -341,6 +341,7 @@ def test(ctx): 'serdi_static -z > %s' % nul, 'serdi_static -p > %s' % nul, 'serdi_static -c > %s' % nul, + 'serdi_static -r > %s' % nul, 'serdi_static -i illegal > %s' % nul, 'serdi_static -o illegal > %s' % nul, 'serdi_static -i turtle > %s' % nul, @@ -386,6 +387,8 @@ def test(ctx): flags += '-b' if (num % 5 == 0): flags += ' -f' + if (num % 3 == 0): + flags += ' -r http://www.w3.org/' base_uri = 'http://www.w3.org/2001/sw/DataAccess/df1/' + test.replace('\\', '/') out_filename = test + '.thru' commands += [ |