diff options
author | David Robillard <d@drobilla.net> | 2012-03-08 15:57:20 +0000 |
---|---|---|
committer | David Robillard <d@drobilla.net> | 2012-03-08 15:57:20 +0000 |
commit | f4365012b555699b916dbeec4d81425bf663579c (patch) | |
tree | 27807b4430269cdb279d32e9f734f119b8bb7191 /src | |
parent | 7b022006c47586dc00ed8bca85fcb0bdf5f9465d (diff) | |
download | serd-f4365012b555699b916dbeec4d81425bf663579c.tar.gz serd-f4365012b555699b916dbeec4d81425bf663579c.tar.bz2 serd-f4365012b555699b916dbeec4d81425bf663579c.zip |
Add serd_writer_get_env().
Add serd_node_new_uri_from_path() and serd_file_uri_parse() and implement
proper URI to/from path hex escaping, etc.
Add serd_uri_serialise_relative() for making URIs relative to a base where
possible (by chopping a common prefix and adding dot segments).
Make URIs serialised by the writer properly escape characters.
git-svn-id: http://svn.drobilla.net/serd/trunk@330 490d8e77-9747-427b-9fa3-0b8f29cee8a0
Diffstat (limited to 'src')
-rw-r--r-- | src/node.c | 62 | ||||
-rw-r--r-- | src/serd_internal.h | 7 | ||||
-rw-r--r-- | src/serdi.c | 3 | ||||
-rw-r--r-- | src/uri.c | 235 | ||||
-rw-r--r-- | src/writer.c | 102 |
5 files changed, 313 insertions, 96 deletions
@@ -112,6 +112,68 @@ serd_node_new_uri_from_string(const uint8_t* str, return serd_node_new_uri(&uri, base, out); // Resolve/Serialise } +static inline bool +is_uri_path_char(const uint8_t c) +{ + if (is_alpha(c) || is_digit(c)) { + return true; + } + switch (c) { + case '-': case '.': case '_': case '~': // unreserved + case ':': case '@': // pchar + case '/': // separator + // sub-delims + case '!': case '$': case '&': case '\'': case '(': case ')': + case '*': case '+': case ',': case ';': case '=': + return true; + default: + return false; + } +} + +SERD_API +SerdNode +serd_node_new_uri_from_path(const uint8_t* path, + const uint8_t* hostname, + SerdURI* out) +{ + const size_t path_len = strlen((const char*)path); + const size_t hostname_len = hostname ? strlen((const char*)hostname) : 0; + const bool evil = is_windows_path(path); + size_t uri_len = 0; + uint8_t* uri = NULL; + + if (path[0] == '/' || is_windows_path(path)) { + uri_len = strlen("file://") + hostname_len + evil; + uri = (uint8_t*)malloc(uri_len + 1); + snprintf((char*)uri, uri_len + 1, "file://%s%s", + hostname ? (const char*)hostname : "", + evil ? "/" : ""); + } + + SerdChunk chunk = { uri, uri_len }; + for (size_t i = 0; i < path_len; ++i) { + if (evil && path[i] == '\\') { + serd_chunk_sink("/", 1, &chunk); + } else if (path[i] == '%') { + serd_chunk_sink("%%", 2, &chunk); + } else if (is_uri_path_char(path[i])) { + serd_chunk_sink(path + i, 1, &chunk); + } else { + char escape[4] = { '%', 0, 0, 0 }; + snprintf(escape + 1, sizeof(escape) - 1, "%X", path[i]); + serd_chunk_sink(escape, 3, &chunk); + } + } + serd_chunk_sink_finish(&chunk); + + if (out) { + serd_uri_parse(chunk.buf, out); + } + + return serd_node_from_string(SERD_URI, chunk.buf); +} + SERD_API SerdNode serd_node_new_uri(const SerdURI* uri, const SerdURI* base, SerdURI* out) diff --git a/src/serd_internal.h b/src/serd_internal.h index 6e535402..f0137f28 100644 --- a/src/serd_internal.h +++ b/src/serd_internal.h @@ -233,4 +233,11 @@ is_base64(const uint8_t c) return is_alpha(c) || is_digit(c) || c == '+' || c == '/' || c == '='; } +static inline bool +is_windows_path(const uint8_t* path) +{ + return is_alpha(path[0]) && (path[1] == ':' || path[1] == '|') + && (path[2] == '/' || path[2] == '\\'); +} + #endif // SERD_INTERNAL_H diff --git a/src/serdi.c b/src/serdi.c index 661b60e0..ff1f8d51 100644 --- a/src/serdi.c +++ b/src/serdi.c @@ -178,7 +178,8 @@ main(int argc, char** argv) } } - if (input_syntax != SERD_NTRIPLES) { // Base URI may change (@base) + if (input_syntax != SERD_NTRIPLES // Base URI may change (@base) + || (output_syntax == SERD_TURTLE)) { output_style |= SERD_STYLE_RESOLVED; } @@ -21,21 +21,12 @@ // #define URI_DEBUG 1 -static inline bool -is_windows_path(const uint8_t* path) -{ - return is_alpha(path[0]) && (path[1] == ':' || path[1] == '|') - && (path[2] == '/' || path[2] == '\\'); -} - SERD_API const uint8_t* serd_uri_to_path(const uint8_t* uri) { const uint8_t* path = uri; - if (uri[0] == '/' || is_windows_path(uri)) { - return uri; - } else if (serd_uri_string_has_scheme(uri)) { + if (!is_windows_path(uri) && serd_uri_string_has_scheme(uri)) { if (strncmp((const char*)uri, "file:", 5)) { fprintf(stderr, "Non-file URI `%s'\n", uri); return NULL; @@ -55,6 +46,56 @@ serd_uri_to_path(const uint8_t* uri) } SERD_API +uint8_t* +serd_file_uri_parse(const uint8_t* uri, uint8_t** hostname) +{ + const uint8_t* path = uri; + if (hostname) { + *hostname = NULL; + } + if (!strncmp((const char*)uri, "file://", 7)) { + const uint8_t* auth = uri + 7; + if (*auth == '/') { // No hostname + path = auth; + } else { // Has hostname + if (!(path = (const uint8_t*)strchr((const char*)auth, '/'))) { + return NULL; + } + if (hostname) { + *hostname = (uint8_t*)calloc(1, path - auth + 1); + memcpy(*hostname, auth, path - auth); + } + } + } + + if (is_windows_path(path + 1)) { + ++path; + } + + SerdChunk chunk = { NULL, 0 }; + for (const uint8_t* s = path; *s; ++s) { + if (*s == '%') { + if (*(s + 1) == '%') { + serd_chunk_sink("%", 1, &chunk); + ++s; + } else if (is_digit(*(s + 1)) && is_digit(*(s + 2))) { + const uint8_t code[3] = { *(s + 1), *(s + 2), 0 }; + uint32_t num; + sscanf((const char*)code, "%X", &num); + const uint8_t c = num; + serd_chunk_sink(&c, 1, &chunk); + s += 2; + } else { + s += 2; // Junk escape, ignore + } + } else { + serd_chunk_sink(s, 1, &chunk); + } + } + return serd_chunk_sink_finish(&chunk); +} + +SERD_API bool serd_uri_string_has_scheme(const uint8_t* utf8) { @@ -276,12 +317,11 @@ remove_dot_segments(const uint8_t* path, size_t len, size_t* up) return begin; } +/// See http://tools.ietf.org/html/rfc3986#section-5.2.2 SERD_API void serd_uri_resolve(const SerdURI* r, const SerdURI* base, SerdURI* t) { - // See http://tools.ietf.org/html/rfc3986#section-5.2.2 - t->path_base.buf = NULL; t->path_base.len = 0; if (r->scheme.len) { @@ -323,66 +363,147 @@ serd_uri_resolve(const SerdURI* r, const SerdURI* base, SerdURI* t) #endif } -SERD_API -size_t -serd_uri_serialise(const SerdURI* uri, SerdSink sink, void* stream) +/** Write a relative path relative to a base path. */ +static size_t +write_rel_path(SerdSink sink, + void* stream, + const SerdChunk* base, + const SerdChunk* path) { - // See http://tools.ietf.org/html/rfc3986#section-5.3 - - size_t write_size = 0; -#define WRITE(buf, len) \ - write_size += len; \ - sink((const uint8_t*)buf, len, stream); + size_t up; + size_t len = 0; + const uint8_t* begin = remove_dot_segments(path->buf, path->len, &up); + const uint8_t* end = path->buf + path->len; + + if (base && base->buf) { + // Find the up'th last slash + const uint8_t* base_last = (base->buf + base->len - 1); + ++up; + do { + if (*base_last == '/') { + --up; + } + } while (up > 0 && (--base_last > base->buf)); - if (uri->scheme.buf) { - WRITE(uri->scheme.buf, uri->scheme.len); - WRITE(":", 1); + // Write base URI prefix + if (*base_last == '/') { + const size_t base_len = base_last - base->buf + 1; + len += sink(base->buf, base_len, stream); + } } - if (uri->authority.buf) { - WRITE("//", 2); - WRITE(uri->authority.buf, uri->authority.len); + + // Write URI suffix + len += sink(begin, end - begin, stream); + + return len; +} + +/** Write an absolute path relative to a base path. */ +static size_t +write_abs_path(SerdSink sink, + void* stream, + const SerdChunk* base, + const SerdChunk* path) +{ + size_t len = 0; + const size_t min_len = (path->len < base->len) ? path->len : base->len; + + // Find the last separator common to both paths + size_t last_shared_sep = 0; + size_t i = 0; + for (; i < min_len && path->buf[i] == base->buf[i]; ++i) { + if (path->buf[i] == '/') { + last_shared_sep = i; + } } - if (!uri->path.buf) { - WRITE(uri->path_base.buf, uri->path_base.len); - } else { - const uint8_t* begin = uri->path.buf; - const uint8_t* const end = uri->path.buf + uri->path.len; - size_t up; - begin = remove_dot_segments(uri->path.buf, uri->path.len, &up); + if (i == path->len && i == base->len) { // Paths are identical + return 0; + } else if (last_shared_sep == 0) { // No common components + return sink(path->buf, path->len, stream); + } - if (uri->path_base.buf) { - // Find the up'th last slash - const uint8_t* base_last = (uri->path_base.buf - + uri->path_base.len - 1); + // Find the number of up references ("..") required + size_t up = 0; + for (size_t i = last_shared_sep + 1; i < base->len; ++i) { + if (base->buf[i] == '/') { ++up; - do { - if (*base_last == '/') { - --up; - } - } while (up > 0 && (--base_last > uri->path_base.buf)); + } + } - // Write base URI prefix - if (*base_last == '/') { - const size_t base_len = base_last - uri->path_base.buf + 1; - WRITE(uri->path_base.buf, base_len); - } + // Write up references + for (size_t i = 0; i < up; ++i) { + len += sink("../", 3, stream); + } + + // Write suffix + const size_t suffix_len = path->len - last_shared_sep - 1; + len += sink(path->buf + last_shared_sep + 1, suffix_len, stream); + + return len; +} + +static inline bool +chunk_equals(const SerdChunk* a, const SerdChunk* b) +{ + return a->len == b->len + && !strncmp((const char*)a->buf, (const char*)b->buf, a->len); +} +/** Return true iff both are absolute URIs on the same host. */ +static inline bool +same_host(const SerdURI* base, const SerdURI* uri) +{ + return base && uri && base->scheme.len + && chunk_equals(&base->scheme, &uri->scheme) + && chunk_equals(&base->authority, &uri->authority); +} + +/// See http://tools.ietf.org/html/rfc3986#section-5.3 +SERD_API +size_t +serd_uri_serialise_relative(const SerdURI* uri, + const SerdURI* base, + SerdSink sink, + void* stream) +{ + size_t len = 0; + const bool relative = same_host(base, uri); + if (relative) { + len = write_abs_path(sink, stream, base ? &base->path : 0, &uri->path); + } + if (!relative || (!len && base->query.buf)) { + if (uri->scheme.buf) { + len += sink(uri->scheme.buf, uri->scheme.len, stream); + len += sink(":", 1, stream); + } + if (uri->authority.buf) { + len += sink("//", 2, stream); + len += sink(uri->authority.buf, uri->authority.len, stream); + } + if (uri->path.buf && uri->path_base.buf) { + len += write_rel_path(sink, stream, &uri->path_base, &uri->path); + } else if (uri->path.buf) { + len += write_rel_path(sink, stream, NULL, &uri->path); } else { - // Relative path is just query or fragment, append to base URI - WRITE(uri->path_base.buf, uri->path_base.len); + len += sink(uri->path_base.buf, uri->path_base.len, stream); } - - // Write URI suffix - WRITE(begin, end - begin); } if (uri->query.buf) { - WRITE("?", 1); - WRITE(uri->query.buf, uri->query.len); + len += sink("?", 1, stream); + len += sink(uri->query.buf, uri->query.len, stream); } if (uri->fragment.buf) { // Note uri->fragment.buf includes the leading `#' - WRITE(uri->fragment.buf, uri->fragment.len); + len += sink(uri->fragment.buf, uri->fragment.len, stream); } - return write_size; + return len; +} + +/// See http://tools.ietf.org/html/rfc3986#section-5.3 +SERD_API +size_t +serd_uri_serialise(const SerdURI* uri, SerdSink sink, void* stream) +{ + return serd_uri_serialise_relative(uri, NULL, sink, stream); } diff --git a/src/writer.c b/src/writer.c index eb9c5b2d..849d4e7c 100644 --- a/src/writer.c +++ b/src/writer.c @@ -131,23 +131,24 @@ sink(const void* buf, size_t len, SerdWriter* writer) } } -static bool +static size_t write_text(SerdWriter* writer, TextContext ctx, - const uint8_t* utf8, size_t n_bytes, uint8_t terminator) + const uint8_t* utf8, size_t n_bytes) { - char escape[11] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + size_t len = 0; + char escape[11] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; for (size_t i = 0; i < n_bytes;) { // Fast bulk write for long strings of printable ASCII size_t j = i; for (; j < n_bytes; ++j) { - if (utf8[j] == terminator || utf8[j] == '\\' || utf8[j] == '"' + if (utf8[j] == '>' || utf8[j] == '\\' || utf8[j] == '"' || (!in_range(utf8[j], 0x20, 0x7E))) { break; } } if (j > i) { - sink(&utf8[i], j - i, writer); + len += sink(&utf8[i], j - i, writer); i = j; continue; } @@ -155,27 +156,29 @@ write_text(SerdWriter* writer, TextContext ctx, uint8_t in = utf8[i++]; if (ctx == WRITE_LONG_STRING) { if (in == '\\') { - sink("\\\\", 2, writer); continue; + len += sink("\\\\", 2, writer); continue; } else if (in == '\"' && i == n_bytes) { - sink("\\\"", 2, writer); continue; // '"' at end of string + len += sink("\\\"", 2, writer); continue; // '"' at string end } } else { switch (in) { - case '\\': sink("\\\\", 2, writer); continue; - case '\n': sink("\\n", 2, writer); continue; - case '\r': sink("\\r", 2, writer); continue; - case '\t': sink("\\t", 2, writer); continue; + case '\\': len += sink("\\\\", 2, writer); continue; + case '\n': len += sink("\\n", 2, writer); continue; + case '\r': len += sink("\\r", 2, writer); continue; + case '\t': len += sink("\\t", 2, writer); continue; case '"': - if (terminator == '"') { - sink("\\\"", 2, writer); + if (ctx == WRITE_STRING) { + len += sink("\\\"", 2, writer); continue; } // else fall-through default: break; } - if (in == terminator) { - snprintf(escape, sizeof(escape), "\\u%04X", terminator); - sink(escape, 6, writer); + if ((ctx == WRITE_STRING && in == '"') || + (ctx == WRITE_URI && in == '>')) { + snprintf(escape, sizeof(escape), "\\u%04X", + ctx == WRITE_STRING ? '"' : '>'); + len += sink(escape, 6, writer); continue; } } @@ -186,10 +189,10 @@ write_text(SerdWriter* writer, TextContext ctx, c = in & 0x7F; if (in_range(c, 0x20, 0x7E) || (is_space(c) && ctx == WRITE_LONG_STRING)) { - sink(&in, 1, writer); // Print ASCII character + len += sink(&in, 1, writer); // Print ASCII character } else { snprintf(escape, sizeof(escape), "\\u%04X", c); - sink(escape, 6, writer); // Escape ASCII control character + len += sink(escape, 6, writer); // ASCII control character } continue; } else if ((in & 0xE0) == 0xC0) { // Starts with `110' @@ -204,14 +207,14 @@ write_text(SerdWriter* writer, TextContext ctx, } else { fprintf(stderr, "Invalid UTF-8: %X\n", in); const uint8_t replacement_char[] = { 0xEF, 0xBF, 0xBD }; - sink(replacement_char, sizeof(replacement_char), writer); - return false; + len += sink(replacement_char, sizeof(replacement_char), writer); + return 0; } if (ctx != WRITE_URI && !(writer->style & SERD_STYLE_ASCII)) { // Write UTF-8 character directly to UTF-8 output // TODO: Always parse and validate character? - sink(utf8 + i - 1, size, writer); + len += sink(utf8 + i - 1, size, writer); i += size - 1; continue; } @@ -228,13 +231,19 @@ write_text(SerdWriter* writer, TextContext ctx, if (c < 0xFFFF) { snprintf(escape, sizeof(escape), "\\u%04X", c); - sink(escape, 6, writer); + len += sink(escape, 6, writer); } else { snprintf(escape, sizeof(escape), "\\U%08X", c); - sink(escape, 10, writer); + len += sink(escape, 10, writer); } } - return true; + return len; +} + +static size_t +uri_sink(const void* buf, size_t len, void* stream) +{ + return write_text((SerdWriter*)stream, WRITE_URI, buf, len); } static void @@ -299,6 +308,7 @@ write_node(SerdWriter* writer, { SerdChunk uri_prefix; SerdChunk uri_suffix; + bool has_scheme; switch (node->type) { case SERD_BLANK: if (writer->syntax != SERD_NTRIPLES @@ -343,8 +353,8 @@ write_node(SerdWriter* writer, return false; } sink("<", 1, writer); - write_text(writer, WRITE_URI, uri_prefix.buf, uri_prefix.len, '>'); - write_text(writer, WRITE_URI, uri_suffix.buf, uri_suffix.len, '>'); + write_text(writer, WRITE_URI, uri_prefix.buf, uri_prefix.len); + write_text(writer, WRITE_URI, uri_suffix.buf, uri_suffix.len); sink(">", 1, writer); break; case SERD_TURTLE: @@ -365,12 +375,11 @@ write_node(SerdWriter* writer, if (writer->syntax != SERD_NTRIPLES && (node->flags & (SERD_HAS_NEWLINE|SERD_HAS_QUOTE))) { sink("\"\"\"", 3, writer); - write_text(writer, WRITE_LONG_STRING, - node->buf, node->n_bytes, '\0'); + write_text(writer, WRITE_LONG_STRING, node->buf, node->n_bytes); sink("\"\"\"", 3, writer); } else { sink("\"", 1, writer); - write_text(writer, WRITE_STRING, node->buf, node->n_bytes, '"'); + write_text(writer, WRITE_STRING, node->buf, node->n_bytes); sink("\"", 1, writer); } if (lang && lang->buf) { @@ -382,6 +391,7 @@ write_node(SerdWriter* writer, } break; case SERD_URI: + has_scheme = serd_uri_string_has_scheme(node->buf); if ((writer->syntax == SERD_TURTLE) && !strcmp((const char*)node->buf, NS_RDF "type")) { sink("a", 1, writer); @@ -390,29 +400,38 @@ write_node(SerdWriter* writer, && !strcmp((const char*)node->buf, NS_RDF "nil")) { sink("()", 2, writer); break; - } else if ((writer->style & SERD_STYLE_CURIED) - && serd_uri_string_has_scheme(node->buf)) { + } else if (has_scheme && (writer->style & SERD_STYLE_CURIED)) { SerdNode prefix; SerdChunk suffix; if (serd_env_qualify(writer->env, node, &prefix, &suffix)) { - write_text(writer, WRITE_URI, prefix.buf, prefix.n_bytes, '>'); + write_text(writer, WRITE_URI, prefix.buf, prefix.n_bytes); sink(":", 1, writer); - write_text(writer, WRITE_URI, suffix.buf, suffix.len, '>'); + write_text(writer, WRITE_URI, suffix.buf, suffix.len); break; } - } else if ((writer->style & SERD_STYLE_RESOLVED) - && !serd_uri_string_has_scheme(node->buf)) { + } + if (!has_scheme && (writer->style & SERD_STYLE_RESOLVED)) { SerdURI uri; serd_uri_parse(node->buf, &uri); SerdURI abs_uri; serd_uri_resolve(&uri, &writer->base_uri, &abs_uri); sink("<", 1, writer); - serd_uri_serialise(&abs_uri, (SerdSink)sink, writer); + serd_uri_serialise(&abs_uri, uri_sink, writer); + sink(">", 1, writer); + break; + } else if (has_scheme && (writer->syntax == SERD_TURTLE) + && (writer->style & SERD_STYLE_RESOLVED)) { + SerdURI uri; + serd_uri_parse(node->buf, &uri); + sink("<", 1, writer); + serd_uri_serialise_relative( + &uri, &writer->base_uri, uri_sink, writer); sink(">", 1, writer); break; } + sink("<", 1, writer); - write_text(writer, WRITE_URI, node->buf, node->n_bytes, '>'); + write_text(writer, WRITE_URI, node->buf, node->n_bytes); sink(">", 1, writer); default: break; @@ -687,7 +706,7 @@ serd_writer_set_prefix(SerdWriter* writer, sink("@prefix ", 8, writer); sink(name->buf, name->n_bytes, writer); sink(": <", 3, writer); - write_text(writer, WRITE_URI, uri->buf, uri->n_bytes, '>'); + write_text(writer, WRITE_URI, uri->buf, uri->n_bytes); sink("> .\n", 4, writer); } return reset_context(writer, false); @@ -709,6 +728,13 @@ serd_writer_free(SerdWriter* writer) } SERD_API +SerdEnv* +serd_writer_get_env(SerdWriter* writer) +{ + return writer->env; +} + +SERD_API size_t serd_file_sink(const void* buf, size_t len, void* stream) { |