aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorDavid Robillard <d@drobilla.net>2012-03-08 15:57:20 +0000
committerDavid Robillard <d@drobilla.net>2012-03-08 15:57:20 +0000
commitf4365012b555699b916dbeec4d81425bf663579c (patch)
tree27807b4430269cdb279d32e9f734f119b8bb7191 /src
parent7b022006c47586dc00ed8bca85fcb0bdf5f9465d (diff)
downloadserd-f4365012b555699b916dbeec4d81425bf663579c.tar.gz
serd-f4365012b555699b916dbeec4d81425bf663579c.tar.bz2
serd-f4365012b555699b916dbeec4d81425bf663579c.zip
Add serd_writer_get_env().
Add serd_node_new_uri_from_path() and serd_file_uri_parse() and implement proper URI to/from path hex escaping, etc. Add serd_uri_serialise_relative() for making URIs relative to a base where possible (by chopping a common prefix and adding dot segments). Make URIs serialised by the writer properly escape characters. git-svn-id: http://svn.drobilla.net/serd/trunk@330 490d8e77-9747-427b-9fa3-0b8f29cee8a0
Diffstat (limited to 'src')
-rw-r--r--src/node.c62
-rw-r--r--src/serd_internal.h7
-rw-r--r--src/serdi.c3
-rw-r--r--src/uri.c235
-rw-r--r--src/writer.c102
5 files changed, 313 insertions, 96 deletions
diff --git a/src/node.c b/src/node.c
index 4d8c620f..dc352389 100644
--- a/src/node.c
+++ b/src/node.c
@@ -112,6 +112,68 @@ serd_node_new_uri_from_string(const uint8_t* str,
return serd_node_new_uri(&uri, base, out); // Resolve/Serialise
}
+static inline bool
+is_uri_path_char(const uint8_t c)
+{
+ if (is_alpha(c) || is_digit(c)) {
+ return true;
+ }
+ switch (c) {
+ case '-': case '.': case '_': case '~': // unreserved
+ case ':': case '@': // pchar
+ case '/': // separator
+ // sub-delims
+ case '!': case '$': case '&': case '\'': case '(': case ')':
+ case '*': case '+': case ',': case ';': case '=':
+ return true;
+ default:
+ return false;
+ }
+}
+
+SERD_API
+SerdNode
+serd_node_new_uri_from_path(const uint8_t* path,
+ const uint8_t* hostname,
+ SerdURI* out)
+{
+ const size_t path_len = strlen((const char*)path);
+ const size_t hostname_len = hostname ? strlen((const char*)hostname) : 0;
+ const bool evil = is_windows_path(path);
+ size_t uri_len = 0;
+ uint8_t* uri = NULL;
+
+ if (path[0] == '/' || is_windows_path(path)) {
+ uri_len = strlen("file://") + hostname_len + evil;
+ uri = (uint8_t*)malloc(uri_len + 1);
+ snprintf((char*)uri, uri_len + 1, "file://%s%s",
+ hostname ? (const char*)hostname : "",
+ evil ? "/" : "");
+ }
+
+ SerdChunk chunk = { uri, uri_len };
+ for (size_t i = 0; i < path_len; ++i) {
+ if (evil && path[i] == '\\') {
+ serd_chunk_sink("/", 1, &chunk);
+ } else if (path[i] == '%') {
+ serd_chunk_sink("%%", 2, &chunk);
+ } else if (is_uri_path_char(path[i])) {
+ serd_chunk_sink(path + i, 1, &chunk);
+ } else {
+ char escape[4] = { '%', 0, 0, 0 };
+ snprintf(escape + 1, sizeof(escape) - 1, "%X", path[i]);
+ serd_chunk_sink(escape, 3, &chunk);
+ }
+ }
+ serd_chunk_sink_finish(&chunk);
+
+ if (out) {
+ serd_uri_parse(chunk.buf, out);
+ }
+
+ return serd_node_from_string(SERD_URI, chunk.buf);
+}
+
SERD_API
SerdNode
serd_node_new_uri(const SerdURI* uri, const SerdURI* base, SerdURI* out)
diff --git a/src/serd_internal.h b/src/serd_internal.h
index 6e535402..f0137f28 100644
--- a/src/serd_internal.h
+++ b/src/serd_internal.h
@@ -233,4 +233,11 @@ is_base64(const uint8_t c)
return is_alpha(c) || is_digit(c) || c == '+' || c == '/' || c == '=';
}
+static inline bool
+is_windows_path(const uint8_t* path)
+{
+ return is_alpha(path[0]) && (path[1] == ':' || path[1] == '|')
+ && (path[2] == '/' || path[2] == '\\');
+}
+
#endif // SERD_INTERNAL_H
diff --git a/src/serdi.c b/src/serdi.c
index 661b60e0..ff1f8d51 100644
--- a/src/serdi.c
+++ b/src/serdi.c
@@ -178,7 +178,8 @@ main(int argc, char** argv)
}
}
- if (input_syntax != SERD_NTRIPLES) { // Base URI may change (@base)
+ if (input_syntax != SERD_NTRIPLES // Base URI may change (@base)
+ || (output_syntax == SERD_TURTLE)) {
output_style |= SERD_STYLE_RESOLVED;
}
diff --git a/src/uri.c b/src/uri.c
index df36564f..b67116f9 100644
--- a/src/uri.c
+++ b/src/uri.c
@@ -21,21 +21,12 @@
// #define URI_DEBUG 1
-static inline bool
-is_windows_path(const uint8_t* path)
-{
- return is_alpha(path[0]) && (path[1] == ':' || path[1] == '|')
- && (path[2] == '/' || path[2] == '\\');
-}
-
SERD_API
const uint8_t*
serd_uri_to_path(const uint8_t* uri)
{
const uint8_t* path = uri;
- if (uri[0] == '/' || is_windows_path(uri)) {
- return uri;
- } else if (serd_uri_string_has_scheme(uri)) {
+ if (!is_windows_path(uri) && serd_uri_string_has_scheme(uri)) {
if (strncmp((const char*)uri, "file:", 5)) {
fprintf(stderr, "Non-file URI `%s'\n", uri);
return NULL;
@@ -55,6 +46,56 @@ serd_uri_to_path(const uint8_t* uri)
}
SERD_API
+uint8_t*
+serd_file_uri_parse(const uint8_t* uri, uint8_t** hostname)
+{
+ const uint8_t* path = uri;
+ if (hostname) {
+ *hostname = NULL;
+ }
+ if (!strncmp((const char*)uri, "file://", 7)) {
+ const uint8_t* auth = uri + 7;
+ if (*auth == '/') { // No hostname
+ path = auth;
+ } else { // Has hostname
+ if (!(path = (const uint8_t*)strchr((const char*)auth, '/'))) {
+ return NULL;
+ }
+ if (hostname) {
+ *hostname = (uint8_t*)calloc(1, path - auth + 1);
+ memcpy(*hostname, auth, path - auth);
+ }
+ }
+ }
+
+ if (is_windows_path(path + 1)) {
+ ++path;
+ }
+
+ SerdChunk chunk = { NULL, 0 };
+ for (const uint8_t* s = path; *s; ++s) {
+ if (*s == '%') {
+ if (*(s + 1) == '%') {
+ serd_chunk_sink("%", 1, &chunk);
+ ++s;
+ } else if (is_digit(*(s + 1)) && is_digit(*(s + 2))) {
+ const uint8_t code[3] = { *(s + 1), *(s + 2), 0 };
+ uint32_t num;
+ sscanf((const char*)code, "%X", &num);
+ const uint8_t c = num;
+ serd_chunk_sink(&c, 1, &chunk);
+ s += 2;
+ } else {
+ s += 2; // Junk escape, ignore
+ }
+ } else {
+ serd_chunk_sink(s, 1, &chunk);
+ }
+ }
+ return serd_chunk_sink_finish(&chunk);
+}
+
+SERD_API
bool
serd_uri_string_has_scheme(const uint8_t* utf8)
{
@@ -276,12 +317,11 @@ remove_dot_segments(const uint8_t* path, size_t len, size_t* up)
return begin;
}
+/// See http://tools.ietf.org/html/rfc3986#section-5.2.2
SERD_API
void
serd_uri_resolve(const SerdURI* r, const SerdURI* base, SerdURI* t)
{
- // See http://tools.ietf.org/html/rfc3986#section-5.2.2
-
t->path_base.buf = NULL;
t->path_base.len = 0;
if (r->scheme.len) {
@@ -323,66 +363,147 @@ serd_uri_resolve(const SerdURI* r, const SerdURI* base, SerdURI* t)
#endif
}
-SERD_API
-size_t
-serd_uri_serialise(const SerdURI* uri, SerdSink sink, void* stream)
+/** Write a relative path relative to a base path. */
+static size_t
+write_rel_path(SerdSink sink,
+ void* stream,
+ const SerdChunk* base,
+ const SerdChunk* path)
{
- // See http://tools.ietf.org/html/rfc3986#section-5.3
-
- size_t write_size = 0;
-#define WRITE(buf, len) \
- write_size += len; \
- sink((const uint8_t*)buf, len, stream);
+ size_t up;
+ size_t len = 0;
+ const uint8_t* begin = remove_dot_segments(path->buf, path->len, &up);
+ const uint8_t* end = path->buf + path->len;
+
+ if (base && base->buf) {
+ // Find the up'th last slash
+ const uint8_t* base_last = (base->buf + base->len - 1);
+ ++up;
+ do {
+ if (*base_last == '/') {
+ --up;
+ }
+ } while (up > 0 && (--base_last > base->buf));
- if (uri->scheme.buf) {
- WRITE(uri->scheme.buf, uri->scheme.len);
- WRITE(":", 1);
+ // Write base URI prefix
+ if (*base_last == '/') {
+ const size_t base_len = base_last - base->buf + 1;
+ len += sink(base->buf, base_len, stream);
+ }
}
- if (uri->authority.buf) {
- WRITE("//", 2);
- WRITE(uri->authority.buf, uri->authority.len);
+
+ // Write URI suffix
+ len += sink(begin, end - begin, stream);
+
+ return len;
+}
+
+/** Write an absolute path relative to a base path. */
+static size_t
+write_abs_path(SerdSink sink,
+ void* stream,
+ const SerdChunk* base,
+ const SerdChunk* path)
+{
+ size_t len = 0;
+ const size_t min_len = (path->len < base->len) ? path->len : base->len;
+
+ // Find the last separator common to both paths
+ size_t last_shared_sep = 0;
+ size_t i = 0;
+ for (; i < min_len && path->buf[i] == base->buf[i]; ++i) {
+ if (path->buf[i] == '/') {
+ last_shared_sep = i;
+ }
}
- if (!uri->path.buf) {
- WRITE(uri->path_base.buf, uri->path_base.len);
- } else {
- const uint8_t* begin = uri->path.buf;
- const uint8_t* const end = uri->path.buf + uri->path.len;
- size_t up;
- begin = remove_dot_segments(uri->path.buf, uri->path.len, &up);
+ if (i == path->len && i == base->len) { // Paths are identical
+ return 0;
+ } else if (last_shared_sep == 0) { // No common components
+ return sink(path->buf, path->len, stream);
+ }
- if (uri->path_base.buf) {
- // Find the up'th last slash
- const uint8_t* base_last = (uri->path_base.buf
- + uri->path_base.len - 1);
+ // Find the number of up references ("..") required
+ size_t up = 0;
+ for (size_t i = last_shared_sep + 1; i < base->len; ++i) {
+ if (base->buf[i] == '/') {
++up;
- do {
- if (*base_last == '/') {
- --up;
- }
- } while (up > 0 && (--base_last > uri->path_base.buf));
+ }
+ }
- // Write base URI prefix
- if (*base_last == '/') {
- const size_t base_len = base_last - uri->path_base.buf + 1;
- WRITE(uri->path_base.buf, base_len);
- }
+ // Write up references
+ for (size_t i = 0; i < up; ++i) {
+ len += sink("../", 3, stream);
+ }
+
+ // Write suffix
+ const size_t suffix_len = path->len - last_shared_sep - 1;
+ len += sink(path->buf + last_shared_sep + 1, suffix_len, stream);
+
+ return len;
+}
+
+static inline bool
+chunk_equals(const SerdChunk* a, const SerdChunk* b)
+{
+ return a->len == b->len
+ && !strncmp((const char*)a->buf, (const char*)b->buf, a->len);
+}
+/** Return true iff both are absolute URIs on the same host. */
+static inline bool
+same_host(const SerdURI* base, const SerdURI* uri)
+{
+ return base && uri && base->scheme.len
+ && chunk_equals(&base->scheme, &uri->scheme)
+ && chunk_equals(&base->authority, &uri->authority);
+}
+
+/// See http://tools.ietf.org/html/rfc3986#section-5.3
+SERD_API
+size_t
+serd_uri_serialise_relative(const SerdURI* uri,
+ const SerdURI* base,
+ SerdSink sink,
+ void* stream)
+{
+ size_t len = 0;
+ const bool relative = same_host(base, uri);
+ if (relative) {
+ len = write_abs_path(sink, stream, base ? &base->path : 0, &uri->path);
+ }
+ if (!relative || (!len && base->query.buf)) {
+ if (uri->scheme.buf) {
+ len += sink(uri->scheme.buf, uri->scheme.len, stream);
+ len += sink(":", 1, stream);
+ }
+ if (uri->authority.buf) {
+ len += sink("//", 2, stream);
+ len += sink(uri->authority.buf, uri->authority.len, stream);
+ }
+ if (uri->path.buf && uri->path_base.buf) {
+ len += write_rel_path(sink, stream, &uri->path_base, &uri->path);
+ } else if (uri->path.buf) {
+ len += write_rel_path(sink, stream, NULL, &uri->path);
} else {
- // Relative path is just query or fragment, append to base URI
- WRITE(uri->path_base.buf, uri->path_base.len);
+ len += sink(uri->path_base.buf, uri->path_base.len, stream);
}
-
- // Write URI suffix
- WRITE(begin, end - begin);
}
if (uri->query.buf) {
- WRITE("?", 1);
- WRITE(uri->query.buf, uri->query.len);
+ len += sink("?", 1, stream);
+ len += sink(uri->query.buf, uri->query.len, stream);
}
if (uri->fragment.buf) {
// Note uri->fragment.buf includes the leading `#'
- WRITE(uri->fragment.buf, uri->fragment.len);
+ len += sink(uri->fragment.buf, uri->fragment.len, stream);
}
- return write_size;
+ return len;
+}
+
+/// See http://tools.ietf.org/html/rfc3986#section-5.3
+SERD_API
+size_t
+serd_uri_serialise(const SerdURI* uri, SerdSink sink, void* stream)
+{
+ return serd_uri_serialise_relative(uri, NULL, sink, stream);
}
diff --git a/src/writer.c b/src/writer.c
index eb9c5b2d..849d4e7c 100644
--- a/src/writer.c
+++ b/src/writer.c
@@ -131,23 +131,24 @@ sink(const void* buf, size_t len, SerdWriter* writer)
}
}
-static bool
+static size_t
write_text(SerdWriter* writer, TextContext ctx,
- const uint8_t* utf8, size_t n_bytes, uint8_t terminator)
+ const uint8_t* utf8, size_t n_bytes)
{
- char escape[11] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+ size_t len = 0;
+ char escape[11] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
for (size_t i = 0; i < n_bytes;) {
// Fast bulk write for long strings of printable ASCII
size_t j = i;
for (; j < n_bytes; ++j) {
- if (utf8[j] == terminator || utf8[j] == '\\' || utf8[j] == '"'
+ if (utf8[j] == '>' || utf8[j] == '\\' || utf8[j] == '"'
|| (!in_range(utf8[j], 0x20, 0x7E))) {
break;
}
}
if (j > i) {
- sink(&utf8[i], j - i, writer);
+ len += sink(&utf8[i], j - i, writer);
i = j;
continue;
}
@@ -155,27 +156,29 @@ write_text(SerdWriter* writer, TextContext ctx,
uint8_t in = utf8[i++];
if (ctx == WRITE_LONG_STRING) {
if (in == '\\') {
- sink("\\\\", 2, writer); continue;
+ len += sink("\\\\", 2, writer); continue;
} else if (in == '\"' && i == n_bytes) {
- sink("\\\"", 2, writer); continue; // '"' at end of string
+ len += sink("\\\"", 2, writer); continue; // '"' at string end
}
} else {
switch (in) {
- case '\\': sink("\\\\", 2, writer); continue;
- case '\n': sink("\\n", 2, writer); continue;
- case '\r': sink("\\r", 2, writer); continue;
- case '\t': sink("\\t", 2, writer); continue;
+ case '\\': len += sink("\\\\", 2, writer); continue;
+ case '\n': len += sink("\\n", 2, writer); continue;
+ case '\r': len += sink("\\r", 2, writer); continue;
+ case '\t': len += sink("\\t", 2, writer); continue;
case '"':
- if (terminator == '"') {
- sink("\\\"", 2, writer);
+ if (ctx == WRITE_STRING) {
+ len += sink("\\\"", 2, writer);
continue;
} // else fall-through
default: break;
}
- if (in == terminator) {
- snprintf(escape, sizeof(escape), "\\u%04X", terminator);
- sink(escape, 6, writer);
+ if ((ctx == WRITE_STRING && in == '"') ||
+ (ctx == WRITE_URI && in == '>')) {
+ snprintf(escape, sizeof(escape), "\\u%04X",
+ ctx == WRITE_STRING ? '"' : '>');
+ len += sink(escape, 6, writer);
continue;
}
}
@@ -186,10 +189,10 @@ write_text(SerdWriter* writer, TextContext ctx,
c = in & 0x7F;
if (in_range(c, 0x20, 0x7E)
|| (is_space(c) && ctx == WRITE_LONG_STRING)) {
- sink(&in, 1, writer); // Print ASCII character
+ len += sink(&in, 1, writer); // Print ASCII character
} else {
snprintf(escape, sizeof(escape), "\\u%04X", c);
- sink(escape, 6, writer); // Escape ASCII control character
+ len += sink(escape, 6, writer); // ASCII control character
}
continue;
} else if ((in & 0xE0) == 0xC0) { // Starts with `110'
@@ -204,14 +207,14 @@ write_text(SerdWriter* writer, TextContext ctx,
} else {
fprintf(stderr, "Invalid UTF-8: %X\n", in);
const uint8_t replacement_char[] = { 0xEF, 0xBF, 0xBD };
- sink(replacement_char, sizeof(replacement_char), writer);
- return false;
+ len += sink(replacement_char, sizeof(replacement_char), writer);
+ return 0;
}
if (ctx != WRITE_URI && !(writer->style & SERD_STYLE_ASCII)) {
// Write UTF-8 character directly to UTF-8 output
// TODO: Always parse and validate character?
- sink(utf8 + i - 1, size, writer);
+ len += sink(utf8 + i - 1, size, writer);
i += size - 1;
continue;
}
@@ -228,13 +231,19 @@ write_text(SerdWriter* writer, TextContext ctx,
if (c < 0xFFFF) {
snprintf(escape, sizeof(escape), "\\u%04X", c);
- sink(escape, 6, writer);
+ len += sink(escape, 6, writer);
} else {
snprintf(escape, sizeof(escape), "\\U%08X", c);
- sink(escape, 10, writer);
+ len += sink(escape, 10, writer);
}
}
- return true;
+ return len;
+}
+
+static size_t
+uri_sink(const void* buf, size_t len, void* stream)
+{
+ return write_text((SerdWriter*)stream, WRITE_URI, buf, len);
}
static void
@@ -299,6 +308,7 @@ write_node(SerdWriter* writer,
{
SerdChunk uri_prefix;
SerdChunk uri_suffix;
+ bool has_scheme;
switch (node->type) {
case SERD_BLANK:
if (writer->syntax != SERD_NTRIPLES
@@ -343,8 +353,8 @@ write_node(SerdWriter* writer,
return false;
}
sink("<", 1, writer);
- write_text(writer, WRITE_URI, uri_prefix.buf, uri_prefix.len, '>');
- write_text(writer, WRITE_URI, uri_suffix.buf, uri_suffix.len, '>');
+ write_text(writer, WRITE_URI, uri_prefix.buf, uri_prefix.len);
+ write_text(writer, WRITE_URI, uri_suffix.buf, uri_suffix.len);
sink(">", 1, writer);
break;
case SERD_TURTLE:
@@ -365,12 +375,11 @@ write_node(SerdWriter* writer,
if (writer->syntax != SERD_NTRIPLES
&& (node->flags & (SERD_HAS_NEWLINE|SERD_HAS_QUOTE))) {
sink("\"\"\"", 3, writer);
- write_text(writer, WRITE_LONG_STRING,
- node->buf, node->n_bytes, '\0');
+ write_text(writer, WRITE_LONG_STRING, node->buf, node->n_bytes);
sink("\"\"\"", 3, writer);
} else {
sink("\"", 1, writer);
- write_text(writer, WRITE_STRING, node->buf, node->n_bytes, '"');
+ write_text(writer, WRITE_STRING, node->buf, node->n_bytes);
sink("\"", 1, writer);
}
if (lang && lang->buf) {
@@ -382,6 +391,7 @@ write_node(SerdWriter* writer,
}
break;
case SERD_URI:
+ has_scheme = serd_uri_string_has_scheme(node->buf);
if ((writer->syntax == SERD_TURTLE)
&& !strcmp((const char*)node->buf, NS_RDF "type")) {
sink("a", 1, writer);
@@ -390,29 +400,38 @@ write_node(SerdWriter* writer,
&& !strcmp((const char*)node->buf, NS_RDF "nil")) {
sink("()", 2, writer);
break;
- } else if ((writer->style & SERD_STYLE_CURIED)
- && serd_uri_string_has_scheme(node->buf)) {
+ } else if (has_scheme && (writer->style & SERD_STYLE_CURIED)) {
SerdNode prefix;
SerdChunk suffix;
if (serd_env_qualify(writer->env, node, &prefix, &suffix)) {
- write_text(writer, WRITE_URI, prefix.buf, prefix.n_bytes, '>');
+ write_text(writer, WRITE_URI, prefix.buf, prefix.n_bytes);
sink(":", 1, writer);
- write_text(writer, WRITE_URI, suffix.buf, suffix.len, '>');
+ write_text(writer, WRITE_URI, suffix.buf, suffix.len);
break;
}
- } else if ((writer->style & SERD_STYLE_RESOLVED)
- && !serd_uri_string_has_scheme(node->buf)) {
+ }
+ if (!has_scheme && (writer->style & SERD_STYLE_RESOLVED)) {
SerdURI uri;
serd_uri_parse(node->buf, &uri);
SerdURI abs_uri;
serd_uri_resolve(&uri, &writer->base_uri, &abs_uri);
sink("<", 1, writer);
- serd_uri_serialise(&abs_uri, (SerdSink)sink, writer);
+ serd_uri_serialise(&abs_uri, uri_sink, writer);
+ sink(">", 1, writer);
+ break;
+ } else if (has_scheme && (writer->syntax == SERD_TURTLE)
+ && (writer->style & SERD_STYLE_RESOLVED)) {
+ SerdURI uri;
+ serd_uri_parse(node->buf, &uri);
+ sink("<", 1, writer);
+ serd_uri_serialise_relative(
+ &uri, &writer->base_uri, uri_sink, writer);
sink(">", 1, writer);
break;
}
+
sink("<", 1, writer);
- write_text(writer, WRITE_URI, node->buf, node->n_bytes, '>');
+ write_text(writer, WRITE_URI, node->buf, node->n_bytes);
sink(">", 1, writer);
default:
break;
@@ -687,7 +706,7 @@ serd_writer_set_prefix(SerdWriter* writer,
sink("@prefix ", 8, writer);
sink(name->buf, name->n_bytes, writer);
sink(": <", 3, writer);
- write_text(writer, WRITE_URI, uri->buf, uri->n_bytes, '>');
+ write_text(writer, WRITE_URI, uri->buf, uri->n_bytes);
sink("> .\n", 4, writer);
}
return reset_context(writer, false);
@@ -709,6 +728,13 @@ serd_writer_free(SerdWriter* writer)
}
SERD_API
+SerdEnv*
+serd_writer_get_env(SerdWriter* writer)
+{
+ return writer->env;
+}
+
+SERD_API
size_t
serd_file_sink(const void* buf, size_t len, void* stream)
{