1 files changed, 512 insertions, 0 deletions
diff --git a/src/uri.c b/src/uri.c
new file mode 100644
index 00000000..6ec6f1c7
--- /dev/null
+++ b/src/uri.c
@@ -0,0 +1,512 @@
+/*
+  Copyright 2011-2014 David Robillard <http://drobilla.net>
+
+  Permission to use, copy, modify, and/or distribute this software for any
+  purpose with or without fee is hereby granted, provided that the above
+  copyright notice and this permission notice appear in all copies.
+
+  THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+*/
+
+#include "serd_internal.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+// #define URI_DEBUG 1
+
+const uint8_t*
+serd_uri_to_path(const uint8_t* uri)
+{
+	const uint8_t* path = uri;
+	if (!is_windows_path(uri) && serd_uri_string_has_scheme(uri)) {
+		if (strncmp((const char*)uri, "file:", 5)) {
+			fprintf(stderr, "Non-file URI `%s'\n", uri);
+			return NULL;
+		} else if (!strncmp((const char*)uri, "file://localhost/", 17)) {
+			path = uri + 16;
+		} else if (!strncmp((const char*)uri, "file://", 7)) {
+			path = uri + 7;
+		} else {
+			fprintf(stderr, "Invalid file URI `%s'\n", uri);
+			return NULL;
+		}
+		if (is_windows_path(path + 1)) {
+			++path;  // Special case for terrible Windows file URIs
+		}
+	}
+	return path;
+}
+
+uint8_t*
+serd_file_uri_parse(const uint8_t* uri, uint8_t** hostname)
+{
+	const uint8_t* path = uri;
+	if (hostname) {
+		*hostname = NULL;
+	}
+	if (!strncmp((const char*)uri, "file://", 7)) {
+		const uint8_t* auth = uri + 7;
+		if (*auth == '/') {  // No hostname
+			path = auth;
+		} else {  // Has hostname
+			if (!(path = (const uint8_t*)strchr((const char*)auth, '/'))) {
+				return NULL;
+			}
+			if (hostname) {
+				*hostname = (uint8_t*)calloc(1, path - auth + 1);
+				memcpy(*hostname, auth, path - auth);
+			}
+		}
+	}
+
+	if (is_windows_path(path + 1)) {
+		++path;
+	}
+
+	SerdChunk chunk = { NULL, 0 };
+	for (const uint8_t* s = path; *s; ++s) {
+		if (*s == '%') {
+			if (*(s + 1) == '%') {
+				serd_chunk_sink("%", 1, &chunk);
+				++s;
+			} else if (is_hexdig(*(s + 1)) && is_hexdig(*(s + 2))) {
+				const uint8_t code[3] = { *(s + 1), *(s + 2), 0 };
+				uint32_t num;
+				sscanf((const char*)code, "%X", &num);
+				const uint8_t c = num;
+				serd_chunk_sink(&c, 1, &chunk);
+				s += 2;
+			} else {
+				s += 2;  // Junk escape, ignore
+			}
+		} else {
+			serd_chunk_sink(s, 1, &chunk);
+		}
+	}
+	return serd_chunk_sink_finish(&chunk);
+}
+
+bool
+serd_uri_string_has_scheme(const uint8_t* utf8)
+{
+	// RFC3986: scheme ::= ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
+	if (!utf8 || !is_alpha(utf8[0])) {
+		return false;  // Invalid scheme initial character, URI is relative
+	}
+
+	for (uint8_t c; (c = *++utf8) != '\0';) {
+		if (!is_uri_scheme_char(c)) {
+			return false;
+		} else if (c == ':') {
+			return true;  // End of scheme
+		}
+	}
+
+	return false;
+}
+
+#ifdef URI_DEBUG
+static void
+serd_uri_dump(const SerdURI* uri, FILE* file)
+{
+#define PRINT_PART(range, name) \
+	if (range.buf) { \
+		fprintf(stderr, "  " name " = "); \
+		fwrite((range).buf, 1, (range).len, stderr); \
+		fprintf(stderr, "\n"); \
+	}
+
+	PRINT_PART(uri->scheme,    "scheme   ");
+	PRINT_PART(uri->authority, "authority");
+	PRINT_PART(uri->path_base, "path_base");
+	PRINT_PART(uri->path,      "path     ");
+	PRINT_PART(uri->query,     "query    ");
+	PRINT_PART(uri->fragment,  "fragment ");
+}
+#endif
+
+SerdStatus
+serd_uri_parse(const uint8_t* utf8, SerdURI* out)
+{
+	*out = SERD_URI_NULL;
+
+	const uint8_t* ptr = utf8;
+
+	/* See http://tools.ietf.org/html/rfc3986#section-3
+	   URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
+	*/
+
+	/* S3.1: scheme ::= ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) */
+	if (is_alpha(*ptr)) {
+		for (uint8_t c = *++ptr; true; c = *++ptr) {
+			switch (c) {
+			case '\0': case '/': case '?': case '#':
+				ptr = utf8;
+				goto path;  // Relative URI (starts with path by definition)
+			case ':':
+				out->scheme.buf = utf8;
+				out->scheme.len = (ptr++) - utf8;
+				goto maybe_authority;  // URI with scheme
+			case '+': case '-': case '.':
+				continue;
+			default:
+				if (is_alpha(c) || is_digit(c)) {
+					continue;
+				}
+			}
+		}
+	}
+
+	/* S3.2: The authority component is preceded by a double slash ("//")
+	   and is terminated by the next slash ("/"), question mark ("?"),
+	   or number sign ("#") character, or by the end of the URI.
+	*/
+maybe_authority:
+	if (*ptr == '/' && *(ptr + 1) == '/') {
+		ptr += 2;
+		out->authority.buf = ptr;
+		for (uint8_t c; (c = *ptr) != '\0'; ++ptr) {
+			switch (c) {
+			case '/': goto path;
+			case '?': goto query;
+			case '#': goto fragment;
+			default:
+				++out->authority.len;
+			}
+		}
+	}
+
+	/* RFC3986 S3.3: The path is terminated by the first question mark ("?")
+	   or number sign ("#") character, or by the end of the URI.
+	*/
+path:
+	switch (*ptr) {
+	case '?':  goto query;
+	case '#':  goto fragment;
+	case '\0': goto end;
+	default:  break;
+	}
+	out->path.buf = ptr;
+	out->path.len = 0;
+	for (uint8_t c; (c = *ptr) != '\0'; ++ptr) {
+		switch (c) {
+		case '?': goto query;
+		case '#': goto fragment;
+		default:
+			++out->path.len;
+		}
+	}
+
+	/* RFC3986 S3.4: The query component is indicated by the first question
+	   mark ("?") character and terminated by a number sign ("#") character
+	   or by the end of the URI.
+	*/
+query:
+	if (*ptr == '?') {
+		out->query.buf = ++ptr;
+		for (uint8_t c; (c = *ptr) != '\0'; ++ptr) {
+			switch (c) {
+			case '#':
+				goto fragment;
+			default:
+				++out->query.len;
+			}
+		}
+	}
+
+	/* RFC3986 S3.5: A fragment identifier component is indicated by the
+	   presence of a number sign ("#") character and terminated by the end
+	   of the URI.
+	*/
+fragment:
+	if (*ptr == '#') {
+		out->fragment.buf = ptr;
+		while (*ptr++ != '\0') {
+			++out->fragment.len;
+		}
+	}
+
+end:
+	#ifdef URI_DEBUG
+	fprintf(stderr, "PARSE URI <%s>\n", utf8);
+	serd_uri_dump(out, stderr);
+	fprintf(stderr, "\n");
+	#endif
+
+	return SERD_SUCCESS;
+}
+
+/**
+   Remove leading dot components from `path`.
+   See http://tools.ietf.org/html/rfc3986#section-5.2.3
+   @param up Set to the number of up-references (e.g. "../") trimmed
+   @return A pointer to the new start of `path`
+*/
+static const uint8_t*
+remove_dot_segments(const uint8_t* path, size_t len, size_t* up)
+{
+	const uint8_t*       begin = path;
+	const uint8_t* const end   = path + len;
+
+	*up = 0;
+	while (begin < end) {
+		switch (begin[0]) {
+		case '.':
+			switch (begin[1]) {
+			case '/':
+				begin += 2;  // Chop leading "./"
+				break;
+			case '.':
+				switch (begin[2]) {
+				case '\0':
+					++*up;
+					begin += 2;  // Chop input ".."
+					break;
+				case '/':
+					++*up;
+					begin += 3;  // Chop leading "../"
+					break;
+				default:
+					return begin;
+				}
+				break;
+			case '\0':
+				++begin;  // Chop input "."
+                // fallthru
+			default:
+				return begin;
+			}
+			break;
+		case '/':
+			switch (begin[1]) {
+			case '.':
+				switch (begin[2]) {
+				case '/':
+					begin += 2;  // Leading "/./" => "/"
+					break;
+				case '.':
+					switch (begin[3]) {
+					case '/':
+						++*up;
+						begin += 3;  // Leading "/../" => "/"
+					}
+					break;
+				default:
+					return begin;
+				}
+			}  // else fall through
+		default:
+			return begin;  // Finished chopping dot components
+		}
+	}
+
+	return begin;
+}
+
+/// Merge `base` and `path` in-place
+static void
+merge(SerdChunk* base, SerdChunk* path)
+{
+	size_t         up;
+	const uint8_t* begin = remove_dot_segments(path->buf, path->len, &up);
+	const uint8_t* end   = path->buf + path->len;
+
+	if (base->len) {
+		// Find the up'th last slash
+		const uint8_t* base_last = (base->buf + base->len - 1);
+		++up;
+		do {
+			if (*base_last == '/') {
+				--up;
+			}
+		} while (up > 0 && (--base_last > base->buf));
+
+		// Set path prefix
+		base->len = base_last - base->buf + 1;
+	}
+
+	// Set path suffix
+	path->buf = begin;
+	path->len = end - begin;
+}
+
+/// See http://tools.ietf.org/html/rfc3986#section-5.2.2
+void
+serd_uri_resolve(const SerdURI* r, const SerdURI* base, SerdURI* t)
+{
+	if (!base->scheme.len) {
+		*t = *r;  // Don't resolve against non-absolute URIs
+		return;
+	}
+
+	t->path_base.buf = NULL;
+	t->path_base.len = 0;
+	if (r->scheme.len) {
+		*t = *r;
+	} else {
+		if (r->authority.len) {
+			t->authority = r->authority;
+			t->path      = r->path;
+			t->query     = r->query;
+		} else {
+			t->path = r->path;
+			if (!r->path.len) {
+				t->path_base = base->path;
+				if (r->query.len) {
+					t->query = r->query;
+				} else {
+					t->query = base->query;
+				}
+			} else {
+				if (r->path.buf[0] != '/') {
+					t->path_base = base->path;
+				}
+				merge(&t->path_base, &t->path);
+				t->query = r->query;
+			}
+			t->authority = base->authority;
+		}
+		t->scheme   = base->scheme;
+		t->fragment = r->fragment;
+	}
+
+	#ifdef URI_DEBUG
+	fprintf(stderr, "## RESOLVE URI\n# BASE\n");
+	serd_uri_dump(base, stderr);
+	fprintf(stderr, "# URI\n");
+	serd_uri_dump(r, stderr);
+	fprintf(stderr, "# RESULT\n");
+	serd_uri_dump(t, stderr);
+	fprintf(stderr, "\n");
+	#endif
+}
+
+/** Write the path of `uri` starting at index `i` */
+static size_t
+write_path_tail(SerdSink sink, void* stream, const SerdURI* uri, size_t i)
+{
+	size_t len = 0;
+	if (i < uri->path_base.len) {
+		len += sink(uri->path_base.buf + i, uri->path_base.len - i, stream);
+	}
+	if (uri->path.buf) {
+		if (i < uri->path_base.len) {
+			len += sink(uri->path.buf, uri->path.len, stream);
+		} else {
+			const size_t j = (i - uri->path_base.len);
+			len += sink(uri->path.buf + j, uri->path.len - j, stream);
+		}
+	}
+	return len;
+}
+
+/** Write the path of `uri` relative to the path of `base`. */
+static size_t
+write_rel_path(SerdSink       sink,
+               void*          stream,
+               const SerdURI* uri,
+               const SerdURI* base)
+{
+	const size_t path_len = uri_path_len(uri);
+	const size_t base_len = uri_path_len(base);
+	const size_t min_len  = (path_len < base_len) ? path_len : base_len;
+
+	// Find the last separator common to both paths
+	size_t last_shared_sep = 0;
+	size_t i               = 0;
+	for (; i < min_len && uri_path_at(uri, i) == uri_path_at(base, i); ++i) {
+		if (uri_path_at(uri, i) == '/') {
+			last_shared_sep = i;
+		}
+	}
+
+	if (i == path_len && i == base_len) {  // Paths are identical
+		return 0;
+	}
+
+	// Find the number of up references ("..") required
+	size_t up = 0;
+	for (size_t s = last_shared_sep + 1; s < base_len; ++s) {
+		if (uri_path_at(base, s) == '/') {
+			++up;
+		}
+	}
+
+	// Write up references
+	size_t len = 0;
+	for (size_t u = 0; u < up; ++u) {
+		len += sink("../", 3, stream);
+	}
+
+	if (last_shared_sep == 0 && up == 0) {
+		len += sink("/", 1, stream);
+	}
+
+	// Write suffix
+	return len += write_path_tail(sink, stream, uri, last_shared_sep + 1);
+}
+
+static uint8_t
+serd_uri_path_starts_without_slash(const SerdURI* uri)
+{
+	return ((uri->path_base.len || uri->path.len) &&
+	        ((!uri->path_base.len || uri->path_base.buf[0] != '/') &&
+	         (!uri->path.len || uri->path.buf[0] != '/')));
+}
+
+/// See http://tools.ietf.org/html/rfc3986#section-5.3
+size_t
+serd_uri_serialise_relative(const SerdURI* uri,
+                            const SerdURI* base,
+                            const SerdURI* root,
+                            SerdSink       sink,
+                            void*          stream)
+{
+	size_t     len = 0;
+	const bool relative =
+		root ? uri_is_under(uri, root) : uri_is_related(uri, base);
+
+	if (relative) {
+		len = write_rel_path(sink, stream, uri, base);
+	}
+	if (!relative || (!len && base->query.buf)) {
+		if (uri->scheme.buf) {
+			len += sink(uri->scheme.buf, uri->scheme.len, stream);
+			len += sink(":", 1, stream);
+		}
+		if (uri->authority.buf) {
+			len += sink("//", 2, stream);
+			len += sink(uri->authority.buf, uri->authority.len, stream);
+			if (uri->authority.buf[uri->authority.len - 1] != '/' &&
+			    serd_uri_path_starts_without_slash(uri)) {
+				// Special case: ensure path begins with a slash
+				// https://tools.ietf.org/html/rfc3986#section-3.2
+				len += sink("/", 1, stream);
+			}
+		}
+		len += write_path_tail(sink, stream, uri, 0);
+	}
+	if (uri->query.buf) {
+		len += sink("?", 1, stream);
+		len += sink(uri->query.buf, uri->query.len, stream);
+	}
+	if (uri->fragment.buf) {
+		// Note uri->fragment.buf includes the leading `#'
+		len += sink(uri->fragment.buf, uri->fragment.len, stream);
+	}
+	return len;
+}
+
+/// See http://tools.ietf.org/html/rfc3986#section-5.3
+size_t
+serd_uri_serialise(const SerdURI* uri, SerdSink sink, void* stream)
+{
+	return serd_uri_serialise_relative(uri, NULL, NULL, sink, stream);
+}