/* Serd, an RDF serialisation library.
 * Copyright 2011 David Robillard <d@drobilla.net>
 *
 * Serd is free software: you can redistribute it and/or modify it under
 * the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Serd is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
 * License for details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#include <assert.h>
#include <stdlib.h>
#include <string.h>

#include "serd/serd.h"

//#define URI_DEBUG 1

/** Return true if @a c lies within [min...max] (inclusive) */
static inline bool
in_range(const char c, const char min, const char max)
{
	return (c >= min && c <= max);
}

/** RFC2234: ALPHA := %x41-5A / %x61-7A  ; A-Z / a-z */
static inline bool
is_alpha(const uint8_t c)
{
	return in_range(c, 'A', 'Z') || in_range(c, 'a', 'z');
}

/** RFC2234: DIGIT ::= %x30-39  ; 0-9 */
static inline bool
is_digit(const uint8_t c)
{
	return in_range(c, '0', '9');
}

SERD_API
bool
serd_uri_string_has_scheme(const uint8_t* utf8)
{
	// RFC3986: scheme ::= ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
	if (!is_alpha(utf8[0])) {
		return false;  // Invalid scheme initial character, URI is relative
	}
	for (uint8_t c = *++utf8; (c = *utf8) != '\0'; ++utf8) {
		switch (c) {
		case ':':
			return true;  // End of scheme
		case '+': case '-': case '.':
			break;  // Valid scheme character, continue
		default:
			if (!is_alpha(c) && !is_digit(c)) {
				return false;  // Invalid scheme character
			}
		}
	}

	return false;
}

#ifdef URI_DEBUG
static void
serd_uri_dump(const SerdURI* uri, FILE* file)
{
#define PRINT_PART(range, name) \
	if (range.buf) { \
		fprintf(stderr, "  " name " = "); \
		fwrite((range).buf, 1, (range).len, stderr); \
		fprintf(stderr, "\n"); \
	}

	PRINT_PART(uri->scheme,    "scheme");
	PRINT_PART(uri->authority, "authority");
	PRINT_PART(uri->path_base, "path_base");
	PRINT_PART(uri->path,      "path");
	PRINT_PART(uri->query,     "query");
	PRINT_PART(uri->fragment,  "fragment");
}
#endif

SERD_API
bool
serd_uri_parse(const uint8_t* utf8, SerdURI* uri)
{
	*uri = SERD_URI_NULL;
	assert(uri->path_base.buf == NULL);
	assert(uri->path_base.len == 0);
	assert(uri->authority.len == 0);

	const uint8_t* ptr = utf8;

	/* See http://tools.ietf.org/html/rfc3986#section-3
	   URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
	*/

	/* S3.1: scheme ::= ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) */
	if (is_alpha(*ptr)) {
		for (uint8_t c = *++ptr; true; c = *++ptr) {
			switch (c) {
			case '\0': case '/': case '?': case '#':
				ptr = utf8;
				goto path;  // Relative URI (starts with path by definition)
			case ':':
				uri->scheme.buf = utf8;
				uri->scheme.len = (ptr++) - utf8;
				goto maybe_authority;  // URI with scheme
			case '+': case '-': case '.':
				continue;
			default:
				if (is_alpha(c) || is_digit(c)) {
					continue;
				}
			}
		}
	}

	/* S3.2: The authority component is preceded by a double slash ("//")
	   and is terminated by the next slash ("/"), question mark ("?"),
	   or number sign ("#") character, or by the end of the URI.
	*/
maybe_authority:
	if (*ptr == '/' && *(ptr + 1) == '/') {
		ptr += 2;
		uri->authority.buf = ptr;
		assert(uri->authority.len == 0);
		for (uint8_t c = *ptr; (c = *ptr) != '\0'; ++ptr) {
			switch (c) {
			case '/': goto path;
			case '?': goto query;
			case '#': goto fragment;
			default:
				++uri->authority.len;
			}
		}
	}

	/* RFC3986 S3.3: The path is terminated by the first question mark ("?")
	   or number sign ("#") character, or by the end of the URI.
	*/
path:
	switch (*ptr) {
	case '?':  goto query;
	case '#':  goto fragment;
	case '\0': goto end;
	default:  break;
	}
	uri->path.buf = ptr;
	uri->path.len = 0;
	for (uint8_t c = *ptr; (c = *ptr) != '\0'; ++ptr) {
		switch (c) {
		case '?': goto query;
		case '#': goto fragment;
		default:
			++uri->path.len;
		}
	}

	/* RFC3986 S3.4: The query component is indicated by the first question
	   mark ("?") character and terminated by a number sign ("#") character
	   or by the end of the URI.
	*/
query:
	if (*ptr == '?') {
		uri->query.buf = ++ptr;
		for (uint8_t c = *ptr; (c = *ptr) != '\0'; ++ptr) {
			switch (c) {
			case '#':
				goto fragment;
			default:
				++uri->query.len;
			}
		}
	}

	/* RFC3986 S3.5: A fragment identifier component is indicated by the
	   presence of a number sign ("#") character and terminated by the end
	   of the URI.
	*/
fragment:
	if (*ptr == '#') {
		uri->fragment.buf = ptr;
		while (*ptr++ != '\0') {
			++uri->fragment.len;
		}
	}

end:
	#ifdef URI_DEBUG
	fprintf(stderr, "PARSE URI <%s>\n", utf8);
	serd_uri_dump(uri, stderr);
	fprintf(stderr, "\n");
	#endif

	return true;
}

SERD_API
bool
serd_uri_resolve(const SerdURI* r, const SerdURI* base, SerdURI* t)
{
	assert(!r->scheme.len);  // r is relative

	// See http://tools.ietf.org/html/rfc3986#section-5.2.2

	t->path_base.buf = NULL;
	t->path_base.len = 0;
	if (r->scheme.len) {
		t->scheme    = r->scheme;
		t->authority = r->authority;
		t->path      = r->path;
		t->query     = r->query;
	} else {
		if (r->authority.len) {
			t->authority = r->authority;
			t->path      = r->path;
			t->query     = r->query;
		} else {
			t->path = r->path;
			if (!r->path.len) {
				t->path_base = base->path;
				if (r->query.len) {
					t->query = r->query;
				} else {
					t->query = base->query;
				}
			} else {
				if (r->path.buf[0] != '/') {
					t->path_base = base->path;
				}
				t->query = r->query;
			}
			t->authority = base->authority;
		}
		t->scheme = base->scheme;
	}
	t->fragment = r->fragment;

	#ifdef URI_DEBUG
	fprintf(stderr, "RESOLVE URI\nBASE:\n");
	serd_uri_dump(base, stderr);
	fprintf(stderr, "URI:\n");
	serd_uri_dump(r, stderr);
	fprintf(stderr, "RESULT:\n");
	serd_uri_dump(t, stderr);
	fprintf(stderr, "\n");
	#endif
	return true;
}

SERD_API
size_t
serd_uri_serialise(const SerdURI* uri, SerdSink sink, void* stream)
{
	// See http://tools.ietf.org/html/rfc3986#section-5.3

	size_t write_size = 0;
#define WRITE(buf, len) \
	write_size += len; \
	if (len) { \
		sink((const uint8_t*)buf, len, stream); \
	}
#define WRITE_CHAR(c) WRITE(&(c), 1)
#define WRITE_COMPONENT(prefix, field, suffix) \
	if ((field).len) { \
		for (const uint8_t* c = (const uint8_t*)prefix; *c != '\0'; ++c) { \
			WRITE(c, 1); \
		} \
		WRITE((field).buf, (field).len); \
		for (const uint8_t* c = (const uint8_t*)suffix; *c != '\0'; ++c) { \
			WRITE(c, 1); \
		} \
	}

	WRITE_COMPONENT("",   uri->scheme,    ":");
	WRITE_COMPONENT("//", uri->authority, "");
	if (uri->path_base.len) {
		if (!uri->path.buf && (uri->fragment.buf || uri->query.buf)) {
			WRITE_COMPONENT("", uri->path_base, "");
		} else {
			/* Merge paths, removing dot components.
			   See http://tools.ietf.org/html/rfc3986#section-5.2.3
			*/
			const uint8_t* uri_first = uri->path.buf;
			const uint8_t* uri_end   = uri_first;
			size_t         up        = 1;
			if (uri_first) {
				// Count and skip leading dot components
				uri_end = uri->path.buf + uri->path.len;
				while (uri_first < uri_end) {
					if (!memcmp((const char*)uri_first, "./", 2)) {
						uri_first += 2;
					} else if (!memcmp((const char*)uri_first, "../", 3)) {
						++up;
						uri_first += 3;
					} else if (!memcmp((const char*)uri_first, "..", 2)) {
						++up;
						uri_first += 2;
					} else if (!memcmp((const char*)uri_first, ".", 1)) {
						++uri_first;
					} else if (!memcmp((const char*)uri_first, "//", 1)) {
						++uri_first;
					} else {
						break;
					}
				}

				if (uri->path.buf && uri->path_base.buf) {
					// Find the up'th last slash
					const uint8_t* base_last = uri->path_base.buf + uri->path_base.len - 1;
					do {
						if (*base_last == '/') {
							--up;
						}
					} while (up > 0 && (--base_last > uri->path_base.buf));

					// Write base URI prefix
					const size_t base_len = base_last - uri->path_base.buf + 1;
					WRITE(uri->path_base.buf, base_len);

				} else {
					// Relative path is just query or fragment, append it to full base URI
					WRITE_COMPONENT("", uri->path_base, "");
				}

				// Write URI suffix
				WRITE(uri_first, uri_end - uri_first);
			}
		}
	} else {
		WRITE_COMPONENT("", uri->path, "");
	}
	WRITE_COMPONENT("?", uri->query, "");
	if (uri->fragment.buf) {
		// Note uri->fragment.buf includes the leading `#'
		WRITE_COMPONENT("", uri->fragment, "");
	}
	return write_size;
}


static size_t
serd_uri_string_length(const SerdURI* uri)
{
	size_t len = uri->path_base.len;

#define ADD_LEN(field, n_delims) \
	if ((field).len) { len += (field).len + (n_delims); }

	ADD_LEN(uri->path,      1);  // + possible leading `/'
	ADD_LEN(uri->scheme,    1);  // + trailing `:'
	ADD_LEN(uri->authority, 2);  // + leading `//'
	ADD_LEN(uri->query,     1);  // + leading `?'
	ADD_LEN(uri->fragment,  1);  // + leading `#'

	return len;
}

static size_t
string_sink(const void* buf, size_t len, void* stream)
{
	uint8_t** ptr = (uint8_t**)stream;
	memcpy(*ptr, buf, len);
	*ptr += len;
	return len;
}

SERD_API
SerdString*
serd_string_new_from_uri(const SerdURI* uri, SerdURI* out)
{
	const size_t len = serd_uri_string_length(uri);
	SerdString*  str = malloc(sizeof(SerdString) + len + 1);
	str->n_bytes = len + 1;
	str->n_chars = len;  // FIXME: UTF-8

	uint8_t* ptr = str->buf;
	const size_t actual_len = serd_uri_serialise(uri, string_sink, &ptr);

	str->buf[actual_len] = '\0';
	str->n_bytes = actual_len + 1;
	str->n_chars = str->n_bytes - 1;  // FIXME: UTF-8

	#ifdef URI_DEBUG
	fwrite("URI: `'", 1, 6, stderr);
	fwrite(str->buf, 1, str->n_bytes - 1, stderr);
	fwrite("'\n", 1, 2, stderr);
	#endif

	return str;
}