From 0a62fc5f6aafd3e3f67d861634014d7e894c7bfd Mon Sep 17 00:00:00 2001
From: David Robillard <d@drobilla.net>
Date: Thu, 20 Jan 2011 07:31:58 +0000
Subject: Rework character reading functions to support reading multi-byte
 characters (take a string dest parameter instead of returning uchar). Escape
 ntriples output. Pass all good read tests with output verification.

git-svn-id: http://svn.drobilla.net/serd/trunk@8 490d8e77-9747-427b-9fa3-0b8f29cee8a0
---
 doc/reference.doxygen.in |   8 +-
 serd/serd.h              | 105 +++++++++--------
 src/namespaces.c         |  46 +-------
 src/reader.c             | 287 +++++++++++++++++++++++------------------------
 src/serdi.c              |  84 ++------------
 src/string.c             |  65 +++++++++++
 src/uri.c                |  49 +++-----
 src/write.c              | 170 ++++++++++++++++++++++++++++
 wscript                  |   8 +-
 9 files changed, 472 insertions(+), 350 deletions(-)
 create mode 100644 src/string.c
 create mode 100644 src/write.c

diff --git a/doc/reference.doxygen.in b/doc/reference.doxygen.in
index 66fda410..db972e34 100644
--- a/doc/reference.doxygen.in
+++ b/doc/reference.doxygen.in
@@ -270,7 +270,7 @@ SUBGROUPING            = YES
 # be useful for C code in case the coding convention dictates that all compound
 # types are typedef'ed and only the typedef is referenced, never the tag name.
 
-TYPEDEF_HIDES_STRUCT   = NO
+TYPEDEF_HIDES_STRUCT   = YES
 
 # The SYMBOL_CACHE_SIZE determines the size of the internal cache use to
 # determine which symbols to keep in memory and which to flush to disk.
@@ -297,7 +297,7 @@ SYMBOL_CACHE_SIZE      = 0
 # Private class members and static file members will be hidden unless
 # the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES
 
-EXTRACT_ALL            = NO
+EXTRACT_ALL            = YES
 
 # If the EXTRACT_PRIVATE tag is set to YES all private members of a class
 # will be included in the documentation.
@@ -480,14 +480,14 @@ SHOW_DIRECTORIES       = NO
 # This will remove the Files entry from the Quick Index and from the
 # Folder Tree View (if specified). The default is YES.
 
-SHOW_FILES             = YES
+SHOW_FILES             = NO
 
 # Set the SHOW_NAMESPACES tag to NO to disable the generation of the
 # Namespaces page.
 # This will remove the Namespaces entry from the Quick Index
 # and from the Folder Tree View (if specified). The default is YES.
 
-SHOW_NAMESPACES        = YES
+SHOW_NAMESPACES        = NO
 
 # The FILE_VERSION_FILTER tag can be used to specify a program or script that
 # doxygen should invoke to get the current version for each file (typically from
diff --git a/serd/serd.h b/serd/serd.h
index d3d2b03d..dd4b25b7 100644
--- a/serd/serd.h
+++ b/serd/serd.h
@@ -49,6 +49,10 @@
  * @{
  */
 
+typedef struct SerdNamespacesImpl* SerdNamespaces;
+typedef struct SerdReaderImpl*     SerdReader;
+
+
 /** RDF syntax */
 typedef enum {
 	SERD_TURTLE   = 1,
@@ -57,56 +61,31 @@ typedef enum {
 
 /** Type of RDF node. */
 typedef enum {
-	BLANK   = 1,
-	URI     = 2,
-	QNAME   = 3,
-	LITERAL = 4
+	BLANK   = 1,  ///< Blank node (resource with no URI)
+	URI     = 2,  ///< URI (universal identifier)
+	QNAME   = 3,  ///< CURIE/QName (URI shortened with a namespace)
+	LITERAL = 4   ///< Literal string (with optional lang or datatype)
 } SerdNodeType;
 
-
-/** @name String
- * @{
- */
-
-/** Measured UTF-8 string. */
-typedef struct {
-	size_t  n_bytes;
-	size_t  n_chars;
-	uint8_t buf[];
-} SerdString;
-
-/** Create a new UTF-8 string from @a utf8. */
-SERD_API
-SerdString*
-serd_string_new(const uint8_t* utf8);
-
-/** Copy @a string. */
-SERD_API
-SerdString*
-serd_string_copy(const SerdString* string);
-
-/** @} */
-
-
 /** @name URIs
  * @{
  */
 
-/** Range of memory. */
+/* Range of memory. */
 typedef struct {
 	const uint8_t* buf;
 	size_t         len;
 } SerdRange;
 
-/** Parsed URI. */
+/* Parsed URI. */
 typedef struct {
-	SerdRange scheme;
-	SerdRange authority;
-	SerdRange path_base;
-	SerdRange path;
-	SerdRange query;
-	SerdRange fragment;
-	bool      base_uri_has_authority;
+	SerdRange scheme; ///< Scheme
+	SerdRange authority; ///< Authority
+	SerdRange path_base; ///< Path prefix if relative
+	SerdRange path; ///< Path suffix
+	SerdRange query; ///< Query
+	SerdRange fragment; ///< Fragment
+	bool      base_uri_has_authority; ///< True iff base URI has authority
 } SerdURI;
 
 /** Return true iff @a utf8 is a relative URI string. */
@@ -129,11 +108,52 @@ SERD_API
 bool
 serd_uri_write(const SerdURI* uri, FILE* file);
 
+/** Sink function for raw string output. */
+typedef size_t (*SerdSink)(const uint8_t* buf, size_t len, void* stream);
+
+/** Serialise @a uri with a series of calls to @a sink. */
+SERD_API
+size_t
+serd_uri_serialise(const SerdURI* uri, SerdSink sink, void* stream);
+
+/** @} */
+
+/** @name String
+ * @{
+ */
+
+/** Measured UTF-8 string. */
+typedef struct {
+	size_t  n_bytes;  ///< Size in bytes including trailing null byte
+	size_t  n_chars;  ///< Length in characters
+	uint8_t buf[];    ///< Buffer
+} SerdString;
+
+/** Create a new UTF-8 string from @a utf8. */
+SERD_API
+SerdString*
+serd_string_new(const uint8_t* utf8);
+
+/** Copy @a string. */
+SERD_API
+SerdString*
+serd_string_copy(const SerdString* string);
+
 /** Serialise @a uri to a string. */
 SERD_API
 SerdString*
-serd_uri_serialise(const SerdURI* uri,
-                   SerdURI*       out);
+serd_string_new_from_uri(const SerdURI* uri,
+                         SerdURI*       out);
+
+SERD_API
+bool
+serd_write_node(FILE*             file,
+                const SerdURI*    base_uri,
+                SerdNamespaces    ns,
+                SerdNodeType      type,
+                const SerdString* str,
+                const SerdString* datatype,
+                const SerdString* lang);
 
 /** @} */
 
@@ -142,9 +162,6 @@ serd_uri_serialise(const SerdURI* uri,
  * @{
  */
 
-/** Reader. */
-typedef struct SerdReaderImpl* SerdReader;
-
 /** Handler for base URI changes. */
 typedef bool (*SerdBaseHandler)(void*             handle,
                                 const SerdString* uri);
@@ -194,8 +211,6 @@ serd_reader_free(SerdReader reader);
  * @{
  */
 
-typedef struct SerdNamespacesImpl* SerdNamespaces;
-
 /** Create a new namespaces dictionary. */
 SERD_API
 SerdNamespaces
diff --git a/src/namespaces.c b/src/namespaces.c
index fab53ea3..f18ebcff 100644
--- a/src/namespaces.c
+++ b/src/namespaces.c
@@ -32,48 +32,6 @@ struct SerdNamespacesImpl {
 	size_t         n_namespaces;
 };
 
-static inline size_t
-utf8_strlen(const uint8_t* utf8, size_t* out_n_bytes)
-{
-	size_t n_chars = 0;
-	size_t i       = 0;
-	for (; utf8[i]; ++i) {
-		if ((utf8[i] & 0xC0) != 0x80) {
-			// Does not start with `10', start of a new character
-			++n_chars;
-		}
-	}
-	if (out_n_bytes) {
-		*out_n_bytes = i + 1;
-	}
-	return n_chars;
-}
-
-SERD_API
-SerdString*
-serd_string_new(const uint8_t* utf8)
-{
-	size_t n_bytes;
-	size_t n_chars = utf8_strlen(utf8, &n_bytes);
-	SerdString* const str = malloc(sizeof(SerdString) + n_bytes);
-	str->n_bytes = n_bytes;
-	str->n_chars = n_chars;
-	memcpy(str->buf, utf8, str->n_bytes);
-	return str;
-}
-
-SERD_API
-SerdString*
-serd_string_copy(const SerdString* s)
-{
-	if (s) {
-		SerdString* const copy = malloc(sizeof(SerdString) + s->n_bytes);
-		memcpy(copy, s, sizeof(SerdString) + s->n_bytes);
-		return copy;
-	}
-	return NULL;
-}
-
 SERD_API
 SerdNamespaces
 serd_namespaces_new()
@@ -148,9 +106,9 @@ serd_namespaces_expand(SerdNamespaces    ns,
 	SerdNamespace* const record = serd_namespaces_find(ns, qname->buf, colon - qname->buf);
 	if (record) {
 		uri_prefix->buf = record->uri->buf;
-		uri_prefix->len = record->uri->n_bytes;
+		uri_prefix->len = record->uri->n_bytes - 1;
 		uri_suffix->buf = colon + 1;
-		uri_suffix->len = qname->n_bytes - (colon - qname->buf) - 1;
+		uri_suffix->len = qname->n_bytes - (colon - qname->buf) - 2;
 		return true;
 	}
 	return false;
diff --git a/src/reader.c b/src/reader.c
index bf0eea11..52999180 100644
--- a/src/reader.c
+++ b/src/reader.c
@@ -140,22 +140,6 @@ readahead(SerdReader parser, uint8_t* pre, int n)
 	return true;
 }
 
-static inline unsigned
-utf8_char_len(const uint8_t b0)
-{
-	if ((b0 & 0x80) == 0) {  // Starts with `0'
-		return 1;
-	} else if ((b0 & 0xE0) == 0xC0) {  // Starts with `110'
-		return 2;
-	} else if ((b0 & 0xF0) == 0xE0) {  // Starts with `1110'
-		return 3;
-	} else if ((b0 & 0xF8) == 0xF0) {  // Starts with `11110'
-		return 4;
-	} else {
-		return 0;
-	}
-}
-
 static inline uchar
 peek_utf8_char(SerdReader parser, unsigned* n_bytes)
 {
@@ -334,52 +318,82 @@ read_hex(SerdReader parser)
 	}
 }
 
-static inline uchar
-read_hex_escape(SerdReader parser, unsigned length)
+static inline bool
+read_hex_escape(SerdReader parser, unsigned length, Ref dest)
 {
-	uchar   ret      = 0;
-	uint8_t chars[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
-	uint8_t code[4]  = { 0, 0, 0, 0 };
+	uint8_t buf[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
 	for (unsigned i = 0; i < length; ++i) {
-		chars[i] = read_hex(parser);
-	}
-
-	sscanf((const char*)chars, "%X", (uint32_t*)code);
-	const uint32_t code_num = *(uint32_t*)code;
-	if (code_num < 0x80) {
-		fprintf(stderr, "1 byte UTF-8 escape\n");
-		return code[0];
-	} else if (code_num < 0x800) {
-		fprintf(stderr, "2 byte UTF-8 escape\n");
-		fprintf(stderr, "B0 %X\n", code[0]);
-		fprintf(stderr, "B1 %X\n", code[1]);
-		fprintf(stderr, "B2 %X\n", code[2]);
-		fprintf(stderr, "B3 %X\n", code[3]);
-		ret = ((0xC0 + ((code[3] & 0x1F) << 2) + ((code[4] & 0xC0) >> 6)) << 8)
-			+ (code[4] & 0x3F);
-		fprintf(stderr, "RET %X\n", ret);
-	} else if (code_num < 0x10000) {
-		fprintf(stderr, "3 byte UTF-8 escape\n");
+		buf[i] = read_hex(parser);
+	}
+
+	uint32_t c;
+	sscanf((const char*)buf, "%X", &c);
+
+	unsigned size = 0;
+	if (c < 0x00000080) {
+		size = 1;
+	} else if (c < 0x00000800) {
+		size = 2;
+	} else if (c < 0x00010000) {
+		size = 3;
+	} else if (c < 0x00200000) {
+		size = 4;
+	} else if (c < 0x04000000) {
+		size = 5;
+	} else if (c < 0x80000000) {
+		size = 6;
 	} else {
-		fprintf(stderr, "4 byte UTF-8 escape\n");
+		return false;
 	}
-	return ret;
+
+	// Build output in buf
+	// (Note # of bytes = # of leading 1 bits in first byte)
+	switch (size) {
+	case 6:
+		buf[5] = 0x80 | (uint8_t)(c & 0x3F);
+		c >>= 6;
+		c |= (4 << 24);  // set bit 2
+	case 5:
+		buf[4] = 0x80 | (uint8_t)(c & 0x3F);
+		c >>= 6;
+		c |= (8 << 18);  // set bit 3
+	case 4:
+		buf[3] = 0x80 | (uint8_t)(c & 0x3F);
+		c >>= 6;
+		c |= (16 << 12);  // set bit 4
+	case 3:
+		buf[2] = 0x80 | (uint8_t)(c & 0x3F);
+		c >>= 6;
+		c |= (32 << 6);  // set bit 5
+	case 2:
+		buf[1] = 0x80 | (uint8_t)(c & 0x3F);
+		c >>= 6;
+		c |= 0xC0;  // set bits 6 and 7
+	case 1:
+		buf[0] = (uint8_t)c;
+	}
+
+	for (unsigned i = 0; i < size; ++i) {
+		push_char(parser, dest, buf[i]);
+	}
+	return true;
 }
 
-static inline uchar
-character_escape(SerdReader parser, const uchar esc)
+static inline bool
+read_character_escape(SerdReader parser, Ref dest)
 {
-	switch (esc) {
+	switch (peek_char(parser)) {
 	case '\\':
-		return eat_char(parser, '\\');
+		push_char(parser, dest, eat_char(parser, '\\'));
+		return true;
 	case 'u':
-		eat_char(parser, esc);
-		return read_hex_escape(parser, 4);
+		eat_char(parser, 'u');
+		return read_hex_escape(parser, 4, dest);
 	case 'U':
-		eat_char(parser, esc);
-		return read_hex_escape(parser, 8);
+		eat_char(parser, 'U');
+		return read_hex_escape(parser, 8, dest);
 	default:
-		return 0;
+		return false;
 	}
 }
 
@@ -387,117 +401,124 @@ character_escape(SerdReader parser, const uchar esc)
 //                  | '\U' hex hex hex hex hex hex hex hex
 //                  | '\\'
 //                  | [#x20-#x5B] | [#x5D-#x10FFFF]
-static inline uchar
-read_character(SerdReader parser)
+static inline bool
+read_character(SerdReader parser, Ref dest)
 {
 	const uchar c = peek_char(parser);
-	uchar       esc;
 	switch (c) {
 	case '\\':
 		eat_char(parser, '\\');
-		esc = character_escape(parser, peek_char(parser));
-		if (esc) {
-			return esc;
+		if (read_character_escape(parser, dest)) {
+			return true;
 		} else {
-			return error(parser, "illegal escape `\\%c'\n", esc);
+			return error(parser, "invalid escape `\\%c'\n", peek_char(parser));
 		}
 	default:
 		if (in_range(c, 0x20, 0x5B) || in_range(c, 0x5D, 0x10FFF)) {
-			return eat_char(parser, c);
+			push_char(parser, dest, eat_char(parser, c));
+			return true;
 		} else {
-			return error(parser, "illegal character `%c'\n", c);
+			return error(parser, "invalid character `%c'\n", c);
 		}
 	}
 }
 
-static inline uchar
-echaracter_escape(SerdReader parser, const uchar esc)
+static inline bool
+read_echaracter_escape(SerdReader parser, Ref dest)
 {
-	const uchar ret = character_escape(parser, esc);
-	if (ret) {
-		return ret;
+	if (read_character_escape(parser, dest)) {
+		return true;
 	}
-	switch (esc) {
+	switch (peek_char(parser)) {
 	case 't':
 		eat_char(parser, 't');
-		return '\t';
+		push_char(parser, dest, '\t');
+		return true;
 	case 'n':
 		eat_char(parser, 'n');
-		return '\n';
+		push_char(parser, dest, '\n');
+		return true;
 	case 'r':
 		eat_char(parser, 'r');
-		return '\r';
+		push_char(parser, dest, '\r');
+		return true;
 	default:
-		return 0;
+		return false;
 	}
 }
 
 // [39] echaracter ::= character | '\t' | '\n' | '\r'
-static inline uchar
-read_echaracter(SerdReader parser)
+static inline bool
+read_echaracter(SerdReader parser, Ref dest)
 {
 	uchar c = peek_char(parser);
-	uchar esc;
 	switch (c) {
 	case '\\':
 		eat_char(parser, '\\');
-		esc = echaracter_escape(parser, peek_char(parser));
-		if (esc) {
-			return esc;
+		if (read_echaracter_escape(parser, peek_char(parser))) {
+			return true;
 		} else {
-			return error(parser, "illegal escape `\\%c'\n", esc);
+			return error(parser, "illegal escape `\\%c'\n", peek_char(parser));
 		}
 	default:
-		return read_character(parser);
+		return read_character(parser, dest);
 	}
 }
 
-static inline uchar
-scharacter_escape(SerdReader parser, const uchar esc)
+static inline bool
+read_scharacter_escape(SerdReader parser, Ref dest)
 {
-	const uchar ret = echaracter_escape(parser, esc);
-	if (ret) {
-		return ret;
-	} else if (esc == '"') {
-		return eat_char(parser, '"');
+	if (read_echaracter_escape(parser, dest)) {
+		return true;
+	} else if (peek_char(parser) == '"') {
+		push_char(parser, dest, eat_char(parser, '"'));
+		return true;
 	}
-	return 0;
+	return false;
 }
 
-static inline uchar
-ucharacter_escape(SerdReader parser, const uchar esc)
+static inline bool
+read_ucharacter_escape(SerdReader parser, Ref dest)
 {
-	const uchar ret = echaracter_escape(parser, esc);
-	if (ret) {
-		return ret;
-	} else if (esc == '>') {
-		return eat_char(parser, '>');
+	if (read_echaracter_escape(parser, dest)) {
+		return true;
+	} else if (peek_char(parser) == '>') {
+		push_char(parser, dest, eat_char(parser, '>'));
+		return true;
 	}
-	return 0;
+	return false;
 }
 
 // [43] lcharacter ::= echaracter | '\"' | #x9 | #xA | #xD
-static inline uchar
-read_lcharacter(SerdReader parser, bool* is_escape)
+static inline bool
+read_lcharacter(SerdReader parser, Ref dest)
 {
-	*is_escape = false;
-	const uchar c = peek_char(parser);
-	uchar       esc;
+	uchar   c = peek_char(parser);
+	uint8_t pre[3];
 	switch (c) {
+	case '"':
+		readahead(parser, pre, 3);
+		if (pre[1] == '\"' && pre[2] == '\"') {
+			eat_char(parser, '\"');
+			eat_char(parser, '\"');
+			eat_char(parser, '\"');
+			return false;
+		} else {
+			push_char(parser, dest, eat_char(parser, '"'));
+			return true;
+		}
 	case '\\':
 		eat_char(parser, '\\');
-		esc = scharacter_escape(parser, peek_char(parser));
-		if (esc) {
-			*is_escape = true;
-			return esc;
+		if (read_scharacter_escape(parser, dest)) {
+			return true;
 		} else {
-			return error(parser, "illegal escape `\\%c'\n", esc);
+			return error(parser, "illegal escape `\\%c'\n", peek_char(parser));
 		}
 	case 0x9: case 0xA: case 0xD:
-		eat_char(parser, c);
+		push_char(parser, dest, eat_char(parser, c));
 		return c;
 	default:
-		return read_echaracter(parser);
+		return read_echaracter(parser, dest);
 	}
 }
 
@@ -506,48 +527,39 @@ static inline bool
 read_scharacter(SerdReader parser, Ref dest)
 {
 	uchar c = peek_char(parser);
-	uchar esc;
 	switch (c) {
 	case '\\':
 		eat_char(parser, '\\');
-		esc = scharacter_escape(parser, peek_char(parser));
-		if (esc) {
-			push_char(parser, dest, esc);
+		if (read_scharacter_escape(parser, dest)) {
 			return true;
 		} else {
-			return error(parser, "illegal escape `\\%c'\n", esc);
+			return error(parser, "illegal escape `\\%c'\n", peek_char(parser));
 		}
 	case '\"':
 		return false;
 	default:
-		c = read_character(parser);
-		if (c) {
-			push_char(parser, dest, c);
-		}
-		return c;
+		return read_character(parser, dest);
 	}
 }
 
 // Spec:   [41] ucharacter ::= ( character - #x3E ) | '\>'
 // Actual: [41] ucharacter ::= ( echaracter - #x3E ) | '\>'
-static inline uchar
-read_ucharacter(SerdReader parser)
+static inline bool
+read_ucharacter(SerdReader parser, Ref dest)
 {
-	const uchar c = peek_char(parser);
-	uchar       esc;
+	uchar c = peek_char(parser);
 	switch (c) {
 	case '\\':
 		eat_char(parser, '\\');
-		esc = ucharacter_escape(parser, peek_char(parser));
-		if (esc) {
-			return esc;
+		if (read_ucharacter_escape(parser, dest)) {
+			return true;
 		} else {
-			return error(parser, "illegal escape `\\%c'\n", esc);
+			return error(parser, "illegal escape `\\%c'\n", peek_char(parser));
 		}
 	case '>':
-		return 0;
+		return false;
 	default:
-		return read_character(parser);
+		return read_character(parser, dest);
 	}
 }
 
@@ -607,21 +619,7 @@ read_longString(SerdReader parser)
 {
 	eat_string(parser, "\"\"\"", 3);
 	Ref   str = push_string(parser, "", 1);
-	uchar c;
-	bool  is_escape = false;
-	while ((c = read_lcharacter(parser, &is_escape)) != 0) {
-		if (c == '\"' && !is_escape) {
-			uint8_t pre[2];
-			readahead(parser, pre, 2);
-			if (pre[0] == '\"' && pre[1] == '\"') {
-				eat_char(parser, '\"');
-				eat_char(parser, '\"');
-				return str;
-			}
-		}
-		push_char(parser, str, c);
-	}
-	eat_string(parser, "\"\"\"", 3);
+	while (read_lcharacter(parser, str)) {}
 	return str;
 }
 
@@ -658,11 +656,8 @@ read_quotedString(SerdReader parser)
 static inline Ref
 read_relativeURI(SerdReader parser)
 {
-	uchar c;
-	Ref   str = push_string(parser, "", 1);
-	while ((c = read_ucharacter(parser)) != 0) {
-		push_char(parser, str, c);
-	}
+	Ref str = push_string(parser, "", 1);
+	while (read_ucharacter(parser, str)) {}
 	return str;
 }
 
diff --git a/src/serdi.c b/src/serdi.c
index f5be93c3..3f91e040 100644
--- a/src/serdi.c
+++ b/src/serdi.c
@@ -49,7 +49,7 @@ event_base(void*             handle,
 			assert(false);
 			return false;
 		}
-		base_uri_str = serd_uri_serialise(&abs_base_uri, &base_uri);
+		base_uri_str = serd_string_new_from_uri(&abs_base_uri, &base_uri);
 		// FIXME: double parse
 		serd_uri_parse(base_uri_str->buf, &base_uri);
 	} else {
@@ -83,7 +83,7 @@ event_prefix(void*             handle,
 			return false;
 		}
 		SerdURI     new_abs_uri;
-		SerdString* abs_uri_string = serd_uri_serialise(&abs_uri, &new_abs_uri);
+		SerdString* abs_uri_string = serd_string_new_from_uri(&abs_uri, &new_abs_uri);
 		serd_namespaces_add(state->ns, name, abs_uri_string);
 	} else {
 		serd_namespaces_add(state->ns, name, uri_string);
@@ -91,77 +91,6 @@ event_prefix(void*             handle,
 	return true;
 }
 
-static inline bool
-write_node(State*            state,
-           const SerdString* str,
-           SerdNodeType      type,
-           const SerdString* datatype,
-           const SerdString* lang)
-{
-	SerdRange uri_prefix;
-	SerdRange uri_suffix;
-	switch (type) {
-	case BLANK:
-		fwrite("_:", 1, 2, state->out_fd);
-		fwrite(str->buf, 1, str->n_bytes - 1, state->out_fd);
-		break;
-	case QNAME:
-		if (!serd_namespaces_expand(state->ns, str, &uri_prefix, &uri_suffix)) {
-			fprintf(stderr, "error: undefined namespace prefix `%s'\n", str->buf);
-			return false;
-		}
-		fwrite("<", 1, 1, state->out_fd);
-		fwrite(uri_prefix.buf, 1, uri_prefix.len - 1, state->out_fd);
-		fwrite(uri_suffix.buf, 1, uri_suffix.len - 1, state->out_fd);
-		fwrite(">", 1, 1, state->out_fd);
-		break;
-	case URI:
-		if (serd_uri_string_is_relative(str->buf)) {
-			SerdURI uri;
-			if (serd_uri_parse(str->buf, &uri)) {
-				SerdURI abs_uri;
-				if (serd_uri_resolve(&uri, &state->base_uri, &abs_uri)) {
-					fwrite("<", 1, 1, state->out_fd);
-					serd_uri_write(&abs_uri, state->out_fd);
-					fwrite(">", 1, 1, state->out_fd);
-					return true;
-				}
-			}
-		} else {
-			fwrite("<", 1, 1, state->out_fd);
-			fwrite(str->buf, 1, str->n_bytes - 1, state->out_fd);
-			fwrite(">", 1, 1, state->out_fd);
-			return true;
-		}
-		return false;
-	case LITERAL:
-		fwrite("\"", 1, 1, state->out_fd);
-		for (size_t i = 0; i < str->n_bytes - 1; ++i) {
-			const char c = str->buf[i];
-			switch (c) {
-			case '\\': fwrite("\\\\", 1, 2, state->out_fd); break;
-			case '\n': fwrite("\\n",  1, 2, state->out_fd); break;
-			case '\r': fwrite("\\r",  1, 2, state->out_fd); break;
-			case '\t': fwrite("\\t",  1, 2, state->out_fd); break;
-			case '"':  fwrite("\\\"", 1, 2, state->out_fd); break;
-			default:
-				fwrite(&c, 1, 1, state->out_fd);
-			}
-		}
-		fwrite("\"", 1, 1, state->out_fd);
-		if (lang) {
-			fwrite("@\"", 1, 2, state->out_fd);
-			fwrite(lang->buf, 1, lang->n_bytes - 1, state->out_fd);
-			fwrite("\"", 1, 1, state->out_fd);
-		} else if (datatype) {
-			fwrite("^^", 1, 2, state->out_fd);
-			write_node(state, datatype, URI, NULL, NULL);
-		}
-		break;
-	}
-	return true;
-}
-
 static bool
 event_statement(void*             handle,
                 const SerdString* graph,
@@ -176,11 +105,14 @@ event_statement(void*             handle,
 {
 	State* const state = (State*)handle;
 	FILE* const  fd    = state->out_fd;
-	write_node(state, subject, subject_type, NULL, NULL);
+	serd_write_node(fd, &state->base_uri, state->ns,
+	                subject_type, subject, NULL, NULL);
 	fwrite(" ", 1, 1, fd);
-	write_node(state, predicate, predicate_type, NULL, NULL);
+	serd_write_node(fd, &state->base_uri, state->ns,
+	                predicate_type, predicate, NULL, NULL);
 	fwrite(" ", 1, 1, fd);
-	write_node(state, object, object_type, object_datatype, object_lang);
+	serd_write_node(fd, &state->base_uri, state->ns,
+	                object_type, object, object_datatype, object_lang);
 	fwrite(" .\n", 1, 3, fd);
 	return true;
 }
diff --git a/src/string.c b/src/string.c
new file mode 100644
index 00000000..301a98cc
--- /dev/null
+++ b/src/string.c
@@ -0,0 +1,65 @@
+/* Serd, an RDF serialisation library.
+ * Copyright 2011 David Robillard <d@drobilla.net>
+ * 
+ * Serd is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Serd is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ * License for details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "serd/serd.h"
+
+static inline size_t
+utf8_strlen(const uint8_t* utf8, size_t* out_n_bytes)
+{
+	size_t n_chars = 0;
+	size_t i       = 0;
+	for (; utf8[i]; ++i) {
+		if ((utf8[i] & 0xC0) != 0x80) {
+			// Does not start with `10', start of a new character
+			++n_chars;
+		}
+	}
+	if (out_n_bytes) {
+		*out_n_bytes = i + 1;
+	}
+	return n_chars;
+}
+
+SERD_API
+SerdString*
+serd_string_new(const uint8_t* utf8)
+{
+	size_t n_bytes;
+	size_t n_chars = utf8_strlen(utf8, &n_bytes);
+	SerdString* const str = malloc(sizeof(SerdString) + n_bytes);
+	str->n_bytes = n_bytes;
+	str->n_chars = n_chars;
+	memcpy(str->buf, utf8, str->n_bytes);
+	return str;
+}
+
+SERD_API
+SerdString*
+serd_string_copy(const SerdString* s)
+{
+	if (s) {
+		SerdString* const copy = malloc(sizeof(SerdString) + s->n_bytes);
+		memcpy(copy, s, sizeof(SerdString) + s->n_bytes);
+		return copy;
+	}
+	return NULL;
+}
diff --git a/src/uri.c b/src/uri.c
index d98f07ff..1ff7a6d9 100644
--- a/src/uri.c
+++ b/src/uri.c
@@ -260,10 +260,9 @@ serd_uri_resolve(const SerdURI* r, const SerdURI* base, SerdURI* t)
 	return true;
 }
 
-typedef size_t (*Sink)(const void* data, size_t size, size_t nmemb, void* stream);
-
-static size_t
-serd_uri_serialise_internal(const SerdURI* uri, Sink sink, void* stream)
+SERD_API
+size_t
+serd_uri_serialise(const SerdURI* uri, SerdSink sink, void* stream)
 {
 	/* See http://tools.ietf.org/html/rfc3986#section-5.3 */
 
@@ -271,16 +270,16 @@ serd_uri_serialise_internal(const SerdURI* uri, Sink sink, void* stream)
 #define WRITE(buf, len) \
 	write_size += len; \
 	if (len) { \
-		sink(buf, 1, len, stream); \
+		sink((const uint8_t*)buf, len, stream); \
 	}
 #define WRITE_CHAR(c) WRITE(&(c), 1)
 #define WRITE_COMPONENT(prefix, field, suffix) \
 	if ((field).len) { \
-		for (const char* c = prefix; *c != '\0'; ++c) { \
+		for (const uint8_t* c = (const uint8_t*)prefix; *c != '\0'; ++c) { \
 			WRITE(c, 1); \
 		} \
 		WRITE((field).buf, (field).len); \
-		for (const char* c = suffix; *c != '\0'; ++c) { \
+		for (const uint8_t* c = (const uint8_t*)suffix; *c != '\0'; ++c) { \
 			WRITE(c, 1); \
 		} \
 	}
@@ -354,26 +353,9 @@ serd_uri_serialise_internal(const SerdURI* uri, Sink sink, void* stream)
 		// Note uri->fragment.buf includes the leading `#'
 		WRITE_COMPONENT("", uri->fragment, "");
 	}
-	WRITE("\0", 1);
 	return write_size;
 }
 
-SERD_API
-bool
-serd_uri_write(const SerdURI* uri, FILE* file)
-{
-	//#if 0
-	SerdURI           flat_uri;
-	SerdString* const flat_uri_str = serd_uri_serialise(uri, &flat_uri);
-	if (flat_uri_str) {
-		fwrite(flat_uri_str->buf, 1, flat_uri_str->n_bytes - 1, file);
-		free(flat_uri_str);
-		return true;
-	}
-	return false;
-	//#endif
-	//return (serd_uri_serialise_internal(uri, (Sink)fwrite, file) > 0);
-}
 
 static size_t
 serd_uri_string_length(const SerdURI* uri)
@@ -393,18 +375,17 @@ serd_uri_string_length(const SerdURI* uri)
 }
 
 static size_t
-string_write(const void* data, size_t size, size_t nmemb, void* stream)
+string_sink(const uint8_t* buf, size_t len, void* stream)
 {
-	uint8_t**    ptr        = (uint8_t**)stream;
-	const size_t write_size = (size * nmemb);
-	memcpy(*ptr, data, write_size);
-	*ptr += write_size;
-	return nmemb;
+	uint8_t** ptr = (uint8_t**)stream;
+	memcpy(*ptr, buf, len);
+	*ptr += len;
+	return len;
 }
 
 SERD_API
 SerdString*
-serd_uri_serialise(const SerdURI* uri, SerdURI* out)
+serd_string_new_from_uri(const SerdURI* uri, SerdURI* out)
 {
 	const size_t len = serd_uri_string_length(uri);
 	SerdString*  str = malloc(sizeof(SerdString) + len + 1);
@@ -412,10 +393,10 @@ serd_uri_serialise(const SerdURI* uri, SerdURI* out)
 	str->n_chars = len;  // FIXME: UTF-8
 
 	uint8_t* ptr = str->buf;
-	const size_t actual_len = serd_uri_serialise_internal(uri, string_write, &ptr);
+	const size_t actual_len = serd_uri_serialise(uri, string_sink, &ptr);
 	
-	str->buf[actual_len] = '\0';
-	str->n_bytes = actual_len;
+	str->buf[actual_len + 1] = '\0';
+	str->n_bytes = actual_len + 1;
 	str->n_chars = str->n_bytes - 1;
 
 	#ifdef URI_DEBUG
diff --git a/src/write.c b/src/write.c
new file mode 100644
index 00000000..c2b92d78
--- /dev/null
+++ b/src/write.c
@@ -0,0 +1,170 @@
+/* Serd, an RDF serialisation library.
+ * Copyright 2011 David Robillard <d@drobilla.net>
+ * 
+ * Serd is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Serd is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ * License for details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include "serd/serd.h"
+
+static size_t
+file_sink(const uint8_t* buf, size_t len, void* stream)
+{
+	FILE* file = (FILE*)stream;
+	return fwrite(buf, 1, len, file);
+}
+
+static inline bool
+serd_write_uri(FILE* file, const SerdURI* uri)
+{
+	return serd_uri_serialise(uri, file_sink, file);
+}
+
+static bool
+serd_write_ascii(const uint8_t* utf8, size_t n_bytes, FILE* out_fd, const uint8_t esc)
+{
+	for (size_t i = 0; i < n_bytes;) {
+		uint8_t in = utf8[i++];
+		switch (in) {
+		case '\\': fwrite("\\\\", 1, 2, out_fd); continue;
+		case '\n': fwrite("\\n",  1, 2, out_fd); continue;
+		case '\r': fwrite("\\r",  1, 2, out_fd); continue;
+		case '\t': fwrite("\\t",  1, 2, out_fd); continue;
+		case '"':  if (esc == '"') { fwrite("\\\"", 1, 2, out_fd); continue; }
+		default: break;
+		}
+
+		if (in == esc) {
+			fprintf(out_fd, "\\u%04X", esc);
+			continue;
+		}
+
+		uint32_t c    = 0;
+		size_t   size = 0;
+		if ((in & 0x80) == 0) {  // Starts with `0'
+			size = 1;
+			c = in & 0x7F;
+			if ((in >= 0x20) && (in <= 0x7E)) {  // Printable ASCII
+				fwrite(&in, 1, 1, out_fd);
+				continue;
+			}
+		} else if ((in & 0xE0) == 0xC0) {  // Starts with `110'
+			size = 2;
+			c = in & 0x1F;
+		} else if ((in & 0xF0) == 0xE0) {  // Starts with `1110'
+			size = 3;
+			c = in & 0x0F;
+		} else if ((in & 0xF8) == 0xF0) {  // Starts with `11110'
+			size = 4;
+			c = in & 0x07;
+		} else if ((in & 0xFC) == 0xF8) {  // Starts with `111110'
+			size = 5;
+			c = in & 0x03;
+		} else if ((in & 0xFE) == 0xFC) {  // Starts with `1111110'
+			size = 6;
+			c = in & 0x01;
+		} else {
+			fprintf(stderr, "invalid UTF-8 at offset %zu: %X\n", i, in);
+			return false;
+		}
+
+#define READ_BYTE() do { \
+			assert(i < n_bytes); \
+			in = utf8[i++] & 0x3f; \
+			c <<= 6; \
+			c |= in; \
+		} while (0)
+
+		switch (size) {
+		case 6: READ_BYTE();
+		case 5: READ_BYTE();
+		case 4: READ_BYTE();
+		case 3: READ_BYTE();
+		case 2: READ_BYTE();
+		}
+
+		if (c < 0xFFFF) {
+			fprintf(out_fd, "\\u%04X", c);
+		} else {
+			fprintf(out_fd, "\\U%08X", c);
+		}
+	}
+	return true;
+}
+
+SERD_API
+bool
+serd_write_node(FILE*             fd,
+                const SerdURI*    base_uri,
+                SerdNamespaces    ns,
+                SerdNodeType      type,
+                const SerdString* str,
+                const SerdString* datatype,
+                const SerdString* lang)
+{
+	SerdRange uri_prefix;
+	SerdRange uri_suffix;
+	switch (type) {
+	case BLANK:
+		fwrite("_:", 1, 2, fd);
+		fwrite(str->buf, 1, str->n_bytes - 1, fd);
+		break;
+	case QNAME:
+		if (!serd_namespaces_expand(ns, str, &uri_prefix, &uri_suffix)) {
+			fprintf(stderr, "error: undefined namespace prefix `%s'\n", str->buf);
+			return false;
+		}
+		fwrite("<", 1, 1, fd);
+		serd_write_ascii(uri_prefix.buf, uri_prefix.len, fd, '>');
+		serd_write_ascii(uri_suffix.buf, uri_suffix.len, fd, '>');
+		fwrite(">", 1, 1, fd);
+		break;
+	case URI:
+		if (serd_uri_string_is_relative(str->buf)) {
+			SerdURI uri;
+			if (serd_uri_parse(str->buf, &uri)) {
+				SerdURI abs_uri;
+				if (serd_uri_resolve(&uri, base_uri, &abs_uri)) {
+					fwrite("<", 1, 1, fd);
+					serd_write_uri(fd, &abs_uri);
+					fwrite(">", 1, 1, fd);
+					return true;
+				}
+			}
+		} else {
+			fwrite("<", 1, 1, fd);
+			serd_write_ascii(str->buf, str->n_bytes - 1, fd, '>');
+			fwrite(">", 1, 1, fd);
+			return true;
+		}
+		return false;
+	case LITERAL:
+		fwrite("\"", 1, 1, fd);
+		serd_write_ascii(str->buf, str->n_bytes - 1, fd, '"');
+		fwrite("\"", 1, 1, fd);
+		if (lang) {
+			fwrite("@\"", 1, 2, fd);
+			fwrite(lang->buf, 1, lang->n_bytes - 1, fd);
+			fwrite("\"", 1, 1, fd);
+		} else if (datatype) {
+			fwrite("^^", 1, 2, fd);
+			serd_write_node(fd, base_uri, ns, URI, datatype, NULL, NULL);
+		}
+		break;
+	}
+	return true;
+}
+
diff --git a/wscript b/wscript
index 3367977e..b701c98c 100644
--- a/wscript
+++ b/wscript
@@ -54,7 +54,13 @@ def build(bld):
 	# Pkgconfig file
 	autowaf.build_pc(bld, 'SERD', SERD_VERSION, ['REDLAND'])
 
-	lib_source = 'src/reader.c src/namespaces.c src/uri.c'
+	lib_source = '''
+		src/namespaces.c
+		src/reader.c
+		src/string.c
+		src/uri.c
+		src/write.c
+	'''
 
 	# Library
 	obj = bld(features = 'c cshlib')
-- 
cgit v1.2.1