aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorDavid Robillard <d@drobilla.net>2013-12-24 22:43:25 +0000
committerDavid Robillard <d@drobilla.net>2013-12-24 22:43:25 +0000
commit2ecbc85f122fcf850f7254e54653255e999cd119 (patch)
tree032f8eeb99411f3686f0e3a41d00788c0300ab39 /src
parenta38f60f807f7f00a9f88b2d59e3c3e776f41de8a (diff)
downloadserd-2ecbc85f122fcf850f7254e54653255e999cd119.tar.gz
serd-2ecbc85f122fcf850f7254e54653255e999cd119.tar.bz2
serd-2ecbc85f122fcf850f7254e54653255e999cd119.zip
Update to latest Turtle test suite.
Support UTF-8 in blank node names. Support idiotic SPARQL escaping in local names. git-svn-id: http://svn.drobilla.net/serd/trunk@452 490d8e77-9747-427b-9fa3-0b8f29cee8a0
Diffstat (limited to 'src')
-rw-r--r--src/reader.c11
-rw-r--r--src/writer.c70
2 files changed, 65 insertions, 16 deletions
diff --git a/src/reader.c b/src/reader.c
index f86bb630..664cb361 100644
--- a/src/reader.c
+++ b/src/reader.c
@@ -615,7 +615,10 @@ static bool
read_PN_CHARS_BASE(SerdReader* reader, Ref dest)
{
const uint8_t c = peek_byte(reader);
- if (is_alpha(c)) { // TODO: UTF-8
+ if ((c & 0x80)) { // Multi-byte character
+ return !read_utf8_character(reader, dest, eat_byte_safe(reader, c));
+ }
+ if (is_alpha(c)) {
push_byte(reader, dest, eat_byte_safe(reader, c));
return true;
}
@@ -626,7 +629,11 @@ static bool
read_PN_CHARS(SerdReader* reader, Ref dest)
{
const uint8_t c = peek_byte(reader);
- if (is_alpha(c) || is_digit(c) || c == '_' || c == '-') { // TODO: UTF-8
+ if ((c & 0x80)) { // Multi-byte character
+ return !read_utf8_character(reader, dest, eat_byte_safe(reader, c));
+ }
+
+ if (is_alpha(c) || is_digit(c) || c == '_' || c == '-') {
push_byte(reader, dest, eat_byte_safe(reader, c));
return true;
}
diff --git a/src/writer.c b/src/writer.c
index b0e61cb1..69d51b53 100644
--- a/src/writer.c
+++ b/src/writer.c
@@ -137,7 +137,9 @@ copy_node(SerdNode* dst, const SerdNode* src)
static inline size_t
sink(const void* buf, size_t len, SerdWriter* writer)
{
- if (writer->style & SERD_STYLE_BULK) {
+ if (len == 0) {
+ return 0;
+ } else if (writer->style & SERD_STYLE_BULK) {
return serd_bulk_sink_write(buf, len, &writer->bulk_sink);
} else {
return writer->sink(buf, len, writer->stream);
@@ -171,7 +173,7 @@ parse_utf8_char(SerdWriter* writer, const uint8_t* utf8, size_t* size)
uint8_t in = utf8[i++];
#define READ_BYTE() \
- in = utf8[i++] & 0x3f; \
+ in = utf8[i++] & 0x3F; \
c = (c << 6) | in;
switch (*size) {
@@ -242,21 +244,62 @@ write_uri(SerdWriter* writer, const uint8_t* utf8, size_t n_bytes)
}
}
- if (j > i) {
- // Bulk write all characters up to this special one
- len += sink(&utf8[i], j - i, writer);
- i = j;
- continue;
+ // Bulk write all characters up to this special one
+ len += sink(&utf8[i], j - i, writer);
+ if ((i = j) == n_bytes) {
+ break; // Reached end
}
// Write UTF-8 character
size_t size = 0;
len += write_character(writer, utf8 + i, &size);
i += size;
+ }
+ return len;
+}
- if (size == 0) {
- return len;
+static bool
+lname_must_escape(const uint8_t c)
+{
+ /* This arbitrary list of characters, most of which have nothing to do with
+ Turtle, must be handled as special cases here because the RDF and SPARQL
+ WGs are apparently intent on making the once elegant Turtle a baroque
+ and inconsistent mess, throwing elegance and extensibility completely
+ out the window for no good reason.
+
+ Note '-', '.', and '_' are also in PN_LOCAL_ESC, but are valid unescaped
+ in local names, so they are not escaped here. */
+
+ switch (c) {
+ case '\'': case '!': case '#': case '$': case '%': case '&':
+ case '(': case ')': case '*': case '+': case ',': case '/':
+ case ';': case '=': case '?': case '@': case '~':
+ return true;
+ }
+ return false;
+}
+
+static size_t
+write_lname(SerdWriter* writer, const uint8_t* utf8, size_t n_bytes)
+{
+ size_t len = 0;
+ for (size_t i = 0; i < n_bytes; ++i) {
+ size_t j = i; // Index of next character that must be escaped
+ for (; j < n_bytes; ++j) {
+ if (lname_must_escape(utf8[j])) {
+ break;
+ }
}
+
+ // Bulk write all characters up to this special one
+ len += sink(&utf8[i], j - i, writer);
+ if ((i = j) == n_bytes) {
+ break; // Reached end
+ }
+
+ // Write escape
+ len += sink("\\", 1, writer);
+ len += sink(&utf8[i], 1, writer);
}
return len;
}
@@ -276,10 +319,9 @@ write_text(SerdWriter* writer, TextContext ctx,
}
}
- if (j > i) {
- len += sink(&utf8[i], j - i, writer);
- i = j;
- continue;
+ len += sink(&utf8[i], j - i, writer);
+ if ((i = j) == n_bytes) {
+ break; // Reached end
}
uint8_t in = utf8[i++];
@@ -447,7 +489,7 @@ write_node(SerdWriter* writer,
sink(">", 1, writer);
break;
case SERD_TURTLE:
- sink(node->buf, node->n_bytes, writer);
+ write_lname(writer, node->buf, node->n_bytes);
}
break;
case SERD_LITERAL: