From 09c4bb6a3031d2951ff3d285936a91a4f87dca0f Mon Sep 17 00:00:00 2001
From: David Robillard <d@drobilla.net>
Date: Sat, 30 Mar 2013 16:37:21 +0000
Subject: Add more tests from the new W3C Turtle test suite. Support crazy
 escaped NULL characters in literals. Fix incorrect round-trip serialization
 test command.

git-svn-id: http://svn.drobilla.net/serd/trunk@446 490d8e77-9747-427b-9fa3-0b8f29cee8a0
---
 src/reader.c                                   |  28 +++++++++++++++++--------
 src/serdi.c                                    |   6 +++---
 tests/tests-ttl/LITERAL1_all_controls.nt       |   1 +
 tests/tests-ttl/LITERAL1_all_controls.ttl      | Bin 0 -> 77 bytes
 tests/tests-ttl/LITERAL1_all_controls.ttl.thru |   0
 tests/tests-ttl/LITERAL1_all_punctuation.nt    |   1 +
 tests/tests-ttl/LITERAL1_all_punctuation.ttl   |   1 +
 tests/tests-ttl/manifest.ttl                   |  16 ++++++++++++++
 wscript                                        |   2 +-
 9 files changed, 42 insertions(+), 13 deletions(-)
 create mode 100644 tests/tests-ttl/LITERAL1_all_controls.nt
 create mode 100644 tests/tests-ttl/LITERAL1_all_controls.ttl
 create mode 100644 tests/tests-ttl/LITERAL1_all_controls.ttl.thru
 create mode 100644 tests/tests-ttl/LITERAL1_all_punctuation.nt
 create mode 100644 tests/tests-ttl/LITERAL1_all_punctuation.ttl

diff --git a/src/reader.c b/src/reader.c
index 6233cf30..f86bb630 100644
--- a/src/reader.c
+++ b/src/reader.c
@@ -280,8 +280,8 @@ read_HEX(SerdReader* reader)
 }
 
 // Read UCHAR escape, initial \ is already eaten by caller
-static inline uint32_t
-read_UCHAR(SerdReader* reader, Ref dest)
+static inline bool
+read_UCHAR(SerdReader* reader, Ref dest, uint32_t* char_code)
 {
 	const uint8_t b      = peek_byte(reader);
 	unsigned      length = 0;
@@ -293,14 +293,14 @@ read_UCHAR(SerdReader* reader, Ref dest)
 		length = 4;
 		break;
 	default:
-		return 0;
+		return false;
 	}
 	eat_byte_safe(reader, b);
 
 	uint8_t buf[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
 	for (unsigned i = 0; i < length; ++i) {
 		if (!(buf[i] = read_HEX(reader))) {
-			return 0;
+			return false;
 		}
 	}
 
@@ -320,7 +320,8 @@ read_UCHAR(SerdReader* reader, Ref dest)
 		r_err(reader, SERD_ERR_BAD_SYNTAX,
 		      "unicode character 0x%X out of range\n", code);
 		push_replacement(reader, dest);
-		return 0xFFFD;
+		*char_code = 0xFFFD;
+		return true;
 	}
 
 	// Build output in buf
@@ -346,7 +347,8 @@ read_UCHAR(SerdReader* reader, Ref dest)
 	for (unsigned i = 0; i < size; ++i) {
 		push_byte(reader, dest, buf[i]);
 	}
-	return code;
+	*char_code = code;
+	return true;
 }
 
 // Read ECHAR escape, initial \ is already eaten by caller
@@ -521,10 +523,12 @@ read_STRING_LITERAL_LONG(SerdReader* reader, SerdNodeFlags* flags, uint8_t q)
 	Ref ref = push_node(reader, SERD_LITERAL, "", 0);
 	while (true) {
 		const uint8_t c = peek_byte(reader);
+		uint32_t      code;
 		switch (c) {
 		case '\\':
 			eat_byte_safe(reader, c);
-			if (!read_ECHAR(reader, ref, flags) && !read_UCHAR(reader, ref)) {
+			if (!read_ECHAR(reader, ref, flags) &&
+			    !read_UCHAR(reader, ref, &code)) {
 				r_err(reader, SERD_ERR_BAD_SYNTAX,
 				      "invalid escape `\\%c'\n", peek_byte(reader));
 				return pop_node(reader, ref);
@@ -559,13 +563,15 @@ read_STRING_LITERAL(SerdReader* reader, SerdNodeFlags* flags, uint8_t q)
 	Ref ref = push_node(reader, SERD_LITERAL, "", 0);
 	while (true) {
 		const uint8_t c = peek_byte(reader);
+		uint32_t      code;
 		switch (c) {
 		case '\n': case '\r':
 			r_err(reader, SERD_ERR_BAD_SYNTAX, "line end in short string\n");
 			return pop_node(reader, ref);
 		case '\\':
 			eat_byte_safe(reader, c);
-			if (!read_ECHAR(reader, ref, flags) && !read_UCHAR(reader, ref)) {
+			if (!read_ECHAR(reader, ref, flags) &&
+			    !read_UCHAR(reader, ref, &code)) {
 				r_err(reader, SERD_ERR_BAD_SYNTAX,
 				      "invalid escape `\\%c'\n", peek_byte(reader));
 				return pop_node(reader, ref);
@@ -775,7 +781,11 @@ read_IRIREF(SerdReader* reader)
 			return ref;
 		case '\\':
 			eat_byte_safe(reader, c);
-			switch (code = read_UCHAR(reader, ref)) {
+			if (!read_UCHAR(reader, ref, &code)) {
+				r_err(reader, SERD_ERR_BAD_SYNTAX, "invalid IRI escape\n");
+				return pop_node(reader, ref);
+			}
+			switch (code) {
 			case 0: case ' ': case '<': case '>':
 				r_err(reader, SERD_ERR_BAD_SYNTAX,
 				      "invalid escaped IRI character %X %c\n", code, code);
diff --git a/src/serdi.c b/src/serdi.c
index 290ee1d9..e1de6e32 100644
--- a/src/serdi.c
+++ b/src/serdi.c
@@ -195,9 +195,9 @@ main(int argc, char** argv)
 		}
 	}
 
-	if (input_syntax != SERD_NTRIPLES  // Base URI may change (@base)
-	    || (output_syntax == SERD_TURTLE)) {
-		output_style |= SERD_STYLE_RESOLVED;
+	if (input_syntax != SERD_NTRIPLES || (output_style & SERD_STYLE_CURIED)) {
+		// Base URI may change and/or we're abbreviating URIs, so must resolve
+		output_style |= SERD_STYLE_RESOLVED;  // Base may chan
 	}
 
 	if (bulk_write) {
diff --git a/tests/tests-ttl/LITERAL1_all_controls.nt b/tests/tests-ttl/LITERAL1_all_controls.nt
new file mode 100644
index 00000000..91c8af14
--- /dev/null
+++ b/tests/tests-ttl/LITERAL1_all_controls.nt
@@ -0,0 +1 @@
+<http://a.example/s> <http://a.example/p> "\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007\u0008\t\u000B\u000C\u000E\u000F\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F" .
diff --git a/tests/tests-ttl/LITERAL1_all_controls.ttl b/tests/tests-ttl/LITERAL1_all_controls.ttl
new file mode 100644
index 00000000..dbf3721c
Binary files /dev/null and b/tests/tests-ttl/LITERAL1_all_controls.ttl differ
diff --git a/tests/tests-ttl/LITERAL1_all_controls.ttl.thru b/tests/tests-ttl/LITERAL1_all_controls.ttl.thru
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/tests-ttl/LITERAL1_all_punctuation.nt b/tests/tests-ttl/LITERAL1_all_punctuation.nt
new file mode 100644
index 00000000..c25d818f
--- /dev/null
+++ b/tests/tests-ttl/LITERAL1_all_punctuation.nt
@@ -0,0 +1 @@
+<http://a.example/s> <http://a.example/p> " !\"#$%&():;<=>?@[]^_`{|}~" .
diff --git a/tests/tests-ttl/LITERAL1_all_punctuation.ttl b/tests/tests-ttl/LITERAL1_all_punctuation.ttl
new file mode 100644
index 00000000..7b1d9e54
--- /dev/null
+++ b/tests/tests-ttl/LITERAL1_all_punctuation.ttl
@@ -0,0 +1 @@
+<http://a.example/s> <http://a.example/p> ' !"#$%&():;<=>?@[]^_`{|}~' .
diff --git a/tests/tests-ttl/manifest.ttl b/tests/tests-ttl/manifest.ttl
index 7b9a5f4b..d38ed41d 100644
--- a/tests/tests-ttl/manifest.ttl
+++ b/tests/tests-ttl/manifest.ttl
@@ -66,6 +66,8 @@
     <#first>
     <#last>
     <#LITERAL1>
+    <#LITERAL1_all_controls>
+    <#LITERAL1_all_punctuation>
     <#LITERAL_LONG1>
     <#LITERAL_LONG1_with_1_squote>
     <#LITERAL_LONG1_with_2_squotes>
@@ -565,6 +567,20 @@
    mf:result    <LITERAL1.nt> ;
    .
 
+<#LITERAL1_all_controls> rdf:type rdft:TestTurtleEval ;
+   mf:name      "LITERAL1_all_controls" ;
+   rdfs:comment "LITERAL1_all_controls '\\x00\\x01\\x02\\x03\\x04...'" ;
+   mf:action    <LITERAL1_all_controls.ttl> ;
+   mf:result    <LITERAL1_all_controls.nt> ;
+   .
+
+<#LITERAL1_all_punctuation> rdf:type rdft:TestTurtleEval ;
+   mf:name      "LITERAL1_all_punctuation" ;
+   rdfs:comment "LITERAL1_all_punctuation '!\"#$%&()...'" ;
+   mf:action    <LITERAL1_all_punctuation.ttl> ;
+   mf:result    <LITERAL1_all_punctuation.nt> ;
+   .
+
 <#LITERAL_LONG1> rdf:type rdft:TestTurtleEval ;
    mf:name      "LITERAL_LONG1" ;
    rdfs:comment "LITERAL_LONG1 '''x'''" ;
diff --git a/wscript b/wscript
index 3b3ed4d9..7277e336 100644
--- a/wscript
+++ b/wscript
@@ -286,7 +286,7 @@ def test_thru(ctx, base, path, check_filename, flags):
     in_filename = os.path.join(ctx.path.abspath(), path);
     out_filename = path + '.thru'
 
-    command = ('%s %s -i ntriples -o turtle -p foo "%s" "%s" | '
+    command = ('%s %s -i turtle -o turtle -p foo "%s" "%s" | '
                '%s -i turtle -o ntriples -c foo - "%s" > %s') % (
         'serdi_static', flags.ljust(5),
         in_filename, base,
-- 
cgit v1.2.1