aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid Robillard <d@drobilla.net>2023-02-05 12:42:52 -0500
committerDavid Robillard <d@drobilla.net>2023-12-02 18:49:08 -0500
commitea4b6e7d109ae3abc7f8ecdf99e3eb33e3484b77 (patch)
tree9b050faf6286c055d2fc78729eb4b56a12e3746c
parentd35082a57adac79703f2c9bb72da468172a209c5 (diff)
downloadserd-ea4b6e7d109ae3abc7f8ecdf99e3eb33e3484b77.tar.gz
serd-ea4b6e7d109ae3abc7f8ecdf99e3eb33e3484b77.tar.bz2
serd-ea4b6e7d109ae3abc7f8ecdf99e3eb33e3484b77.zip
Strengthen handling of corrupt UTF-8 input
-rw-r--r--NEWS3
-rw-r--r--src/read_utf8.c15
-rw-r--r--src/string_utils.h20
-rw-r--r--src/writer.c4
-rw-r--r--test/extra/lax/test-bad-uri-nq-out.nq2
-rw-r--r--test/extra/lax/test-bad-uri-out.nt2
-rw-r--r--test/extra/lax/test-bad-utf8-nq-out.nq32
-rw-r--r--test/extra/lax/test-bad-utf8-nt-out.nt32
-rw-r--r--test/extra/lax/test-bad-utf8-ttl-out.nt36
-rw-r--r--test/extra/lax/test-bad-utf8.nq32
-rw-r--r--test/extra/lax/test-bad-utf8.nt32
-rw-r--r--test/extra/lax/test-bad-utf8.ttl66
12 files changed, 234 insertions, 42 deletions
diff --git a/NEWS b/NEWS
index 7df3b70a..de5fe0fd 100644
--- a/NEWS
+++ b/NEWS
@@ -13,11 +13,12 @@ serd (1.1.1) unstable; urgency=medium
* Rename SerdChunk to SerdStringView
* Simplify statement flags
* Simplify writer style options and write UTF-8 by default
+ * Strengthen handling of corrupt UTF-8 input
* Support writing all escapes in Turtle and TriG prefixed names
* Use a fixed-size reader stack
* Use char* for strings in public API
- -- David Robillard <d@drobilla.net> Wed, 13 Jul 2022 21:43:56 +0000
+ -- David Robillard <d@drobilla.net> Mon, 19 Dec 2022 20:54:56 +0000
serd (0.32.0) stable; urgency=medium
diff --git a/src/read_utf8.c b/src/read_utf8.c
index fb8ed0e2..f86bbeba 100644
--- a/src/read_utf8.c
+++ b/src/read_utf8.c
@@ -11,21 +11,10 @@
#define MAX_UTF8_BYTES 4U
static SerdStatus
-skip_invalid_utf8(SerdReader* const reader)
-{
- for (int b = peek_byte(reader); b != EOF && ((uint8_t)b & 0x80);) {
- skip_byte(reader, b);
- b = peek_byte(reader);
- }
-
- return reader->strict ? SERD_BAD_SYNTAX : SERD_FAILURE;
-}
-
-static SerdStatus
bad_char(SerdReader* const reader, const char* const fmt, const uint8_t c)
{
r_err(reader, SERD_BAD_SYNTAX, fmt, c);
- return skip_invalid_utf8(reader);
+ return reader->strict ? SERD_BAD_SYNTAX : SERD_FAILURE;
}
static SerdStatus
@@ -48,7 +37,7 @@ read_utf8_continuation_bytes(SerdReader* const reader,
}
const uint8_t byte = (uint8_t)b;
- if (!(byte & 0x80U)) {
+ if (!is_utf8_continuation(byte)) {
return bad_char(reader, "0x%X is not a UTF-8 continuation byte", byte);
}
diff --git a/src/string_utils.h b/src/string_utils.h
index 8f7ea083..9de03fa0 100644
--- a/src/string_utils.h
+++ b/src/string_utils.h
@@ -48,6 +48,26 @@ is_xdigit(const int c)
return is_hexdig(c) || in_range(c, 'a', 'f');
}
+/** UTF-8: Leading bytes start with 0, or two to four 1s followed by a 0 */
+static inline bool
+is_utf8_leading(const uint8_t c)
+{
+ static const uint8_t m1 = 0x80U; // 10000000
+ static const uint8_t m2 = 0xC0U; // 11000000
+ static const uint8_t m3 = 0xE0U; // 11100000
+ static const uint8_t m4 = 0xF0U; // 11110000
+ static const uint8_t m5 = 0xF8U; // 11111000
+
+ return (c & m1) == 0U || (c & m3) == m2 || (c & m4) == m3 || (c & m5) == m4;
+}
+
+/** UTF-8: Continuation bytes start with 10 */
+static inline bool
+is_utf8_continuation(const uint8_t c)
+{
+ return (c & 0xC0U) == 0x80U;
+}
+
static inline bool
is_space(const char c)
{
diff --git a/src/writer.c b/src/writer.c
index be199af4..482721f9 100644
--- a/src/writer.c
+++ b/src/writer.c
@@ -369,7 +369,7 @@ write_uri(SerdWriter* writer, const char* utf8, size_t n_bytes, SerdStatus* st)
if (size == 0) {
// Corrupt input, write percent-encoded bytes and scan to next start
char escape[4] = {0, 0, 0, 0};
- for (; i < n_bytes && (utf8[i] & 0x80); ++i) {
+ for (; i < n_bytes && !is_utf8_leading((uint8_t)utf8[i]); ++i) {
snprintf(escape, sizeof(escape), "%%%02X", (uint8_t)utf8[i]);
len += sink(escape, 3, writer);
}
@@ -590,7 +590,7 @@ write_text(SerdWriter* writer,
if (size == 0) {
// Corrupt input, write replacement character and scan to the next start
st = esink(replacement_char, sizeof(replacement_char), writer);
- for (; i < n_bytes && (utf8[i] & 0x80); ++i) {
+ for (; i < n_bytes && !is_utf8_leading((uint8_t)utf8[i]); ++i) {
}
} else {
i += size - 1U;
diff --git a/test/extra/lax/test-bad-uri-nq-out.nq b/test/extra/lax/test-bad-uri-nq-out.nq
index 8cb00ba7..66da9b0d 100644
--- a/test/extra/lax/test-bad-uri-nq-out.nq
+++ b/test/extra/lax/test-bad-uri-nq-out.nq
@@ -1,4 +1,4 @@
<http://example.org/s> <http://example.org/p> <http://example.org/\u0009bado1> .
<http://example.org/s> <http://example.org/p> <http://example.org/goodo1> .
-<http://example.org/s> <http://example.org/p> <http://example.org/\uFFFDbado2> .
+<http://example.org/s> <http://example.org/p> <http://example.org/\uFFFD\uFFFDbado2> .
<http://example.org/s> <http://example.org/p> <http://example.org/goodo2> .
diff --git a/test/extra/lax/test-bad-uri-out.nt b/test/extra/lax/test-bad-uri-out.nt
index 8cb00ba7..66da9b0d 100644
--- a/test/extra/lax/test-bad-uri-out.nt
+++ b/test/extra/lax/test-bad-uri-out.nt
@@ -1,4 +1,4 @@
<http://example.org/s> <http://example.org/p> <http://example.org/\u0009bado1> .
<http://example.org/s> <http://example.org/p> <http://example.org/goodo1> .
-<http://example.org/s> <http://example.org/p> <http://example.org/\uFFFDbado2> .
+<http://example.org/s> <http://example.org/p> <http://example.org/\uFFFD\uFFFDbado2> .
<http://example.org/s> <http://example.org/p> <http://example.org/goodo2> .
diff --git a/test/extra/lax/test-bad-utf8-nq-out.nq b/test/extra/lax/test-bad-utf8-nq-out.nq
index 8cefa258..9970c1dd 100644
--- a/test/extra/lax/test-bad-utf8-nq-out.nq
+++ b/test/extra/lax/test-bad-utf8-nq-out.nq
@@ -1,3 +1,29 @@
-<http://example.org/s> <http://example.org/p> "Impossible bytes: \uFFFD \uFFFD" .
-<http://example.org/s> <http://example.org/p> "2 continuation bytes: \uFFFD" .
-<http://example.org/s> <http://example.org/p> "Missing continuation: \uFFFD" .
+<http://example.org/s> <http://example.org/p> "The other values of this property should align nicely" .
+<http://example.org/s> <http://example.org/p> "Impossible byte 1: \uFFFD |" .
+<http://example.org/s> <http://example.org/p> "Impossible byte 2: \uFFFD |" .
+<http://example.org/s> <http://example.org/p> "Four impossible bytes: \uFFFD\uFFFD\uFFFD\uFFFD |" .
+<http://example.org/s> <http://example.org/p> "First continuation byte: \uFFFD |" .
+<http://example.org/s> <http://example.org/p> "Last continuation byte: \uFFFD |" .
+<http://example.org/s> <http://example.org/p> "2 continuation bytes: \uFFFD\uFFFD |" .
+<http://example.org/s> <http://example.org/p> "3 continuation bytes: \uFFFD\uFFFD\uFFFD |" .
+<http://example.org/s> <http://example.org/p> "4 continuation bytes: \uFFFD\uFFFD\uFFFD\uFFFD |" .
+<http://example.org/s> <http://example.org/p> "5 continuation bytes: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |" .
+<http://example.org/s> <http://example.org/p> "6 continuation bytes: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |" .
+<http://example.org/s> <http://example.org/p> "7 continuation bytes: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |" .
+<http://example.org/s> <http://example.org/p> "Continuation bytes 1: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |" .
+<http://example.org/s> <http://example.org/p> "Continuation bytes 2: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |" .
+<http://example.org/s> <http://example.org/p> "Continuation bytes 3: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |" .
+<http://example.org/s> <http://example.org/p> "Continuation bytes 4: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |" .
+<http://example.org/s> <http://example.org/p> "Lonely leading bytes of 2-byte sequences 1: \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD |" .
+<http://example.org/s> <http://example.org/p> "Lonely leading bytes of 2-byte sequences 2: \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD |" .
+<http://example.org/s> <http://example.org/p> "Lonely leading bytes of 3-byte sequences: \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD |" .
+<http://example.org/s> <http://example.org/p> "Lonely leading bytes of 4-byte sequences: \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD |" .
+<http://example.org/s> <http://example.org/p> "Lonely leading bytes of 5-byte sequences: \uFFFD \uFFFD \uFFFD \uFFFD |" .
+<http://example.org/s> <http://example.org/p> "Lonely leading bytes of 6-byte sequences: \uFFFD \uFFFD |" .
+<http://example.org/s> <http://example.org/p> "2-byte sequence with last byte missing (U+0000): \uFFFD |" .
+<http://example.org/s> <http://example.org/p> "3-byte sequence with last byte missing (U+0000): \uFFFD |" .
+<http://example.org/s> <http://example.org/p> "4-byte sequence with last byte missing (U+0000): \uFFFD |" .
+<http://example.org/s> <http://example.org/p> "2-byte sequence with last byte missing (U-000007FF): \uFFFD |" .
+<http://example.org/s> <http://example.org/p> "3-byte sequence with last byte missing (U-0000FFFF): \uFFFD |" .
+<http://example.org/s> <http://example.org/p> "4-byte sequence with last byte missing (U-001FFFFF): \uFFFD |" .
+<http://example.org/s> <http://example.org/p> "6 sequences with last byte missing: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |" .
diff --git a/test/extra/lax/test-bad-utf8-nt-out.nt b/test/extra/lax/test-bad-utf8-nt-out.nt
index 8cefa258..9970c1dd 100644
--- a/test/extra/lax/test-bad-utf8-nt-out.nt
+++ b/test/extra/lax/test-bad-utf8-nt-out.nt
@@ -1,3 +1,29 @@
-<http://example.org/s> <http://example.org/p> "Impossible bytes: \uFFFD \uFFFD" .
-<http://example.org/s> <http://example.org/p> "2 continuation bytes: \uFFFD" .
-<http://example.org/s> <http://example.org/p> "Missing continuation: \uFFFD" .
+<http://example.org/s> <http://example.org/p> "The other values of this property should align nicely" .
+<http://example.org/s> <http://example.org/p> "Impossible byte 1: \uFFFD |" .
+<http://example.org/s> <http://example.org/p> "Impossible byte 2: \uFFFD |" .
+<http://example.org/s> <http://example.org/p> "Four impossible bytes: \uFFFD\uFFFD\uFFFD\uFFFD |" .
+<http://example.org/s> <http://example.org/p> "First continuation byte: \uFFFD |" .
+<http://example.org/s> <http://example.org/p> "Last continuation byte: \uFFFD |" .
+<http://example.org/s> <http://example.org/p> "2 continuation bytes: \uFFFD\uFFFD |" .
+<http://example.org/s> <http://example.org/p> "3 continuation bytes: \uFFFD\uFFFD\uFFFD |" .
+<http://example.org/s> <http://example.org/p> "4 continuation bytes: \uFFFD\uFFFD\uFFFD\uFFFD |" .
+<http://example.org/s> <http://example.org/p> "5 continuation bytes: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |" .
+<http://example.org/s> <http://example.org/p> "6 continuation bytes: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |" .
+<http://example.org/s> <http://example.org/p> "7 continuation bytes: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |" .
+<http://example.org/s> <http://example.org/p> "Continuation bytes 1: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |" .
+<http://example.org/s> <http://example.org/p> "Continuation bytes 2: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |" .
+<http://example.org/s> <http://example.org/p> "Continuation bytes 3: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |" .
+<http://example.org/s> <http://example.org/p> "Continuation bytes 4: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |" .
+<http://example.org/s> <http://example.org/p> "Lonely leading bytes of 2-byte sequences 1: \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD |" .
+<http://example.org/s> <http://example.org/p> "Lonely leading bytes of 2-byte sequences 2: \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD |" .
+<http://example.org/s> <http://example.org/p> "Lonely leading bytes of 3-byte sequences: \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD |" .
+<http://example.org/s> <http://example.org/p> "Lonely leading bytes of 4-byte sequences: \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD |" .
+<http://example.org/s> <http://example.org/p> "Lonely leading bytes of 5-byte sequences: \uFFFD \uFFFD \uFFFD \uFFFD |" .
+<http://example.org/s> <http://example.org/p> "Lonely leading bytes of 6-byte sequences: \uFFFD \uFFFD |" .
+<http://example.org/s> <http://example.org/p> "2-byte sequence with last byte missing (U+0000): \uFFFD |" .
+<http://example.org/s> <http://example.org/p> "3-byte sequence with last byte missing (U+0000): \uFFFD |" .
+<http://example.org/s> <http://example.org/p> "4-byte sequence with last byte missing (U+0000): \uFFFD |" .
+<http://example.org/s> <http://example.org/p> "2-byte sequence with last byte missing (U-000007FF): \uFFFD |" .
+<http://example.org/s> <http://example.org/p> "3-byte sequence with last byte missing (U-0000FFFF): \uFFFD |" .
+<http://example.org/s> <http://example.org/p> "4-byte sequence with last byte missing (U-001FFFFF): \uFFFD |" .
+<http://example.org/s> <http://example.org/p> "6 sequences with last byte missing: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |" .
diff --git a/test/extra/lax/test-bad-utf8-ttl-out.nt b/test/extra/lax/test-bad-utf8-ttl-out.nt
index 58f2c52b..c8364786 100644
--- a/test/extra/lax/test-bad-utf8-ttl-out.nt
+++ b/test/extra/lax/test-bad-utf8-ttl-out.nt
@@ -1,6 +1,30 @@
-<http://example.org/s> <http://example.org/p> "Impossible bytes: \uFFFD \uFFFD" .
-<http://example.org/s> <http://example.org/p> "2 continuation bytes: \uFFFD" .
-<http://example.org/s> <http://example.org/p> "Missing continuation: \uFFFD" .
-<http://example.org/s> <http://example.org/p> "Impossible bytes: \uFFFD \uFFFD" .
-<http://example.org/s> <http://example.org/p> "2 continuation bytes: \uFFFD" .
-<http://example.org/s> <http://example.org/p> "Missing continuation: \uFFFD" .
+<http://example.org/s> <http://example.org/p> "The other values of this property should align nicely" .
+<http://example.org/s> <http://example.org/p> "Impossible byte 1: \uFFFD |" .
+<http://example.org/s> <http://example.org/p> "Impossible byte 2: \uFFFD |" .
+<http://example.org/s> <http://example.org/p> "Four impossible bytes: \uFFFD\uFFFD\uFFFD\uFFFD |" .
+<http://example.org/s> <http://example.org/p> "First continuation byte: \uFFFD |" .
+<http://example.org/s> <http://example.org/p> "Last continuation byte: \uFFFD |" .
+<http://example.org/s> <http://example.org/p> "2 continuation bytes: \uFFFD\uFFFD |" .
+<http://example.org/s> <http://example.org/p> "3 continuation bytes: \uFFFD\uFFFD\uFFFD |" .
+<http://example.org/s> <http://example.org/p> "4 continuation bytes: \uFFFD\uFFFD\uFFFD\uFFFD |" .
+<http://example.org/s> <http://example.org/p> "5 continuation bytes: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |" .
+<http://example.org/s> <http://example.org/p> "6 continuation bytes: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |" .
+<http://example.org/s> <http://example.org/p> "7 continuation bytes: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |" .
+<http://example.org/s> <http://example.org/p> "Continuation bytes 1: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |" .
+<http://example.org/s> <http://example.org/p> "Continuation bytes 2: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |" .
+<http://example.org/s> <http://example.org/p> "Continuation bytes 3: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |" .
+<http://example.org/s> <http://example.org/p> "Continuation bytes 4: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |" .
+<http://example.org/s> <http://example.org/p> "Lonely leading bytes of 2-byte sequences 1: \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD |" .
+<http://example.org/s> <http://example.org/p> "Lonely leading bytes of 2-byte sequences 2: \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD |" .
+<http://example.org/s> <http://example.org/p> "Lonely leading bytes of 3-byte sequences: \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD |" .
+<http://example.org/s> <http://example.org/p> "Lonely leading bytes of 4-byte sequences: \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD |" .
+<http://example.org/s> <http://example.org/p> "Lonely leading bytes of 5-byte sequences: \uFFFD \uFFFD \uFFFD \uFFFD |" .
+<http://example.org/s> <http://example.org/p> "Lonely leading bytes of 6-byte sequences: \uFFFD \uFFFD |" .
+<http://example.org/s> <http://example.org/p> "2-byte sequence with last byte missing (U+0000): \uFFFD |" .
+<http://example.org/s> <http://example.org/p> "3-byte sequence with last byte missing (U+0000): \uFFFD |" .
+<http://example.org/s> <http://example.org/p> "4-byte sequence with last byte missing (U+0000): \uFFFD |" .
+<http://example.org/s> <http://example.org/p> "2-byte sequence with last byte missing (U-000007FF): \uFFFD |" .
+<http://example.org/s> <http://example.org/p> "3-byte sequence with last byte missing (U-0000FFFF): \uFFFD |" .
+<http://example.org/s> <http://example.org/p> "4-byte sequence with last byte missing (U-001FFFFF): \uFFFD |" .
+<http://example.org/s> <http://example.org/p> "6 sequences with last byte missing: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |" .
+<http://example.org/s> <http://example.org/p> "\nImpossible byte 1: \uFFFD |\nImpossible byte 2: \uFFFD |\nFour impossible bytes: \uFFFD\uFFFD\uFFFD\uFFFD |\nFirst continuation byte: \uFFFD |\nLast continuation byte: \uFFFD |\n2 continuation bytes: \uFFFD\uFFFD |\n3 continuation bytes: \uFFFD\uFFFD\uFFFD |\n4 continuation bytes: \uFFFD\uFFFD\uFFFD\uFFFD |\n5 continuation bytes: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |\n6 continuation bytes: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |\n7 continuation bytes: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |\nContinuation bytes 1: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |\nContinuation bytes 2: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |\nContinuation bytes 3: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |\nContinuation bytes 4: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |\nLonely leading bytes of 2-byte sequences 1: \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD |\nLonely leading bytes of 2-byte sequences 2: \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD |\nLonely leading bytes of 3-byte sequences: \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD |\nLonely leading bytes of 4-byte sequences: \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD |\nLonely leading bytes of 5-byte sequences: \uFFFD \uFFFD \uFFFD \uFFFD |\nLonely leading bytes of 6-byte sequences: \uFFFD \uFFFD |\n2-byte sequence with last byte missing (U+0000): \uFFFD |\n3-byte sequence with last byte missing (U+0000): \uFFFD |\n4-byte sequence with last byte missing (U+0000): \uFFFD |\n2-byte sequence with last byte missing (U-000007FF): \uFFFD |\n3-byte sequence with last byte missing (U-0000FFFF): \uFFFD |\n4-byte sequence with last byte missing (U-001FFFFF): \uFFFD |\n6 sequences with last byte missing: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |\n" .
diff --git a/test/extra/lax/test-bad-utf8.nq b/test/extra/lax/test-bad-utf8.nq
index b8c04637..de67830b 100644
--- a/test/extra/lax/test-bad-utf8.nq
+++ b/test/extra/lax/test-bad-utf8.nq
@@ -1,3 +1,29 @@
-<http://example.org/s> <http://example.org/p> "Impossible bytes: " .
-<http://example.org/s> <http://example.org/p> "2 continuation bytes: " .
-<http://example.org/s> <http://example.org/p> "Missing continuation: " .
+<http://example.org/s> <http://example.org/p> "The other values of this property should align nicely" .
+<http://example.org/s> <http://example.org/p> "Impossible byte 1: |" .
+<http://example.org/s> <http://example.org/p> "Impossible byte 2: |" .
+<http://example.org/s> <http://example.org/p> "Four impossible bytes: |" .
+<http://example.org/s> <http://example.org/p> "First continuation byte: |" .
+<http://example.org/s> <http://example.org/p> "Last continuation byte: |" .
+<http://example.org/s> <http://example.org/p> "2 continuation bytes: |" .
+<http://example.org/s> <http://example.org/p> "3 continuation bytes: |" .
+<http://example.org/s> <http://example.org/p> "4 continuation bytes: |" .
+<http://example.org/s> <http://example.org/p> "5 continuation bytes: |" .
+<http://example.org/s> <http://example.org/p> "6 continuation bytes: |" .
+<http://example.org/s> <http://example.org/p> "7 continuation bytes: |" .
+<http://example.org/s> <http://example.org/p> "Continuation bytes 1: |" .
+<http://example.org/s> <http://example.org/p> "Continuation bytes 2: |" .
+<http://example.org/s> <http://example.org/p> "Continuation bytes 3: |" .
+<http://example.org/s> <http://example.org/p> "Continuation bytes 4: |" .
+<http://example.org/s> <http://example.org/p> "Lonely leading bytes of 2-byte sequences 1: |" .
+<http://example.org/s> <http://example.org/p> "Lonely leading bytes of 2-byte sequences 2: |" .
+<http://example.org/s> <http://example.org/p> "Lonely leading bytes of 3-byte sequences: |" .
+<http://example.org/s> <http://example.org/p> "Lonely leading bytes of 4-byte sequences: |" .
+<http://example.org/s> <http://example.org/p> "Lonely leading bytes of 5-byte sequences: |" .
+<http://example.org/s> <http://example.org/p> "Lonely leading bytes of 6-byte sequences: |" .
+<http://example.org/s> <http://example.org/p> "2-byte sequence with last byte missing (U+0000): |" .
+<http://example.org/s> <http://example.org/p> "3-byte sequence with last byte missing (U+0000): |" .
+<http://example.org/s> <http://example.org/p> "4-byte sequence with last byte missing (U+0000): |" .
+<http://example.org/s> <http://example.org/p> "2-byte sequence with last byte missing (U-000007FF): |" .
+<http://example.org/s> <http://example.org/p> "3-byte sequence with last byte missing (U-0000FFFF): |" .
+<http://example.org/s> <http://example.org/p> "4-byte sequence with last byte missing (U-001FFFFF): |" .
+<http://example.org/s> <http://example.org/p> "6 sequences with last byte missing: |" .
diff --git a/test/extra/lax/test-bad-utf8.nt b/test/extra/lax/test-bad-utf8.nt
index b8c04637..de67830b 100644
--- a/test/extra/lax/test-bad-utf8.nt
+++ b/test/extra/lax/test-bad-utf8.nt
@@ -1,3 +1,29 @@
-<http://example.org/s> <http://example.org/p> "Impossible bytes: " .
-<http://example.org/s> <http://example.org/p> "2 continuation bytes: " .
-<http://example.org/s> <http://example.org/p> "Missing continuation: " .
+<http://example.org/s> <http://example.org/p> "The other values of this property should align nicely" .
+<http://example.org/s> <http://example.org/p> "Impossible byte 1: |" .
+<http://example.org/s> <http://example.org/p> "Impossible byte 2: |" .
+<http://example.org/s> <http://example.org/p> "Four impossible bytes: |" .
+<http://example.org/s> <http://example.org/p> "First continuation byte: |" .
+<http://example.org/s> <http://example.org/p> "Last continuation byte: |" .
+<http://example.org/s> <http://example.org/p> "2 continuation bytes: |" .
+<http://example.org/s> <http://example.org/p> "3 continuation bytes: |" .
+<http://example.org/s> <http://example.org/p> "4 continuation bytes: |" .
+<http://example.org/s> <http://example.org/p> "5 continuation bytes: |" .
+<http://example.org/s> <http://example.org/p> "6 continuation bytes: |" .
+<http://example.org/s> <http://example.org/p> "7 continuation bytes: |" .
+<http://example.org/s> <http://example.org/p> "Continuation bytes 1: |" .
+<http://example.org/s> <http://example.org/p> "Continuation bytes 2: |" .
+<http://example.org/s> <http://example.org/p> "Continuation bytes 3: |" .
+<http://example.org/s> <http://example.org/p> "Continuation bytes 4: |" .
+<http://example.org/s> <http://example.org/p> "Lonely leading bytes of 2-byte sequences 1: |" .
+<http://example.org/s> <http://example.org/p> "Lonely leading bytes of 2-byte sequences 2: |" .
+<http://example.org/s> <http://example.org/p> "Lonely leading bytes of 3-byte sequences: |" .
+<http://example.org/s> <http://example.org/p> "Lonely leading bytes of 4-byte sequences: |" .
+<http://example.org/s> <http://example.org/p> "Lonely leading bytes of 5-byte sequences: |" .
+<http://example.org/s> <http://example.org/p> "Lonely leading bytes of 6-byte sequences: |" .
+<http://example.org/s> <http://example.org/p> "2-byte sequence with last byte missing (U+0000): |" .
+<http://example.org/s> <http://example.org/p> "3-byte sequence with last byte missing (U+0000): |" .
+<http://example.org/s> <http://example.org/p> "4-byte sequence with last byte missing (U+0000): |" .
+<http://example.org/s> <http://example.org/p> "2-byte sequence with last byte missing (U-000007FF): |" .
+<http://example.org/s> <http://example.org/p> "3-byte sequence with last byte missing (U-0000FFFF): |" .
+<http://example.org/s> <http://example.org/p> "4-byte sequence with last byte missing (U-001FFFFF): |" .
+<http://example.org/s> <http://example.org/p> "6 sequences with last byte missing: |" .
diff --git a/test/extra/lax/test-bad-utf8.ttl b/test/extra/lax/test-bad-utf8.ttl
index 0e177366..e5640078 100644
--- a/test/extra/lax/test-bad-utf8.ttl
+++ b/test/extra/lax/test-bad-utf8.ttl
@@ -1,6 +1,60 @@
-<http://example.org/s> <http://example.org/p> "Impossible bytes: " .
-<http://example.org/s> <http://example.org/p> "2 continuation bytes: " .
-<http://example.org/s> <http://example.org/p> "Missing continuation: " .
-<http://example.org/s> <http://example.org/p> """Impossible bytes: """ .
-<http://example.org/s> <http://example.org/p> """2 continuation bytes: """ .
-<http://example.org/s> <http://example.org/p> """Missing continuation: """ .
+<http://example.org/s> <http://example.org/p> "The other values of this property should align nicely" .
+<http://example.org/s> <http://example.org/p> "Impossible byte 1: |" .
+<http://example.org/s> <http://example.org/p> "Impossible byte 2: |" .
+<http://example.org/s> <http://example.org/p> "Four impossible bytes: |" .
+<http://example.org/s> <http://example.org/p> "First continuation byte: |" .
+<http://example.org/s> <http://example.org/p> "Last continuation byte: |" .
+<http://example.org/s> <http://example.org/p> "2 continuation bytes: |" .
+<http://example.org/s> <http://example.org/p> "3 continuation bytes: |" .
+<http://example.org/s> <http://example.org/p> "4 continuation bytes: |" .
+<http://example.org/s> <http://example.org/p> "5 continuation bytes: |" .
+<http://example.org/s> <http://example.org/p> "6 continuation bytes: |" .
+<http://example.org/s> <http://example.org/p> "7 continuation bytes: |" .
+<http://example.org/s> <http://example.org/p> "Continuation bytes 1: |" .
+<http://example.org/s> <http://example.org/p> "Continuation bytes 2: |" .
+<http://example.org/s> <http://example.org/p> "Continuation bytes 3: |" .
+<http://example.org/s> <http://example.org/p> "Continuation bytes 4: |" .
+<http://example.org/s> <http://example.org/p> "Lonely leading bytes of 2-byte sequences 1: |" .
+<http://example.org/s> <http://example.org/p> "Lonely leading bytes of 2-byte sequences 2: |" .
+<http://example.org/s> <http://example.org/p> "Lonely leading bytes of 3-byte sequences: |" .
+<http://example.org/s> <http://example.org/p> "Lonely leading bytes of 4-byte sequences: |" .
+<http://example.org/s> <http://example.org/p> "Lonely leading bytes of 5-byte sequences: |" .
+<http://example.org/s> <http://example.org/p> "Lonely leading bytes of 6-byte sequences: |" .
+<http://example.org/s> <http://example.org/p> "2-byte sequence with last byte missing (U+0000): |" .
+<http://example.org/s> <http://example.org/p> "3-byte sequence with last byte missing (U+0000): |" .
+<http://example.org/s> <http://example.org/p> "4-byte sequence with last byte missing (U+0000): |" .
+<http://example.org/s> <http://example.org/p> "2-byte sequence with last byte missing (U-000007FF): |" .
+<http://example.org/s> <http://example.org/p> "3-byte sequence with last byte missing (U-0000FFFF): |" .
+<http://example.org/s> <http://example.org/p> "4-byte sequence with last byte missing (U-001FFFFF): |" .
+<http://example.org/s> <http://example.org/p> "6 sequences with last byte missing: |" .
+
+<http://example.org/s> <http://example.org/p> """
+Impossible byte 1: |
+Impossible byte 2: |
+Four impossible bytes: |
+First continuation byte: |
+Last continuation byte: |
+2 continuation bytes: |
+3 continuation bytes: |
+4 continuation bytes: |
+5 continuation bytes: |
+6 continuation bytes: |
+7 continuation bytes: |
+Continuation bytes 1: |
+Continuation bytes 2: |
+Continuation bytes 3: |
+Continuation bytes 4: |
+Lonely leading bytes of 2-byte sequences 1: |
+Lonely leading bytes of 2-byte sequences 2: |
+Lonely leading bytes of 3-byte sequences: |
+Lonely leading bytes of 4-byte sequences: |
+Lonely leading bytes of 5-byte sequences: |
+Lonely leading bytes of 6-byte sequences: |
+2-byte sequence with last byte missing (U+0000): |
+3-byte sequence with last byte missing (U+0000): |
+4-byte sequence with last byte missing (U+0000): |
+2-byte sequence with last byte missing (U-000007FF): |
+3-byte sequence with last byte missing (U-0000FFFF): |
+4-byte sequence with last byte missing (U-001FFFFF): |
+6 sequences with last byte missing: |
+""" .