From ea4b6e7d109ae3abc7f8ecdf99e3eb33e3484b77 Mon Sep 17 00:00:00 2001 From: David Robillard Date: Sun, 5 Feb 2023 12:42:52 -0500 Subject: Strengthen handling of corrupt UTF-8 input --- NEWS | 3 +- src/read_utf8.c | 15 +------- src/string_utils.h | 20 ++++++++++ src/writer.c | 4 +- test/extra/lax/test-bad-uri-nq-out.nq | 2 +- test/extra/lax/test-bad-uri-out.nt | 2 +- test/extra/lax/test-bad-utf8-nq-out.nq | 32 ++++++++++++++-- test/extra/lax/test-bad-utf8-nt-out.nt | 32 ++++++++++++++-- test/extra/lax/test-bad-utf8-ttl-out.nt | 36 +++++++++++++++--- test/extra/lax/test-bad-utf8.nq | 32 ++++++++++++++-- test/extra/lax/test-bad-utf8.nt | 32 ++++++++++++++-- test/extra/lax/test-bad-utf8.ttl | 66 ++++++++++++++++++++++++++++++--- 12 files changed, 234 insertions(+), 42 deletions(-) diff --git a/NEWS b/NEWS index 7df3b70a..de5fe0fd 100644 --- a/NEWS +++ b/NEWS @@ -13,11 +13,12 @@ serd (1.1.1) unstable; urgency=medium * Rename SerdChunk to SerdStringView * Simplify statement flags * Simplify writer style options and write UTF-8 by default + * Strengthen handling of corrupt UTF-8 input * Support writing all escapes in Turtle and TriG prefixed names * Use a fixed-size reader stack * Use char* for strings in public API - -- David Robillard Wed, 13 Jul 2022 21:43:56 +0000 + -- David Robillard Mon, 19 Dec 2022 20:54:56 +0000 serd (0.32.0) stable; urgency=medium diff --git a/src/read_utf8.c b/src/read_utf8.c index fb8ed0e2..f86bbeba 100644 --- a/src/read_utf8.c +++ b/src/read_utf8.c @@ -10,22 +10,11 @@ #define MAX_UTF8_BYTES 4U -static SerdStatus -skip_invalid_utf8(SerdReader* const reader) -{ - for (int b = peek_byte(reader); b != EOF && ((uint8_t)b & 0x80);) { - skip_byte(reader, b); - b = peek_byte(reader); - } - - return reader->strict ? SERD_BAD_SYNTAX : SERD_FAILURE; -} - static SerdStatus bad_char(SerdReader* const reader, const char* const fmt, const uint8_t c) { r_err(reader, SERD_BAD_SYNTAX, fmt, c); - return skip_invalid_utf8(reader); + return reader->strict ? SERD_BAD_SYNTAX : SERD_FAILURE; } static SerdStatus @@ -48,7 +37,7 @@ read_utf8_continuation_bytes(SerdReader* const reader, } const uint8_t byte = (uint8_t)b; - if (!(byte & 0x80U)) { + if (!is_utf8_continuation(byte)) { return bad_char(reader, "0x%X is not a UTF-8 continuation byte", byte); } diff --git a/src/string_utils.h b/src/string_utils.h index 8f7ea083..9de03fa0 100644 --- a/src/string_utils.h +++ b/src/string_utils.h @@ -48,6 +48,26 @@ is_xdigit(const int c) return is_hexdig(c) || in_range(c, 'a', 'f'); } +/** UTF-8: Leading bytes start with 0, or two to four 1s followed by a 0 */ +static inline bool +is_utf8_leading(const uint8_t c) +{ + static const uint8_t m1 = 0x80U; // 10000000 + static const uint8_t m2 = 0xC0U; // 11000000 + static const uint8_t m3 = 0xE0U; // 11100000 + static const uint8_t m4 = 0xF0U; // 11110000 + static const uint8_t m5 = 0xF8U; // 11111000 + + return (c & m1) == 0U || (c & m3) == m2 || (c & m4) == m3 || (c & m5) == m4; +} + +/** UTF-8: Continuation bytes start with 10 */ +static inline bool +is_utf8_continuation(const uint8_t c) +{ + return (c & 0xC0U) == 0x80U; +} + static inline bool is_space(const char c) { diff --git a/src/writer.c b/src/writer.c index be199af4..482721f9 100644 --- a/src/writer.c +++ b/src/writer.c @@ -369,7 +369,7 @@ write_uri(SerdWriter* writer, const char* utf8, size_t n_bytes, SerdStatus* st) if (size == 0) { // Corrupt input, write percent-encoded bytes and scan to next start char escape[4] = {0, 0, 0, 0}; - for (; i < n_bytes && (utf8[i] & 0x80); ++i) { + for (; i < n_bytes && !is_utf8_leading((uint8_t)utf8[i]); ++i) { snprintf(escape, sizeof(escape), "%%%02X", (uint8_t)utf8[i]); len += sink(escape, 3, writer); } @@ -590,7 +590,7 @@ write_text(SerdWriter* writer, if (size == 0) { // Corrupt input, write replacement character and scan to the next start st = esink(replacement_char, sizeof(replacement_char), writer); - for (; i < n_bytes && (utf8[i] & 0x80); ++i) { + for (; i < n_bytes && !is_utf8_leading((uint8_t)utf8[i]); ++i) { } } else { i += size - 1U; diff --git a/test/extra/lax/test-bad-uri-nq-out.nq b/test/extra/lax/test-bad-uri-nq-out.nq index 8cb00ba7..66da9b0d 100644 --- a/test/extra/lax/test-bad-uri-nq-out.nq +++ b/test/extra/lax/test-bad-uri-nq-out.nq @@ -1,4 +1,4 @@ . . - . + . . diff --git a/test/extra/lax/test-bad-uri-out.nt b/test/extra/lax/test-bad-uri-out.nt index 8cb00ba7..66da9b0d 100644 --- a/test/extra/lax/test-bad-uri-out.nt +++ b/test/extra/lax/test-bad-uri-out.nt @@ -1,4 +1,4 @@ . . - . + . . diff --git a/test/extra/lax/test-bad-utf8-nq-out.nq b/test/extra/lax/test-bad-utf8-nq-out.nq index 8cefa258..9970c1dd 100644 --- a/test/extra/lax/test-bad-utf8-nq-out.nq +++ b/test/extra/lax/test-bad-utf8-nq-out.nq @@ -1,3 +1,29 @@ - "Impossible bytes: \uFFFD \uFFFD" . - "2 continuation bytes: \uFFFD" . - "Missing continuation: \uFFFD" . + "The other values of this property should align nicely" . + "Impossible byte 1: \uFFFD |" . + "Impossible byte 2: \uFFFD |" . + "Four impossible bytes: \uFFFD\uFFFD\uFFFD\uFFFD |" . + "First continuation byte: \uFFFD |" . + "Last continuation byte: \uFFFD |" . + "2 continuation bytes: \uFFFD\uFFFD |" . + "3 continuation bytes: \uFFFD\uFFFD\uFFFD |" . + "4 continuation bytes: \uFFFD\uFFFD\uFFFD\uFFFD |" . + "5 continuation bytes: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |" . + "6 continuation bytes: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |" . + "7 continuation bytes: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |" . + "Continuation bytes 1: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |" . + "Continuation bytes 2: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |" . + "Continuation bytes 3: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |" . + "Continuation bytes 4: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |" . + "Lonely leading bytes of 2-byte sequences 1: \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD |" . + "Lonely leading bytes of 2-byte sequences 2: \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD |" . + "Lonely leading bytes of 3-byte sequences: \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD |" . + "Lonely leading bytes of 4-byte sequences: \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD |" . + "Lonely leading bytes of 5-byte sequences: \uFFFD \uFFFD \uFFFD \uFFFD |" . + "Lonely leading bytes of 6-byte sequences: \uFFFD \uFFFD |" . + "2-byte sequence with last byte missing (U+0000): \uFFFD |" . + "3-byte sequence with last byte missing (U+0000): \uFFFD |" . + "4-byte sequence with last byte missing (U+0000): \uFFFD |" . + "2-byte sequence with last byte missing (U-000007FF): \uFFFD |" . + "3-byte sequence with last byte missing (U-0000FFFF): \uFFFD |" . + "4-byte sequence with last byte missing (U-001FFFFF): \uFFFD |" . + "6 sequences with last byte missing: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |" . diff --git a/test/extra/lax/test-bad-utf8-nt-out.nt b/test/extra/lax/test-bad-utf8-nt-out.nt index 8cefa258..9970c1dd 100644 --- a/test/extra/lax/test-bad-utf8-nt-out.nt +++ b/test/extra/lax/test-bad-utf8-nt-out.nt @@ -1,3 +1,29 @@ - "Impossible bytes: \uFFFD \uFFFD" . - "2 continuation bytes: \uFFFD" . - "Missing continuation: \uFFFD" . + "The other values of this property should align nicely" . + "Impossible byte 1: \uFFFD |" . + "Impossible byte 2: \uFFFD |" . + "Four impossible bytes: \uFFFD\uFFFD\uFFFD\uFFFD |" . + "First continuation byte: \uFFFD |" . + "Last continuation byte: \uFFFD |" . + "2 continuation bytes: \uFFFD\uFFFD |" . + "3 continuation bytes: \uFFFD\uFFFD\uFFFD |" . + "4 continuation bytes: \uFFFD\uFFFD\uFFFD\uFFFD |" . + "5 continuation bytes: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |" . + "6 continuation bytes: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |" . + "7 continuation bytes: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |" . + "Continuation bytes 1: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |" . + "Continuation bytes 2: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |" . + "Continuation bytes 3: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |" . + "Continuation bytes 4: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |" . + "Lonely leading bytes of 2-byte sequences 1: \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD |" . + "Lonely leading bytes of 2-byte sequences 2: \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD |" . + "Lonely leading bytes of 3-byte sequences: \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD |" . + "Lonely leading bytes of 4-byte sequences: \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD |" . + "Lonely leading bytes of 5-byte sequences: \uFFFD \uFFFD \uFFFD \uFFFD |" . + "Lonely leading bytes of 6-byte sequences: \uFFFD \uFFFD |" . + "2-byte sequence with last byte missing (U+0000): \uFFFD |" . + "3-byte sequence with last byte missing (U+0000): \uFFFD |" . + "4-byte sequence with last byte missing (U+0000): \uFFFD |" . + "2-byte sequence with last byte missing (U-000007FF): \uFFFD |" . + "3-byte sequence with last byte missing (U-0000FFFF): \uFFFD |" . + "4-byte sequence with last byte missing (U-001FFFFF): \uFFFD |" . + "6 sequences with last byte missing: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |" . diff --git a/test/extra/lax/test-bad-utf8-ttl-out.nt b/test/extra/lax/test-bad-utf8-ttl-out.nt index 58f2c52b..c8364786 100644 --- a/test/extra/lax/test-bad-utf8-ttl-out.nt +++ b/test/extra/lax/test-bad-utf8-ttl-out.nt @@ -1,6 +1,30 @@ - "Impossible bytes: \uFFFD \uFFFD" . - "2 continuation bytes: \uFFFD" . - "Missing continuation: \uFFFD" . - "Impossible bytes: \uFFFD \uFFFD" . - "2 continuation bytes: \uFFFD" . - "Missing continuation: \uFFFD" . + "The other values of this property should align nicely" . + "Impossible byte 1: \uFFFD |" . + "Impossible byte 2: \uFFFD |" . + "Four impossible bytes: \uFFFD\uFFFD\uFFFD\uFFFD |" . + "First continuation byte: \uFFFD |" . + "Last continuation byte: \uFFFD |" . + "2 continuation bytes: \uFFFD\uFFFD |" . + "3 continuation bytes: \uFFFD\uFFFD\uFFFD |" . + "4 continuation bytes: \uFFFD\uFFFD\uFFFD\uFFFD |" . + "5 continuation bytes: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |" . + "6 continuation bytes: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |" . + "7 continuation bytes: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |" . + "Continuation bytes 1: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |" . + "Continuation bytes 2: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |" . + "Continuation bytes 3: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |" . + "Continuation bytes 4: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |" . + "Lonely leading bytes of 2-byte sequences 1: \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD |" . + "Lonely leading bytes of 2-byte sequences 2: \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD |" . + "Lonely leading bytes of 3-byte sequences: \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD |" . + "Lonely leading bytes of 4-byte sequences: \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD |" . + "Lonely leading bytes of 5-byte sequences: \uFFFD \uFFFD \uFFFD \uFFFD |" . + "Lonely leading bytes of 6-byte sequences: \uFFFD \uFFFD |" . + "2-byte sequence with last byte missing (U+0000): \uFFFD |" . + "3-byte sequence with last byte missing (U+0000): \uFFFD |" . + "4-byte sequence with last byte missing (U+0000): \uFFFD |" . + "2-byte sequence with last byte missing (U-000007FF): \uFFFD |" . + "3-byte sequence with last byte missing (U-0000FFFF): \uFFFD |" . + "4-byte sequence with last byte missing (U-001FFFFF): \uFFFD |" . + "6 sequences with last byte missing: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |" . + "\nImpossible byte 1: \uFFFD |\nImpossible byte 2: \uFFFD |\nFour impossible bytes: \uFFFD\uFFFD\uFFFD\uFFFD |\nFirst continuation byte: \uFFFD |\nLast continuation byte: \uFFFD |\n2 continuation bytes: \uFFFD\uFFFD |\n3 continuation bytes: \uFFFD\uFFFD\uFFFD |\n4 continuation bytes: \uFFFD\uFFFD\uFFFD\uFFFD |\n5 continuation bytes: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |\n6 continuation bytes: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |\n7 continuation bytes: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |\nContinuation bytes 1: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |\nContinuation bytes 2: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |\nContinuation bytes 3: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |\nContinuation bytes 4: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |\nLonely leading bytes of 2-byte sequences 1: \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD |\nLonely leading bytes of 2-byte sequences 2: \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD |\nLonely leading bytes of 3-byte sequences: \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD |\nLonely leading bytes of 4-byte sequences: \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD \uFFFD |\nLonely leading bytes of 5-byte sequences: \uFFFD \uFFFD \uFFFD \uFFFD |\nLonely leading bytes of 6-byte sequences: \uFFFD \uFFFD |\n2-byte sequence with last byte missing (U+0000): \uFFFD |\n3-byte sequence with last byte missing (U+0000): \uFFFD |\n4-byte sequence with last byte missing (U+0000): \uFFFD |\n2-byte sequence with last byte missing (U-000007FF): \uFFFD |\n3-byte sequence with last byte missing (U-0000FFFF): \uFFFD |\n4-byte sequence with last byte missing (U-001FFFFF): \uFFFD |\n6 sequences with last byte missing: \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD |\n" . diff --git a/test/extra/lax/test-bad-utf8.nq b/test/extra/lax/test-bad-utf8.nq index b8c04637..de67830b 100644 --- a/test/extra/lax/test-bad-utf8.nq +++ b/test/extra/lax/test-bad-utf8.nq @@ -1,3 +1,29 @@ - "Impossible bytes: " . - "2 continuation bytes: " . - "Missing continuation: " . + "The other values of this property should align nicely" . + "Impossible byte 1: |" . + "Impossible byte 2: |" . + "Four impossible bytes: |" . + "First continuation byte: |" . + "Last continuation byte: |" . + "2 continuation bytes: |" . + "3 continuation bytes: |" . + "4 continuation bytes: |" . + "5 continuation bytes: |" . + "6 continuation bytes: |" . + "7 continuation bytes: |" . + "Continuation bytes 1: |" . + "Continuation bytes 2: |" . + "Continuation bytes 3: |" . + "Continuation bytes 4: |" . + "Lonely leading bytes of 2-byte sequences 1: |" . + "Lonely leading bytes of 2-byte sequences 2: |" . + "Lonely leading bytes of 3-byte sequences: |" . + "Lonely leading bytes of 4-byte sequences: |" . + "Lonely leading bytes of 5-byte sequences: |" . + "Lonely leading bytes of 6-byte sequences: |" . + "2-byte sequence with last byte missing (U+0000): |" . + "3-byte sequence with last byte missing (U+0000): |" . + "4-byte sequence with last byte missing (U+0000): |" . + "2-byte sequence with last byte missing (U-000007FF): |" . + "3-byte sequence with last byte missing (U-0000FFFF): |" . + "4-byte sequence with last byte missing (U-001FFFFF): |" . + "6 sequences with last byte missing: |" . diff --git a/test/extra/lax/test-bad-utf8.nt b/test/extra/lax/test-bad-utf8.nt index b8c04637..de67830b 100644 --- a/test/extra/lax/test-bad-utf8.nt +++ b/test/extra/lax/test-bad-utf8.nt @@ -1,3 +1,29 @@ - "Impossible bytes: " . - "2 continuation bytes: " . - "Missing continuation: " . + "The other values of this property should align nicely" . + "Impossible byte 1: |" . + "Impossible byte 2: |" . + "Four impossible bytes: |" . + "First continuation byte: |" . + "Last continuation byte: |" . + "2 continuation bytes: |" . + "3 continuation bytes: |" . + "4 continuation bytes: |" . + "5 continuation bytes: |" . + "6 continuation bytes: |" . + "7 continuation bytes: |" . + "Continuation bytes 1: |" . + "Continuation bytes 2: |" . + "Continuation bytes 3: |" . + "Continuation bytes 4: |" . + "Lonely leading bytes of 2-byte sequences 1: |" . + "Lonely leading bytes of 2-byte sequences 2: |" . + "Lonely leading bytes of 3-byte sequences: |" . + "Lonely leading bytes of 4-byte sequences: |" . + "Lonely leading bytes of 5-byte sequences: |" . + "Lonely leading bytes of 6-byte sequences: |" . + "2-byte sequence with last byte missing (U+0000): |" . + "3-byte sequence with last byte missing (U+0000): |" . + "4-byte sequence with last byte missing (U+0000): |" . + "2-byte sequence with last byte missing (U-000007FF): |" . + "3-byte sequence with last byte missing (U-0000FFFF): |" . + "4-byte sequence with last byte missing (U-001FFFFF): |" . + "6 sequences with last byte missing: |" . diff --git a/test/extra/lax/test-bad-utf8.ttl b/test/extra/lax/test-bad-utf8.ttl index 0e177366..e5640078 100644 --- a/test/extra/lax/test-bad-utf8.ttl +++ b/test/extra/lax/test-bad-utf8.ttl @@ -1,6 +1,60 @@ - "Impossible bytes: " . - "2 continuation bytes: " . - "Missing continuation: " . - """Impossible bytes: """ . - """2 continuation bytes: """ . - """Missing continuation: """ . + "The other values of this property should align nicely" . + "Impossible byte 1: |" . + "Impossible byte 2: |" . + "Four impossible bytes: |" . + "First continuation byte: |" . + "Last continuation byte: |" . + "2 continuation bytes: |" . + "3 continuation bytes: |" . + "4 continuation bytes: |" . + "5 continuation bytes: |" . + "6 continuation bytes: |" . + "7 continuation bytes: |" . + "Continuation bytes 1: |" . + "Continuation bytes 2: |" . + "Continuation bytes 3: |" . + "Continuation bytes 4: |" . + "Lonely leading bytes of 2-byte sequences 1: |" . + "Lonely leading bytes of 2-byte sequences 2: |" . + "Lonely leading bytes of 3-byte sequences: |" . + "Lonely leading bytes of 4-byte sequences: |" . + "Lonely leading bytes of 5-byte sequences: |" . + "Lonely leading bytes of 6-byte sequences: |" . + "2-byte sequence with last byte missing (U+0000): |" . + "3-byte sequence with last byte missing (U+0000): |" . + "4-byte sequence with last byte missing (U+0000): |" . + "2-byte sequence with last byte missing (U-000007FF): |" . + "3-byte sequence with last byte missing (U-0000FFFF): |" . + "4-byte sequence with last byte missing (U-001FFFFF): |" . + "6 sequences with last byte missing: |" . + + """ +Impossible byte 1: | +Impossible byte 2: | +Four impossible bytes: | +First continuation byte: | +Last continuation byte: | +2 continuation bytes: | +3 continuation bytes: | +4 continuation bytes: | +5 continuation bytes: | +6 continuation bytes: | +7 continuation bytes: | +Continuation bytes 1: | +Continuation bytes 2: | +Continuation bytes 3: | +Continuation bytes 4: | +Lonely leading bytes of 2-byte sequences 1: | +Lonely leading bytes of 2-byte sequences 2: | +Lonely leading bytes of 3-byte sequences: | +Lonely leading bytes of 4-byte sequences: | +Lonely leading bytes of 5-byte sequences: | +Lonely leading bytes of 6-byte sequences: | +2-byte sequence with last byte missing (U+0000): | +3-byte sequence with last byte missing (U+0000): | +4-byte sequence with last byte missing (U+0000): | +2-byte sequence with last byte missing (U-000007FF): | +3-byte sequence with last byte missing (U-0000FFFF): | +4-byte sequence with last byte missing (U-001FFFFF): | +6 sequences with last byte missing: | +""" . -- cgit v1.2.1