From d292cd3ac24f954069bc6ecb3d9356b20d8f6100 Mon Sep 17 00:00:00 2001 From: David Robillard Date: Sun, 9 Jul 2017 20:43:26 +0200 Subject: Gracefully handle applications that write corrupt UTF-8 --- NEWS | 3 ++- src/writer.c | 20 +++++++++++++------- tests/serd_test.c | 15 +++++++++++---- 3 files changed, 26 insertions(+), 12 deletions(-) diff --git a/NEWS b/NEWS index 0798b1a1..c5825c77 100644 --- a/NEWS +++ b/NEWS @@ -4,8 +4,9 @@ serd (0.27.2) unstable; * Add serd_node_from_substring() * Fix strict parsing of abolute URI schemes * Fix parsing of hex escapes in file URIs (thanks Johannes Mueller) + * Gracefully handle applications that write corrupt UTF-8 - -- David Robillard Sun, 09 Jul 2017 14:58:47 +0200 + -- David Robillard Sun, 09 Jul 2017 20:43:13 +0200 serd (0.26.0) stable; diff --git a/src/writer.c b/src/writer.c index c293b4f8..10637aaf 100644 --- a/src/writer.c +++ b/src/writer.c @@ -62,6 +62,8 @@ typedef struct { uint8_t space_after_sep; ///< Newline after sep if after sep } SepRule; +static const uint8_t replacement_char[] = { 0xEF, 0xBF, 0xBD }; + static const SepRule rules[] = { { NULL, 0, 0, 0, 0 }, { " .\n\n", 4, 0, 0, 0 }, @@ -166,7 +168,7 @@ parse_utf8_char(SerdWriter* writer, const uint8_t* utf8, size_t* size) case 1: case 2: case 3: case 4: break; default: - return 0; + return *size = 0; } uint32_t c = utf8[0] & ((1 << (8 - *size)) - 1); @@ -182,10 +184,8 @@ parse_utf8_char(SerdWriter* writer, const uint8_t* utf8, size_t* size) static size_t write_character(SerdWriter* writer, const uint8_t* utf8, size_t* size) { - const uint8_t replacement_char[] = { 0xEF, 0xBF, 0xBD }; - char escape[11] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; - - const uint32_t c = parse_utf8_char(writer, utf8, size); + char escape[11] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + const uint32_t c = parse_utf8_char(writer, utf8, size); switch (*size) { case 0: w_err(writer, SERD_ERR_BAD_ARG, "invalid UTF-8: %X\n", utf8[0]); @@ -245,6 +245,11 @@ write_uri(SerdWriter* writer, const uint8_t* utf8, size_t n_bytes) size_t size = 0; len += write_character(writer, utf8 + i, &size); i += size; + if (size == 0) { + // Corrupt input, write replacement char and scan to next start + sink(replacement_char, sizeof(replacement_char), writer); + for (; i < n_bytes && (utf8[i] & 0x80); ++i) {} + } } return len; } @@ -351,9 +356,10 @@ write_text(SerdWriter* writer, TextContext ctx, size_t size = 0; len += write_character(writer, utf8 + i - 1, &size); - if (size == 0) { - return len; + // Corrupt input, write replacement char and scan to next start + sink(replacement_char, sizeof(replacement_char), writer); + for (; i < n_bytes && (utf8[i] & 0x80); ++i) {} } i += size - 1; diff --git a/tests/serd_test.c b/tests/serd_test.c index 6c854241..9399f822 100644 --- a/tests/serd_test.c +++ b/tests/serd_test.c @@ -531,6 +531,7 @@ main(void) { &SERD_NODE_NULL, &p, &o, NULL, NULL }, { &s, &o, &o, NULL, NULL }, { &o, &p, &o, NULL, NULL }, + { &s, &p, &SERD_NODE_NULL, NULL, NULL }, { NULL, NULL, NULL, NULL, NULL } }; for (unsigned i = 0; i < sizeof(junk) / (sizeof(SerdNode*) * 5); ++i) { if (!serd_writer_write_statement( @@ -560,10 +561,16 @@ main(void) } } - // Write 1 statement with bad UTF-8 (should be replaced) + // Write statements with bad UTF-8 (should be replaced) + const uint8_t bad_str[] = { 0xFF, 0x90, 'h', 'i', 0 }; + SerdNode bad_lit = serd_node_from_string(SERD_LITERAL, bad_str); + SerdNode bad_uri = serd_node_from_string(SERD_URI, bad_str); if (serd_writer_write_statement(writer, 0, NULL, - &s, &p, &o, NULL, NULL)) { - return failure("Failed to write junk UTF-8\n"); + &s, &p, &bad_lit, NULL, NULL)) { + return failure("Failed to write junk UTF-8 literal\n"); + } else if (serd_writer_write_statement(writer, 0, NULL, + &s, &p, &bad_uri, NULL, NULL)) { + return failure("Failed to write junk UTF-8 URI\n"); } // Write 1 valid statement @@ -624,7 +631,7 @@ main(void) const SerdStatus st = serd_reader_read_file(reader, USTR(path)); if (st) { return failure("Error reading file (%s)\n", serd_strerror(st)); - } else if (rt->n_statements != 12) { + } else if (rt->n_statements != 13) { return failure("Bad statement count %d\n", rt->n_statements); } else if (!rt->graph || !rt->graph->buf || strcmp((const char*)rt->graph->buf, "http://example.org/")) { -- cgit v1.2.1