aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid Robillard <d@drobilla.net>2017-07-09 20:43:26 +0200
committerDavid Robillard <d@drobilla.net>2017-07-10 12:06:56 +0200
commitd292cd3ac24f954069bc6ecb3d9356b20d8f6100 (patch)
tree619ff3638f4cf6de82a7279c78086fcb7dfd4c6a
parent58f153dcdf1c5424ed3cefb3ce59e63b30f68f27 (diff)
downloadserd-d292cd3ac24f954069bc6ecb3d9356b20d8f6100.tar.gz
serd-d292cd3ac24f954069bc6ecb3d9356b20d8f6100.tar.bz2
serd-d292cd3ac24f954069bc6ecb3d9356b20d8f6100.zip
Gracefully handle applications that write corrupt UTF-8
-rw-r--r--NEWS3
-rw-r--r--src/writer.c20
-rw-r--r--tests/serd_test.c15
3 files changed, 26 insertions, 12 deletions
diff --git a/NEWS b/NEWS
index 0798b1a1..c5825c77 100644
--- a/NEWS
+++ b/NEWS
@@ -4,8 +4,9 @@ serd (0.27.2) unstable;
* Add serd_node_from_substring()
* Fix strict parsing of abolute URI schemes
* Fix parsing of hex escapes in file URIs (thanks Johannes Mueller)
+ * Gracefully handle applications that write corrupt UTF-8
- -- David Robillard <d@drobilla.net> Sun, 09 Jul 2017 14:58:47 +0200
+ -- David Robillard <d@drobilla.net> Sun, 09 Jul 2017 20:43:13 +0200
serd (0.26.0) stable;
diff --git a/src/writer.c b/src/writer.c
index c293b4f8..10637aaf 100644
--- a/src/writer.c
+++ b/src/writer.c
@@ -62,6 +62,8 @@ typedef struct {
uint8_t space_after_sep; ///< Newline after sep if after sep
} SepRule;
+static const uint8_t replacement_char[] = { 0xEF, 0xBF, 0xBD };
+
static const SepRule rules[] = {
{ NULL, 0, 0, 0, 0 },
{ " .\n\n", 4, 0, 0, 0 },
@@ -166,7 +168,7 @@ parse_utf8_char(SerdWriter* writer, const uint8_t* utf8, size_t* size)
case 1: case 2: case 3: case 4:
break;
default:
- return 0;
+ return *size = 0;
}
uint32_t c = utf8[0] & ((1 << (8 - *size)) - 1);
@@ -182,10 +184,8 @@ parse_utf8_char(SerdWriter* writer, const uint8_t* utf8, size_t* size)
static size_t
write_character(SerdWriter* writer, const uint8_t* utf8, size_t* size)
{
- const uint8_t replacement_char[] = { 0xEF, 0xBF, 0xBD };
- char escape[11] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
-
- const uint32_t c = parse_utf8_char(writer, utf8, size);
+ char escape[11] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+ const uint32_t c = parse_utf8_char(writer, utf8, size);
switch (*size) {
case 0:
w_err(writer, SERD_ERR_BAD_ARG, "invalid UTF-8: %X\n", utf8[0]);
@@ -245,6 +245,11 @@ write_uri(SerdWriter* writer, const uint8_t* utf8, size_t n_bytes)
size_t size = 0;
len += write_character(writer, utf8 + i, &size);
i += size;
+ if (size == 0) {
+ // Corrupt input, write replacement char and scan to next start
+ sink(replacement_char, sizeof(replacement_char), writer);
+ for (; i < n_bytes && (utf8[i] & 0x80); ++i) {}
+ }
}
return len;
}
@@ -351,9 +356,10 @@ write_text(SerdWriter* writer, TextContext ctx,
size_t size = 0;
len += write_character(writer, utf8 + i - 1, &size);
-
if (size == 0) {
- return len;
+ // Corrupt input, write replacement char and scan to next start
+ sink(replacement_char, sizeof(replacement_char), writer);
+ for (; i < n_bytes && (utf8[i] & 0x80); ++i) {}
}
i += size - 1;
diff --git a/tests/serd_test.c b/tests/serd_test.c
index 6c854241..9399f822 100644
--- a/tests/serd_test.c
+++ b/tests/serd_test.c
@@ -531,6 +531,7 @@ main(void)
{ &SERD_NODE_NULL, &p, &o, NULL, NULL },
{ &s, &o, &o, NULL, NULL },
{ &o, &p, &o, NULL, NULL },
+ { &s, &p, &SERD_NODE_NULL, NULL, NULL },
{ NULL, NULL, NULL, NULL, NULL } };
for (unsigned i = 0; i < sizeof(junk) / (sizeof(SerdNode*) * 5); ++i) {
if (!serd_writer_write_statement(
@@ -560,10 +561,16 @@ main(void)
}
}
- // Write 1 statement with bad UTF-8 (should be replaced)
+ // Write statements with bad UTF-8 (should be replaced)
+ const uint8_t bad_str[] = { 0xFF, 0x90, 'h', 'i', 0 };
+ SerdNode bad_lit = serd_node_from_string(SERD_LITERAL, bad_str);
+ SerdNode bad_uri = serd_node_from_string(SERD_URI, bad_str);
if (serd_writer_write_statement(writer, 0, NULL,
- &s, &p, &o, NULL, NULL)) {
- return failure("Failed to write junk UTF-8\n");
+ &s, &p, &bad_lit, NULL, NULL)) {
+ return failure("Failed to write junk UTF-8 literal\n");
+ } else if (serd_writer_write_statement(writer, 0, NULL,
+ &s, &p, &bad_uri, NULL, NULL)) {
+ return failure("Failed to write junk UTF-8 URI\n");
}
// Write 1 valid statement
@@ -624,7 +631,7 @@ main(void)
const SerdStatus st = serd_reader_read_file(reader, USTR(path));
if (st) {
return failure("Error reading file (%s)\n", serd_strerror(st));
- } else if (rt->n_statements != 12) {
+ } else if (rt->n_statements != 13) {
return failure("Bad statement count %d\n", rt->n_statements);
} else if (!rt->graph || !rt->graph->buf ||
strcmp((const char*)rt->graph->buf, "http://example.org/")) {