From 1423442a9a34c93874ca6896a7b037bf08569aa5 Mon Sep 17 00:00:00 2001 From: David Robillard Date: Tue, 29 Aug 2017 11:51:37 +0200 Subject: Fix writing of corrupt UTF-8 --- src/serd_internal.h | 8 +------- src/writer.c | 15 +++++++-------- 2 files changed, 8 insertions(+), 15 deletions(-) (limited to 'src') diff --git a/src/serd_internal.h b/src/serd_internal.h index 267ef6f6..acd66803 100644 --- a/src/serd_internal.h +++ b/src/serd_internal.h @@ -340,12 +340,7 @@ utf8_num_bytes(const uint8_t c) { if ((c & 0x80) == 0) { // Starts with `0' return 1; - } - -#ifdef HAVE_BUILTIN_CLZ - return __builtin_clz(~c << 24); -#else - if ((c & 0xE0) == 0xC0) { // Starts with `110' + } else if ((c & 0xE0) == 0xC0) { // Starts with `110' return 2; } else if ((c & 0xF0) == 0xE0) { // Starts with `1110' return 3; @@ -353,7 +348,6 @@ utf8_num_bytes(const uint8_t c) return 4; } return 0; -#endif } /// Return the code point of a UTF-8 character with known length diff --git a/src/writer.c b/src/writer.c index a359ee6c..d1f1b87f 100644 --- a/src/writer.c +++ b/src/writer.c @@ -241,9 +241,8 @@ write_uri(SerdWriter* writer, const uint8_t* utf8, size_t n_bytes) len += write_character(writer, utf8 + i, &size); i += size; if (size == 0) { - // Corrupt input, write replacement char and scan to next start - sink(replacement_char, sizeof(replacement_char), writer); - for (; i < n_bytes && (utf8[i] & 0x80); ++i) {} + // Corrupt input, scan to start of next character + for (++i; i < n_bytes && (utf8[i] & 0x80); ++i) {} } } return len; @@ -315,7 +314,7 @@ write_text(SerdWriter* writer, TextContext ctx, break; // Reached end } - uint8_t in = utf8[i++]; + const uint8_t in = utf8[i++]; if (ctx == WRITE_LONG_STRING) { switch (in) { case '\\': len += sink("\\\\", 2, writer); continue; @@ -349,15 +348,15 @@ write_text(SerdWriter* writer, TextContext ctx, } } + // Write UTF-8 character size_t size = 0; len += write_character(writer, utf8 + i - 1, &size); if (size == 0) { - // Corrupt input, write replacement char and scan to next start - sink(replacement_char, sizeof(replacement_char), writer); + // Corrupt input, scan to start of next character for (; i < n_bytes && (utf8[i] & 0x80); ++i) {} + } else { + i += size - 1; } - - i += size - 1; } return len; } -- cgit v1.2.1