diff options
author | David Robillard <d@drobilla.net> | 2017-08-29 11:51:37 +0200 |
---|---|---|
committer | David Robillard <d@drobilla.net> | 2017-08-29 12:01:09 +0200 |
commit | 1423442a9a34c93874ca6896a7b037bf08569aa5 (patch) | |
tree | e751956e35471e3bdefdaa6a209f2cbd8c715126 | |
parent | 2976016031592d98a1277a2679d32af9024241dd (diff) | |
download | serd-1423442a9a34c93874ca6896a7b037bf08569aa5.tar.gz serd-1423442a9a34c93874ca6896a7b037bf08569aa5.tar.bz2 serd-1423442a9a34c93874ca6896a7b037bf08569aa5.zip |
Fix writing of corrupt UTF-8
-rw-r--r-- | src/serd_internal.h | 8 | ||||
-rw-r--r-- | src/writer.c | 15 | ||||
-rw-r--r-- | wscript | 5 |
3 files changed, 8 insertions, 20 deletions
diff --git a/src/serd_internal.h b/src/serd_internal.h index 267ef6f6..acd66803 100644 --- a/src/serd_internal.h +++ b/src/serd_internal.h @@ -340,12 +340,7 @@ utf8_num_bytes(const uint8_t c) { if ((c & 0x80) == 0) { // Starts with `0' return 1; - } - -#ifdef HAVE_BUILTIN_CLZ - return __builtin_clz(~c << 24); -#else - if ((c & 0xE0) == 0xC0) { // Starts with `110' + } else if ((c & 0xE0) == 0xC0) { // Starts with `110' return 2; } else if ((c & 0xF0) == 0xE0) { // Starts with `1110' return 3; @@ -353,7 +348,6 @@ utf8_num_bytes(const uint8_t c) return 4; } return 0; -#endif } /// Return the code point of a UTF-8 character with known length diff --git a/src/writer.c b/src/writer.c index a359ee6c..d1f1b87f 100644 --- a/src/writer.c +++ b/src/writer.c @@ -241,9 +241,8 @@ write_uri(SerdWriter* writer, const uint8_t* utf8, size_t n_bytes) len += write_character(writer, utf8 + i, &size); i += size; if (size == 0) { - // Corrupt input, write replacement char and scan to next start - sink(replacement_char, sizeof(replacement_char), writer); - for (; i < n_bytes && (utf8[i] & 0x80); ++i) {} + // Corrupt input, scan to start of next character + for (++i; i < n_bytes && (utf8[i] & 0x80); ++i) {} } } return len; @@ -315,7 +314,7 @@ write_text(SerdWriter* writer, TextContext ctx, break; // Reached end } - uint8_t in = utf8[i++]; + const uint8_t in = utf8[i++]; if (ctx == WRITE_LONG_STRING) { switch (in) { case '\\': len += sink("\\\\", 2, writer); continue; @@ -349,15 +348,15 @@ write_text(SerdWriter* writer, TextContext ctx, } } + // Write UTF-8 character size_t size = 0; len += write_character(writer, utf8 + i - 1, &size); if (size == 0) { - // Corrupt input, write replacement char and scan to next start - sink(replacement_char, sizeof(replacement_char), writer); + // Corrupt input, scan to start of next character for (; i < n_bytes && (utf8[i] & 0x80); ++i) {} + } else { + i += size - 1; } - - i += size - 1; } return len; } @@ -78,11 +78,6 @@ def configure(conf): defines = ['_POSIX_C_SOURCE=200809L'], mandatory = False) - conf.check(fragment = 'int main() { return __builtin_clz(1); }', - function_name = '__builtin_clz', - define_name = 'HAVE_BUILTIN_CLZ', - mandatory = False) - autowaf.define(conf, 'SERD_VERSION', SERD_VERSION) autowaf.set_lib_env(conf, 'serd', SERD_VERSION) conf.write_config_header('serd_config.h', remove=False) |