aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid Robillard <d@drobilla.net>2017-08-29 11:51:37 +0200
committerDavid Robillard <d@drobilla.net>2017-08-29 12:01:09 +0200
commit1423442a9a34c93874ca6896a7b037bf08569aa5 (patch)
treee751956e35471e3bdefdaa6a209f2cbd8c715126
parent2976016031592d98a1277a2679d32af9024241dd (diff)
downloadserd-1423442a9a34c93874ca6896a7b037bf08569aa5.tar.gz
serd-1423442a9a34c93874ca6896a7b037bf08569aa5.tar.bz2
serd-1423442a9a34c93874ca6896a7b037bf08569aa5.zip
Fix writing of corrupt UTF-8
-rw-r--r--src/serd_internal.h8
-rw-r--r--src/writer.c15
-rw-r--r--wscript5
3 files changed, 8 insertions, 20 deletions
diff --git a/src/serd_internal.h b/src/serd_internal.h
index 267ef6f6..acd66803 100644
--- a/src/serd_internal.h
+++ b/src/serd_internal.h
@@ -340,12 +340,7 @@ utf8_num_bytes(const uint8_t c)
{
if ((c & 0x80) == 0) { // Starts with `0'
return 1;
- }
-
-#ifdef HAVE_BUILTIN_CLZ
- return __builtin_clz(~c << 24);
-#else
- if ((c & 0xE0) == 0xC0) { // Starts with `110'
+ } else if ((c & 0xE0) == 0xC0) { // Starts with `110'
return 2;
} else if ((c & 0xF0) == 0xE0) { // Starts with `1110'
return 3;
@@ -353,7 +348,6 @@ utf8_num_bytes(const uint8_t c)
return 4;
}
return 0;
-#endif
}
/// Return the code point of a UTF-8 character with known length
diff --git a/src/writer.c b/src/writer.c
index a359ee6c..d1f1b87f 100644
--- a/src/writer.c
+++ b/src/writer.c
@@ -241,9 +241,8 @@ write_uri(SerdWriter* writer, const uint8_t* utf8, size_t n_bytes)
len += write_character(writer, utf8 + i, &size);
i += size;
if (size == 0) {
- // Corrupt input, write replacement char and scan to next start
- sink(replacement_char, sizeof(replacement_char), writer);
- for (; i < n_bytes && (utf8[i] & 0x80); ++i) {}
+ // Corrupt input, scan to start of next character
+ for (++i; i < n_bytes && (utf8[i] & 0x80); ++i) {}
}
}
return len;
@@ -315,7 +314,7 @@ write_text(SerdWriter* writer, TextContext ctx,
break; // Reached end
}
- uint8_t in = utf8[i++];
+ const uint8_t in = utf8[i++];
if (ctx == WRITE_LONG_STRING) {
switch (in) {
case '\\': len += sink("\\\\", 2, writer); continue;
@@ -349,15 +348,15 @@ write_text(SerdWriter* writer, TextContext ctx,
}
}
+ // Write UTF-8 character
size_t size = 0;
len += write_character(writer, utf8 + i - 1, &size);
if (size == 0) {
- // Corrupt input, write replacement char and scan to next start
- sink(replacement_char, sizeof(replacement_char), writer);
+ // Corrupt input, scan to start of next character
for (; i < n_bytes && (utf8[i] & 0x80); ++i) {}
+ } else {
+ i += size - 1;
}
-
- i += size - 1;
}
return len;
}
diff --git a/wscript b/wscript
index c080b05e..ffd19d10 100644
--- a/wscript
+++ b/wscript
@@ -78,11 +78,6 @@ def configure(conf):
defines = ['_POSIX_C_SOURCE=200809L'],
mandatory = False)
- conf.check(fragment = 'int main() { return __builtin_clz(1); }',
- function_name = '__builtin_clz',
- define_name = 'HAVE_BUILTIN_CLZ',
- mandatory = False)
-
autowaf.define(conf, 'SERD_VERSION', SERD_VERSION)
autowaf.set_lib_env(conf, 'serd', SERD_VERSION)
conf.write_config_header('serd_config.h', remove=False)