aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorDavid Robillard <d@drobilla.net>2017-08-29 11:51:37 +0200
committerDavid Robillard <d@drobilla.net>2017-08-29 12:01:09 +0200
commit1423442a9a34c93874ca6896a7b037bf08569aa5 (patch)
treee751956e35471e3bdefdaa6a209f2cbd8c715126 /src
parent2976016031592d98a1277a2679d32af9024241dd (diff)
downloadserd-1423442a9a34c93874ca6896a7b037bf08569aa5.tar.gz
serd-1423442a9a34c93874ca6896a7b037bf08569aa5.tar.bz2
serd-1423442a9a34c93874ca6896a7b037bf08569aa5.zip
Fix writing of corrupt UTF-8
Diffstat (limited to 'src')
-rw-r--r--src/serd_internal.h8
-rw-r--r--src/writer.c15
2 files changed, 8 insertions, 15 deletions
diff --git a/src/serd_internal.h b/src/serd_internal.h
index 267ef6f6..acd66803 100644
--- a/src/serd_internal.h
+++ b/src/serd_internal.h
@@ -340,12 +340,7 @@ utf8_num_bytes(const uint8_t c)
{
if ((c & 0x80) == 0) { // Starts with `0'
return 1;
- }
-
-#ifdef HAVE_BUILTIN_CLZ
- return __builtin_clz(~c << 24);
-#else
- if ((c & 0xE0) == 0xC0) { // Starts with `110'
+ } else if ((c & 0xE0) == 0xC0) { // Starts with `110'
return 2;
} else if ((c & 0xF0) == 0xE0) { // Starts with `1110'
return 3;
@@ -353,7 +348,6 @@ utf8_num_bytes(const uint8_t c)
return 4;
}
return 0;
-#endif
}
/// Return the code point of a UTF-8 character with known length
diff --git a/src/writer.c b/src/writer.c
index a359ee6c..d1f1b87f 100644
--- a/src/writer.c
+++ b/src/writer.c
@@ -241,9 +241,8 @@ write_uri(SerdWriter* writer, const uint8_t* utf8, size_t n_bytes)
len += write_character(writer, utf8 + i, &size);
i += size;
if (size == 0) {
- // Corrupt input, write replacement char and scan to next start
- sink(replacement_char, sizeof(replacement_char), writer);
- for (; i < n_bytes && (utf8[i] & 0x80); ++i) {}
+ // Corrupt input, scan to start of next character
+ for (++i; i < n_bytes && (utf8[i] & 0x80); ++i) {}
}
}
return len;
@@ -315,7 +314,7 @@ write_text(SerdWriter* writer, TextContext ctx,
break; // Reached end
}
- uint8_t in = utf8[i++];
+ const uint8_t in = utf8[i++];
if (ctx == WRITE_LONG_STRING) {
switch (in) {
case '\\': len += sink("\\\\", 2, writer); continue;
@@ -349,15 +348,15 @@ write_text(SerdWriter* writer, TextContext ctx,
}
}
+ // Write UTF-8 character
size_t size = 0;
len += write_character(writer, utf8 + i - 1, &size);
if (size == 0) {
- // Corrupt input, write replacement char and scan to next start
- sink(replacement_char, sizeof(replacement_char), writer);
+ // Corrupt input, scan to start of next character
for (; i < n_bytes && (utf8[i] & 0x80); ++i) {}
+ } else {
+ i += size - 1;
}
-
- i += size - 1;
}
return len;
}