aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorDavid Robillard <d@drobilla.net>2017-06-30 11:08:21 -0400
committerDavid Robillard <d@drobilla.net>2017-06-30 11:09:01 -0400
commit76e9a6530fd52070ae5c1784dcbd5e2929c7972d (patch)
tree4d3d65c25458d78aab5e0b5d9bba35db32bcc043 /src
parent83e06e802c65bcc810483992838ac05fd173aae7 (diff)
downloadserd-76e9a6530fd52070ae5c1784dcbd5e2929c7972d.tar.gz
serd-76e9a6530fd52070ae5c1784dcbd5e2929c7972d.tar.bz2
serd-76e9a6530fd52070ae5c1784dcbd5e2929c7972d.zip
Clean up UTF-8 parsing and use CLZ if available
Diffstat (limited to 'src')
-rw-r--r--src/writer.c56
1 files changed, 29 insertions, 27 deletions
diff --git a/src/writer.c b/src/writer.c
index 6d9055e8..63b8d5af 100644
--- a/src/writer.c
+++ b/src/writer.c
@@ -158,42 +158,44 @@ sink(const void* buf, size_t len, SerdWriter* writer)
return serd_byte_sink_write(buf, len, &writer->byte_sink);
}
-// Parse a UTF-8 character, set *size to the length, and return the code point
+// Return the number of bytes in a UTF-8 character
static inline uint32_t
-parse_utf8_char(SerdWriter* writer, const uint8_t* utf8, size_t* size)
+utf8_num_bytes(const uint8_t* utf8)
{
- uint32_t c = 0;
if ((utf8[0] & 0x80) == 0) { // Starts with `0'
- *size = 1;
- c = utf8[0];
- } else if ((utf8[0] & 0xE0) == 0xC0) { // Starts with `110'
- *size = 2;
- c = utf8[0] & 0x1F;
+ return 1;
+ }
+
+#ifdef HAVE_BUILTIN_CLZ
+ return __builtin_clz(~utf8[0] << 24);
+#else
+ if ((utf8[0] & 0xE0) == 0xC0) { // Starts with `110'
+ return 2;
} else if ((utf8[0] & 0xF0) == 0xE0) { // Starts with `1110'
- *size = 3;
- c = utf8[0] & 0x0F;
+ return 3;
} else if ((utf8[0] & 0xF8) == 0xF0) { // Starts with `11110'
- *size = 4;
- c = utf8[0] & 0x07;
- } else {
- w_err(writer, SERD_ERR_BAD_ARG, "invalid UTF-8: %X\n", utf8[0]);
- *size = 0;
- return 0;
+ return 4;
}
+ return 0;
+#endif
+}
- size_t i = 0;
- uint8_t in = utf8[i++];
-
-#define READ_BYTE() \
- in = utf8[i++] & 0x3F; \
- c = (c << 6) | in;
-
- switch (*size) {
- case 4: READ_BYTE();
- case 3: READ_BYTE();
- case 2: READ_BYTE();
+// Parse a UTF-8 character, set *size to the length, and return the code point
+static inline uint32_t
+parse_utf8_char(SerdWriter* writer, const uint8_t* utf8, size_t* size)
+{
+ switch (*size = utf8_num_bytes(utf8)) {
+ case 1: case 2: case 3: case 4:
+ break;
+ default:
+ return 0;
}
+ uint32_t c = utf8[0] & ((1 << (8 - *size)) - 1);
+ for (size_t i = 1; i < *size; ++i) {
+ const uint8_t in = utf8[i] & 0x3F;
+ c = (c << 6) | in;
+ }
return c;
}