aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid Robillard <d@drobilla.net>2017-06-30 11:08:21 -0400
committerDavid Robillard <d@drobilla.net>2017-06-30 11:09:01 -0400
commit76e9a6530fd52070ae5c1784dcbd5e2929c7972d (patch)
tree4d3d65c25458d78aab5e0b5d9bba35db32bcc043
parent83e06e802c65bcc810483992838ac05fd173aae7 (diff)
downloadserd-76e9a6530fd52070ae5c1784dcbd5e2929c7972d.tar.gz
serd-76e9a6530fd52070ae5c1784dcbd5e2929c7972d.tar.bz2
serd-76e9a6530fd52070ae5c1784dcbd5e2929c7972d.zip
Clean up UTF-8 parsing and use CLZ if available
-rw-r--r--src/writer.c56
-rw-r--r--wscript5
2 files changed, 34 insertions, 27 deletions
diff --git a/src/writer.c b/src/writer.c
index 6d9055e8..63b8d5af 100644
--- a/src/writer.c
+++ b/src/writer.c
@@ -158,42 +158,44 @@ sink(const void* buf, size_t len, SerdWriter* writer)
return serd_byte_sink_write(buf, len, &writer->byte_sink);
}
-// Parse a UTF-8 character, set *size to the length, and return the code point
+// Return the number of bytes in a UTF-8 character
static inline uint32_t
-parse_utf8_char(SerdWriter* writer, const uint8_t* utf8, size_t* size)
+utf8_num_bytes(const uint8_t* utf8)
{
- uint32_t c = 0;
if ((utf8[0] & 0x80) == 0) { // Starts with `0'
- *size = 1;
- c = utf8[0];
- } else if ((utf8[0] & 0xE0) == 0xC0) { // Starts with `110'
- *size = 2;
- c = utf8[0] & 0x1F;
+ return 1;
+ }
+
+#ifdef HAVE_BUILTIN_CLZ
+ return __builtin_clz(~utf8[0] << 24);
+#else
+ if ((utf8[0] & 0xE0) == 0xC0) { // Starts with `110'
+ return 2;
} else if ((utf8[0] & 0xF0) == 0xE0) { // Starts with `1110'
- *size = 3;
- c = utf8[0] & 0x0F;
+ return 3;
} else if ((utf8[0] & 0xF8) == 0xF0) { // Starts with `11110'
- *size = 4;
- c = utf8[0] & 0x07;
- } else {
- w_err(writer, SERD_ERR_BAD_ARG, "invalid UTF-8: %X\n", utf8[0]);
- *size = 0;
- return 0;
+ return 4;
}
+ return 0;
+#endif
+}
- size_t i = 0;
- uint8_t in = utf8[i++];
-
-#define READ_BYTE() \
- in = utf8[i++] & 0x3F; \
- c = (c << 6) | in;
-
- switch (*size) {
- case 4: READ_BYTE();
- case 3: READ_BYTE();
- case 2: READ_BYTE();
+// Parse a UTF-8 character, set *size to the length, and return the code point
+static inline uint32_t
+parse_utf8_char(SerdWriter* writer, const uint8_t* utf8, size_t* size)
+{
+ switch (*size = utf8_num_bytes(utf8)) {
+ case 1: case 2: case 3: case 4:
+ break;
+ default:
+ return 0;
}
+ uint32_t c = utf8[0] & ((1 << (8 - *size)) - 1);
+ for (size_t i = 1; i < *size; ++i) {
+ const uint8_t in = utf8[i] & 0x3F;
+ c = (c << 6) | in;
+ }
return c;
}
diff --git a/wscript b/wscript
index b39e43a8..3d60859c 100644
--- a/wscript
+++ b/wscript
@@ -78,6 +78,11 @@ def configure(conf):
defines = ['_POSIX_C_SOURCE=200809L'],
mandatory = False)
+ conf.check(fragment = 'int main() { return __builtin_clz(1); }',
+ function_name = '__builtin_clz',
+ define_name = 'HAVE_BUILTIN_CLZ',
+ mandatory = False)
+
autowaf.define(conf, 'SERD_VERSION', SERD_VERSION)
autowaf.set_lib_env(conf, 'serd', SERD_VERSION)
conf.write_config_header('serd_config.h', remove=False)