diff options
author | David Robillard <d@drobilla.net> | 2017-06-30 11:08:21 -0400 |
---|---|---|
committer | David Robillard <d@drobilla.net> | 2017-06-30 11:09:01 -0400 |
commit | 76e9a6530fd52070ae5c1784dcbd5e2929c7972d (patch) | |
tree | 4d3d65c25458d78aab5e0b5d9bba35db32bcc043 /src | |
parent | 83e06e802c65bcc810483992838ac05fd173aae7 (diff) | |
download | serd-76e9a6530fd52070ae5c1784dcbd5e2929c7972d.tar.gz serd-76e9a6530fd52070ae5c1784dcbd5e2929c7972d.tar.bz2 serd-76e9a6530fd52070ae5c1784dcbd5e2929c7972d.zip |
Clean up UTF-8 parsing and use CLZ if available
Diffstat (limited to 'src')
-rw-r--r-- | src/writer.c | 56 |
1 files changed, 29 insertions, 27 deletions
diff --git a/src/writer.c b/src/writer.c index 6d9055e8..63b8d5af 100644 --- a/src/writer.c +++ b/src/writer.c @@ -158,42 +158,44 @@ sink(const void* buf, size_t len, SerdWriter* writer) return serd_byte_sink_write(buf, len, &writer->byte_sink); } -// Parse a UTF-8 character, set *size to the length, and return the code point +// Return the number of bytes in a UTF-8 character static inline uint32_t -parse_utf8_char(SerdWriter* writer, const uint8_t* utf8, size_t* size) +utf8_num_bytes(const uint8_t* utf8) { - uint32_t c = 0; if ((utf8[0] & 0x80) == 0) { // Starts with `0' - *size = 1; - c = utf8[0]; - } else if ((utf8[0] & 0xE0) == 0xC0) { // Starts with `110' - *size = 2; - c = utf8[0] & 0x1F; + return 1; + } + +#ifdef HAVE_BUILTIN_CLZ + return __builtin_clz(~utf8[0] << 24); +#else + if ((utf8[0] & 0xE0) == 0xC0) { // Starts with `110' + return 2; } else if ((utf8[0] & 0xF0) == 0xE0) { // Starts with `1110' - *size = 3; - c = utf8[0] & 0x0F; + return 3; } else if ((utf8[0] & 0xF8) == 0xF0) { // Starts with `11110' - *size = 4; - c = utf8[0] & 0x07; - } else { - w_err(writer, SERD_ERR_BAD_ARG, "invalid UTF-8: %X\n", utf8[0]); - *size = 0; - return 0; + return 4; } + return 0; +#endif +} - size_t i = 0; - uint8_t in = utf8[i++]; - -#define READ_BYTE() \ - in = utf8[i++] & 0x3F; \ - c = (c << 6) | in; - - switch (*size) { - case 4: READ_BYTE(); - case 3: READ_BYTE(); - case 2: READ_BYTE(); +// Parse a UTF-8 character, set *size to the length, and return the code point +static inline uint32_t +parse_utf8_char(SerdWriter* writer, const uint8_t* utf8, size_t* size) +{ + switch (*size = utf8_num_bytes(utf8)) { + case 1: case 2: case 3: case 4: + break; + default: + return 0; } + uint32_t c = utf8[0] & ((1 << (8 - *size)) - 1); + for (size_t i = 1; i < *size; ++i) { + const uint8_t in = utf8[i] & 0x3F; + c = (c << 6) | in; + } return c; } |