From 9c29938c172e2423f67925274a18b4f1c1bb42cf Mon Sep 17 00:00:00 2001 From: David Robillard Date: Sun, 9 Jul 2017 20:09:36 +0200 Subject: Factor out UTF-8 character size counting --- src/reader.c | 10 ++-------- src/serd_internal.h | 23 +++++++++++++++++++++++ src/writer.c | 24 +----------------------- 3 files changed, 26 insertions(+), 31 deletions(-) diff --git a/src/reader.c b/src/reader.c index dd7fa8b2..91e0e920 100644 --- a/src/reader.c +++ b/src/reader.c @@ -409,14 +409,8 @@ bad_char(SerdReader* reader, Ref dest, const char* fmt, uint8_t c) static SerdStatus read_utf8_character(SerdReader* reader, Ref dest, uint8_t c) { - unsigned size = 1; - if ((c & 0xE0) == 0xC0) { // Starts with `110' - size = 2; - } else if ((c & 0xF0) == 0xE0) { // Starts with `1110' - size = 3; - } else if ((c & 0xF8) == 0xF0) { // Starts with `11110' - size = 4; - } else { + const uint32_t size = utf8_num_bytes(c); + if (size <= 1 || size > 4) { return bad_char(reader, dest, "invalid UTF-8 start 0x%X\n", c); } diff --git a/src/serd_internal.h b/src/serd_internal.h index 55f6a6b6..814b9a5b 100644 --- a/src/serd_internal.h +++ b/src/serd_internal.h @@ -325,12 +325,35 @@ is_windows_path(const uint8_t* path) && (path[2] == '/' || path[2] == '\\'); } +/* String utilities */ + size_t serd_substrlen(const uint8_t* str, const size_t len, size_t* n_bytes, SerdNodeFlags* flags); +static inline uint32_t +utf8_num_bytes(const uint8_t c) +{ + if ((c & 0x80) == 0) { // Starts with `0' + return 1; + } + +#ifdef HAVE_BUILTIN_CLZ + return __builtin_clz(~c << 24); +#else + if ((c & 0xE0) == 0xC0) { // Starts with `110' + return 2; + } else if ((c & 0xF0) == 0xE0) { // Starts with `1110' + return 3; + } else if ((c & 0xF8) == 0xF0) { // Starts with `11110' + return 4; + } + return 0; +#endif +} + /* URI utilities */ static inline bool diff --git a/src/writer.c b/src/writer.c index 63b8d5af..c293b4f8 100644 --- a/src/writer.c +++ b/src/writer.c @@ -158,33 +158,11 @@ sink(const void* buf, size_t len, SerdWriter* writer) return serd_byte_sink_write(buf, len, &writer->byte_sink); } -// Return the number of bytes in a UTF-8 character -static inline uint32_t -utf8_num_bytes(const uint8_t* utf8) -{ - if ((utf8[0] & 0x80) == 0) { // Starts with `0' - return 1; - } - -#ifdef HAVE_BUILTIN_CLZ - return __builtin_clz(~utf8[0] << 24); -#else - if ((utf8[0] & 0xE0) == 0xC0) { // Starts with `110' - return 2; - } else if ((utf8[0] & 0xF0) == 0xE0) { // Starts with `1110' - return 3; - } else if ((utf8[0] & 0xF8) == 0xF0) { // Starts with `11110' - return 4; - } - return 0; -#endif -} - // Parse a UTF-8 character, set *size to the length, and return the code point static inline uint32_t parse_utf8_char(SerdWriter* writer, const uint8_t* utf8, size_t* size) { - switch (*size = utf8_num_bytes(utf8)) { + switch (*size = utf8_num_bytes(utf8[0])) { case 1: case 2: case 3: case 4: break; default: -- cgit v1.2.1