diff options
Diffstat (limited to 'src/string_utils.h')
-rw-r--r-- | src/string_utils.h | 28 |
1 files changed, 26 insertions, 2 deletions
diff --git a/src/string_utils.h b/src/string_utils.h index 5eeabc6b..a411b90d 100644 --- a/src/string_utils.h +++ b/src/string_utils.h @@ -152,20 +152,44 @@ utf8_num_bytes(const uint8_t leading) return lengths[leading >> 3u]; } +static inline unsigned +utf8_num_bytes_for_codepoint(const uint32_t code) +{ + if (code < 0x00000080) { + return 1u; + } + + if (code < 0x00000800) { + return 2u; + } + + if (code < 0x00010000) { + return 3u; + } + + if (code < 0x00110000) { + return 4u; + } + + return 0u; // Out of range +} + /// Return the code point of a UTF-8 character with known length static inline uint32_t -parse_counted_utf8_char(const uint8_t* utf8, size_t size) +parse_counted_utf8_char(const uint8_t* const utf8, const size_t size) { uint32_t c = utf8[0] & ((1u << (8u - size)) - 1u); + for (size_t i = 1; i < size; ++i) { c = (c << 6) | (utf8[i] & 0x3Fu); } + return c; } /// Parse a UTF-8 character, set *size to the length, and return the code point static inline uint32_t -parse_utf8_char(const uint8_t* utf8, size_t* size) +parse_utf8_char(const uint8_t* const utf8, size_t* const size) { switch (*size = utf8_num_bytes(utf8[0])) { case 1: |