From 6033875bcb22169de59126451e227dce1ee2db0c Mon Sep 17 00:00:00 2001 From: David Robillard Date: Tue, 28 Mar 2023 12:12:15 -0400 Subject: Shrink UTF-8 utility code I've found that the negative cache impact of the 32-byte lookup table here can be worse than the simple conditional code in real-world scenarios (even though it's faster in micro-benchmarks). So, go with the simple (and conveniently more terse) thing. --- src/string_utils.h | 41 +++++------------------------------------ 1 file changed, 5 insertions(+), 36 deletions(-) (limited to 'src/string_utils.h') diff --git a/src/string_utils.h b/src/string_utils.h index ea47c4b8..9ae0abcb 100644 --- a/src/string_utils.h +++ b/src/string_utils.h @@ -110,42 +110,11 @@ serd_strncasecmp(const char* s1, const char* s2, size_t n) static inline uint32_t utf8_num_bytes(const uint8_t leading) { - static const uint8_t lengths[32] = { - 1U, // 00000xxx - 1U, // 00001xxx - 1U, // 00010xxx - 1U, // 00011xxx - 1U, // 00100xxx - 1U, // 00101xxx - 1U, // 00110xxx - 1U, // 00111xxx - 1U, // 01000xxx - 1U, // 01001xxx - 1U, // 01010xxx - 1U, // 01011xxx - 1U, // 01100xxx - 1U, // 01101xxx - 1U, // 01110xxx - 1U, // 01111xxx - 0U, // 10000xxx - 0U, // 10001xxx - 0U, // 10010xxx - 0U, // 10011xxx - 0U, // 10100xxx - 0U, // 10101xxx - 0U, // 10110xxx - 0U, // 10111xxx - 2U, // 11000xxx - 2U, // 11001xxx - 2U, // 11010xxx - 2U, // 11011xxx - 3U, // 11100xxx - 3U, // 11101xxx - 4U, // 11110xxx - 0U // 11111xxx - }; - - return lengths[leading >> 3U]; + return ((leading & 0x80U) == 0x00U) ? 1U // Starts with `0' + : ((leading & 0xE0U) == 0xC0U) ? 2U // Starts with `110' + : ((leading & 0xF0U) == 0xE0U) ? 3U // Starts with `1110' + : ((leading & 0xF8U) == 0xF0U) ? 4U // Starts with `11110' + : 0U; // Invalid } /// Return the code point of a UTF-8 character with known length -- cgit v1.2.1