aboutsummaryrefslogtreecommitdiffstats
path: root/src/string_utils.h
diff options
context:
space:
mode:
authorDavid Robillard <d@drobilla.net>2023-02-05 12:42:52 -0500
committerDavid Robillard <d@drobilla.net>2023-12-02 18:49:08 -0500
commitea4b6e7d109ae3abc7f8ecdf99e3eb33e3484b77 (patch)
tree9b050faf6286c055d2fc78729eb4b56a12e3746c /src/string_utils.h
parentd35082a57adac79703f2c9bb72da468172a209c5 (diff)
downloadserd-ea4b6e7d109ae3abc7f8ecdf99e3eb33e3484b77.tar.gz
serd-ea4b6e7d109ae3abc7f8ecdf99e3eb33e3484b77.tar.bz2
serd-ea4b6e7d109ae3abc7f8ecdf99e3eb33e3484b77.zip
Strengthen handling of corrupt UTF-8 input
Diffstat (limited to 'src/string_utils.h')
-rw-r--r--src/string_utils.h20
1 files changed, 20 insertions, 0 deletions
diff --git a/src/string_utils.h b/src/string_utils.h
index 8f7ea083..9de03fa0 100644
--- a/src/string_utils.h
+++ b/src/string_utils.h
@@ -48,6 +48,26 @@ is_xdigit(const int c)
return is_hexdig(c) || in_range(c, 'a', 'f');
}
+/** UTF-8: Leading bytes start with 0, or two to four 1s followed by a 0 */
+static inline bool
+is_utf8_leading(const uint8_t c)
+{
+ static const uint8_t m1 = 0x80U; // 10000000
+ static const uint8_t m2 = 0xC0U; // 11000000
+ static const uint8_t m3 = 0xE0U; // 11100000
+ static const uint8_t m4 = 0xF0U; // 11110000
+ static const uint8_t m5 = 0xF8U; // 11111000
+
+ return (c & m1) == 0U || (c & m3) == m2 || (c & m4) == m3 || (c & m5) == m4;
+}
+
+/** UTF-8: Continuation bytes start with 10 */
+static inline bool
+is_utf8_continuation(const uint8_t c)
+{
+ return (c & 0xC0U) == 0x80U;
+}
+
static inline bool
is_space(const char c)
{