diff options
author | David Robillard <d@drobilla.net> | 2023-02-05 12:42:52 -0500 |
---|---|---|
committer | David Robillard <d@drobilla.net> | 2023-12-02 18:49:08 -0500 |
commit | ea4b6e7d109ae3abc7f8ecdf99e3eb33e3484b77 (patch) | |
tree | 9b050faf6286c055d2fc78729eb4b56a12e3746c /src | |
parent | d35082a57adac79703f2c9bb72da468172a209c5 (diff) | |
download | serd-ea4b6e7d109ae3abc7f8ecdf99e3eb33e3484b77.tar.gz serd-ea4b6e7d109ae3abc7f8ecdf99e3eb33e3484b77.tar.bz2 serd-ea4b6e7d109ae3abc7f8ecdf99e3eb33e3484b77.zip |
Strengthen handling of corrupt UTF-8 input
Diffstat (limited to 'src')
-rw-r--r-- | src/read_utf8.c | 15 | ||||
-rw-r--r-- | src/string_utils.h | 20 | ||||
-rw-r--r-- | src/writer.c | 4 |
3 files changed, 24 insertions, 15 deletions
diff --git a/src/read_utf8.c b/src/read_utf8.c index fb8ed0e2..f86bbeba 100644 --- a/src/read_utf8.c +++ b/src/read_utf8.c @@ -11,21 +11,10 @@ #define MAX_UTF8_BYTES 4U static SerdStatus -skip_invalid_utf8(SerdReader* const reader) -{ - for (int b = peek_byte(reader); b != EOF && ((uint8_t)b & 0x80);) { - skip_byte(reader, b); - b = peek_byte(reader); - } - - return reader->strict ? SERD_BAD_SYNTAX : SERD_FAILURE; -} - -static SerdStatus bad_char(SerdReader* const reader, const char* const fmt, const uint8_t c) { r_err(reader, SERD_BAD_SYNTAX, fmt, c); - return skip_invalid_utf8(reader); + return reader->strict ? SERD_BAD_SYNTAX : SERD_FAILURE; } static SerdStatus @@ -48,7 +37,7 @@ read_utf8_continuation_bytes(SerdReader* const reader, } const uint8_t byte = (uint8_t)b; - if (!(byte & 0x80U)) { + if (!is_utf8_continuation(byte)) { return bad_char(reader, "0x%X is not a UTF-8 continuation byte", byte); } diff --git a/src/string_utils.h b/src/string_utils.h index 8f7ea083..9de03fa0 100644 --- a/src/string_utils.h +++ b/src/string_utils.h @@ -48,6 +48,26 @@ is_xdigit(const int c) return is_hexdig(c) || in_range(c, 'a', 'f'); } +/** UTF-8: Leading bytes start with 0, or two to four 1s followed by a 0 */ +static inline bool +is_utf8_leading(const uint8_t c) +{ + static const uint8_t m1 = 0x80U; // 10000000 + static const uint8_t m2 = 0xC0U; // 11000000 + static const uint8_t m3 = 0xE0U; // 11100000 + static const uint8_t m4 = 0xF0U; // 11110000 + static const uint8_t m5 = 0xF8U; // 11111000 + + return (c & m1) == 0U || (c & m3) == m2 || (c & m4) == m3 || (c & m5) == m4; +} + +/** UTF-8: Continuation bytes start with 10 */ +static inline bool +is_utf8_continuation(const uint8_t c) +{ + return (c & 0xC0U) == 0x80U; +} + static inline bool is_space(const char c) { diff --git a/src/writer.c b/src/writer.c index be199af4..482721f9 100644 --- a/src/writer.c +++ b/src/writer.c @@ -369,7 +369,7 @@ write_uri(SerdWriter* writer, const char* utf8, size_t n_bytes, SerdStatus* st) if (size == 0) { // Corrupt input, write percent-encoded bytes and scan to next start char escape[4] = {0, 0, 0, 0}; - for (; i < n_bytes && (utf8[i] & 0x80); ++i) { + for (; i < n_bytes && !is_utf8_leading((uint8_t)utf8[i]); ++i) { snprintf(escape, sizeof(escape), "%%%02X", (uint8_t)utf8[i]); len += sink(escape, 3, writer); } @@ -590,7 +590,7 @@ write_text(SerdWriter* writer, if (size == 0) { // Corrupt input, write replacement character and scan to the next start st = esink(replacement_char, sizeof(replacement_char), writer); - for (; i < n_bytes && (utf8[i] & 0x80); ++i) { + for (; i < n_bytes && !is_utf8_leading((uint8_t)utf8[i]); ++i) { } } else { i += size - 1U; |