From ea4b6e7d109ae3abc7f8ecdf99e3eb33e3484b77 Mon Sep 17 00:00:00 2001
From: David Robillard <d@drobilla.net>
Date: Sun, 5 Feb 2023 12:42:52 -0500
Subject: Strengthen handling of corrupt UTF-8 input

---
 src/read_utf8.c    | 15 ++-------------
 src/string_utils.h | 20 ++++++++++++++++++++
 src/writer.c       |  4 ++--
 3 files changed, 24 insertions(+), 15 deletions(-)

(limited to 'src')

diff --git a/src/read_utf8.c b/src/read_utf8.c
index fb8ed0e2..f86bbeba 100644
--- a/src/read_utf8.c
+++ b/src/read_utf8.c
@@ -10,22 +10,11 @@
 
 #define MAX_UTF8_BYTES 4U
 
-static SerdStatus
-skip_invalid_utf8(SerdReader* const reader)
-{
-  for (int b = peek_byte(reader); b != EOF && ((uint8_t)b & 0x80);) {
-    skip_byte(reader, b);
-    b = peek_byte(reader);
-  }
-
-  return reader->strict ? SERD_BAD_SYNTAX : SERD_FAILURE;
-}
-
 static SerdStatus
 bad_char(SerdReader* const reader, const char* const fmt, const uint8_t c)
 {
   r_err(reader, SERD_BAD_SYNTAX, fmt, c);
-  return skip_invalid_utf8(reader);
+  return reader->strict ? SERD_BAD_SYNTAX : SERD_FAILURE;
 }
 
 static SerdStatus
@@ -48,7 +37,7 @@ read_utf8_continuation_bytes(SerdReader* const reader,
     }
 
     const uint8_t byte = (uint8_t)b;
-    if (!(byte & 0x80U)) {
+    if (!is_utf8_continuation(byte)) {
       return bad_char(reader, "0x%X is not a UTF-8 continuation byte", byte);
     }
 
diff --git a/src/string_utils.h b/src/string_utils.h
index 8f7ea083..9de03fa0 100644
--- a/src/string_utils.h
+++ b/src/string_utils.h
@@ -48,6 +48,26 @@ is_xdigit(const int c)
   return is_hexdig(c) || in_range(c, 'a', 'f');
 }
 
+/** UTF-8: Leading bytes start with 0, or two to four 1s followed by a 0 */
+static inline bool
+is_utf8_leading(const uint8_t c)
+{
+  static const uint8_t m1 = 0x80U; // 10000000
+  static const uint8_t m2 = 0xC0U; // 11000000
+  static const uint8_t m3 = 0xE0U; // 11100000
+  static const uint8_t m4 = 0xF0U; // 11110000
+  static const uint8_t m5 = 0xF8U; // 11111000
+
+  return (c & m1) == 0U || (c & m3) == m2 || (c & m4) == m3 || (c & m5) == m4;
+}
+
+/** UTF-8: Continuation bytes start with 10 */
+static inline bool
+is_utf8_continuation(const uint8_t c)
+{
+  return (c & 0xC0U) == 0x80U;
+}
+
 static inline bool
 is_space(const char c)
 {
diff --git a/src/writer.c b/src/writer.c
index be199af4..482721f9 100644
--- a/src/writer.c
+++ b/src/writer.c
@@ -369,7 +369,7 @@ write_uri(SerdWriter* writer, const char* utf8, size_t n_bytes, SerdStatus* st)
     if (size == 0) {
       // Corrupt input, write percent-encoded bytes and scan to next start
       char escape[4] = {0, 0, 0, 0};
-      for (; i < n_bytes && (utf8[i] & 0x80); ++i) {
+      for (; i < n_bytes && !is_utf8_leading((uint8_t)utf8[i]); ++i) {
         snprintf(escape, sizeof(escape), "%%%02X", (uint8_t)utf8[i]);
         len += sink(escape, 3, writer);
       }
@@ -590,7 +590,7 @@ write_text(SerdWriter* writer,
       if (size == 0) {
         // Corrupt input, write replacement character and scan to the next start
         st = esink(replacement_char, sizeof(replacement_char), writer);
-        for (; i < n_bytes && (utf8[i] & 0x80); ++i) {
+        for (; i < n_bytes && !is_utf8_leading((uint8_t)utf8[i]); ++i) {
         }
       } else {
         i += size - 1U;
-- 
cgit v1.2.1