3 files changed, 153 insertions, 86 deletions
diff --git a/src/n3.c b/src/n3.c
index e2a7a3a8..582beae4 100644
--- a/src/n3.c
+++ b/src/n3.c
@@ -18,6 +18,7 @@
 #include "env.h"
 #include "namespaces.h"
 #include "node.h"
+#include "read_utf8.h"
 #include "reader.h"
 #include "stack.h"
 #include "string_utils.h"
@@ -178,86 +179,6 @@ read_ECHAR(SerdReader* const reader, SerdNode* const dest)
   }
 }
 
-static SerdStatus
-bad_char(SerdReader* const reader, const char* const fmt, const uint8_t c)
-{
-  // Skip bytes until the next start byte
-  for (int b = peek_byte(reader); b != EOF && ((uint8_t)b & 0x80);) {
-    eat_byte_safe(reader, b);
-    b = peek_byte(reader);
-  }
-
-  r_err(reader, SERD_ERR_BAD_SYNTAX, fmt, c);
-  return reader->strict ? SERD_ERR_BAD_SYNTAX : SERD_FAILURE;
-}
-
-static SerdStatus
-read_utf8_bytes(SerdReader* const reader,
-                uint8_t           bytes[4],
-                uint32_t* const   size,
-                const uint8_t     c)
-{
-  *size = utf8_num_bytes(c);
-  if (*size <= 1 || *size > 4) {
-    return bad_char(reader, "invalid UTF-8 start 0x%X", c);
-  }
-
-  bytes[0] = c;
-  for (unsigned i = 1; i < *size; ++i) {
-    const int b = peek_byte(reader);
-    if (b == EOF || ((uint8_t)b & 0x80) == 0) {
-      return bad_char(reader, "invalid UTF-8 continuation 0x%X", (uint8_t)b);
-    }
-
-    eat_byte_safe(reader, b);
-    bytes[i] = (uint8_t)b;
-  }
-
-  return SERD_SUCCESS;
-}
-
-static SerdStatus
-read_utf8_character(SerdReader* const reader,
-                    SerdNode* const   dest,
-                    const uint8_t     c)
-{
-  uint32_t   size     = 0;
-  uint8_t    bytes[4] = {0, 0, 0, 0};
-  SerdStatus st       = read_utf8_bytes(reader, bytes, &size, c);
-
-  if (!tolerate_status(reader, st)) {
-    return st;
-  }
-
-  if (st) {
-    const SerdStatus rst = push_bytes(reader, dest, replacement_char, 3);
-    return rst ? rst : st;
-  }
-
-  return push_bytes(reader, dest, bytes, size);
-}
-
-static SerdStatus
-read_utf8_code(SerdReader* const reader,
-               SerdNode* const   dest,
-               uint32_t* const   code,
-               const uint8_t     c)
-{
-  uint32_t   size     = 0;
-  uint8_t    bytes[4] = {0, 0, 0, 0};
-  SerdStatus st       = read_utf8_bytes(reader, bytes, &size, c);
-  if (st) {
-    const SerdStatus rst = push_bytes(reader, dest, replacement_char, 3);
-    return rst ? rst : st;
-  }
-
-  if (!(st = push_bytes(reader, dest, bytes, size))) {
-    *code = parse_counted_utf8_char(bytes, size);
-  }
-
-  return st;
-}
-
 // Read one character (possibly multi-byte)
 // The first byte, c, has already been eaten by caller
 static SerdStatus
@@ -279,7 +200,8 @@ read_character(SerdReader* const reader, SerdNode* const dest, const uint8_t c)
 
     return push_byte(reader, dest, c);
   }
-  return read_utf8_character(reader, dest, c);
+
+  return read_utf8_continuation(reader, dest, c);
 }
 
 // [10] comment ::= '#' ( [^#xA #xD] )*
@@ -472,8 +394,7 @@ read_PN_CHARS_BASE(SerdReader* const reader, SerdNode* const dest)
     st = push_byte(reader, dest, eat_byte_safe(reader, c));
   } else if (c == EOF || !(c & 0x80)) {
     return SERD_FAILURE;
-  } else if ((st = read_utf8_code(
-                reader, dest, &code, (uint8_t)eat_byte_safe(reader, c)))) {
+  } else if ((st = read_utf8_code_point(reader, dest, &code, (uint8_t)c))) {
     return st;
   } else if (!is_PN_CHARS_BASE(code)) {
     r_err(
@@ -502,8 +423,7 @@ read_PN_CHARS(SerdReader* const reader, SerdNode* const dest)
     st = push_byte(reader, dest, eat_byte_safe(reader, c));
   } else if (c == EOF || !(c & 0x80)) {
     return SERD_FAILURE;
-  } else if ((st = read_utf8_code(
-                reader, dest, &code, (uint8_t)eat_byte_safe(reader, c)))) {
+  } else if ((st = read_utf8_code_point(reader, dest, &code, (uint8_t)c))) {
     return st;
   } else if (!is_PN_CHARS(code)) {
     return r_err(
@@ -861,7 +781,7 @@ read_IRIREF(SerdReader* const reader, SerdNode** const dest)
       } else if (!(c & 0x80)) {
         st = push_byte(reader, *dest, c);
       } else {
-        st = read_utf8_character(reader, *dest, (uint8_t)c);
+        st = read_utf8_continuation(reader, *dest, (uint8_t)c);
       }
     }
   }
diff --git a/src/read_utf8.c b/src/read_utf8.c
new file mode 100644
index 00000000..614ea14f
--- /dev/null
+++ b/src/read_utf8.c
@@ -0,0 +1,112 @@
+/*
+  Copyright 2011-2021 David Robillard <d@drobilla.net>
+
+  Permission to use, copy, modify, and/or distribute this software for any
+  purpose with or without fee is hereby granted, provided that the above
+  copyright notice and this permission notice appear in all copies.
+
+  THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+*/
+
+#include "read_utf8.h"
+#include "reader.h"
+
+#include "string_utils.h"
+
+#include <stdio.h>
+
+static SerdStatus
+skip_invalid_utf8(SerdReader* const reader)
+{
+  for (int b = peek_byte(reader); b != EOF && ((uint8_t)b & 0x80);) {
+    eat_byte_safe(reader, b);
+    b = peek_byte(reader);
+  }
+
+  return reader->strict ? SERD_ERR_BAD_SYNTAX : SERD_FAILURE;
+}
+
+static SerdStatus
+bad_char(SerdReader* const reader, const char* const fmt, const uint8_t c)
+{
+  r_err(reader, SERD_ERR_BAD_SYNTAX, fmt, c);
+  return skip_invalid_utf8(reader);
+}
+
+static SerdStatus
+read_utf8_continuation_bytes(SerdReader* const reader,
+                             uint8_t           bytes[4],
+                             uint32_t* const   size,
+                             const uint8_t     lead)
+{
+  *size = utf8_num_bytes(lead);
+  if (*size < 1 || *size > 4) {
+    return bad_char(reader, "0x%X is not a UTF-8 leading byte", lead);
+  }
+
+  bytes[0] = lead;
+
+  for (uint32_t i = 1u; i < *size; ++i) {
+    const int b = peek_byte(reader);
+    if (b == EOF) {
+      return r_err(reader, SERD_ERR_NO_DATA, "unexpected end of input");
+    }
+
+    const uint8_t byte = (uint8_t)b;
+    if (!(byte & 0x80u)) {
+      return bad_char(reader, "0x%X is not a UTF-8 continuation byte", byte);
+    }
+
+    eat_byte_safe(reader, b);
+    bytes[i] = byte;
+  }
+
+  return SERD_SUCCESS;
+}
+
+SerdStatus
+read_utf8_continuation(SerdReader* const reader,
+                       SerdNode* const   dest,
+                       const uint8_t     lead)
+{
+  uint32_t size     = 0;
+  uint8_t  bytes[4] = {0, 0, 0, 0};
+
+  SerdStatus st = read_utf8_continuation_bytes(reader, bytes, &size, lead);
+  if (st) {
+    return reader->strict ? st : push_bytes(reader, dest, replacement_char, 3);
+  }
+
+  return push_bytes(reader, dest, bytes, size);
+}
+
+SerdStatus
+read_utf8_code_point(SerdReader* const reader,
+                     SerdNode* const   dest,
+                     uint32_t* const   code,
+                     const uint8_t     lead)
+{
+  uint32_t size     = 0u;
+  uint8_t  bytes[4] = {lead, 0u, 0u, 0u};
+
+  *code = 0u;
+
+  eat_byte_safe(reader, lead);
+
+  SerdStatus st = read_utf8_continuation_bytes(reader, bytes, &size, lead);
+  if (st) {
+    return reader->strict ? st : push_bytes(reader, dest, replacement_char, 3);
+  }
+
+  if (!(st = push_bytes(reader, dest, bytes, size))) {
+    *code = parse_counted_utf8_char(bytes, size);
+  }
+
+  return st;
+}
diff --git a/src/read_utf8.h b/src/read_utf8.h
new file mode 100644
index 00000000..eb78be74
--- /dev/null
+++ b/src/read_utf8.h
@@ -0,0 +1,35 @@
+/*
+  Copyright 2011-2021 David Robillard <d@drobilla.net>
+
+  Permission to use, copy, modify, and/or distribute this software for any
+  purpose with or without fee is hereby granted, provided that the above
+  copyright notice and this permission notice appear in all copies.
+
+  THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+*/
+
+#ifndef SERD_READ_UTF8_H
+#define SERD_READ_UTF8_H
+
+#include "serd/serd.h"
+
+#include <stdint.h>
+
+/// Read a UTF-8 character continuation (starting after the lead byte)
+SerdStatus
+read_utf8_continuation(SerdReader* reader, SerdNode* dest, uint8_t lead);
+
+/// Read a single UTF-8 character and parse it to a code point
+SerdStatus
+read_utf8_code_point(SerdReader* reader,
+                     SerdNode*   dest,
+                     uint32_t*   code,
+                     uint8_t     lead);
+
+#endif // SERD_READ_UTF8_H