Factor out UTF-8 reading utilities

author: David Robillard <d@drobilla.net> 2021-06-30 13:34:31 -0400
committer: David Robillard <d@drobilla.net> 2023-12-02 18:49:07 -0500
commit: 469034ec4ae5c0b5230ca30c40aaa9b1432c13a2 (patch)
tree: b6d2c350e3eebb6a1ce0cdff740a8c488bfbb3bc /src/n3.c
parent: 3d79b6ee36b250644e6cf70eee8e3076d94cbb7f (diff)
download: serd-469034ec4ae5c0b5230ca30c40aaa9b1432c13a2.tar.gz
serd-469034ec4ae5c0b5230ca30c40aaa9b1432c13a2.tar.bz2
serd-469034ec4ae5c0b5230ca30c40aaa9b1432c13a2.zip
1 files changed, 10 insertions, 85 deletions
diff --git a/src/n3.c b/src/n3.c
index 7512cf7e..967f5162 100644
--- a/src/n3.c
+++ b/src/n3.c
@@ -3,6 +3,7 @@
 
 #include "namespaces.h"
 #include "node.h"
+#include "read_utf8.h"
 #include "reader.h"
 #include "stack.h"
 #include "string_utils.h"
@@ -164,85 +165,6 @@ read_ECHAR(SerdReader* const reader, SerdNode* const dest)
   }
 }
 
-static SerdStatus
-bad_char(SerdReader* const reader, const char* const fmt, const uint8_t c)
-{
-  // Skip bytes until the next start byte
-  for (int b = peek_byte(reader); b != EOF && ((uint8_t)b & 0x80);) {
-    skip_byte(reader, b);
-    b = peek_byte(reader);
-  }
-
-  r_err(reader, SERD_BAD_SYNTAX, fmt, c);
-  return reader->strict ? SERD_BAD_SYNTAX : SERD_FAILURE;
-}
-
-static SerdStatus
-read_utf8_bytes(SerdReader* const reader,
-                uint8_t           bytes[4],
-                uint32_t* const   size,
-                const uint8_t     c)
-{
-  *size = utf8_num_bytes(c);
-  if (*size <= 1 || *size > 4) {
-    return bad_char(reader, "invalid UTF-8 start 0x%X", c);
-  }
-
-  bytes[0] = c;
-  for (unsigned i = 1; i < *size; ++i) {
-    const int b = peek_byte(reader);
-    if (b == EOF || ((uint8_t)b & 0x80) == 0) {
-      return bad_char(reader, "invalid UTF-8 continuation 0x%X", (uint8_t)b);
-    }
-
-    bytes[i] = (uint8_t)eat_byte_safe(reader, b);
-  }
-
-  return SERD_SUCCESS;
-}
-
-static SerdStatus
-read_utf8_character(SerdReader* const reader,
-                    SerdNode* const   dest,
-                    const uint8_t     c)
-{
-  uint32_t   size     = 0;
-  uint8_t    bytes[4] = {0, 0, 0, 0};
-  SerdStatus st       = read_utf8_bytes(reader, bytes, &size, c);
-
-  if (!tolerate_status(reader, st)) {
-    return st;
-  }
-
-  if (st) {
-    const SerdStatus rst = push_bytes(reader, dest, replacement_char, 3);
-    return rst ? rst : st;
-  }
-
-  return push_bytes(reader, dest, bytes, size);
-}
-
-static SerdStatus
-read_utf8_code(SerdReader* const reader,
-               SerdNode* const   dest,
-               uint32_t* const   code,
-               const uint8_t     c)
-{
-  uint32_t   size     = 0;
-  uint8_t    bytes[4] = {0, 0, 0, 0};
-  SerdStatus st       = read_utf8_bytes(reader, bytes, &size, c);
-  if (st) {
-    const SerdStatus rst = push_bytes(reader, dest, replacement_char, 3);
-    return rst ? rst : st;
-  }
-
-  if (!(st = push_bytes(reader, dest, bytes, size))) {
-    *code = parse_counted_utf8_char(bytes, size);
-  }
-
-  return st;
-}
-
 // Read one character (possibly multi-byte)
 // The first byte, c, has already been eaten by caller
 static SerdStatus
@@ -264,7 +186,8 @@ read_character(SerdReader* const reader, SerdNode* const dest, const uint8_t c)
 
     return push_byte(reader, dest, c);
   }
-  return read_utf8_character(reader, dest, c);
+
+  return read_utf8_continuation(reader, dest, c);
 }
 
 // [10] comment ::= '#' ( [^#xA #xD] )*
@@ -470,8 +393,9 @@ read_PN_CHARS_BASE(SerdReader* const reader, SerdNode* const dest)
     return SERD_FAILURE;
   }
 
-  skip_byte(reader, c);
-  TRY(st, read_utf8_code(reader, dest, &code, (uint8_t)c));
+  if ((st = read_utf8_code_point(reader, dest, &code, (uint8_t)c))) {
+    return st;
+  }
 
   if (!is_PN_CHARS_BASE(code)) {
     r_err(reader, SERD_BAD_SYNTAX, "invalid character U+%04X in name", code);
@@ -505,8 +429,9 @@ read_PN_CHARS(SerdReader* const reader, SerdNode* const dest)
     return SERD_FAILURE;
   }
 
-  skip_byte(reader, c);
-  TRY(st, read_utf8_code(reader, dest, &code, (uint8_t)c));
+  if ((st = read_utf8_code_point(reader, dest, &code, (uint8_t)c))) {
+    return st;
+  }
 
   if (!is_PN_CHARS(code)) {
     return r_err(
@@ -797,7 +722,7 @@ read_IRIREF(SerdReader* const reader, SerdNode** const dest)
       } else if (!(c & 0x80)) {
         st = push_byte(reader, *dest, c);
       } else {
-        st = read_utf8_character(reader, *dest, (uint8_t)c);
+        st = read_utf8_continuation(reader, *dest, (uint8_t)c);
       }
     }
   }
author	David Robillard <d@drobilla.net>	2021-06-30 13:34:31 -0400
committer	David Robillard <d@drobilla.net>	2023-12-02 18:49:07 -0500
commit	469034ec4ae5c0b5230ca30c40aaa9b1432c13a2 (patch)
tree	b6d2c350e3eebb6a1ce0cdff740a8c488bfbb3bc /src/n3.c
parent	3d79b6ee36b250644e6cf70eee8e3076d94cbb7f (diff)
download	serd-469034ec4ae5c0b5230ca30c40aaa9b1432c13a2.tar.gz serd-469034ec4ae5c0b5230ca30c40aaa9b1432c13a2.tar.bz2 serd-469034ec4ae5c0b5230ca30c40aaa9b1432c13a2.zip