aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid Robillard <d@drobilla.net>2021-06-30 13:34:31 -0400
committerDavid Robillard <d@drobilla.net>2023-12-02 18:49:07 -0500
commit469034ec4ae5c0b5230ca30c40aaa9b1432c13a2 (patch)
treeb6d2c350e3eebb6a1ce0cdff740a8c488bfbb3bc
parent3d79b6ee36b250644e6cf70eee8e3076d94cbb7f (diff)
downloadserd-469034ec4ae5c0b5230ca30c40aaa9b1432c13a2.tar.gz
serd-469034ec4ae5c0b5230ca30c40aaa9b1432c13a2.tar.bz2
serd-469034ec4ae5c0b5230ca30c40aaa9b1432c13a2.zip
Factor out UTF-8 reading utilities
-rw-r--r--meson.build1
-rw-r--r--src/n3.c95
-rw-r--r--src/read_utf8.c99
-rw-r--r--src/read_utf8.h24
4 files changed, 134 insertions, 85 deletions
diff --git a/meson.build b/meson.build
index c5748d89..09a170e8 100644
--- a/meson.build
+++ b/meson.build
@@ -157,6 +157,7 @@ sources = files(
'src/env.c',
'src/n3.c',
'src/node.c',
+ 'src/read_utf8.c',
'src/reader.c',
'src/sink.c',
'src/statement.c',
diff --git a/src/n3.c b/src/n3.c
index 7512cf7e..967f5162 100644
--- a/src/n3.c
+++ b/src/n3.c
@@ -3,6 +3,7 @@
#include "namespaces.h"
#include "node.h"
+#include "read_utf8.h"
#include "reader.h"
#include "stack.h"
#include "string_utils.h"
@@ -164,85 +165,6 @@ read_ECHAR(SerdReader* const reader, SerdNode* const dest)
}
}
-static SerdStatus
-bad_char(SerdReader* const reader, const char* const fmt, const uint8_t c)
-{
- // Skip bytes until the next start byte
- for (int b = peek_byte(reader); b != EOF && ((uint8_t)b & 0x80);) {
- skip_byte(reader, b);
- b = peek_byte(reader);
- }
-
- r_err(reader, SERD_BAD_SYNTAX, fmt, c);
- return reader->strict ? SERD_BAD_SYNTAX : SERD_FAILURE;
-}
-
-static SerdStatus
-read_utf8_bytes(SerdReader* const reader,
- uint8_t bytes[4],
- uint32_t* const size,
- const uint8_t c)
-{
- *size = utf8_num_bytes(c);
- if (*size <= 1 || *size > 4) {
- return bad_char(reader, "invalid UTF-8 start 0x%X", c);
- }
-
- bytes[0] = c;
- for (unsigned i = 1; i < *size; ++i) {
- const int b = peek_byte(reader);
- if (b == EOF || ((uint8_t)b & 0x80) == 0) {
- return bad_char(reader, "invalid UTF-8 continuation 0x%X", (uint8_t)b);
- }
-
- bytes[i] = (uint8_t)eat_byte_safe(reader, b);
- }
-
- return SERD_SUCCESS;
-}
-
-static SerdStatus
-read_utf8_character(SerdReader* const reader,
- SerdNode* const dest,
- const uint8_t c)
-{
- uint32_t size = 0;
- uint8_t bytes[4] = {0, 0, 0, 0};
- SerdStatus st = read_utf8_bytes(reader, bytes, &size, c);
-
- if (!tolerate_status(reader, st)) {
- return st;
- }
-
- if (st) {
- const SerdStatus rst = push_bytes(reader, dest, replacement_char, 3);
- return rst ? rst : st;
- }
-
- return push_bytes(reader, dest, bytes, size);
-}
-
-static SerdStatus
-read_utf8_code(SerdReader* const reader,
- SerdNode* const dest,
- uint32_t* const code,
- const uint8_t c)
-{
- uint32_t size = 0;
- uint8_t bytes[4] = {0, 0, 0, 0};
- SerdStatus st = read_utf8_bytes(reader, bytes, &size, c);
- if (st) {
- const SerdStatus rst = push_bytes(reader, dest, replacement_char, 3);
- return rst ? rst : st;
- }
-
- if (!(st = push_bytes(reader, dest, bytes, size))) {
- *code = parse_counted_utf8_char(bytes, size);
- }
-
- return st;
-}
-
// Read one character (possibly multi-byte)
// The first byte, c, has already been eaten by caller
static SerdStatus
@@ -264,7 +186,8 @@ read_character(SerdReader* const reader, SerdNode* const dest, const uint8_t c)
return push_byte(reader, dest, c);
}
- return read_utf8_character(reader, dest, c);
+
+ return read_utf8_continuation(reader, dest, c);
}
// [10] comment ::= '#' ( [^#xA #xD] )*
@@ -470,8 +393,9 @@ read_PN_CHARS_BASE(SerdReader* const reader, SerdNode* const dest)
return SERD_FAILURE;
}
- skip_byte(reader, c);
- TRY(st, read_utf8_code(reader, dest, &code, (uint8_t)c));
+ if ((st = read_utf8_code_point(reader, dest, &code, (uint8_t)c))) {
+ return st;
+ }
if (!is_PN_CHARS_BASE(code)) {
r_err(reader, SERD_BAD_SYNTAX, "invalid character U+%04X in name", code);
@@ -505,8 +429,9 @@ read_PN_CHARS(SerdReader* const reader, SerdNode* const dest)
return SERD_FAILURE;
}
- skip_byte(reader, c);
- TRY(st, read_utf8_code(reader, dest, &code, (uint8_t)c));
+ if ((st = read_utf8_code_point(reader, dest, &code, (uint8_t)c))) {
+ return st;
+ }
if (!is_PN_CHARS(code)) {
return r_err(
@@ -797,7 +722,7 @@ read_IRIREF(SerdReader* const reader, SerdNode** const dest)
} else if (!(c & 0x80)) {
st = push_byte(reader, *dest, c);
} else {
- st = read_utf8_character(reader, *dest, (uint8_t)c);
+ st = read_utf8_continuation(reader, *dest, (uint8_t)c);
}
}
}
diff --git a/src/read_utf8.c b/src/read_utf8.c
new file mode 100644
index 00000000..c6a24778
--- /dev/null
+++ b/src/read_utf8.c
@@ -0,0 +1,99 @@
+// Copyright 2011-2021 David Robillard <d@drobilla.net>
+// SPDX-License-Identifier: ISC
+
+#include "read_utf8.h"
+#include "reader.h"
+
+#include "string_utils.h"
+
+#include <stdio.h>
+
+static SerdStatus
+skip_invalid_utf8(SerdReader* const reader)
+{
+ for (int b = peek_byte(reader); b != EOF && ((uint8_t)b & 0x80);) {
+ skip_byte(reader, b);
+ b = peek_byte(reader);
+ }
+
+ return reader->strict ? SERD_BAD_SYNTAX : SERD_FAILURE;
+}
+
+static SerdStatus
+bad_char(SerdReader* const reader, const char* const fmt, const uint8_t c)
+{
+ r_err(reader, SERD_BAD_SYNTAX, fmt, c);
+ return skip_invalid_utf8(reader);
+}
+
+static SerdStatus
+read_utf8_continuation_bytes(SerdReader* const reader,
+ uint8_t bytes[4],
+ uint32_t* const size,
+ const uint8_t lead)
+{
+ *size = utf8_num_bytes(lead);
+ if (*size < 1 || *size > 4) {
+ return bad_char(reader, "0x%X is not a UTF-8 leading byte", lead);
+ }
+
+ bytes[0] = lead;
+
+ for (uint32_t i = 1U; i < *size; ++i) {
+ const int b = peek_byte(reader);
+ if (b == EOF) {
+ return r_err(reader, SERD_NO_DATA, "unexpected end of input");
+ }
+
+ const uint8_t byte = (uint8_t)b;
+ if (!(byte & 0x80U)) {
+ return bad_char(reader, "0x%X is not a UTF-8 continuation byte", byte);
+ }
+
+ skip_byte(reader, b);
+ bytes[i] = byte;
+ }
+
+ return SERD_SUCCESS;
+}
+
+SerdStatus
+read_utf8_continuation(SerdReader* const reader,
+ SerdNode* const dest,
+ const uint8_t lead)
+{
+ uint32_t size = 0;
+ uint8_t bytes[8] = {lead, 0U, 0U, 0U, 0U, 0U, 0U, 0U};
+
+ SerdStatus st = read_utf8_continuation_bytes(reader, bytes, &size, lead);
+ if (st) {
+ return reader->strict ? st : push_bytes(reader, dest, replacement_char, 3);
+ }
+
+ return push_bytes(reader, dest, bytes, size);
+}
+
+SerdStatus
+read_utf8_code_point(SerdReader* const reader,
+ SerdNode* const dest,
+ uint32_t* const code,
+ const uint8_t lead)
+{
+ uint32_t size = 0U;
+ uint8_t bytes[8] = {lead, 0U, 0U, 0U, 0U, 0U, 0U, 0U};
+
+ *code = 0U;
+
+ skip_byte(reader, lead);
+
+ SerdStatus st = read_utf8_continuation_bytes(reader, bytes, &size, lead);
+ if (st) {
+ return reader->strict ? st : push_bytes(reader, dest, replacement_char, 3);
+ }
+
+ if (!(st = push_bytes(reader, dest, bytes, size))) {
+ *code = parse_counted_utf8_char(bytes, size);
+ }
+
+ return st;
+}
diff --git a/src/read_utf8.h b/src/read_utf8.h
new file mode 100644
index 00000000..ce1be85f
--- /dev/null
+++ b/src/read_utf8.h
@@ -0,0 +1,24 @@
+// Copyright 2011-2021 David Robillard <d@drobilla.net>
+// SPDX-License-Identifier: ISC
+
+#ifndef SERD_SRC_READ_UTF8_H
+#define SERD_SRC_READ_UTF8_H
+
+#include "serd/node.h"
+#include "serd/reader.h"
+#include "serd/status.h"
+
+#include <stdint.h>
+
+/// Read a UTF-8 character continuation (starting after the lead byte)
+SerdStatus
+read_utf8_continuation(SerdReader* reader, SerdNode* dest, uint8_t lead);
+
+/// Read a single UTF-8 character and parse it to a code point
+SerdStatus
+read_utf8_code_point(SerdReader* reader,
+ SerdNode* dest,
+ uint32_t* code,
+ uint8_t lead);
+
+#endif // SERD_SRC_READ_UTF8_H