aboutsummaryrefslogtreecommitdiffstats
path: root/src/n3.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/n3.c')
-rw-r--r--src/n3.c92
1 files changed, 6 insertions, 86 deletions
diff --git a/src/n3.c b/src/n3.c
index e2a7a3a8..582beae4 100644
--- a/src/n3.c
+++ b/src/n3.c
@@ -18,6 +18,7 @@
#include "env.h"
#include "namespaces.h"
#include "node.h"
+#include "read_utf8.h"
#include "reader.h"
#include "stack.h"
#include "string_utils.h"
@@ -178,86 +179,6 @@ read_ECHAR(SerdReader* const reader, SerdNode* const dest)
}
}
-static SerdStatus
-bad_char(SerdReader* const reader, const char* const fmt, const uint8_t c)
-{
- // Skip bytes until the next start byte
- for (int b = peek_byte(reader); b != EOF && ((uint8_t)b & 0x80);) {
- eat_byte_safe(reader, b);
- b = peek_byte(reader);
- }
-
- r_err(reader, SERD_ERR_BAD_SYNTAX, fmt, c);
- return reader->strict ? SERD_ERR_BAD_SYNTAX : SERD_FAILURE;
-}
-
-static SerdStatus
-read_utf8_bytes(SerdReader* const reader,
- uint8_t bytes[4],
- uint32_t* const size,
- const uint8_t c)
-{
- *size = utf8_num_bytes(c);
- if (*size <= 1 || *size > 4) {
- return bad_char(reader, "invalid UTF-8 start 0x%X", c);
- }
-
- bytes[0] = c;
- for (unsigned i = 1; i < *size; ++i) {
- const int b = peek_byte(reader);
- if (b == EOF || ((uint8_t)b & 0x80) == 0) {
- return bad_char(reader, "invalid UTF-8 continuation 0x%X", (uint8_t)b);
- }
-
- eat_byte_safe(reader, b);
- bytes[i] = (uint8_t)b;
- }
-
- return SERD_SUCCESS;
-}
-
-static SerdStatus
-read_utf8_character(SerdReader* const reader,
- SerdNode* const dest,
- const uint8_t c)
-{
- uint32_t size = 0;
- uint8_t bytes[4] = {0, 0, 0, 0};
- SerdStatus st = read_utf8_bytes(reader, bytes, &size, c);
-
- if (!tolerate_status(reader, st)) {
- return st;
- }
-
- if (st) {
- const SerdStatus rst = push_bytes(reader, dest, replacement_char, 3);
- return rst ? rst : st;
- }
-
- return push_bytes(reader, dest, bytes, size);
-}
-
-static SerdStatus
-read_utf8_code(SerdReader* const reader,
- SerdNode* const dest,
- uint32_t* const code,
- const uint8_t c)
-{
- uint32_t size = 0;
- uint8_t bytes[4] = {0, 0, 0, 0};
- SerdStatus st = read_utf8_bytes(reader, bytes, &size, c);
- if (st) {
- const SerdStatus rst = push_bytes(reader, dest, replacement_char, 3);
- return rst ? rst : st;
- }
-
- if (!(st = push_bytes(reader, dest, bytes, size))) {
- *code = parse_counted_utf8_char(bytes, size);
- }
-
- return st;
-}
-
// Read one character (possibly multi-byte)
// The first byte, c, has already been eaten by caller
static SerdStatus
@@ -279,7 +200,8 @@ read_character(SerdReader* const reader, SerdNode* const dest, const uint8_t c)
return push_byte(reader, dest, c);
}
- return read_utf8_character(reader, dest, c);
+
+ return read_utf8_continuation(reader, dest, c);
}
// [10] comment ::= '#' ( [^#xA #xD] )*
@@ -472,8 +394,7 @@ read_PN_CHARS_BASE(SerdReader* const reader, SerdNode* const dest)
st = push_byte(reader, dest, eat_byte_safe(reader, c));
} else if (c == EOF || !(c & 0x80)) {
return SERD_FAILURE;
- } else if ((st = read_utf8_code(
- reader, dest, &code, (uint8_t)eat_byte_safe(reader, c)))) {
+ } else if ((st = read_utf8_code_point(reader, dest, &code, (uint8_t)c))) {
return st;
} else if (!is_PN_CHARS_BASE(code)) {
r_err(
@@ -502,8 +423,7 @@ read_PN_CHARS(SerdReader* const reader, SerdNode* const dest)
st = push_byte(reader, dest, eat_byte_safe(reader, c));
} else if (c == EOF || !(c & 0x80)) {
return SERD_FAILURE;
- } else if ((st = read_utf8_code(
- reader, dest, &code, (uint8_t)eat_byte_safe(reader, c)))) {
+ } else if ((st = read_utf8_code_point(reader, dest, &code, (uint8_t)c))) {
return st;
} else if (!is_PN_CHARS(code)) {
return r_err(
@@ -861,7 +781,7 @@ read_IRIREF(SerdReader* const reader, SerdNode** const dest)
} else if (!(c & 0x80)) {
st = push_byte(reader, *dest, c);
} else {
- st = read_utf8_character(reader, *dest, (uint8_t)c);
+ st = read_utf8_continuation(reader, *dest, (uint8_t)c);
}
}
}