aboutsummaryrefslogtreecommitdiffstats
path: root/src/read_utf8.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/read_utf8.c')
-rw-r--r--src/read_utf8.c112
1 files changed, 112 insertions, 0 deletions
diff --git a/src/read_utf8.c b/src/read_utf8.c
new file mode 100644
index 00000000..614ea14f
--- /dev/null
+++ b/src/read_utf8.c
@@ -0,0 +1,112 @@
+/*
+ Copyright 2011-2021 David Robillard <d@drobilla.net>
+
+ Permission to use, copy, modify, and/or distribute this software for any
+ purpose with or without fee is hereby granted, provided that the above
+ copyright notice and this permission notice appear in all copies.
+
+ THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+*/
+
+#include "read_utf8.h"
+#include "reader.h"
+
+#include "string_utils.h"
+
+#include <stdio.h>
+
+static SerdStatus
+skip_invalid_utf8(SerdReader* const reader)
+{
+ for (int b = peek_byte(reader); b != EOF && ((uint8_t)b & 0x80);) {
+ eat_byte_safe(reader, b);
+ b = peek_byte(reader);
+ }
+
+ return reader->strict ? SERD_ERR_BAD_SYNTAX : SERD_FAILURE;
+}
+
+static SerdStatus
+bad_char(SerdReader* const reader, const char* const fmt, const uint8_t c)
+{
+ r_err(reader, SERD_ERR_BAD_SYNTAX, fmt, c);
+ return skip_invalid_utf8(reader);
+}
+
+static SerdStatus
+read_utf8_continuation_bytes(SerdReader* const reader,
+ uint8_t bytes[4],
+ uint32_t* const size,
+ const uint8_t lead)
+{
+ *size = utf8_num_bytes(lead);
+ if (*size < 1 || *size > 4) {
+ return bad_char(reader, "0x%X is not a UTF-8 leading byte", lead);
+ }
+
+ bytes[0] = lead;
+
+ for (uint32_t i = 1u; i < *size; ++i) {
+ const int b = peek_byte(reader);
+ if (b == EOF) {
+ return r_err(reader, SERD_ERR_NO_DATA, "unexpected end of input");
+ }
+
+ const uint8_t byte = (uint8_t)b;
+ if (!(byte & 0x80u)) {
+ return bad_char(reader, "0x%X is not a UTF-8 continuation byte", byte);
+ }
+
+ eat_byte_safe(reader, b);
+ bytes[i] = byte;
+ }
+
+ return SERD_SUCCESS;
+}
+
+SerdStatus
+read_utf8_continuation(SerdReader* const reader,
+ SerdNode* const dest,
+ const uint8_t lead)
+{
+ uint32_t size = 0;
+ uint8_t bytes[4] = {0, 0, 0, 0};
+
+ SerdStatus st = read_utf8_continuation_bytes(reader, bytes, &size, lead);
+ if (st) {
+ return reader->strict ? st : push_bytes(reader, dest, replacement_char, 3);
+ }
+
+ return push_bytes(reader, dest, bytes, size);
+}
+
+SerdStatus
+read_utf8_code_point(SerdReader* const reader,
+ SerdNode* const dest,
+ uint32_t* const code,
+ const uint8_t lead)
+{
+ uint32_t size = 0u;
+ uint8_t bytes[4] = {lead, 0u, 0u, 0u};
+
+ *code = 0u;
+
+ eat_byte_safe(reader, lead);
+
+ SerdStatus st = read_utf8_continuation_bytes(reader, bytes, &size, lead);
+ if (st) {
+ return reader->strict ? st : push_bytes(reader, dest, replacement_char, 3);
+ }
+
+ if (!(st = push_bytes(reader, dest, bytes, size))) {
+ *code = parse_counted_utf8_char(bytes, size);
+ }
+
+ return st;
+}