diff options
author | David Robillard <d@drobilla.net> | 2021-06-30 13:34:31 -0400 |
---|---|---|
committer | David Robillard <d@drobilla.net> | 2022-01-14 19:37:51 -0500 |
commit | 840139223c77dba90b5ef92537e6c982b000c196 (patch) | |
tree | a74a28b585a090b58a60d8dcc7dc5095f403974e /src | |
parent | 88e9a65720f64788e99e0ff96d5e3b9e7db94e8c (diff) | |
download | serd-840139223c77dba90b5ef92537e6c982b000c196.tar.gz serd-840139223c77dba90b5ef92537e6c982b000c196.tar.bz2 serd-840139223c77dba90b5ef92537e6c982b000c196.zip |
Factor out UTF-8 reading utilities
Diffstat (limited to 'src')
-rw-r--r-- | src/n3.c | 92 | ||||
-rw-r--r-- | src/read_utf8.c | 112 | ||||
-rw-r--r-- | src/read_utf8.h | 35 |
3 files changed, 153 insertions, 86 deletions
@@ -18,6 +18,7 @@ #include "env.h" #include "namespaces.h" #include "node.h" +#include "read_utf8.h" #include "reader.h" #include "stack.h" #include "string_utils.h" @@ -178,86 +179,6 @@ read_ECHAR(SerdReader* const reader, SerdNode* const dest) } } -static SerdStatus -bad_char(SerdReader* const reader, const char* const fmt, const uint8_t c) -{ - // Skip bytes until the next start byte - for (int b = peek_byte(reader); b != EOF && ((uint8_t)b & 0x80);) { - eat_byte_safe(reader, b); - b = peek_byte(reader); - } - - r_err(reader, SERD_ERR_BAD_SYNTAX, fmt, c); - return reader->strict ? SERD_ERR_BAD_SYNTAX : SERD_FAILURE; -} - -static SerdStatus -read_utf8_bytes(SerdReader* const reader, - uint8_t bytes[4], - uint32_t* const size, - const uint8_t c) -{ - *size = utf8_num_bytes(c); - if (*size <= 1 || *size > 4) { - return bad_char(reader, "invalid UTF-8 start 0x%X", c); - } - - bytes[0] = c; - for (unsigned i = 1; i < *size; ++i) { - const int b = peek_byte(reader); - if (b == EOF || ((uint8_t)b & 0x80) == 0) { - return bad_char(reader, "invalid UTF-8 continuation 0x%X", (uint8_t)b); - } - - eat_byte_safe(reader, b); - bytes[i] = (uint8_t)b; - } - - return SERD_SUCCESS; -} - -static SerdStatus -read_utf8_character(SerdReader* const reader, - SerdNode* const dest, - const uint8_t c) -{ - uint32_t size = 0; - uint8_t bytes[4] = {0, 0, 0, 0}; - SerdStatus st = read_utf8_bytes(reader, bytes, &size, c); - - if (!tolerate_status(reader, st)) { - return st; - } - - if (st) { - const SerdStatus rst = push_bytes(reader, dest, replacement_char, 3); - return rst ? rst : st; - } - - return push_bytes(reader, dest, bytes, size); -} - -static SerdStatus -read_utf8_code(SerdReader* const reader, - SerdNode* const dest, - uint32_t* const code, - const uint8_t c) -{ - uint32_t size = 0; - uint8_t bytes[4] = {0, 0, 0, 0}; - SerdStatus st = read_utf8_bytes(reader, bytes, &size, c); - if (st) { - const SerdStatus rst = push_bytes(reader, dest, replacement_char, 3); - return rst ? rst : st; - } - - if (!(st = push_bytes(reader, dest, bytes, size))) { - *code = parse_counted_utf8_char(bytes, size); - } - - return st; -} - // Read one character (possibly multi-byte) // The first byte, c, has already been eaten by caller static SerdStatus @@ -279,7 +200,8 @@ read_character(SerdReader* const reader, SerdNode* const dest, const uint8_t c) return push_byte(reader, dest, c); } - return read_utf8_character(reader, dest, c); + + return read_utf8_continuation(reader, dest, c); } // [10] comment ::= '#' ( [^#xA #xD] )* @@ -472,8 +394,7 @@ read_PN_CHARS_BASE(SerdReader* const reader, SerdNode* const dest) st = push_byte(reader, dest, eat_byte_safe(reader, c)); } else if (c == EOF || !(c & 0x80)) { return SERD_FAILURE; - } else if ((st = read_utf8_code( - reader, dest, &code, (uint8_t)eat_byte_safe(reader, c)))) { + } else if ((st = read_utf8_code_point(reader, dest, &code, (uint8_t)c))) { return st; } else if (!is_PN_CHARS_BASE(code)) { r_err( @@ -502,8 +423,7 @@ read_PN_CHARS(SerdReader* const reader, SerdNode* const dest) st = push_byte(reader, dest, eat_byte_safe(reader, c)); } else if (c == EOF || !(c & 0x80)) { return SERD_FAILURE; - } else if ((st = read_utf8_code( - reader, dest, &code, (uint8_t)eat_byte_safe(reader, c)))) { + } else if ((st = read_utf8_code_point(reader, dest, &code, (uint8_t)c))) { return st; } else if (!is_PN_CHARS(code)) { return r_err( @@ -861,7 +781,7 @@ read_IRIREF(SerdReader* const reader, SerdNode** const dest) } else if (!(c & 0x80)) { st = push_byte(reader, *dest, c); } else { - st = read_utf8_character(reader, *dest, (uint8_t)c); + st = read_utf8_continuation(reader, *dest, (uint8_t)c); } } } diff --git a/src/read_utf8.c b/src/read_utf8.c new file mode 100644 index 00000000..614ea14f --- /dev/null +++ b/src/read_utf8.c @@ -0,0 +1,112 @@ +/* + Copyright 2011-2021 David Robillard <d@drobilla.net> + + Permission to use, copy, modify, and/or distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ + +#include "read_utf8.h" +#include "reader.h" + +#include "string_utils.h" + +#include <stdio.h> + +static SerdStatus +skip_invalid_utf8(SerdReader* const reader) +{ + for (int b = peek_byte(reader); b != EOF && ((uint8_t)b & 0x80);) { + eat_byte_safe(reader, b); + b = peek_byte(reader); + } + + return reader->strict ? SERD_ERR_BAD_SYNTAX : SERD_FAILURE; +} + +static SerdStatus +bad_char(SerdReader* const reader, const char* const fmt, const uint8_t c) +{ + r_err(reader, SERD_ERR_BAD_SYNTAX, fmt, c); + return skip_invalid_utf8(reader); +} + +static SerdStatus +read_utf8_continuation_bytes(SerdReader* const reader, + uint8_t bytes[4], + uint32_t* const size, + const uint8_t lead) +{ + *size = utf8_num_bytes(lead); + if (*size < 1 || *size > 4) { + return bad_char(reader, "0x%X is not a UTF-8 leading byte", lead); + } + + bytes[0] = lead; + + for (uint32_t i = 1u; i < *size; ++i) { + const int b = peek_byte(reader); + if (b == EOF) { + return r_err(reader, SERD_ERR_NO_DATA, "unexpected end of input"); + } + + const uint8_t byte = (uint8_t)b; + if (!(byte & 0x80u)) { + return bad_char(reader, "0x%X is not a UTF-8 continuation byte", byte); + } + + eat_byte_safe(reader, b); + bytes[i] = byte; + } + + return SERD_SUCCESS; +} + +SerdStatus +read_utf8_continuation(SerdReader* const reader, + SerdNode* const dest, + const uint8_t lead) +{ + uint32_t size = 0; + uint8_t bytes[4] = {0, 0, 0, 0}; + + SerdStatus st = read_utf8_continuation_bytes(reader, bytes, &size, lead); + if (st) { + return reader->strict ? st : push_bytes(reader, dest, replacement_char, 3); + } + + return push_bytes(reader, dest, bytes, size); +} + +SerdStatus +read_utf8_code_point(SerdReader* const reader, + SerdNode* const dest, + uint32_t* const code, + const uint8_t lead) +{ + uint32_t size = 0u; + uint8_t bytes[4] = {lead, 0u, 0u, 0u}; + + *code = 0u; + + eat_byte_safe(reader, lead); + + SerdStatus st = read_utf8_continuation_bytes(reader, bytes, &size, lead); + if (st) { + return reader->strict ? st : push_bytes(reader, dest, replacement_char, 3); + } + + if (!(st = push_bytes(reader, dest, bytes, size))) { + *code = parse_counted_utf8_char(bytes, size); + } + + return st; +} diff --git a/src/read_utf8.h b/src/read_utf8.h new file mode 100644 index 00000000..eb78be74 --- /dev/null +++ b/src/read_utf8.h @@ -0,0 +1,35 @@ +/* + Copyright 2011-2021 David Robillard <d@drobilla.net> + + Permission to use, copy, modify, and/or distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ + +#ifndef SERD_READ_UTF8_H +#define SERD_READ_UTF8_H + +#include "serd/serd.h" + +#include <stdint.h> + +/// Read a UTF-8 character continuation (starting after the lead byte) +SerdStatus +read_utf8_continuation(SerdReader* reader, SerdNode* dest, uint8_t lead); + +/// Read a single UTF-8 character and parse it to a code point +SerdStatus +read_utf8_code_point(SerdReader* reader, + SerdNode* dest, + uint32_t* code, + uint8_t lead); + +#endif // SERD_READ_UTF8_H |