/* Copyright 2011-2021 David Robillard Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted, provided that the above copyright notice and this permission notice appear in all copies. THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include "read_utf8.h" #include "reader.h" #include "string_utils.h" #include static SerdStatus skip_invalid_utf8(SerdReader* const reader) { for (int b = peek_byte(reader); b != EOF && ((uint8_t)b & 0x80);) { eat_byte_safe(reader, b); b = peek_byte(reader); } return reader->strict ? SERD_ERR_BAD_SYNTAX : SERD_FAILURE; } static SerdStatus bad_char(SerdReader* const reader, const char* const fmt, const uint8_t c) { r_err(reader, SERD_ERR_BAD_SYNTAX, fmt, c); return skip_invalid_utf8(reader); } static SerdStatus read_utf8_continuation_bytes(SerdReader* const reader, uint8_t bytes[4], uint32_t* const size, const uint8_t lead) { *size = utf8_num_bytes(lead); if (*size < 1 || *size > 4) { return bad_char(reader, "0x%X is not a UTF-8 leading byte", lead); } bytes[0] = lead; for (uint32_t i = 1u; i < *size; ++i) { const int b = peek_byte(reader); if (b == EOF) { return r_err(reader, SERD_ERR_NO_DATA, "unexpected end of input"); } const uint8_t byte = (uint8_t)b; if (!(byte & 0x80u)) { return bad_char(reader, "0x%X is not a UTF-8 continuation byte", byte); } eat_byte_safe(reader, b); bytes[i] = byte; } return SERD_SUCCESS; } SerdStatus read_utf8_continuation(SerdReader* const reader, SerdNode* const dest, const uint8_t lead) { uint32_t size = 0; uint8_t bytes[4] = {0, 0, 0, 0}; SerdStatus st = read_utf8_continuation_bytes(reader, bytes, &size, lead); if (st) { return reader->strict ? st : push_bytes(reader, dest, replacement_char, 3); } return push_bytes(reader, dest, bytes, size); } SerdStatus read_utf8_code_point(SerdReader* const reader, SerdNode* const dest, uint32_t* const code, const uint8_t lead) { uint32_t size = 0u; uint8_t bytes[4] = {lead, 0u, 0u, 0u}; *code = 0u; eat_byte_safe(reader, lead); SerdStatus st = read_utf8_continuation_bytes(reader, bytes, &size, lead); if (st) { return reader->strict ? st : push_bytes(reader, dest, replacement_char, 3); } if (!(st = push_bytes(reader, dest, bytes, size))) { *code = parse_counted_utf8_char(bytes, size); } return st; }