aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid Robillard <d@drobilla.net>2023-02-27 11:17:42 -0500
committerDavid Robillard <d@drobilla.net>2023-12-02 18:49:07 -0500
commit3ae0e3a1f77e514dc7169e02d39964080e5e7ef9 (patch)
tree36b9399f0578762c610e238fe8b0f05d7e5e9b06
parent65cbb4a13f615658282677fcf04685bae63e893c (diff)
downloadserd-3ae0e3a1f77e514dc7169e02d39964080e5e7ef9.tar.gz
serd-3ae0e3a1f77e514dc7169e02d39964080e5e7ef9.tar.bz2
serd-3ae0e3a1f77e514dc7169e02d39964080e5e7ef9.zip
Use tighter types for UTF-8
-rw-r--r--src/read_utf8.c16
-rw-r--r--src/string_utils.h8
-rw-r--r--src/writer.c12
3 files changed, 19 insertions, 17 deletions
diff --git a/src/read_utf8.c b/src/read_utf8.c
index c6a24778..fb8ed0e2 100644
--- a/src/read_utf8.c
+++ b/src/read_utf8.c
@@ -8,6 +8,8 @@
#include <stdio.h>
+#define MAX_UTF8_BYTES 4U
+
static SerdStatus
skip_invalid_utf8(SerdReader* const reader)
{
@@ -28,8 +30,8 @@ bad_char(SerdReader* const reader, const char* const fmt, const uint8_t c)
static SerdStatus
read_utf8_continuation_bytes(SerdReader* const reader,
- uint8_t bytes[4],
- uint32_t* const size,
+ uint8_t bytes[static MAX_UTF8_BYTES],
+ uint8_t* const size,
const uint8_t lead)
{
*size = utf8_num_bytes(lead);
@@ -39,7 +41,7 @@ read_utf8_continuation_bytes(SerdReader* const reader,
bytes[0] = lead;
- for (uint32_t i = 1U; i < *size; ++i) {
+ for (uint8_t i = 1U; i < *size; ++i) {
const int b = peek_byte(reader);
if (b == EOF) {
return r_err(reader, SERD_NO_DATA, "unexpected end of input");
@@ -62,8 +64,8 @@ read_utf8_continuation(SerdReader* const reader,
SerdNode* const dest,
const uint8_t lead)
{
- uint32_t size = 0;
- uint8_t bytes[8] = {lead, 0U, 0U, 0U, 0U, 0U, 0U, 0U};
+ uint8_t size = 0;
+ uint8_t bytes[MAX_UTF8_BYTES] = {lead, 0U, 0U, 0U};
SerdStatus st = read_utf8_continuation_bytes(reader, bytes, &size, lead);
if (st) {
@@ -79,8 +81,8 @@ read_utf8_code_point(SerdReader* const reader,
uint32_t* const code,
const uint8_t lead)
{
- uint32_t size = 0U;
- uint8_t bytes[8] = {lead, 0U, 0U, 0U, 0U, 0U, 0U, 0U};
+ uint8_t size = 0U;
+ uint8_t bytes[MAX_UTF8_BYTES] = {lead, 0U, 0U, 0U};
*code = 0U;
diff --git a/src/string_utils.h b/src/string_utils.h
index 564c58ad..8f7ea083 100644
--- a/src/string_utils.h
+++ b/src/string_utils.h
@@ -104,7 +104,7 @@ serd_strncasecmp(const char* s1, const char* s2, size_t n)
return 0;
}
-static inline uint32_t
+static inline uint8_t
utf8_num_bytes(const uint8_t leading)
{
return ((leading & 0x80U) == 0x00U) ? 1U // Starts with `0'
@@ -114,7 +114,7 @@ utf8_num_bytes(const uint8_t leading)
: 0U; // Invalid
}
-static inline unsigned
+static inline uint8_t
utf8_num_bytes_for_codepoint(const uint32_t code)
{
return (code < 0x00000080) ? 1U
@@ -126,7 +126,7 @@ utf8_num_bytes_for_codepoint(const uint32_t code)
/// Return the code point of a UTF-8 character with known length
static inline uint32_t
-parse_counted_utf8_char(const uint8_t* const utf8, const size_t size)
+parse_counted_utf8_char(const uint8_t* const utf8, const uint8_t size)
{
uint32_t c = utf8[0] & ((1U << (8U - size)) - 1U);
@@ -139,7 +139,7 @@ parse_counted_utf8_char(const uint8_t* const utf8, const size_t size)
/// Parse a UTF-8 character, set *size to the length, and return the code point
static inline uint32_t
-parse_utf8_char(const uint8_t* const utf8, size_t* const size)
+parse_utf8_char(const uint8_t* const utf8, uint8_t* const size)
{
switch (*size = utf8_num_bytes(utf8[0])) {
case 1:
diff --git a/src/writer.c b/src/writer.c
index 60c17e11..be199af4 100644
--- a/src/writer.c
+++ b/src/writer.c
@@ -284,7 +284,7 @@ esink(const void* buf, size_t len, SerdWriter* writer)
static size_t
write_character(SerdWriter* const writer,
const uint8_t* const utf8,
- size_t* const size,
+ uint8_t* const size,
SerdStatus* const st)
{
char escape[11] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
@@ -359,7 +359,7 @@ write_uri(SerdWriter* writer, const char* utf8, size_t n_bytes, SerdStatus* st)
}
// Write UTF-8 character
- size_t size = 0;
+ uint8_t size = 0;
len += write_character(writer, (const uint8_t*)utf8 + i, &size, st);
i += size;
if (*st && !(writer->flags & SERD_WRITE_LAX)) {
@@ -447,7 +447,7 @@ write_lname(SerdWriter* writer, const char* utf8, const size_t n_bytes)
sets of valid characters. */
// Write first character
- size_t first_size = 0U;
+ uint8_t first_size = 0U;
const int first = (int)parse_utf8_char((const uint8_t*)utf8, &first_size);
if (is_PN_CHARS_U(first) || first == ':' || is_digit(first)) {
TRY(st, esink(utf8, first_size, writer));
@@ -457,7 +457,7 @@ write_lname(SerdWriter* writer, const char* utf8, const size_t n_bytes)
// Write middle and last characters
for (size_t i = first_size; i < n_bytes;) {
- size_t c_size = 0U;
+ uint8_t c_size = 0U;
const int c = (int)parse_utf8_char((const uint8_t*)utf8 + i, &c_size);
if (is_PN_CHARS(c) || c == ':' || (c == '.' && (i + 1U < n_bytes))) {
@@ -581,7 +581,7 @@ write_text(SerdWriter* writer,
if (escape_len == 0) {
// No special escape for this character, write full Unicode escape
- size_t size = 0;
+ uint8_t size = 0;
write_character(writer, (const uint8_t*)utf8 + i - 1, &size, &st);
if (st && !(writer->flags & SERD_WRITE_LAX)) {
return st;
@@ -593,7 +593,7 @@ write_text(SerdWriter* writer,
for (; i < n_bytes && (utf8[i] & 0x80); ++i) {
}
} else {
- i += size - 1;
+ i += size - 1U;
}
}
}