Use tighter types for UTF-8

author: David Robillard <d@drobilla.net> 2023-02-27 11:17:42 -0500
committer: David Robillard <d@drobilla.net> 2023-12-02 18:49:07 -0500
commit: 3ae0e3a1f77e514dc7169e02d39964080e5e7ef9 (patch)
tree: 36b9399f0578762c610e238fe8b0f05d7e5e9b06
parent: 65cbb4a13f615658282677fcf04685bae63e893c (diff)
download: serd-3ae0e3a1f77e514dc7169e02d39964080e5e7ef9.tar.gz
serd-3ae0e3a1f77e514dc7169e02d39964080e5e7ef9.tar.bz2
serd-3ae0e3a1f77e514dc7169e02d39964080e5e7ef9.zip
3 files changed, 19 insertions, 17 deletions
diff --git a/src/read_utf8.c b/src/read_utf8.c
index c6a24778..fb8ed0e2 100644
--- a/src/read_utf8.c
+++ b/src/read_utf8.c
@@ -8,6 +8,8 @@
 
 #include <stdio.h>
 
+#define MAX_UTF8_BYTES 4U
+
 static SerdStatus
 skip_invalid_utf8(SerdReader* const reader)
 {
@@ -28,8 +30,8 @@ bad_char(SerdReader* const reader, const char* const fmt, const uint8_t c)
 
 static SerdStatus
 read_utf8_continuation_bytes(SerdReader* const reader,
-                             uint8_t           bytes[4],
-                             uint32_t* const   size,
+                             uint8_t           bytes[static MAX_UTF8_BYTES],
+                             uint8_t* const    size,
                              const uint8_t     lead)
 {
   *size = utf8_num_bytes(lead);
@@ -39,7 +41,7 @@ read_utf8_continuation_bytes(SerdReader* const reader,
 
   bytes[0] = lead;
 
-  for (uint32_t i = 1U; i < *size; ++i) {
+  for (uint8_t i = 1U; i < *size; ++i) {
     const int b = peek_byte(reader);
     if (b == EOF) {
       return r_err(reader, SERD_NO_DATA, "unexpected end of input");
@@ -62,8 +64,8 @@ read_utf8_continuation(SerdReader* const reader,
                        SerdNode* const   dest,
                        const uint8_t     lead)
 {
-  uint32_t size     = 0;
-  uint8_t  bytes[8] = {lead, 0U, 0U, 0U, 0U, 0U, 0U, 0U};
+  uint8_t size                  = 0;
+  uint8_t bytes[MAX_UTF8_BYTES] = {lead, 0U, 0U, 0U};
 
   SerdStatus st = read_utf8_continuation_bytes(reader, bytes, &size, lead);
   if (st) {
@@ -79,8 +81,8 @@ read_utf8_code_point(SerdReader* const reader,
                      uint32_t* const   code,
                      const uint8_t     lead)
 {
-  uint32_t size     = 0U;
-  uint8_t  bytes[8] = {lead, 0U, 0U, 0U, 0U, 0U, 0U, 0U};
+  uint8_t size                  = 0U;
+  uint8_t bytes[MAX_UTF8_BYTES] = {lead, 0U, 0U, 0U};
 
   *code = 0U;
 
diff --git a/src/string_utils.h b/src/string_utils.h
index 564c58ad..8f7ea083 100644
--- a/src/string_utils.h
+++ b/src/string_utils.h
@@ -104,7 +104,7 @@ serd_strncasecmp(const char* s1, const char* s2, size_t n)
   return 0;
 }
 
-static inline uint32_t
+static inline uint8_t
 utf8_num_bytes(const uint8_t leading)
 {
   return ((leading & 0x80U) == 0x00U)   ? 1U  // Starts with `0'
@@ -114,7 +114,7 @@ utf8_num_bytes(const uint8_t leading)
                                         : 0U; // Invalid
 }
 
-static inline unsigned
+static inline uint8_t
 utf8_num_bytes_for_codepoint(const uint32_t code)
 {
   return (code < 0x00000080)   ? 1U
@@ -126,7 +126,7 @@ utf8_num_bytes_for_codepoint(const uint32_t code)
 
 /// Return the code point of a UTF-8 character with known length
 static inline uint32_t
-parse_counted_utf8_char(const uint8_t* const utf8, const size_t size)
+parse_counted_utf8_char(const uint8_t* const utf8, const uint8_t size)
 {
   uint32_t c = utf8[0] & ((1U << (8U - size)) - 1U);
 
@@ -139,7 +139,7 @@ parse_counted_utf8_char(const uint8_t* const utf8, const size_t size)
 
 /// Parse a UTF-8 character, set *size to the length, and return the code point
 static inline uint32_t
-parse_utf8_char(const uint8_t* const utf8, size_t* const size)
+parse_utf8_char(const uint8_t* const utf8, uint8_t* const size)
 {
   switch (*size = utf8_num_bytes(utf8[0])) {
   case 1:
diff --git a/src/writer.c b/src/writer.c
index 60c17e11..be199af4 100644
--- a/src/writer.c
+++ b/src/writer.c
@@ -284,7 +284,7 @@ esink(const void* buf, size_t len, SerdWriter* writer)
 static size_t
 write_character(SerdWriter* const    writer,
                 const uint8_t* const utf8,
-                size_t* const        size,
+                uint8_t* const       size,
                 SerdStatus* const    st)
 {
   char           escape[11] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
@@ -359,7 +359,7 @@ write_uri(SerdWriter* writer, const char* utf8, size_t n_bytes, SerdStatus* st)
     }
 
     // Write UTF-8 character
-    size_t size = 0;
+    uint8_t size = 0;
     len += write_character(writer, (const uint8_t*)utf8 + i, &size, st);
     i += size;
     if (*st && !(writer->flags & SERD_WRITE_LAX)) {
@@ -447,7 +447,7 @@ write_lname(SerdWriter* writer, const char* utf8, const size_t n_bytes)
      sets of valid characters. */
 
   // Write first character
-  size_t    first_size = 0U;
+  uint8_t   first_size = 0U;
   const int first = (int)parse_utf8_char((const uint8_t*)utf8, &first_size);
   if (is_PN_CHARS_U(first) || first == ':' || is_digit(first)) {
     TRY(st, esink(utf8, first_size, writer));
@@ -457,7 +457,7 @@ write_lname(SerdWriter* writer, const char* utf8, const size_t n_bytes)
 
   // Write middle and last characters
   for (size_t i = first_size; i < n_bytes;) {
-    size_t    c_size = 0U;
+    uint8_t   c_size = 0U;
     const int c      = (int)parse_utf8_char((const uint8_t*)utf8 + i, &c_size);
 
     if (is_PN_CHARS(c) || c == ':' || (c == '.' && (i + 1U < n_bytes))) {
@@ -581,7 +581,7 @@ write_text(SerdWriter* writer,
 
     if (escape_len == 0) {
       // No special escape for this character, write full Unicode escape
-      size_t size = 0;
+      uint8_t size = 0;
       write_character(writer, (const uint8_t*)utf8 + i - 1, &size, &st);
       if (st && !(writer->flags & SERD_WRITE_LAX)) {
         return st;
@@ -593,7 +593,7 @@ write_text(SerdWriter* writer,
         for (; i < n_bytes && (utf8[i] & 0x80); ++i) {
         }
       } else {
-        i += size - 1;
+        i += size - 1U;
       }
     }
   }
author	David Robillard <d@drobilla.net>	2023-02-27 11:17:42 -0500
committer	David Robillard <d@drobilla.net>	2023-12-02 18:49:07 -0500
commit	3ae0e3a1f77e514dc7169e02d39964080e5e7ef9 (patch)
tree	36b9399f0578762c610e238fe8b0f05d7e5e9b06
parent	65cbb4a13f615658282677fcf04685bae63e893c (diff)
download	serd-3ae0e3a1f77e514dc7169e02d39964080e5e7ef9.tar.gz serd-3ae0e3a1f77e514dc7169e02d39964080e5e7ef9.tar.bz2 serd-3ae0e3a1f77e514dc7169e02d39964080e5e7ef9.zip