Factor out UTF-8 character size counting

author: David Robillard <d@drobilla.net> 2017-07-09 20:09:36 +0200
committer: David Robillard <d@drobilla.net> 2017-07-10 12:06:56 +0200
commit: 9c29938c172e2423f67925274a18b4f1c1bb42cf (patch)
tree: f75ce11c9827ef51c570646b4146793b17f70c90 /src
parent: 4d535bbe0390ed4f03c611e433145c9e49cbf3ad (diff)
download: serd-9c29938c172e2423f67925274a18b4f1c1bb42cf.tar.gz
serd-9c29938c172e2423f67925274a18b4f1c1bb42cf.tar.bz2
serd-9c29938c172e2423f67925274a18b4f1c1bb42cf.zip
3 files changed, 26 insertions, 31 deletions
diff --git a/src/reader.c b/src/reader.c
index dd7fa8b2..91e0e920 100644
--- a/src/reader.c
+++ b/src/reader.c
@@ -409,14 +409,8 @@ bad_char(SerdReader* reader, Ref dest, const char* fmt, uint8_t c)
 static SerdStatus
 read_utf8_character(SerdReader* reader, Ref dest, uint8_t c)
 {
-	unsigned size = 1;
-	if ((c & 0xE0) == 0xC0) {  // Starts with `110'
-		size = 2;
-	} else if ((c & 0xF0) == 0xE0) {  // Starts with `1110'
-		size = 3;
-	} else if ((c & 0xF8) == 0xF0) {  // Starts with `11110'
-		size = 4;
-	} else {
+	const uint32_t size = utf8_num_bytes(c);
+	if (size <= 1 || size > 4) {
 		return bad_char(reader, dest, "invalid UTF-8 start 0x%X\n", c);
 	}
 
diff --git a/src/serd_internal.h b/src/serd_internal.h
index 55f6a6b6..814b9a5b 100644
--- a/src/serd_internal.h
+++ b/src/serd_internal.h
@@ -325,12 +325,35 @@ is_windows_path(const uint8_t* path)
 		&& (path[2] == '/' || path[2] == '\\');
 }
 
+/* String utilities */
+
 size_t
 serd_substrlen(const uint8_t* str,
                const size_t   len,
                size_t*        n_bytes,
                SerdNodeFlags* flags);
 
+static inline uint32_t
+utf8_num_bytes(const uint8_t c)
+{
+	if ((c & 0x80) == 0) {  // Starts with `0'
+		return 1;
+	}
+
+#ifdef HAVE_BUILTIN_CLZ
+	return __builtin_clz(~c << 24);
+#else
+	if ((c & 0xE0) == 0xC0) {  // Starts with `110'
+		return 2;
+	} else if ((c & 0xF0) == 0xE0) {  // Starts with `1110'
+		return 3;
+	} else if ((c & 0xF8) == 0xF0) {  // Starts with `11110'
+		return 4;
+	}
+	return 0;
+#endif
+}
+
 /* URI utilities */
 
 static inline bool
diff --git a/src/writer.c b/src/writer.c
index 63b8d5af..c293b4f8 100644
--- a/src/writer.c
+++ b/src/writer.c
@@ -158,33 +158,11 @@ sink(const void* buf, size_t len, SerdWriter* writer)
 	return serd_byte_sink_write(buf, len, &writer->byte_sink);
 }
 
-// Return the number of bytes in a UTF-8 character
-static inline uint32_t
-utf8_num_bytes(const uint8_t* utf8)
-{
-	if ((utf8[0] & 0x80) == 0) {  // Starts with `0'
-		return 1;
-	}
-
-#ifdef HAVE_BUILTIN_CLZ
-	return __builtin_clz(~utf8[0] << 24);
-#else
-	if ((utf8[0] & 0xE0) == 0xC0) {  // Starts with `110'
-		return 2;
-	} else if ((utf8[0] & 0xF0) == 0xE0) {  // Starts with `1110'
-		return 3;
-	} else if ((utf8[0] & 0xF8) == 0xF0) {  // Starts with `11110'
-		return 4;
-	}
-	return 0;
-#endif
-}
-
 // Parse a UTF-8 character, set *size to the length, and return the code point
 static inline uint32_t
 parse_utf8_char(SerdWriter* writer, const uint8_t* utf8, size_t* size)
 {
-	switch (*size = utf8_num_bytes(utf8)) {
+	switch (*size = utf8_num_bytes(utf8[0])) {
 	case 1: case 2: case 3: case 4:
 		break;
 	default:
author	David Robillard <d@drobilla.net>	2017-07-09 20:09:36 +0200
committer	David Robillard <d@drobilla.net>	2017-07-10 12:06:56 +0200
commit	9c29938c172e2423f67925274a18b4f1c1bb42cf (patch)
tree	f75ce11c9827ef51c570646b4146793b17f70c90 /src
parent	4d535bbe0390ed4f03c611e433145c9e49cbf3ad (diff)
download	serd-9c29938c172e2423f67925274a18b4f1c1bb42cf.tar.gz serd-9c29938c172e2423f67925274a18b4f1c1bb42cf.tar.bz2 serd-9c29938c172e2423f67925274a18b4f1c1bb42cf.zip