aboutsummaryrefslogtreecommitdiffstats
path: root/src/reader.c
diff options
context:
space:
mode:
authorDavid Robillard <d@drobilla.net>2011-12-11 06:21:43 +0000
committerDavid Robillard <d@drobilla.net>2011-12-11 06:21:43 +0000
commitb132ab7f84abae6fd337cc32f97df22ad5076ee5 (patch)
treecd4435aa71bf3c032f715f178cb39128d7c38d06 /src/reader.c
parent283390ef1bae87030c477d10c0fa43861609f56c (diff)
downloadserd-b132ab7f84abae6fd337cc32f97df22ad5076ee5.tar.gz
serd-b132ab7f84abae6fd337cc32f97df22ad5076ee5.tar.bz2
serd-b132ab7f84abae6fd337cc32f97df22ad5076ee5.zip
Better invalid string character handling.
git-svn-id: http://svn.drobilla.net/serd/trunk@236 490d8e77-9747-427b-9fa3-0b8f29cee8a0
Diffstat (limited to 'src/reader.c')
-rw-r--r--src/reader.c50
1 files changed, 40 insertions, 10 deletions
diff --git a/src/reader.c b/src/reader.c
index 9059236f..4a148b96 100644
--- a/src/reader.c
+++ b/src/reader.c
@@ -482,6 +482,25 @@ read_ucharacter_escape(SerdReader* reader, Ref dest)
}
}
+static inline SerdStatus
+bad_char(SerdReader* reader, Ref dest, const char* fmt, uint8_t c)
+{
+ warn(reader, fmt, c);
+
+ // Emit replacement character
+ push_byte(reader, dest, 0xEF);
+ push_byte(reader, dest, 0xBF);
+ push_byte(reader, dest, 0xBD);
+
+ // Skip bytes until the next start byte
+ for (uint8_t c = peek_byte(reader); (c & 0x80);) {
+ eat_byte(reader, c);
+ c = peek_byte(reader);
+ }
+
+ return SERD_SUCCESS;
+}
+
// [38] character ::= '\u' hex hex hex hex
// | '\U' hex hex hex hex hex hex hex hex
// | '\\'
@@ -497,8 +516,9 @@ read_character(SerdReader* reader, Ref dest)
return SERD_ERR_BAD_SYNTAX;
default:
if (c < 0x20) { // ASCII control character
- error(reader, "unexpected control character\n");
- return SERD_ERR_BAD_SYNTAX;
+ return bad_char(reader, dest,
+ "unexpected control character 0x%X\n",
+ eat_byte(reader, c));
} else if (c <= 0x7E) { // Printable ASCII
push_byte(reader, dest, eat_byte(reader, c));
return SERD_SUCCESS;
@@ -511,16 +531,26 @@ read_character(SerdReader* reader, Ref dest)
} else if ((c & 0xF8) == 0xF0) { // Starts with `11110'
size = 4;
} else {
- warn(reader, "invalid character\n");
- // Push replacement character
- push_byte(reader, dest, 0xEF);
- push_byte(reader, dest, 0xBF);
- push_byte(reader, dest, 0xBD);
- eat_byte(reader, c);
- return SERD_SUCCESS;
+ return bad_char(reader, dest, "invalid UTF-8 start 0x%X\n",
+ eat_byte(reader, c));
}
+
+ char bytes[size];
+ bytes[0] = eat_byte(reader, c);
+
+ // Check character validity
+ for (unsigned i = 1; i < size; ++i) {
+ if (((bytes[i] = peek_byte(reader)) & 0x80) == 0) {
+ return bad_char(reader, dest,
+ "invalid UTF-8 continuation 0x%X\n",
+ bytes[i]);
+ }
+ eat_byte(reader, bytes[i]);
+ }
+
+ // Emit character
for (unsigned i = 0; i < size; ++i) {
- push_byte(reader, dest, eat_byte(reader, peek_byte(reader)));
+ push_byte(reader, dest, bytes[i]);
}
return SERD_SUCCESS;
}