From b132ab7f84abae6fd337cc32f97df22ad5076ee5 Mon Sep 17 00:00:00 2001 From: David Robillard Date: Sun, 11 Dec 2011 06:21:43 +0000 Subject: Better invalid string character handling. git-svn-id: http://svn.drobilla.net/serd/trunk@236 490d8e77-9747-427b-9fa3-0b8f29cee8a0 --- src/reader.c | 50 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 40 insertions(+), 10 deletions(-) (limited to 'src/reader.c') diff --git a/src/reader.c b/src/reader.c index 9059236f..4a148b96 100644 --- a/src/reader.c +++ b/src/reader.c @@ -482,6 +482,25 @@ read_ucharacter_escape(SerdReader* reader, Ref dest) } } +static inline SerdStatus +bad_char(SerdReader* reader, Ref dest, const char* fmt, uint8_t c) +{ + warn(reader, fmt, c); + + // Emit replacement character + push_byte(reader, dest, 0xEF); + push_byte(reader, dest, 0xBF); + push_byte(reader, dest, 0xBD); + + // Skip bytes until the next start byte + for (uint8_t c = peek_byte(reader); (c & 0x80);) { + eat_byte(reader, c); + c = peek_byte(reader); + } + + return SERD_SUCCESS; +} + // [38] character ::= '\u' hex hex hex hex // | '\U' hex hex hex hex hex hex hex hex // | '\\' @@ -497,8 +516,9 @@ read_character(SerdReader* reader, Ref dest) return SERD_ERR_BAD_SYNTAX; default: if (c < 0x20) { // ASCII control character - error(reader, "unexpected control character\n"); - return SERD_ERR_BAD_SYNTAX; + return bad_char(reader, dest, + "unexpected control character 0x%X\n", + eat_byte(reader, c)); } else if (c <= 0x7E) { // Printable ASCII push_byte(reader, dest, eat_byte(reader, c)); return SERD_SUCCESS; @@ -511,16 +531,26 @@ read_character(SerdReader* reader, Ref dest) } else if ((c & 0xF8) == 0xF0) { // Starts with `11110' size = 4; } else { - warn(reader, "invalid character\n"); - // Push replacement character - push_byte(reader, dest, 0xEF); - push_byte(reader, dest, 0xBF); - push_byte(reader, dest, 0xBD); - eat_byte(reader, c); - return SERD_SUCCESS; + return bad_char(reader, dest, "invalid UTF-8 start 0x%X\n", + eat_byte(reader, c)); } + + char bytes[size]; + bytes[0] = eat_byte(reader, c); + + // Check character validity + for (unsigned i = 1; i < size; ++i) { + if (((bytes[i] = peek_byte(reader)) & 0x80) == 0) { + return bad_char(reader, dest, + "invalid UTF-8 continuation 0x%X\n", + bytes[i]); + } + eat_byte(reader, bytes[i]); + } + + // Emit character for (unsigned i = 0; i < size; ++i) { - push_byte(reader, dest, eat_byte(reader, peek_byte(reader))); + push_byte(reader, dest, bytes[i]); } return SERD_SUCCESS; } -- cgit v1.2.1