diff options
author | David Robillard <d@drobilla.net> | 2011-12-11 06:21:43 +0000 |
---|---|---|
committer | David Robillard <d@drobilla.net> | 2011-12-11 06:21:43 +0000 |
commit | b132ab7f84abae6fd337cc32f97df22ad5076ee5 (patch) | |
tree | cd4435aa71bf3c032f715f178cb39128d7c38d06 /src/reader.c | |
parent | 283390ef1bae87030c477d10c0fa43861609f56c (diff) | |
download | serd-b132ab7f84abae6fd337cc32f97df22ad5076ee5.tar.gz serd-b132ab7f84abae6fd337cc32f97df22ad5076ee5.tar.bz2 serd-b132ab7f84abae6fd337cc32f97df22ad5076ee5.zip |
Better invalid string character handling.
git-svn-id: http://svn.drobilla.net/serd/trunk@236 490d8e77-9747-427b-9fa3-0b8f29cee8a0
Diffstat (limited to 'src/reader.c')
-rw-r--r-- | src/reader.c | 50 |
1 files changed, 40 insertions, 10 deletions
diff --git a/src/reader.c b/src/reader.c index 9059236f..4a148b96 100644 --- a/src/reader.c +++ b/src/reader.c @@ -482,6 +482,25 @@ read_ucharacter_escape(SerdReader* reader, Ref dest) } } +static inline SerdStatus +bad_char(SerdReader* reader, Ref dest, const char* fmt, uint8_t c) +{ + warn(reader, fmt, c); + + // Emit replacement character + push_byte(reader, dest, 0xEF); + push_byte(reader, dest, 0xBF); + push_byte(reader, dest, 0xBD); + + // Skip bytes until the next start byte + for (uint8_t c = peek_byte(reader); (c & 0x80);) { + eat_byte(reader, c); + c = peek_byte(reader); + } + + return SERD_SUCCESS; +} + // [38] character ::= '\u' hex hex hex hex // | '\U' hex hex hex hex hex hex hex hex // | '\\' @@ -497,8 +516,9 @@ read_character(SerdReader* reader, Ref dest) return SERD_ERR_BAD_SYNTAX; default: if (c < 0x20) { // ASCII control character - error(reader, "unexpected control character\n"); - return SERD_ERR_BAD_SYNTAX; + return bad_char(reader, dest, + "unexpected control character 0x%X\n", + eat_byte(reader, c)); } else if (c <= 0x7E) { // Printable ASCII push_byte(reader, dest, eat_byte(reader, c)); return SERD_SUCCESS; @@ -511,16 +531,26 @@ read_character(SerdReader* reader, Ref dest) } else if ((c & 0xF8) == 0xF0) { // Starts with `11110' size = 4; } else { - warn(reader, "invalid character\n"); - // Push replacement character - push_byte(reader, dest, 0xEF); - push_byte(reader, dest, 0xBF); - push_byte(reader, dest, 0xBD); - eat_byte(reader, c); - return SERD_SUCCESS; + return bad_char(reader, dest, "invalid UTF-8 start 0x%X\n", + eat_byte(reader, c)); } + + char bytes[size]; + bytes[0] = eat_byte(reader, c); + + // Check character validity + for (unsigned i = 1; i < size; ++i) { + if (((bytes[i] = peek_byte(reader)) & 0x80) == 0) { + return bad_char(reader, dest, + "invalid UTF-8 continuation 0x%X\n", + bytes[i]); + } + eat_byte(reader, bytes[i]); + } + + // Emit character for (unsigned i = 0; i < size; ++i) { - push_byte(reader, dest, eat_byte(reader, peek_byte(reader))); + push_byte(reader, dest, bytes[i]); } return SERD_SUCCESS; } |