From 8d87a6aa12745d01abb8ef8468b8e3f258af1996 Mon Sep 17 00:00:00 2001 From: David Robillard Date: Thu, 30 Jan 2025 15:28:14 -0500 Subject: Unify invalid input character error reporting Puts all error messages that print input characters in the same place, so that potentially printing them can be handled properly. Most notably this avoids printing EOF and control characters in error messages. --- NEWS | 3 ++- src/n3.c | 31 ++++++++++--------------------- src/reader.c | 12 ++++++++++++ src/reader.h | 5 ++++- 4 files changed, 28 insertions(+), 23 deletions(-) diff --git a/NEWS b/NEWS index fa649066..d5dce4fc 100644 --- a/NEWS +++ b/NEWS @@ -1,9 +1,10 @@ serd (0.32.5) unstable; urgency=medium * Fix handling of some invalid EOF cases in lax mode + * Fix invalid characters in error messages * Remove project and version number from man page OS field - -- David Robillard Thu, 30 Jan 2025 19:35:32 +0000 + -- David Robillard Thu, 30 Jan 2025 20:28:00 +0000 serd (0.32.4) stable; urgency=medium diff --git a/src/n3.c b/src/n3.c index 3d4de079..9750532b 100644 --- a/src/n3.c +++ b/src/n3.c @@ -46,7 +46,7 @@ read_HEX(SerdReader* const reader) return (uint8_t)eat_byte_safe(reader, c); } - r_err(reader, SERD_ERR_BAD_SYNTAX, "invalid hexadecimal digit '%c'\n", c); + r_err_char(reader, "hexadecimal", c); return 0; } @@ -465,11 +465,7 @@ read_PN_CHARS_BASE(SerdReader* const reader, const Ref dest) read_utf8_code(reader, dest, &code, (uint8_t)c); if (!is_PN_CHARS_BASE(code)) { - r_err( - reader, SERD_ERR_BAD_SYNTAX, "invalid character U+%04X in name\n", code); - if (reader->strict) { - return SERD_ERR_BAD_SYNTAX; - } + st = r_err_char(reader, "name", (int)code); } return st; @@ -501,8 +497,7 @@ read_PN_CHARS(SerdReader* const reader, const Ref dest) TRY(st, read_utf8_code(reader, dest, &code, (uint8_t)c)); if (!is_PN_CHARS(code)) { - return r_err( - reader, SERD_ERR_BAD_SYNTAX, "invalid character U+%04X in name\n", code); + st = r_err_char(reader, "name", (int)code); } return st; @@ -531,7 +526,7 @@ read_PN_LOCAL_ESC(SerdReader* const reader, const Ref dest) return ((c == '!') || in_range(c, '#', '/') || (c == ';') || (c == '=') || (c == '?') || (c == '@') || (c == '_') || (c == '~')) ? push_byte(reader, dest, eat_byte_safe(reader, c)) - : r_err(reader, SERD_ERR_BAD_SYNTAX, "invalid escape\n"); + : r_err(reader, SERD_ERR_BAD_SYNTAX, "bad escape\n"); } static SerdStatus @@ -631,7 +626,7 @@ read_LANGTAG(SerdReader* const reader, Ref* const dest) { int c = peek_byte(reader); if (!is_alpha(c)) { - return r_err(reader, SERD_ERR_BAD_SYNTAX, "unexpected '%c'\n", c); + return r_err_char(reader, "language", c); } *dest = push_node(reader, SERD_LITERAL, "", 0); @@ -657,7 +652,7 @@ read_IRIREF_scheme(SerdReader* const reader, const Ref dest) { int c = peek_byte(reader); if (!is_alpha(c)) { - return r_err(reader, SERD_ERR_BAD_SYNTAX, "bad IRI scheme start '%c'\n", c); + return r_err_char(reader, "IRI scheme start", c); } while ((c = peek_byte(reader)) > 0) { @@ -666,11 +661,7 @@ read_IRIREF_scheme(SerdReader* const reader, const Ref dest) } if (!is_uri_scheme_char(c)) { - return r_err(reader, - SERD_ERR_BAD_SYNTAX, - "bad IRI scheme char U+%04X (%c)\n", - (unsigned)c, - (char)c); + return r_err_char(reader, "IRI scheme", c); } push_byte(reader, dest, eat_byte_safe(reader, c)); @@ -704,8 +695,7 @@ read_IRIREF(SerdReader* const reader, Ref* const dest) case '"': case '<': *dest = pop_node(reader, *dest); - return r_err( - reader, SERD_ERR_BAD_SYNTAX, "invalid IRI character '%c'\n", c); + return r_err_char(reader, "IRI", c); case '>': return SERD_SUCCESS; @@ -713,7 +703,7 @@ read_IRIREF(SerdReader* const reader, Ref* const dest) case '\\': if (read_UCHAR(reader, *dest, &code)) { *dest = pop_node(reader, *dest); - return r_err(reader, SERD_ERR_BAD_SYNTAX, "invalid IRI escape\n"); + return r_err_char(reader, "IRI escape", c); } if (code == ' ' || code == '<' || code == '>') { @@ -731,8 +721,7 @@ read_IRIREF(SerdReader* const reader, Ref* const dest) case '|': case '}': *dest = pop_node(reader, *dest); - return r_err( - reader, SERD_ERR_BAD_SYNTAX, "invalid IRI character '%c'\n", c); + return r_err_char(reader, "IRI", c); default: if (c <= 0) { diff --git a/src/reader.c b/src/reader.c index b2563c49..778913c8 100644 --- a/src/reader.c +++ b/src/reader.c @@ -30,6 +30,18 @@ r_err(SerdReader* const reader, const SerdStatus st, const char* const fmt, ...) return st; } +SerdStatus +r_err_char(SerdReader* const reader, const char* const kind, const int c) +{ + const SerdStatus st = SERD_ERR_BAD_SYNTAX; + + return (c < 0x20 || c == 0x7F || c > 0x10FFFF) + ? r_err(reader, st, "bad %s character\n", kind) + : (c == '\'' || c >= 0x80) + ? r_err(reader, st, "bad %s character U+%04X\n", kind, (uint32_t)c) + : r_err(reader, st, "bad %s character '%c'\n", kind, c); +} + void set_blank_id(SerdReader* const reader, const Ref ref, const size_t buf_size) { diff --git a/src/reader.h b/src/reader.h index d7b06a98..65c49f2c 100644 --- a/src/reader.h +++ b/src/reader.h @@ -69,6 +69,9 @@ SERD_LOG_FUNC(3, 4) SerdStatus r_err(SerdReader* reader, SerdStatus st, const char* fmt, ...); +SerdStatus +r_err_char(SerdReader* reader, const char* kind, int c); + Ref push_node_padded(SerdReader* reader, size_t maxlen, @@ -143,7 +146,7 @@ eat_byte_check(SerdReader* const reader, const int byte) { const int c = peek_byte(reader); if (c != byte) { - r_err(reader, SERD_ERR_BAD_SYNTAX, "expected '%c', not '%c'\n", byte, c); + r_err(reader, SERD_ERR_BAD_SYNTAX, "expected '%c'\n", byte); return 0; } return eat_byte_safe(reader, byte); -- cgit v1.2.1