From a6cd7dd91d93015ec118286b868c3fd43133f3ac Mon Sep 17 00:00:00 2001 From: David Robillard Date: Sun, 9 Apr 2023 11:54:11 -0400 Subject: Gracefully handle errors when writing text --- src/n3.c | 29 +++++---- src/serd_config.h | 2 +- src/serdi.c | 11 +++- src/string.c | 2 + src/writer.c | 184 ++++++++++++++++++++++++++++++++++-------------------- 5 files changed, 143 insertions(+), 85 deletions(-) (limited to 'src') diff --git a/src/n3.c b/src/n3.c index 573ffdda..011f57fa 100644 --- a/src/n3.c +++ b/src/n3.c @@ -608,8 +608,8 @@ read_PN_LOCAL(SerdReader* const reader, const Ref dest, bool* const ate_dot) return r_err(reader, st, "bad escape\n"); } - if (st != SERD_SUCCESS && read_PN_CHARS_BASE(reader, dest)) { - return SERD_FAILURE; + if (st != SERD_SUCCESS && (st = read_PN_CHARS_BASE(reader, dest))) { + return st; } } @@ -659,11 +659,9 @@ read_PN_PREFIX_tail(SerdReader* const reader, const Ref dest) static SerdStatus read_PN_PREFIX(SerdReader* const reader, const Ref dest) { - if (!read_PN_CHARS_BASE(reader, dest)) { - return read_PN_PREFIX_tail(reader, dest); - } + const SerdStatus st = read_PN_CHARS_BASE(reader, dest); - return SERD_FAILURE; + return st ? st : read_PN_PREFIX_tail(reader, dest); } static SerdStatus @@ -989,10 +987,11 @@ read_verb(SerdReader* const reader, Ref* const dest) return SERD_SUCCESS; } - if (st > SERD_FAILURE || read_PrefixedName(reader, *dest, false, &ate_dot) || - ate_dot) { + if (st > SERD_FAILURE || + (st = read_PrefixedName(reader, *dest, false, &ate_dot)) || ate_dot) { *dest = pop_node(reader, *dest); - return r_err(reader, SERD_ERR_BAD_SYNTAX, "bad verb\n"); + st = st > SERD_FAILURE ? st : SERD_ERR_BAD_SYNTAX; + return r_err(reader, st, "bad verb\n"); } return SERD_SUCCESS; @@ -1125,8 +1124,9 @@ read_anon(SerdReader* const reader, *ctx.flags = old_flags; } - return (eat_byte_check(reader, ']') == ']') ? SERD_SUCCESS - : SERD_ERR_BAD_SYNTAX; + return st > SERD_FAILURE ? st + : (eat_byte_check(reader, ']') == ']') ? SERD_SUCCESS + : SERD_ERR_BAD_SYNTAX; } /* If emit is true: recurses, calling statement_sink for every statement @@ -1394,7 +1394,7 @@ read_subject(SerdReader* const reader, bool ate_dot = false; switch ((*s_type = peek_byte(reader))) { case '[': - read_anon(reader, ctx, true, dest); + st = read_anon(reader, ctx, true, dest); break; case '(': st = read_collection(reader, ctx, dest); @@ -1567,9 +1567,8 @@ read_wrappedGraph(SerdReader* const reader, ReadContext* const ctx) return r_err(reader, SERD_ERR_BAD_SYNTAX, "bad subject\n"); } - if (read_triples(reader, *ctx, &ate_dot) && s_type != '[') { - return r_err( - reader, SERD_ERR_BAD_SYNTAX, "missing predicate object list\n"); + if ((st = read_triples(reader, *ctx, &ate_dot)) && s_type != '[') { + return r_err(reader, st, "bad predicate object list\n"); } ctx->subject = pop_node(reader, ctx->subject); diff --git a/src/serd_config.h b/src/serd_config.h index 2a2c9b27..8dbfe28b 100644 --- a/src/serd_config.h +++ b/src/serd_config.h @@ -36,7 +36,7 @@ #define SERD_SRC_SERD_CONFIG_H // Define version unconditionally so a warning will catch a mismatch -#define SERD_VERSION "0.31.1" +#define SERD_VERSION "0.31.3" #if !defined(SERD_NO_DEFAULT_CONFIG) diff --git a/src/serdi.c b/src/serdi.c index d82198ad..f6a329bb 100644 --- a/src/serdi.c +++ b/src/serdi.c @@ -144,7 +144,8 @@ choose_style(const SerdSyntax input_syntax, const SerdSyntax output_syntax, const bool ascii, const bool bulk_write, - const bool full_uris) + const bool full_uris, + const bool lax) { unsigned output_style = 0U; if (output_syntax == SERD_NTRIPLES || ascii) { @@ -166,6 +167,10 @@ choose_style(const SerdSyntax input_syntax, output_style |= SERD_STYLE_BULK; } + if (!lax) { + output_style |= SERD_STYLE_STRICT; + } + return (SerdStyle)output_style; } @@ -301,8 +306,8 @@ main(int argc, char** argv) : SERD_NQUADS); } - const SerdStyle output_style = - choose_style(input_syntax, output_syntax, ascii, bulk_write, full_uris); + const SerdStyle output_style = choose_style( + input_syntax, output_syntax, ascii, bulk_write, full_uris, lax); SerdURI base_uri = SERD_URI_NULL; SerdNode base = SERD_NODE_NULL; diff --git a/src/string.c b/src/string.c index 07513739..ecba3463 100644 --- a/src/string.c +++ b/src/string.c @@ -39,6 +39,8 @@ serd_strerror(const SerdStatus status) return (const uint8_t*)"Internal error"; case SERD_ERR_BAD_WRITE: return (const uint8_t*)"Error writing to file/stream"; + case SERD_ERR_BAD_TEXT: + return (const uint8_t*)"Invalid text encoding"; } return (const uint8_t*)"Unknown error"; // never reached } diff --git a/src/writer.c b/src/writer.c index bc7146c1..07edc7f4 100644 --- a/src/writer.c +++ b/src/writer.c @@ -11,6 +11,7 @@ #include "serd/serd.h" +#include #include #include #include @@ -141,13 +142,13 @@ write_node(SerdWriter* writer, Field field, SerdStatementFlags flags); -static bool +SERD_NODISCARD static bool supports_abbrev(const SerdWriter* writer) { return writer->syntax == SERD_TURTLE || writer->syntax == SERD_TRIG; } -static bool +SERD_NODISCARD static bool supports_uriref(const SerdWriter* writer) { return writer->syntax == SERD_TURTLE || writer->syntax == SERD_TRIG; @@ -227,10 +228,20 @@ pop_context(SerdWriter* writer) serd_stack_pop(&writer->anon_stack, sizeof(WriteContext)); } -static size_t +SERD_NODISCARD static size_t sink(const void* buf, size_t len, SerdWriter* writer) { - return serd_byte_sink_write(buf, len, &writer->byte_sink); + const size_t written = serd_byte_sink_write(buf, len, &writer->byte_sink); + if (written != len) { + if (errno) { + const char* const message = strerror(errno); + w_err(writer, SERD_ERR_BAD_WRITE, "write error (%s)\n", message); + } else { + w_err(writer, SERD_ERR_BAD_WRITE, "write error\n"); + } + } + + return written; } SERD_NODISCARD static inline SerdStatus @@ -242,13 +253,17 @@ esink(const void* buf, size_t len, SerdWriter* writer) // Write a single character, as an escape for single byte characters // (Caller prints any single byte characters that don't need escaping) static size_t -write_character(SerdWriter* writer, const uint8_t* utf8, size_t* size) +write_character(SerdWriter* writer, + const uint8_t* utf8, + size_t* size, + SerdStatus* st) { char escape[11] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; const uint32_t c = parse_utf8_char(utf8, size); switch (*size) { case 0: - w_err(writer, SERD_ERR_BAD_ARG, "invalid UTF-8 start: %X\n", utf8[0]); + *st = + w_err(writer, SERD_ERR_BAD_TEXT, "invalid UTF-8 start: %X\n", utf8[0]); return 0; case 1: snprintf(escape, sizeof(escape), "\\u%04X", utf8[0]); @@ -271,7 +286,7 @@ write_character(SerdWriter* writer, const uint8_t* utf8, size_t* size) return sink(escape, 10, writer); } -static bool +SERD_NODISCARD static bool uri_must_escape(const uint8_t c) { switch (c) { @@ -292,7 +307,10 @@ uri_must_escape(const uint8_t c) } static size_t -write_uri(SerdWriter* writer, const uint8_t* utf8, size_t n_bytes) +write_uri(SerdWriter* writer, + const uint8_t* utf8, + size_t n_bytes, + SerdStatus* st) { size_t len = 0; for (size_t i = 0; i < n_bytes;) { @@ -304,15 +322,25 @@ write_uri(SerdWriter* writer, const uint8_t* utf8, size_t n_bytes) } // Bulk write all characters up to this special one - len += sink(&utf8[i], j - i, writer); + const size_t n_bulk = sink(&utf8[i], j - i, writer); + len += n_bulk; + if (n_bulk != j - i) { + *st = SERD_ERR_BAD_WRITE; + return len; + } + if ((i = j) == n_bytes) { break; // Reached end } // Write UTF-8 character size_t size = 0; - len += write_character(writer, utf8 + i, &size); + len += write_character(writer, utf8 + i, &size, st); i += size; + if (*st && (writer->style & SERD_STYLE_STRICT)) { + break; + } + if (size == 0) { // Corrupt input, write percent-encoded bytes and scan to next start char escape[4] = {0, 0, 0, 0}; @@ -326,6 +354,23 @@ write_uri(SerdWriter* writer, const uint8_t* utf8, size_t n_bytes) return len; } +SERD_NODISCARD static SerdStatus +ewrite_uri(SerdWriter* writer, const uint8_t* utf8, size_t n_bytes) +{ + SerdStatus st = SERD_SUCCESS; + write_uri(writer, utf8, n_bytes, &st); + + return (st == SERD_ERR_BAD_WRITE || (writer->style & SERD_STYLE_STRICT)) + ? st + : SERD_SUCCESS; +} + +SERD_NODISCARD static SerdStatus +write_uri_from_node(SerdWriter* writer, const SerdNode* node) +{ + return ewrite_uri(writer, node->buf, node->n_bytes); +} + static bool lname_must_escape(const uint8_t c) { @@ -363,10 +408,10 @@ lname_must_escape(const uint8_t c) return false; } -static size_t +SERD_NODISCARD static SerdStatus write_lname(SerdWriter* writer, const uint8_t* utf8, size_t n_bytes) { - size_t len = 0; + SerdStatus st = SERD_SUCCESS; for (size_t i = 0; i < n_bytes; ++i) { size_t j = i; // Index of next character that must be escaped for (; j < n_bytes; ++j) { @@ -376,28 +421,28 @@ write_lname(SerdWriter* writer, const uint8_t* utf8, size_t n_bytes) } // Bulk write all characters up to this special one - len += sink(&utf8[i], j - i, writer); + TRY(st, esink(&utf8[i], j - i, writer)); if ((i = j) == n_bytes) { break; // Reached end } // Write escape - len += sink("\\", 1, writer); - len += sink(&utf8[i], 1, writer); + TRY(st, esink("\\", 1, writer)); + TRY(st, esink(&utf8[i], 1, writer)); } - return len; + return st; } -static size_t +SERD_NODISCARD static SerdStatus write_text(SerdWriter* writer, TextContext ctx, const uint8_t* utf8, size_t n_bytes) { - size_t len = 0; - size_t n_consecutive_quotes = 0; - for (size_t i = 0; i < n_bytes;) { + size_t n_consecutive_quotes = 0; + SerdStatus st = SERD_SUCCESS; + for (size_t i = 0; !st && i < n_bytes;) { if (utf8[i] != '"') { n_consecutive_quotes = 0; } @@ -411,7 +456,7 @@ write_text(SerdWriter* writer, } } - len += sink(&utf8[i], j - i, writer); + st = esink(&utf8[i], j - i, writer); if ((i = j) == n_bytes) { break; // Reached end } @@ -422,44 +467,44 @@ write_text(SerdWriter* writer, switch (in) { case '\\': - len += sink("\\\\", 2, writer); + st = esink("\\\\", 2, writer); continue; case '\b': - len += sink("\\b", 2, writer); + st = esink("\\b", 2, writer); continue; case '\n': case '\r': case '\t': case '\f': - len += sink(&in, 1, writer); // Write character as-is + st = esink(&in, 1, writer); // Write character as-is continue; case '\"': if (n_consecutive_quotes >= 3 || i == n_bytes) { // Two quotes in a row, or quote at string end, escape - len += sink("\\\"", 2, writer); + st = esink("\\\"", 2, writer); } else { - len += sink(&in, 1, writer); + st = esink(&in, 1, writer); } continue; default: break; } - } else if (ctx == WRITE_STRING) { + } else { switch (in) { case '\\': - len += sink("\\\\", 2, writer); + st = esink("\\\\", 2, writer); continue; case '\n': - len += sink("\\n", 2, writer); + st = esink("\\n", 2, writer); continue; case '\r': - len += sink("\\r", 2, writer); + st = esink("\\r", 2, writer); continue; case '\t': - len += sink("\\t", 2, writer); + st = esink("\\t", 2, writer); continue; case '"': - len += sink("\\\"", 2, writer); + st = esink("\\\"", 2, writer); continue; default: break; @@ -467,10 +512,10 @@ write_text(SerdWriter* writer, if (writer->syntax == SERD_TURTLE) { switch (in) { case '\b': - len += sink("\\b", 2, writer); + st = esink("\\b", 2, writer); continue; case '\f': - len += sink("\\f", 2, writer); + st = esink("\\f", 2, writer); continue; default: break; @@ -480,10 +525,14 @@ write_text(SerdWriter* writer, // Write UTF-8 character size_t size = 0; - len += write_character(writer, utf8 + i - 1, &size); + write_character(writer, utf8 + i - 1, &size, &st); + if (st && (writer->style & SERD_STYLE_STRICT)) { + return st; + } + if (size == 0) { // Corrupt input, write replacement character and scan to the next start - len += sink(replacement_char, sizeof(replacement_char), writer); + st = esink(replacement_char, sizeof(replacement_char), writer); for (; i < n_bytes && (utf8[i] & 0x80); ++i) { } } else { @@ -491,13 +540,21 @@ write_text(SerdWriter* writer, } } - return len; + return (writer->style & SERD_STYLE_STRICT) ? st : SERD_SUCCESS; } -static size_t +typedef struct { + SerdWriter* writer; + SerdStatus status; +} UriSinkContext; + +SERD_NODISCARD static size_t uri_sink(const void* buf, size_t len, void* stream) { - return write_uri((SerdWriter*)stream, (const uint8_t*)buf, len); + UriSinkContext* const context = (UriSinkContext*)stream; + SerdWriter* const writer = context->writer; + + return write_uri(writer, (const uint8_t*)buf, len, &context->status); } SERD_NODISCARD static SerdStatus @@ -596,14 +653,6 @@ reset_context(SerdWriter* writer, const unsigned flags) return SERD_SUCCESS; } -static bool -is_inline_start(const SerdWriter* writer, Field field, SerdStatementFlags flags) -{ - return (supports_abbrev(writer) && - ((field == FIELD_SUBJECT && (flags & SERD_ANON_S_BEGIN)) || - (field == FIELD_OBJECT && (flags & SERD_ANON_O_BEGIN)))); -} - SERD_NODISCARD static SerdStatus write_literal(SerdWriter* writer, const SerdNode* node, @@ -636,11 +685,11 @@ write_literal(SerdWriter* writer, if (supports_abbrev(writer) && (node->flags & (SERD_HAS_NEWLINE | SERD_HAS_QUOTE))) { TRY(st, esink("\"\"\"", 3, writer)); - write_text(writer, WRITE_LONG_STRING, node->buf, node->n_bytes); + TRY(st, write_text(writer, WRITE_LONG_STRING, node->buf, node->n_bytes)); TRY(st, esink("\"\"\"", 3, writer)); } else { TRY(st, esink("\"", 1, writer)); - write_text(writer, WRITE_STRING, node->buf, node->n_bytes); + TRY(st, write_text(writer, WRITE_STRING, node->buf, node->n_bytes)); TRY(st, esink("\"", 1, writer)); } if (lang && lang->buf) { @@ -650,7 +699,8 @@ write_literal(SerdWriter* writer, TRY(st, esink("^^", 2, writer)); return write_node(writer, datatype, NULL, NULL, FIELD_NONE, flags); } - return SERD_SUCCESS; + + return st; } // Return true iff `buf` is a valid prefixed name prefix or suffix @@ -691,10 +741,9 @@ write_uri_node(SerdWriter* const writer, serd_env_qualify(writer->env, node, &prefix, &suffix) && is_name(prefix.buf, prefix.n_bytes) && is_name(suffix.buf, suffix.len)) { - write_uri(writer, prefix.buf, prefix.n_bytes); + TRY(st, write_uri_from_node(writer, &prefix)); TRY(st, esink(":", 1, writer)); - write_uri(writer, suffix.buf, suffix.len); - return SERD_SUCCESS; + return ewrite_uri(writer, suffix.buf, suffix.len); } } @@ -707,6 +756,7 @@ write_uri_node(SerdWriter* const writer, } TRY(st, esink("<", 1, writer)); + if (writer->style & SERD_STYLE_RESOLVED) { SerdURI in_base_uri; SerdURI uri; @@ -714,21 +764,21 @@ write_uri_node(SerdWriter* const writer, serd_env_get_base_uri(writer->env, &in_base_uri); serd_uri_parse(node->buf, &uri); serd_uri_resolve(&uri, &in_base_uri, &abs_uri); - bool rooted = uri_is_under(&writer->base_uri, &writer->root_uri); - SerdURI* root = rooted ? &writer->root_uri : &writer->base_uri; + bool rooted = uri_is_under(&writer->base_uri, &writer->root_uri); + SerdURI* root = rooted ? &writer->root_uri : &writer->base_uri; + UriSinkContext ctx = {writer, SERD_SUCCESS}; if (!uri_is_under(&abs_uri, root) || writer->syntax == SERD_NTRIPLES || writer->syntax == SERD_NQUADS) { - serd_uri_serialise(&abs_uri, uri_sink, writer); + serd_uri_serialise(&abs_uri, uri_sink, &ctx); } else { serd_uri_serialise_relative( - &uri, &writer->base_uri, root, uri_sink, writer); + &uri, &writer->base_uri, root, uri_sink, &ctx); } } else { - write_uri(writer, node->buf, node->n_bytes); + TRY(st, write_uri_from_node(writer, node)); } - TRY(st, esink(">", 1, writer)); - return SERD_SUCCESS; + return esink(">", 1, writer); } SERD_NODISCARD static SerdStatus @@ -750,11 +800,11 @@ write_curie(SerdWriter* const writer, const SerdNode* const node) if (!supports_abbrev(writer)) { TRY(st, esink("<", 1, writer)); - write_uri(writer, prefix.buf, prefix.len); - write_uri(writer, suffix.buf, suffix.len); + TRY(st, ewrite_uri(writer, prefix.buf, prefix.len)); + TRY(st, ewrite_uri(writer, suffix.buf, suffix.len)); TRY(st, esink(">", 1, writer)); } else { - write_lname(writer, node->buf, node->n_bytes); + TRY(st, write_lname(writer, node->buf, node->n_bytes)); } return st; @@ -769,7 +819,8 @@ write_blank(SerdWriter* const writer, SerdStatus st = SERD_SUCCESS; if (supports_abbrev(writer)) { - if (is_inline_start(writer, field, flags)) { + if ((field == FIELD_SUBJECT && (flags & SERD_ANON_S_BEGIN)) || + (field == FIELD_OBJECT && (flags & SERD_ANON_O_BEGIN))) { return write_sep(writer, SEP_ANON_BEGIN); } @@ -808,6 +859,7 @@ write_node(SerdWriter* writer, SerdStatementFlags flags) { SerdStatus st = SERD_SUCCESS; + switch (node->type) { case SERD_NOTHING: break; @@ -1130,7 +1182,7 @@ serd_writer_set_base_uri(SerdWriter* writer, const SerdNode* uri) serd_env_get_base_uri(writer->env, &writer->base_uri); - if (writer->syntax == SERD_TURTLE || writer->syntax == SERD_TRIG) { + if (uri && (writer->syntax == SERD_TURTLE || writer->syntax == SERD_TRIG)) { TRY(st, terminate_context(writer)); TRY(st, esink("@base <", 7, writer)); TRY(st, esink(uri->buf, uri->n_bytes, writer)); @@ -1172,7 +1224,7 @@ serd_writer_set_prefix(SerdWriter* writer, TRY(st, esink("@prefix ", 8, writer)); TRY(st, esink(name->buf, name->n_bytes, writer)); TRY(st, esink(": <", 3, writer)); - write_uri(writer, uri->buf, uri->n_bytes); + TRY(st, ewrite_uri(writer, uri->buf, uri->n_bytes)); TRY(st, esink(">", 1, writer)); writer->last_sep = SEP_NODE; TRY(st, write_sep(writer, SEP_END_DIRECT)); -- cgit v1.2.1