diff options
author | David Robillard <d@drobilla.net> | 2023-02-05 18:39:49 -0500 |
---|---|---|
committer | David Robillard <d@drobilla.net> | 2023-12-02 18:49:08 -0500 |
commit | 343124df71010055c2c1e6cdcadd13d23b2c013a (patch) | |
tree | 7c2de6a72021adaac89e9c4fa97e7cc5503e0657 /src | |
parent | 530edb265fbbed20e6d3a6fd7a36461ff83d9b46 (diff) | |
download | serd-343124df71010055c2c1e6cdcadd13d23b2c013a.tar.gz serd-343124df71010055c2c1e6cdcadd13d23b2c013a.tar.bz2 serd-343124df71010055c2c1e6cdcadd13d23b2c013a.zip |
[WIP] Add support for URI hex escape decoding
Diffstat (limited to 'src')
-rw-r--r-- | src/read_ntriples.c | 66 | ||||
-rw-r--r-- | src/writer.c | 351 |
2 files changed, 290 insertions, 127 deletions
diff --git a/src/read_ntriples.c b/src/read_ntriples.c index dd5c28fc..5c02abfe 100644 --- a/src/read_ntriples.c +++ b/src/read_ntriples.c @@ -108,6 +108,67 @@ read_IRI_scheme(SerdReader* const reader, SerdNode* const dest) return st ? st : SERD_BAD_SYNTAX; } +static SerdStatus +read_hex_byte(SerdReader* const reader, uint8_t digits[const 2]) +{ + for (unsigned i = 0U; i < 2U; ++i) { + if (!(digits[i] = read_HEX(reader))) { + return SERD_BAD_SYNTAX; + } + } + + return SERD_SUCCESS; +} + +static uint8_t +hex_byte_value(const uint8_t c0, const uint8_t c1) +{ + return (uint8_t)((hex_digit_value(c0) << 4U) | hex_digit_value(c1)); +} + +/// RFC3986 S2.1: pct-encoded = "%" HEXDIG HEXDIG +static SerdStatus +read_pct_encoded(SerdReader* const reader, SerdNode* const node) +{ + SerdStatus st = SERD_SUCCESS; + uint8_t hex[9] = {0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U}; + + // Read first percent-encoded byte + TRY(st, read_hex_byte(reader, hex)); + + // Parse the leading byte and get the encoded size from it + uint8_t byte = hex_byte_value(hex[0], hex[1]); + const uint32_t size = utf8_num_bytes(byte); + if (!size) { + return SERD_BAD_TEXT; + } + + // Avoid decoding '%' itself + if (byte == '%') { + return push_bytes(reader, node, (const uint8_t*)"%25", 3); + } + + // Push the leading byte to the node + TRY(st, push_byte(reader, node, byte)); + + // Read remaining hex-encoded bytes + for (unsigned i = 1; i < size; ++i) { + const unsigned offset = 2U * i; + uint8_t* const digits = hex + offset; + TRY(st, eat_byte_check(reader, '%')); + TRY(st, read_hex_byte(reader, digits)); + + byte = hex_byte_value(digits[0], digits[1]); + if (!is_utf8_continuation(byte)) { + return SERD_BAD_TEXT; + } + + TRY(st, push_byte(reader, node, byte)); + } + + return st; +} + SerdStatus read_IRIREF_suffix(SerdReader* const reader, SerdNode* const node) { @@ -131,6 +192,11 @@ read_IRIREF_suffix(SerdReader* const reader, SerdNode* const node) case '>': return SERD_SUCCESS; + case '%': + st = (reader->flags & SERD_READ_DECODED) ? read_pct_encoded(reader, node) + : push_byte(reader, node, c); + break; + case '\\': if (!(st = read_UCHAR(reader, node, &code)) && (code == ' ' || code == '<' || code == '>')) { diff --git a/src/writer.c b/src/writer.c index fd52a123..ccb24e5f 100644 --- a/src/writer.c +++ b/src/writer.c @@ -53,6 +53,13 @@ typedef struct { bool comma_indented; } WriteContext; +/// A status for an operation that reads/writes variable numbers of bytes +typedef struct { + SerdStatus status; + size_t read_count; + size_t write_count; +} VariableResult; + static const WriteContext WRITE_CONTEXT_NULL = {CTX_NAMED, 0U, NULL, NULL, NULL, 0U, 0U}; @@ -280,47 +287,127 @@ esink(const void* buf, size_t len, SerdWriter* writer) return sink(buf, len, writer) == len ? SERD_SUCCESS : SERD_BAD_WRITE; } -// Write a single character as a Unicode escape -// (Caller prints any single byte characters that don't need escaping) -static size_t -write_character(SerdWriter* const writer, - const uint8_t* const utf8, - uint8_t* const size, - SerdStatus* const st) +static VariableResult +write_UCHAR(SerdWriter* const writer, const uint8_t* const utf8) { + VariableResult result = {SERD_SUCCESS, 0U, 0U}; char escape[11] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; - const uint32_t c = parse_utf8_char(utf8, size); - switch (*size) { - case 0: - *st = w_err(writer, SERD_BAD_TEXT, "invalid UTF-8 start: %X", utf8[0]); - return 0; - case 1: - snprintf(escape, sizeof(escape), "\\u%04X", utf8[0]); - return sink(escape, 6, writer); - default: - break; + uint8_t c_size = 0U; + const uint32_t c = parse_utf8_char(utf8, &c_size); + + result.read_count = c_size; + if (result.read_count == 0U) { + result.status = + w_err(writer, SERD_BAD_TEXT, "invalid UTF-8 start: %X", utf8[0]); + } else if (c <= 0xFFFF) { + // Write short (4 digit) escape + snprintf(escape, sizeof(escape), "\\u%04X", c); + result.write_count = sink(escape, 6, writer); + } else { + // Write long (6 digit) escape + snprintf(escape, sizeof(escape), "\\U%08X", c); + result.write_count = sink(escape, 10, writer); } - if (!(writer->flags & SERD_WRITE_ASCII)) { - // Write UTF-8 character directly to UTF-8 output - return sink(utf8, *size, writer); + return result; +} + +SERD_NODISCARD static VariableResult +write_percent_encoded_bytes(SerdWriter* const writer, + const size_t size, + const uint8_t* const data) +{ + static const char hex_chars[] = "0123456789ABCDEF"; + + VariableResult result = {SERD_SUCCESS, 0U, 0U}; + char escape[4] = {'%', 0, 0, 0}; + + for (size_t i = 0U; !result.status && i < size; ++i) { + const uint8_t byte = data[i]; + escape[1] = hex_chars[byte >> 4U]; + escape[2] = hex_chars[byte & 0x0FU]; + + const size_t n_written = sink(escape, 3U, writer); + result.write_count += n_written; + if (n_written != 3U) { + result.status = SERD_BAD_WRITE; + } + + ++result.read_count; } - if (c <= 0xFFFF) { - snprintf(escape, sizeof(escape), "\\u%04X", c); - return sink(escape, 6, writer); + return result; +} + +static VariableResult +write_text_character(SerdWriter* const writer, const uint8_t* const utf8) +{ + VariableResult result = {SERD_SUCCESS, 0U, 0U}; + const uint8_t c = utf8[0]; + + if ((writer->flags & (SERD_WRITE_ASCII | SERD_WRITE_ESCAPES)) || c < 0x20U || + c == 0x7FU) { + // Write ASCII-compatible UCHAR escape like "\u1234" + return write_UCHAR(writer, utf8); + } + + // Parse the leading byte to get the UTF-8 encoding size + if (!(result.read_count = utf8_num_bytes(c))) { + result.status = SERD_BAD_TEXT; + return result; + } + + // Write the UTF-8 encoding directly to the output + result.write_count = sink(utf8, result.read_count, writer); + if (result.write_count != result.read_count) { + result.status = SERD_BAD_WRITE; + } + + return result; +} + +static VariableResult +write_uri_character(SerdWriter* const writer, const uint8_t* const utf8) +{ + VariableResult result = {SERD_SUCCESS, 0U, 0U}; + const uint8_t c = utf8[0]; + + if ((writer->flags & SERD_WRITE_ESCAPES)) { + return write_UCHAR(writer, utf8); + } + + if (c == '%') { + // Avoid encoding '%' itself + result.read_count = 1; + result.write_count = sink("%25", 3, writer); + return result; + } + + if ((c & 0x80U) && !(writer->flags & SERD_WRITE_ASCII)) { + // Parse the leading byte to get the UTF-8 encoding size + if (!(result.read_count = utf8_num_bytes(c))) { + result.status = SERD_BAD_TEXT; + } else { + // Write the UTF-8 encoding directly to the output + result.write_count = sink(utf8, result.read_count, writer); + if (result.write_count != result.read_count) { + result.status = SERD_BAD_WRITE; + } + } + + return result; } - snprintf(escape, sizeof(escape), "\\U%08X", c); - return sink(escape, 10, writer); + return write_percent_encoded_bytes(writer, 1U, utf8); } static bool -uri_must_escape(const int c) +uri_must_escape(const uint8_t c) { switch (c) { case ' ': case '"': + // case '%': case '<': case '>': case '\\': @@ -336,58 +423,60 @@ uri_must_escape(const int c) } static size_t -write_uri(SerdWriter* writer, const char* utf8, size_t n_bytes, SerdStatus* st) +next_text_index(const char* utf8, + const size_t begin, + const size_t end, + bool (*const predicate)(uint8_t)) { - size_t len = 0; - for (size_t i = 0; i < n_bytes;) { - size_t j = i; // Index of next character that must be escaped - for (; j < n_bytes; ++j) { - if (uri_must_escape(utf8[j])) { - break; - } - } - - // Bulk write all characters up to this special one - const size_t n_bulk = sink(&utf8[i], j - i, writer); - len += n_bulk; - if (n_bulk != j - i) { - *st = SERD_BAD_WRITE; - return len; - } + size_t i = begin; + while (i < end && !predicate((uint8_t)utf8[i])) { + ++i; + } + return i; +} +static VariableResult +write_uri(SerdWriter* writer, const char* utf8, const size_t n_bytes) +{ + VariableResult result = {SERD_SUCCESS, 0U, 0U}; + for (size_t i = 0; i < n_bytes;) { + // Write leading chunk as a single fast bulk write + const size_t j = next_text_index(utf8, i, n_bytes, uri_must_escape); + result.status = esink(&utf8[i], j - i, writer); if ((i = j) == n_bytes) { break; // Reached end } - // Write UTF-8 character - uint8_t size = 0; - len += write_character(writer, (const uint8_t*)utf8 + i, &size, st); - i += size; - if (*st && !(writer->flags & SERD_WRITE_LAX)) { + // Write character (escape or UTF-8) + const VariableResult r = + write_uri_character(writer, (const uint8_t*)utf8 + i); + i += r.read_count; + result.write_count += r.write_count; + if (r.status && !(writer->flags & SERD_WRITE_LAX)) { + result.status = r.status; break; } - if (size == 0) { + if (r.read_count == 0) { // Corrupt input, write percent-encoded bytes and scan to next start char escape[4] = {0, 0, 0, 0}; for (; i < n_bytes && !is_utf8_leading((uint8_t)utf8[i]); ++i) { snprintf(escape, sizeof(escape), "%%%02X", (uint8_t)utf8[i]); - len += sink(escape, 3, writer); + result.write_count += sink(escape, 3, writer); } } } - return len; + return result; } SERD_NODISCARD static SerdStatus ewrite_uri(SerdWriter* writer, const char* utf8, size_t n_bytes) { - SerdStatus st = SERD_SUCCESS; - write_uri(writer, utf8, n_bytes, &st); + const VariableResult r = write_uri(writer, utf8, n_bytes); - return (st == SERD_BAD_WRITE || !(writer->flags & SERD_WRITE_LAX)) - ? st + return (r.status == SERD_BAD_WRITE || !(writer->flags & SERD_WRITE_LAX)) + ? r.status : SERD_SUCCESS; } @@ -398,27 +487,6 @@ write_uri_from_node(SerdWriter* writer, const SerdNode* node) } SERD_NODISCARD static SerdStatus -write_utf8_percent_escape(SerdWriter* const writer, - const char* const utf8, - const size_t n_bytes) -{ - static const char hex_chars[] = "0123456789ABCDEF"; - - SerdStatus st = SERD_SUCCESS; - char escape[4] = {'%', 0, 0, 0}; - - for (size_t i = 0U; i < n_bytes; ++i) { - const uint8_t byte = (uint8_t)utf8[i]; - escape[1] = hex_chars[byte >> 4U]; - escape[2] = hex_chars[byte & 0x0FU]; - - TRY(st, esink(escape, 3, writer)); - } - - return st; -} - -SERD_NODISCARD static SerdStatus write_PN_LOCAL_ESC(SerdWriter* const writer, const char c) { const char buf[2] = {'\\', c}; @@ -431,7 +499,8 @@ write_lname_escape(SerdWriter* writer, const char* const utf8, size_t n_bytes) { return is_PN_LOCAL_ESC(utf8[0]) ? write_PN_LOCAL_ESC(writer, utf8[0]) - : write_utf8_percent_escape(writer, utf8, n_bytes); + : write_percent_encoded_bytes(writer, n_bytes, (const uint8_t*)utf8) + .status; } SERD_NODISCARD static SerdStatus @@ -518,14 +587,16 @@ write_short_string_escape(SerdWriter* const writer, const char c) case '\r': return sink("\\r", 2, writer); case '\t': - return sink("\\t", 2, writer); + return (writer->flags & SERD_WRITE_ESCAPES) ? sink("\\t", 2, writer) + : sink("\t", 1, writer); case '"': return sink("\\\"", 2, writer); default: break; } - if (writer->syntax == SERD_TURTLE) { + if (!(writer->flags & SERD_WRITE_ESCAPES)) { + // These are written with UCHAR in pre-NTriples test cases format switch (c) { case '\b': return sink("\\b", 2, writer); @@ -539,63 +610,84 @@ write_short_string_escape(SerdWriter* const writer, const char c) return 0; } -static bool -text_must_escape(const char c) +SERD_NODISCARD static bool +text_must_escape(const uint8_t c) { return c == '\\' || c == '"' || !in_range(c, 0x20, 0x7E); } SERD_NODISCARD static SerdStatus -write_text(SerdWriter* writer, - TextContext ctx, - const char* utf8, - size_t n_bytes) +write_short_text(SerdWriter* writer, const char* utf8, size_t n_bytes) { - size_t n_consecutive_quotes = 0; - SerdStatus st = SERD_SUCCESS; - for (size_t i = 0; !st && i < n_bytes;) { - if (utf8[i] != '"') { - n_consecutive_quotes = 0; + VariableResult result = {SERD_SUCCESS, 0U, 0U}; + for (size_t i = 0; !result.status && i < n_bytes;) { + // Write leading chunk as a single fast bulk write + const size_t j = next_text_index(utf8, i, n_bytes, text_must_escape); + result.status = esink(&utf8[i], j - i, writer); + if ((i = j) == n_bytes) { + break; // Reached end + } + + // Try to write character as a special short escape (newline and friends) + const char in = utf8[i]; + const size_t escape_len = write_short_string_escape(writer, in); + + if (!escape_len) { + // No special escape for this character, write full Unicode escape + result = write_text_character(writer, (const uint8_t*)utf8 + i); + i += result.read_count; + + if (!result.read_count && (writer->flags & SERD_WRITE_LAX)) { + // Corrupt input, write replacement character and scan to the next start + result.status = + esink(replacement_char, sizeof(replacement_char), writer); + i += next_text_index(utf8, i, n_bytes, is_utf8_leading); + } + } else { + ++i; } + } + + return result.status; +} - // Scan for the longest chunk of characters that can be written directly - size_t j = i; - for (; j < n_bytes && !text_must_escape(utf8[j]); ++j) { +SERD_NODISCARD static SerdStatus +write_long_text(SerdWriter* writer, const char* utf8, size_t n_bytes) +{ + size_t n_quotes = 0; + VariableResult result = {SERD_SUCCESS, 0U, 0U}; + for (size_t i = 0; !result.status && i < n_bytes;) { + if (utf8[i] != '"') { + n_quotes = 0; } - // Write chunk as a single fast bulk write - st = esink(&utf8[i], j - i, writer); + // Write leading chunk as a single fast bulk write + const size_t j = next_text_index(utf8, i, n_bytes, text_must_escape); + result.status = esink(&utf8[i], j - i, writer); if ((i = j) == n_bytes) { break; // Reached end } - // Try to write character as a special short escape (newline and friends) - const char in = utf8[i++]; - size_t escape_len = 0; - if (ctx == WRITE_LONG_STRING) { - n_consecutive_quotes = (in == '\"') ? (n_consecutive_quotes + 1) : 0; - escape_len = write_long_string_escape( - writer, n_consecutive_quotes, i == n_bytes, in); - } else { - escape_len = write_short_string_escape(writer, in); - } + // Try to write character as a special long escape (newline and friends) + const char in = utf8[i]; + n_quotes = (in == '\"') ? (n_quotes + 1U) : 0; + const size_t escape_len = + write_long_string_escape(writer, n_quotes, i + 1U == n_bytes, in); - if (escape_len == 0) { + if (!escape_len) { // No special escape for this character, write full Unicode escape - uint8_t size = 0; - write_character(writer, (const uint8_t*)utf8 + i - 1, &size, &st); - if (st && !(writer->flags & SERD_WRITE_LAX)) { - return st; - } - - if (size == 0) { - // Corrupt input, write replacement character and scan to the next start - st = esink(replacement_char, sizeof(replacement_char), writer); - for (; i < n_bytes && !is_utf8_leading((uint8_t)utf8[i]); ++i) { - } - } else { - i += size - 1U; + result = write_UCHAR(writer, (const uint8_t*)utf8 + i); + i += result.read_count; + + if (!result.read_count && (writer->flags & SERD_WRITE_LAX)) { + // Corrupt input, write replacement character and scan to the next + // start + result.status = + esink(replacement_char, sizeof(replacement_char), writer); + i += next_text_index(utf8, i, n_bytes, is_utf8_leading); } + } else { + ++i; } } @@ -615,8 +707,10 @@ uri_sink(const void* buf, size_t size, size_t nmemb, void* stream) UriSinkContext* const context = (UriSinkContext*)stream; SerdWriter* const writer = context->writer; + const VariableResult r = write_uri(writer, (const char*)buf, nmemb); - return write_uri(writer, (const char*)buf, nmemb, &context->status); + context->status = r.status; + return r.write_count; } SERD_NODISCARD static SerdStatus @@ -774,11 +868,11 @@ write_literal(SerdWriter* const writer, if (supports_abbrev(writer) && (node->flags & SERD_IS_LONG)) { TRY(st, esink("\"\"\"", 3, writer)); - TRY(st, write_text(writer, WRITE_LONG_STRING, node_str, node->length)); + TRY(st, write_long_text(writer, node_str, node->length)); TRY(st, esink("\"\"\"", 3, writer)); } else { TRY(st, esink("\"", 1, writer)); - TRY(st, write_text(writer, WRITE_STRING, node_str, node->length)); + TRY(st, write_short_text(writer, node_str, node->length)); TRY(st, esink("\"", 1, writer)); } if (lang && serd_node_string(lang)) { @@ -1390,11 +1484,14 @@ serd_writer_set_base_uri(SerdWriter* writer, const SerdNode* uri) if (uri && (writer->syntax == SERD_TURTLE || writer->syntax == SERD_TRIG)) { TRY(st, terminate_context(writer)); - TRY(st, esink("@base <", 7, writer)); - TRY(st, esink(uri_string.data, uri_string.length, writer)); - TRY(st, esink(">", 1, writer)); - writer->last_sep = SEP_NODE; - TRY(st, write_sep(writer, writer->context.flags, SEP_END_DIRECT)); + + if (!(writer->flags & SERD_WRITE_CONTEXTUAL)) { + TRY(st, esink("@base <", 7, writer)); + TRY(st, esink(uri_string.data, uri_string.length, writer)); + TRY(st, esink(">", 1, writer)); + writer->last_sep = SEP_NODE; + TRY(st, write_sep(writer, writer->context.flags, SEP_END_DIRECT)); + } } return reset_context(writer, RESET_GRAPH | RESET_INDENT); |