aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorDavid Robillard <d@drobilla.net>2023-02-05 18:39:49 -0500
committerDavid Robillard <d@drobilla.net>2023-12-02 18:49:08 -0500
commit343124df71010055c2c1e6cdcadd13d23b2c013a (patch)
tree7c2de6a72021adaac89e9c4fa97e7cc5503e0657 /src
parent530edb265fbbed20e6d3a6fd7a36461ff83d9b46 (diff)
downloadserd-343124df71010055c2c1e6cdcadd13d23b2c013a.tar.gz
serd-343124df71010055c2c1e6cdcadd13d23b2c013a.tar.bz2
serd-343124df71010055c2c1e6cdcadd13d23b2c013a.zip
[WIP] Add support for URI hex escape decoding
Diffstat (limited to 'src')
-rw-r--r--src/read_ntriples.c66
-rw-r--r--src/writer.c351
2 files changed, 290 insertions, 127 deletions
diff --git a/src/read_ntriples.c b/src/read_ntriples.c
index dd5c28fc..5c02abfe 100644
--- a/src/read_ntriples.c
+++ b/src/read_ntriples.c
@@ -108,6 +108,67 @@ read_IRI_scheme(SerdReader* const reader, SerdNode* const dest)
return st ? st : SERD_BAD_SYNTAX;
}
+static SerdStatus
+read_hex_byte(SerdReader* const reader, uint8_t digits[const 2])
+{
+ for (unsigned i = 0U; i < 2U; ++i) {
+ if (!(digits[i] = read_HEX(reader))) {
+ return SERD_BAD_SYNTAX;
+ }
+ }
+
+ return SERD_SUCCESS;
+}
+
+static uint8_t
+hex_byte_value(const uint8_t c0, const uint8_t c1)
+{
+ return (uint8_t)((hex_digit_value(c0) << 4U) | hex_digit_value(c1));
+}
+
+/// RFC3986 S2.1: pct-encoded = "%" HEXDIG HEXDIG
+static SerdStatus
+read_pct_encoded(SerdReader* const reader, SerdNode* const node)
+{
+ SerdStatus st = SERD_SUCCESS;
+ uint8_t hex[9] = {0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U};
+
+ // Read first percent-encoded byte
+ TRY(st, read_hex_byte(reader, hex));
+
+ // Parse the leading byte and get the encoded size from it
+ uint8_t byte = hex_byte_value(hex[0], hex[1]);
+ const uint32_t size = utf8_num_bytes(byte);
+ if (!size) {
+ return SERD_BAD_TEXT;
+ }
+
+ // Avoid decoding '%' itself
+ if (byte == '%') {
+ return push_bytes(reader, node, (const uint8_t*)"%25", 3);
+ }
+
+ // Push the leading byte to the node
+ TRY(st, push_byte(reader, node, byte));
+
+ // Read remaining hex-encoded bytes
+ for (unsigned i = 1; i < size; ++i) {
+ const unsigned offset = 2U * i;
+ uint8_t* const digits = hex + offset;
+ TRY(st, eat_byte_check(reader, '%'));
+ TRY(st, read_hex_byte(reader, digits));
+
+ byte = hex_byte_value(digits[0], digits[1]);
+ if (!is_utf8_continuation(byte)) {
+ return SERD_BAD_TEXT;
+ }
+
+ TRY(st, push_byte(reader, node, byte));
+ }
+
+ return st;
+}
+
SerdStatus
read_IRIREF_suffix(SerdReader* const reader, SerdNode* const node)
{
@@ -131,6 +192,11 @@ read_IRIREF_suffix(SerdReader* const reader, SerdNode* const node)
case '>':
return SERD_SUCCESS;
+ case '%':
+ st = (reader->flags & SERD_READ_DECODED) ? read_pct_encoded(reader, node)
+ : push_byte(reader, node, c);
+ break;
+
case '\\':
if (!(st = read_UCHAR(reader, node, &code)) &&
(code == ' ' || code == '<' || code == '>')) {
diff --git a/src/writer.c b/src/writer.c
index fd52a123..ccb24e5f 100644
--- a/src/writer.c
+++ b/src/writer.c
@@ -53,6 +53,13 @@ typedef struct {
bool comma_indented;
} WriteContext;
+/// A status for an operation that reads/writes variable numbers of bytes
+typedef struct {
+ SerdStatus status;
+ size_t read_count;
+ size_t write_count;
+} VariableResult;
+
static const WriteContext WRITE_CONTEXT_NULL =
{CTX_NAMED, 0U, NULL, NULL, NULL, 0U, 0U};
@@ -280,47 +287,127 @@ esink(const void* buf, size_t len, SerdWriter* writer)
return sink(buf, len, writer) == len ? SERD_SUCCESS : SERD_BAD_WRITE;
}
-// Write a single character as a Unicode escape
-// (Caller prints any single byte characters that don't need escaping)
-static size_t
-write_character(SerdWriter* const writer,
- const uint8_t* const utf8,
- uint8_t* const size,
- SerdStatus* const st)
+static VariableResult
+write_UCHAR(SerdWriter* const writer, const uint8_t* const utf8)
{
+ VariableResult result = {SERD_SUCCESS, 0U, 0U};
char escape[11] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
- const uint32_t c = parse_utf8_char(utf8, size);
- switch (*size) {
- case 0:
- *st = w_err(writer, SERD_BAD_TEXT, "invalid UTF-8 start: %X", utf8[0]);
- return 0;
- case 1:
- snprintf(escape, sizeof(escape), "\\u%04X", utf8[0]);
- return sink(escape, 6, writer);
- default:
- break;
+ uint8_t c_size = 0U;
+ const uint32_t c = parse_utf8_char(utf8, &c_size);
+
+ result.read_count = c_size;
+ if (result.read_count == 0U) {
+ result.status =
+ w_err(writer, SERD_BAD_TEXT, "invalid UTF-8 start: %X", utf8[0]);
+ } else if (c <= 0xFFFF) {
+ // Write short (4 digit) escape
+ snprintf(escape, sizeof(escape), "\\u%04X", c);
+ result.write_count = sink(escape, 6, writer);
+ } else {
+ // Write long (6 digit) escape
+ snprintf(escape, sizeof(escape), "\\U%08X", c);
+ result.write_count = sink(escape, 10, writer);
}
- if (!(writer->flags & SERD_WRITE_ASCII)) {
- // Write UTF-8 character directly to UTF-8 output
- return sink(utf8, *size, writer);
+ return result;
+}
+
+SERD_NODISCARD static VariableResult
+write_percent_encoded_bytes(SerdWriter* const writer,
+ const size_t size,
+ const uint8_t* const data)
+{
+ static const char hex_chars[] = "0123456789ABCDEF";
+
+ VariableResult result = {SERD_SUCCESS, 0U, 0U};
+ char escape[4] = {'%', 0, 0, 0};
+
+ for (size_t i = 0U; !result.status && i < size; ++i) {
+ const uint8_t byte = data[i];
+ escape[1] = hex_chars[byte >> 4U];
+ escape[2] = hex_chars[byte & 0x0FU];
+
+ const size_t n_written = sink(escape, 3U, writer);
+ result.write_count += n_written;
+ if (n_written != 3U) {
+ result.status = SERD_BAD_WRITE;
+ }
+
+ ++result.read_count;
}
- if (c <= 0xFFFF) {
- snprintf(escape, sizeof(escape), "\\u%04X", c);
- return sink(escape, 6, writer);
+ return result;
+}
+
+static VariableResult
+write_text_character(SerdWriter* const writer, const uint8_t* const utf8)
+{
+ VariableResult result = {SERD_SUCCESS, 0U, 0U};
+ const uint8_t c = utf8[0];
+
+ if ((writer->flags & (SERD_WRITE_ASCII | SERD_WRITE_ESCAPES)) || c < 0x20U ||
+ c == 0x7FU) {
+ // Write ASCII-compatible UCHAR escape like "\u1234"
+ return write_UCHAR(writer, utf8);
+ }
+
+ // Parse the leading byte to get the UTF-8 encoding size
+ if (!(result.read_count = utf8_num_bytes(c))) {
+ result.status = SERD_BAD_TEXT;
+ return result;
+ }
+
+ // Write the UTF-8 encoding directly to the output
+ result.write_count = sink(utf8, result.read_count, writer);
+ if (result.write_count != result.read_count) {
+ result.status = SERD_BAD_WRITE;
+ }
+
+ return result;
+}
+
+static VariableResult
+write_uri_character(SerdWriter* const writer, const uint8_t* const utf8)
+{
+ VariableResult result = {SERD_SUCCESS, 0U, 0U};
+ const uint8_t c = utf8[0];
+
+ if ((writer->flags & SERD_WRITE_ESCAPES)) {
+ return write_UCHAR(writer, utf8);
+ }
+
+ if (c == '%') {
+ // Avoid encoding '%' itself
+ result.read_count = 1;
+ result.write_count = sink("%25", 3, writer);
+ return result;
+ }
+
+ if ((c & 0x80U) && !(writer->flags & SERD_WRITE_ASCII)) {
+ // Parse the leading byte to get the UTF-8 encoding size
+ if (!(result.read_count = utf8_num_bytes(c))) {
+ result.status = SERD_BAD_TEXT;
+ } else {
+ // Write the UTF-8 encoding directly to the output
+ result.write_count = sink(utf8, result.read_count, writer);
+ if (result.write_count != result.read_count) {
+ result.status = SERD_BAD_WRITE;
+ }
+ }
+
+ return result;
}
- snprintf(escape, sizeof(escape), "\\U%08X", c);
- return sink(escape, 10, writer);
+ return write_percent_encoded_bytes(writer, 1U, utf8);
}
static bool
-uri_must_escape(const int c)
+uri_must_escape(const uint8_t c)
{
switch (c) {
case ' ':
case '"':
+ // case '%':
case '<':
case '>':
case '\\':
@@ -336,58 +423,60 @@ uri_must_escape(const int c)
}
static size_t
-write_uri(SerdWriter* writer, const char* utf8, size_t n_bytes, SerdStatus* st)
+next_text_index(const char* utf8,
+ const size_t begin,
+ const size_t end,
+ bool (*const predicate)(uint8_t))
{
- size_t len = 0;
- for (size_t i = 0; i < n_bytes;) {
- size_t j = i; // Index of next character that must be escaped
- for (; j < n_bytes; ++j) {
- if (uri_must_escape(utf8[j])) {
- break;
- }
- }
-
- // Bulk write all characters up to this special one
- const size_t n_bulk = sink(&utf8[i], j - i, writer);
- len += n_bulk;
- if (n_bulk != j - i) {
- *st = SERD_BAD_WRITE;
- return len;
- }
+ size_t i = begin;
+ while (i < end && !predicate((uint8_t)utf8[i])) {
+ ++i;
+ }
+ return i;
+}
+static VariableResult
+write_uri(SerdWriter* writer, const char* utf8, const size_t n_bytes)
+{
+ VariableResult result = {SERD_SUCCESS, 0U, 0U};
+ for (size_t i = 0; i < n_bytes;) {
+ // Write leading chunk as a single fast bulk write
+ const size_t j = next_text_index(utf8, i, n_bytes, uri_must_escape);
+ result.status = esink(&utf8[i], j - i, writer);
if ((i = j) == n_bytes) {
break; // Reached end
}
- // Write UTF-8 character
- uint8_t size = 0;
- len += write_character(writer, (const uint8_t*)utf8 + i, &size, st);
- i += size;
- if (*st && !(writer->flags & SERD_WRITE_LAX)) {
+ // Write character (escape or UTF-8)
+ const VariableResult r =
+ write_uri_character(writer, (const uint8_t*)utf8 + i);
+ i += r.read_count;
+ result.write_count += r.write_count;
+ if (r.status && !(writer->flags & SERD_WRITE_LAX)) {
+ result.status = r.status;
break;
}
- if (size == 0) {
+ if (r.read_count == 0) {
// Corrupt input, write percent-encoded bytes and scan to next start
char escape[4] = {0, 0, 0, 0};
for (; i < n_bytes && !is_utf8_leading((uint8_t)utf8[i]); ++i) {
snprintf(escape, sizeof(escape), "%%%02X", (uint8_t)utf8[i]);
- len += sink(escape, 3, writer);
+ result.write_count += sink(escape, 3, writer);
}
}
}
- return len;
+ return result;
}
SERD_NODISCARD static SerdStatus
ewrite_uri(SerdWriter* writer, const char* utf8, size_t n_bytes)
{
- SerdStatus st = SERD_SUCCESS;
- write_uri(writer, utf8, n_bytes, &st);
+ const VariableResult r = write_uri(writer, utf8, n_bytes);
- return (st == SERD_BAD_WRITE || !(writer->flags & SERD_WRITE_LAX))
- ? st
+ return (r.status == SERD_BAD_WRITE || !(writer->flags & SERD_WRITE_LAX))
+ ? r.status
: SERD_SUCCESS;
}
@@ -398,27 +487,6 @@ write_uri_from_node(SerdWriter* writer, const SerdNode* node)
}
SERD_NODISCARD static SerdStatus
-write_utf8_percent_escape(SerdWriter* const writer,
- const char* const utf8,
- const size_t n_bytes)
-{
- static const char hex_chars[] = "0123456789ABCDEF";
-
- SerdStatus st = SERD_SUCCESS;
- char escape[4] = {'%', 0, 0, 0};
-
- for (size_t i = 0U; i < n_bytes; ++i) {
- const uint8_t byte = (uint8_t)utf8[i];
- escape[1] = hex_chars[byte >> 4U];
- escape[2] = hex_chars[byte & 0x0FU];
-
- TRY(st, esink(escape, 3, writer));
- }
-
- return st;
-}
-
-SERD_NODISCARD static SerdStatus
write_PN_LOCAL_ESC(SerdWriter* const writer, const char c)
{
const char buf[2] = {'\\', c};
@@ -431,7 +499,8 @@ write_lname_escape(SerdWriter* writer, const char* const utf8, size_t n_bytes)
{
return is_PN_LOCAL_ESC(utf8[0])
? write_PN_LOCAL_ESC(writer, utf8[0])
- : write_utf8_percent_escape(writer, utf8, n_bytes);
+ : write_percent_encoded_bytes(writer, n_bytes, (const uint8_t*)utf8)
+ .status;
}
SERD_NODISCARD static SerdStatus
@@ -518,14 +587,16 @@ write_short_string_escape(SerdWriter* const writer, const char c)
case '\r':
return sink("\\r", 2, writer);
case '\t':
- return sink("\\t", 2, writer);
+ return (writer->flags & SERD_WRITE_ESCAPES) ? sink("\\t", 2, writer)
+ : sink("\t", 1, writer);
case '"':
return sink("\\\"", 2, writer);
default:
break;
}
- if (writer->syntax == SERD_TURTLE) {
+ if (!(writer->flags & SERD_WRITE_ESCAPES)) {
+ // These are written with UCHAR in pre-NTriples test cases format
switch (c) {
case '\b':
return sink("\\b", 2, writer);
@@ -539,63 +610,84 @@ write_short_string_escape(SerdWriter* const writer, const char c)
return 0;
}
-static bool
-text_must_escape(const char c)
+SERD_NODISCARD static bool
+text_must_escape(const uint8_t c)
{
return c == '\\' || c == '"' || !in_range(c, 0x20, 0x7E);
}
SERD_NODISCARD static SerdStatus
-write_text(SerdWriter* writer,
- TextContext ctx,
- const char* utf8,
- size_t n_bytes)
+write_short_text(SerdWriter* writer, const char* utf8, size_t n_bytes)
{
- size_t n_consecutive_quotes = 0;
- SerdStatus st = SERD_SUCCESS;
- for (size_t i = 0; !st && i < n_bytes;) {
- if (utf8[i] != '"') {
- n_consecutive_quotes = 0;
+ VariableResult result = {SERD_SUCCESS, 0U, 0U};
+ for (size_t i = 0; !result.status && i < n_bytes;) {
+ // Write leading chunk as a single fast bulk write
+ const size_t j = next_text_index(utf8, i, n_bytes, text_must_escape);
+ result.status = esink(&utf8[i], j - i, writer);
+ if ((i = j) == n_bytes) {
+ break; // Reached end
+ }
+
+ // Try to write character as a special short escape (newline and friends)
+ const char in = utf8[i];
+ const size_t escape_len = write_short_string_escape(writer, in);
+
+ if (!escape_len) {
+ // No special escape for this character, write full Unicode escape
+ result = write_text_character(writer, (const uint8_t*)utf8 + i);
+ i += result.read_count;
+
+ if (!result.read_count && (writer->flags & SERD_WRITE_LAX)) {
+ // Corrupt input, write replacement character and scan to the next start
+ result.status =
+ esink(replacement_char, sizeof(replacement_char), writer);
+ i += next_text_index(utf8, i, n_bytes, is_utf8_leading);
+ }
+ } else {
+ ++i;
}
+ }
+
+ return result.status;
+}
- // Scan for the longest chunk of characters that can be written directly
- size_t j = i;
- for (; j < n_bytes && !text_must_escape(utf8[j]); ++j) {
+SERD_NODISCARD static SerdStatus
+write_long_text(SerdWriter* writer, const char* utf8, size_t n_bytes)
+{
+ size_t n_quotes = 0;
+ VariableResult result = {SERD_SUCCESS, 0U, 0U};
+ for (size_t i = 0; !result.status && i < n_bytes;) {
+ if (utf8[i] != '"') {
+ n_quotes = 0;
}
- // Write chunk as a single fast bulk write
- st = esink(&utf8[i], j - i, writer);
+ // Write leading chunk as a single fast bulk write
+ const size_t j = next_text_index(utf8, i, n_bytes, text_must_escape);
+ result.status = esink(&utf8[i], j - i, writer);
if ((i = j) == n_bytes) {
break; // Reached end
}
- // Try to write character as a special short escape (newline and friends)
- const char in = utf8[i++];
- size_t escape_len = 0;
- if (ctx == WRITE_LONG_STRING) {
- n_consecutive_quotes = (in == '\"') ? (n_consecutive_quotes + 1) : 0;
- escape_len = write_long_string_escape(
- writer, n_consecutive_quotes, i == n_bytes, in);
- } else {
- escape_len = write_short_string_escape(writer, in);
- }
+ // Try to write character as a special long escape (newline and friends)
+ const char in = utf8[i];
+ n_quotes = (in == '\"') ? (n_quotes + 1U) : 0;
+ const size_t escape_len =
+ write_long_string_escape(writer, n_quotes, i + 1U == n_bytes, in);
- if (escape_len == 0) {
+ if (!escape_len) {
// No special escape for this character, write full Unicode escape
- uint8_t size = 0;
- write_character(writer, (const uint8_t*)utf8 + i - 1, &size, &st);
- if (st && !(writer->flags & SERD_WRITE_LAX)) {
- return st;
- }
-
- if (size == 0) {
- // Corrupt input, write replacement character and scan to the next start
- st = esink(replacement_char, sizeof(replacement_char), writer);
- for (; i < n_bytes && !is_utf8_leading((uint8_t)utf8[i]); ++i) {
- }
- } else {
- i += size - 1U;
+ result = write_UCHAR(writer, (const uint8_t*)utf8 + i);
+ i += result.read_count;
+
+ if (!result.read_count && (writer->flags & SERD_WRITE_LAX)) {
+ // Corrupt input, write replacement character and scan to the next
+ // start
+ result.status =
+ esink(replacement_char, sizeof(replacement_char), writer);
+ i += next_text_index(utf8, i, n_bytes, is_utf8_leading);
}
+ } else {
+ ++i;
}
}
@@ -615,8 +707,10 @@ uri_sink(const void* buf, size_t size, size_t nmemb, void* stream)
UriSinkContext* const context = (UriSinkContext*)stream;
SerdWriter* const writer = context->writer;
+ const VariableResult r = write_uri(writer, (const char*)buf, nmemb);
- return write_uri(writer, (const char*)buf, nmemb, &context->status);
+ context->status = r.status;
+ return r.write_count;
}
SERD_NODISCARD static SerdStatus
@@ -774,11 +868,11 @@ write_literal(SerdWriter* const writer,
if (supports_abbrev(writer) && (node->flags & SERD_IS_LONG)) {
TRY(st, esink("\"\"\"", 3, writer));
- TRY(st, write_text(writer, WRITE_LONG_STRING, node_str, node->length));
+ TRY(st, write_long_text(writer, node_str, node->length));
TRY(st, esink("\"\"\"", 3, writer));
} else {
TRY(st, esink("\"", 1, writer));
- TRY(st, write_text(writer, WRITE_STRING, node_str, node->length));
+ TRY(st, write_short_text(writer, node_str, node->length));
TRY(st, esink("\"", 1, writer));
}
if (lang && serd_node_string(lang)) {
@@ -1390,11 +1484,14 @@ serd_writer_set_base_uri(SerdWriter* writer, const SerdNode* uri)
if (uri && (writer->syntax == SERD_TURTLE || writer->syntax == SERD_TRIG)) {
TRY(st, terminate_context(writer));
- TRY(st, esink("@base <", 7, writer));
- TRY(st, esink(uri_string.data, uri_string.length, writer));
- TRY(st, esink(">", 1, writer));
- writer->last_sep = SEP_NODE;
- TRY(st, write_sep(writer, writer->context.flags, SEP_END_DIRECT));
+
+ if (!(writer->flags & SERD_WRITE_CONTEXTUAL)) {
+ TRY(st, esink("@base <", 7, writer));
+ TRY(st, esink(uri_string.data, uri_string.length, writer));
+ TRY(st, esink(">", 1, writer));
+ writer->last_sep = SEP_NODE;
+ TRY(st, write_sep(writer, writer->context.flags, SEP_END_DIRECT));
+ }
}
return reset_context(writer, RESET_GRAPH | RESET_INDENT);