aboutsummaryrefslogtreecommitdiffstats
path: root/src/read_ntriples.c
diff options
context:
space:
mode:
authorDavid Robillard <d@drobilla.net>2023-02-05 18:39:49 -0500
committerDavid Robillard <d@drobilla.net>2023-12-02 18:49:08 -0500
commit343124df71010055c2c1e6cdcadd13d23b2c013a (patch)
tree7c2de6a72021adaac89e9c4fa97e7cc5503e0657 /src/read_ntriples.c
parent530edb265fbbed20e6d3a6fd7a36461ff83d9b46 (diff)
downloadserd-343124df71010055c2c1e6cdcadd13d23b2c013a.tar.gz
serd-343124df71010055c2c1e6cdcadd13d23b2c013a.tar.bz2
serd-343124df71010055c2c1e6cdcadd13d23b2c013a.zip
[WIP] Add support for URI hex escape decoding
Diffstat (limited to 'src/read_ntriples.c')
-rw-r--r--src/read_ntriples.c66
1 files changed, 66 insertions, 0 deletions
diff --git a/src/read_ntriples.c b/src/read_ntriples.c
index dd5c28fc..5c02abfe 100644
--- a/src/read_ntriples.c
+++ b/src/read_ntriples.c
@@ -108,6 +108,67 @@ read_IRI_scheme(SerdReader* const reader, SerdNode* const dest)
return st ? st : SERD_BAD_SYNTAX;
}
+static SerdStatus
+read_hex_byte(SerdReader* const reader, uint8_t digits[const 2])
+{
+ for (unsigned i = 0U; i < 2U; ++i) {
+ if (!(digits[i] = read_HEX(reader))) {
+ return SERD_BAD_SYNTAX;
+ }
+ }
+
+ return SERD_SUCCESS;
+}
+
+static uint8_t
+hex_byte_value(const uint8_t c0, const uint8_t c1)
+{
+ return (uint8_t)((hex_digit_value(c0) << 4U) | hex_digit_value(c1));
+}
+
+/// RFC3986 S2.1: pct-encoded = "%" HEXDIG HEXDIG
+static SerdStatus
+read_pct_encoded(SerdReader* const reader, SerdNode* const node)
+{
+ SerdStatus st = SERD_SUCCESS;
+ uint8_t hex[9] = {0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U};
+
+ // Read first percent-encoded byte
+ TRY(st, read_hex_byte(reader, hex));
+
+ // Parse the leading byte and get the encoded size from it
+ uint8_t byte = hex_byte_value(hex[0], hex[1]);
+ const uint32_t size = utf8_num_bytes(byte);
+ if (!size) {
+ return SERD_BAD_TEXT;
+ }
+
+ // Avoid decoding '%' itself
+ if (byte == '%') {
+ return push_bytes(reader, node, (const uint8_t*)"%25", 3);
+ }
+
+ // Push the leading byte to the node
+ TRY(st, push_byte(reader, node, byte));
+
+ // Read remaining hex-encoded bytes
+ for (unsigned i = 1; i < size; ++i) {
+ const unsigned offset = 2U * i;
+ uint8_t* const digits = hex + offset;
+ TRY(st, eat_byte_check(reader, '%'));
+ TRY(st, read_hex_byte(reader, digits));
+
+ byte = hex_byte_value(digits[0], digits[1]);
+ if (!is_utf8_continuation(byte)) {
+ return SERD_BAD_TEXT;
+ }
+
+ TRY(st, push_byte(reader, node, byte));
+ }
+
+ return st;
+}
+
SerdStatus
read_IRIREF_suffix(SerdReader* const reader, SerdNode* const node)
{
@@ -131,6 +192,11 @@ read_IRIREF_suffix(SerdReader* const reader, SerdNode* const node)
case '>':
return SERD_SUCCESS;
+ case '%':
+ st = (reader->flags & SERD_READ_DECODED) ? read_pct_encoded(reader, node)
+ : push_byte(reader, node, c);
+ break;
+
case '\\':
if (!(st = read_UCHAR(reader, node, &code)) &&
(code == ' ' || code == '<' || code == '>')) {