[WIP] Add support for URI hex escape decoding

author: David Robillard <d@drobilla.net> 2023-02-05 18:39:49 -0500
committer: David Robillard <d@drobilla.net> 2023-12-02 18:49:08 -0500
commit: 343124df71010055c2c1e6cdcadd13d23b2c013a (patch)
tree: 7c2de6a72021adaac89e9c4fa97e7cc5503e0657 /src
parent: 530edb265fbbed20e6d3a6fd7a36461ff83d9b46 (diff)
download: serd-343124df71010055c2c1e6cdcadd13d23b2c013a.tar.gz
serd-343124df71010055c2c1e6cdcadd13d23b2c013a.tar.bz2
serd-343124df71010055c2c1e6cdcadd13d23b2c013a.zip
2 files changed, 290 insertions, 127 deletions
diff --git a/src/read_ntriples.c b/src/read_ntriples.c
index dd5c28fc..5c02abfe 100644
--- a/src/read_ntriples.c
+++ b/src/read_ntriples.c
@@ -108,6 +108,67 @@ read_IRI_scheme(SerdReader* const reader, SerdNode* const dest)
   return st ? st : SERD_BAD_SYNTAX;
 }
 
+static SerdStatus
+read_hex_byte(SerdReader* const reader, uint8_t digits[const 2])
+{
+  for (unsigned i = 0U; i < 2U; ++i) {
+    if (!(digits[i] = read_HEX(reader))) {
+      return SERD_BAD_SYNTAX;
+    }
+  }
+
+  return SERD_SUCCESS;
+}
+
+static uint8_t
+hex_byte_value(const uint8_t c0, const uint8_t c1)
+{
+  return (uint8_t)((hex_digit_value(c0) << 4U) | hex_digit_value(c1));
+}
+
+/// RFC3986 S2.1: pct-encoded = "%" HEXDIG HEXDIG
+static SerdStatus
+read_pct_encoded(SerdReader* const reader, SerdNode* const node)
+{
+  SerdStatus st     = SERD_SUCCESS;
+  uint8_t    hex[9] = {0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U};
+
+  // Read first percent-encoded byte
+  TRY(st, read_hex_byte(reader, hex));
+
+  // Parse the leading byte and get the encoded size from it
+  uint8_t        byte = hex_byte_value(hex[0], hex[1]);
+  const uint32_t size = utf8_num_bytes(byte);
+  if (!size) {
+    return SERD_BAD_TEXT;
+  }
+
+  // Avoid decoding '%' itself
+  if (byte == '%') {
+    return push_bytes(reader, node, (const uint8_t*)"%25", 3);
+  }
+
+  // Push the leading byte to the node
+  TRY(st, push_byte(reader, node, byte));
+
+  // Read remaining hex-encoded bytes
+  for (unsigned i = 1; i < size; ++i) {
+    const unsigned offset = 2U * i;
+    uint8_t* const digits = hex + offset;
+    TRY(st, eat_byte_check(reader, '%'));
+    TRY(st, read_hex_byte(reader, digits));
+
+    byte = hex_byte_value(digits[0], digits[1]);
+    if (!is_utf8_continuation(byte)) {
+      return SERD_BAD_TEXT;
+    }
+
+    TRY(st, push_byte(reader, node, byte));
+  }
+
+  return st;
+}
+
 SerdStatus
 read_IRIREF_suffix(SerdReader* const reader, SerdNode* const node)
 {
@@ -131,6 +192,11 @@ read_IRIREF_suffix(SerdReader* const reader, SerdNode* const node)
     case '>':
       return SERD_SUCCESS;
 
+    case '%':
+      st = (reader->flags & SERD_READ_DECODED) ? read_pct_encoded(reader, node)
+                                               : push_byte(reader, node, c);
+      break;
+
     case '\\':
       if (!(st = read_UCHAR(reader, node, &code)) &&
           (code == ' ' || code == '<' || code == '>')) {
diff --git a/src/writer.c b/src/writer.c
index fd52a123..ccb24e5f 100644
--- a/src/writer.c
+++ b/src/writer.c
@@ -53,6 +53,13 @@ typedef struct {
   bool               comma_indented;
 } WriteContext;
 
+/// A status for an operation that reads/writes variable numbers of bytes
+typedef struct {
+  SerdStatus status;
+  size_t     read_count;
+  size_t     write_count;
+} VariableResult;
+
 static const WriteContext WRITE_CONTEXT_NULL =
   {CTX_NAMED, 0U, NULL, NULL, NULL, 0U, 0U};
 
@@ -280,47 +287,127 @@ esink(const void* buf, size_t len, SerdWriter* writer)
   return sink(buf, len, writer) == len ? SERD_SUCCESS : SERD_BAD_WRITE;
 }
 
-// Write a single character as a Unicode escape
-// (Caller prints any single byte characters that don't need escaping)
-static size_t
-write_character(SerdWriter* const    writer,
-                const uint8_t* const utf8,
-                uint8_t* const       size,
-                SerdStatus* const    st)
+static VariableResult
+write_UCHAR(SerdWriter* const writer, const uint8_t* const utf8)
 {
+  VariableResult result     = {SERD_SUCCESS, 0U, 0U};
   char           escape[11] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-  const uint32_t c          = parse_utf8_char(utf8, size);
-  switch (*size) {
-  case 0:
-    *st = w_err(writer, SERD_BAD_TEXT, "invalid UTF-8 start: %X", utf8[0]);
-    return 0;
-  case 1:
-    snprintf(escape, sizeof(escape), "\\u%04X", utf8[0]);
-    return sink(escape, 6, writer);
-  default:
-    break;
+  uint8_t        c_size     = 0U;
+  const uint32_t c          = parse_utf8_char(utf8, &c_size);
+
+  result.read_count = c_size;
+  if (result.read_count == 0U) {
+    result.status =
+      w_err(writer, SERD_BAD_TEXT, "invalid UTF-8 start: %X", utf8[0]);
+  } else if (c <= 0xFFFF) {
+    // Write short (4 digit) escape
+    snprintf(escape, sizeof(escape), "\\u%04X", c);
+    result.write_count = sink(escape, 6, writer);
+  } else {
+    // Write long (6 digit) escape
+    snprintf(escape, sizeof(escape), "\\U%08X", c);
+    result.write_count = sink(escape, 10, writer);
   }
 
-  if (!(writer->flags & SERD_WRITE_ASCII)) {
-    // Write UTF-8 character directly to UTF-8 output
-    return sink(utf8, *size, writer);
+  return result;
+}
+
+SERD_NODISCARD static VariableResult
+write_percent_encoded_bytes(SerdWriter* const    writer,
+                            const size_t         size,
+                            const uint8_t* const data)
+{
+  static const char hex_chars[] = "0123456789ABCDEF";
+
+  VariableResult result    = {SERD_SUCCESS, 0U, 0U};
+  char           escape[4] = {'%', 0, 0, 0};
+
+  for (size_t i = 0U; !result.status && i < size; ++i) {
+    const uint8_t byte = data[i];
+    escape[1]          = hex_chars[byte >> 4U];
+    escape[2]          = hex_chars[byte & 0x0FU];
+
+    const size_t n_written = sink(escape, 3U, writer);
+    result.write_count += n_written;
+    if (n_written != 3U) {
+      result.status = SERD_BAD_WRITE;
+    }
+
+    ++result.read_count;
   }
 
-  if (c <= 0xFFFF) {
-    snprintf(escape, sizeof(escape), "\\u%04X", c);
-    return sink(escape, 6, writer);
+  return result;
+}
+
+static VariableResult
+write_text_character(SerdWriter* const writer, const uint8_t* const utf8)
+{
+  VariableResult result = {SERD_SUCCESS, 0U, 0U};
+  const uint8_t  c      = utf8[0];
+
+  if ((writer->flags & (SERD_WRITE_ASCII | SERD_WRITE_ESCAPES)) || c < 0x20U ||
+      c == 0x7FU) {
+    // Write ASCII-compatible UCHAR escape like "\u1234"
+    return write_UCHAR(writer, utf8);
+  }
+
+  // Parse the leading byte to get the UTF-8 encoding size
+  if (!(result.read_count = utf8_num_bytes(c))) {
+    result.status = SERD_BAD_TEXT;
+    return result;
+  }
+
+  // Write the UTF-8 encoding directly to the output
+  result.write_count = sink(utf8, result.read_count, writer);
+  if (result.write_count != result.read_count) {
+    result.status = SERD_BAD_WRITE;
+  }
+
+  return result;
+}
+
+static VariableResult
+write_uri_character(SerdWriter* const writer, const uint8_t* const utf8)
+{
+  VariableResult result = {SERD_SUCCESS, 0U, 0U};
+  const uint8_t  c      = utf8[0];
+
+  if ((writer->flags & SERD_WRITE_ESCAPES)) {
+    return write_UCHAR(writer, utf8);
+  }
+
+  if (c == '%') {
+    // Avoid encoding '%' itself
+    result.read_count  = 1;
+    result.write_count = sink("%25", 3, writer);
+    return result;
+  }
+
+  if ((c & 0x80U) && !(writer->flags & SERD_WRITE_ASCII)) {
+    // Parse the leading byte to get the UTF-8 encoding size
+    if (!(result.read_count = utf8_num_bytes(c))) {
+      result.status = SERD_BAD_TEXT;
+    } else {
+      // Write the UTF-8 encoding directly to the output
+      result.write_count = sink(utf8, result.read_count, writer);
+      if (result.write_count != result.read_count) {
+        result.status = SERD_BAD_WRITE;
+      }
+    }
+
+    return result;
   }
 
-  snprintf(escape, sizeof(escape), "\\U%08X", c);
-  return sink(escape, 10, writer);
+  return write_percent_encoded_bytes(writer, 1U, utf8);
 }
 
 static bool
-uri_must_escape(const int c)
+uri_must_escape(const uint8_t c)
 {
   switch (c) {
   case ' ':
   case '"':
+    //  case '%':
   case '<':
   case '>':
   case '\\':
@@ -336,58 +423,60 @@ uri_must_escape(const int c)
 }
 
 static size_t
-write_uri(SerdWriter* writer, const char* utf8, size_t n_bytes, SerdStatus* st)
+next_text_index(const char*  utf8,
+                const size_t begin,
+                const size_t end,
+                bool (*const predicate)(uint8_t))
 {
-  size_t len = 0;
-  for (size_t i = 0; i < n_bytes;) {
-    size_t j = i; // Index of next character that must be escaped
-    for (; j < n_bytes; ++j) {
-      if (uri_must_escape(utf8[j])) {
-        break;
-      }
-    }
-
-    // Bulk write all characters up to this special one
-    const size_t n_bulk = sink(&utf8[i], j - i, writer);
-    len += n_bulk;
-    if (n_bulk != j - i) {
-      *st = SERD_BAD_WRITE;
-      return len;
-    }
+  size_t i = begin;
+  while (i < end && !predicate((uint8_t)utf8[i])) {
+    ++i;
+  }
+  return i;
+}
 
+static VariableResult
+write_uri(SerdWriter* writer, const char* utf8, const size_t n_bytes)
+{
+  VariableResult result = {SERD_SUCCESS, 0U, 0U};
+  for (size_t i = 0; i < n_bytes;) {
+    // Write leading chunk as a single fast bulk write
+    const size_t j = next_text_index(utf8, i, n_bytes, uri_must_escape);
+    result.status  = esink(&utf8[i], j - i, writer);
     if ((i = j) == n_bytes) {
       break; // Reached end
     }
 
-    // Write UTF-8 character
-    uint8_t size = 0;
-    len += write_character(writer, (const uint8_t*)utf8 + i, &size, st);
-    i += size;
-    if (*st && !(writer->flags & SERD_WRITE_LAX)) {
+    // Write character (escape or UTF-8)
+    const VariableResult r =
+      write_uri_character(writer, (const uint8_t*)utf8 + i);
+    i += r.read_count;
+    result.write_count += r.write_count;
+    if (r.status && !(writer->flags & SERD_WRITE_LAX)) {
+      result.status = r.status;
       break;
     }
 
-    if (size == 0) {
+    if (r.read_count == 0) {
       // Corrupt input, write percent-encoded bytes and scan to next start
       char escape[4] = {0, 0, 0, 0};
       for (; i < n_bytes && !is_utf8_leading((uint8_t)utf8[i]); ++i) {
         snprintf(escape, sizeof(escape), "%%%02X", (uint8_t)utf8[i]);
-        len += sink(escape, 3, writer);
+        result.write_count += sink(escape, 3, writer);
       }
     }
   }
 
-  return len;
+  return result;
 }
 
 SERD_NODISCARD static SerdStatus
 ewrite_uri(SerdWriter* writer, const char* utf8, size_t n_bytes)
 {
-  SerdStatus st = SERD_SUCCESS;
-  write_uri(writer, utf8, n_bytes, &st);
+  const VariableResult r = write_uri(writer, utf8, n_bytes);
 
-  return (st == SERD_BAD_WRITE || !(writer->flags & SERD_WRITE_LAX))
-           ? st
+  return (r.status == SERD_BAD_WRITE || !(writer->flags & SERD_WRITE_LAX))
+           ? r.status
            : SERD_SUCCESS;
 }
 
@@ -398,27 +487,6 @@ write_uri_from_node(SerdWriter* writer, const SerdNode* node)
 }
 
 SERD_NODISCARD static SerdStatus
-write_utf8_percent_escape(SerdWriter* const writer,
-                          const char* const utf8,
-                          const size_t      n_bytes)
-{
-  static const char hex_chars[] = "0123456789ABCDEF";
-
-  SerdStatus st        = SERD_SUCCESS;
-  char       escape[4] = {'%', 0, 0, 0};
-
-  for (size_t i = 0U; i < n_bytes; ++i) {
-    const uint8_t byte = (uint8_t)utf8[i];
-    escape[1]          = hex_chars[byte >> 4U];
-    escape[2]          = hex_chars[byte & 0x0FU];
-
-    TRY(st, esink(escape, 3, writer));
-  }
-
-  return st;
-}
-
-SERD_NODISCARD static SerdStatus
 write_PN_LOCAL_ESC(SerdWriter* const writer, const char c)
 {
   const char buf[2] = {'\\', c};
@@ -431,7 +499,8 @@ write_lname_escape(SerdWriter* writer, const char* const utf8, size_t n_bytes)
 {
   return is_PN_LOCAL_ESC(utf8[0])
            ? write_PN_LOCAL_ESC(writer, utf8[0])
-           : write_utf8_percent_escape(writer, utf8, n_bytes);
+           : write_percent_encoded_bytes(writer, n_bytes, (const uint8_t*)utf8)
+               .status;
 }
 
 SERD_NODISCARD static SerdStatus
@@ -518,14 +587,16 @@ write_short_string_escape(SerdWriter* const writer, const char c)
   case '\r':
     return sink("\\r", 2, writer);
   case '\t':
-    return sink("\\t", 2, writer);
+    return (writer->flags & SERD_WRITE_ESCAPES) ? sink("\\t", 2, writer)
+                                                : sink("\t", 1, writer);
   case '"':
     return sink("\\\"", 2, writer);
   default:
     break;
   }
 
-  if (writer->syntax == SERD_TURTLE) {
+  if (!(writer->flags & SERD_WRITE_ESCAPES)) {
+    // These are written with UCHAR in pre-NTriples test cases format
     switch (c) {
     case '\b':
       return sink("\\b", 2, writer);
@@ -539,63 +610,84 @@ write_short_string_escape(SerdWriter* const writer, const char c)
   return 0;
 }
 
-static bool
-text_must_escape(const char c)
+SERD_NODISCARD static bool
+text_must_escape(const uint8_t c)
 {
   return c == '\\' || c == '"' || !in_range(c, 0x20, 0x7E);
 }
 
 SERD_NODISCARD static SerdStatus
-write_text(SerdWriter* writer,
-           TextContext ctx,
-           const char* utf8,
-           size_t      n_bytes)
+write_short_text(SerdWriter* writer, const char* utf8, size_t n_bytes)
 {
-  size_t     n_consecutive_quotes = 0;
-  SerdStatus st                   = SERD_SUCCESS;
-  for (size_t i = 0; !st && i < n_bytes;) {
-    if (utf8[i] != '"') {
-      n_consecutive_quotes = 0;
+  VariableResult result = {SERD_SUCCESS, 0U, 0U};
+  for (size_t i = 0; !result.status && i < n_bytes;) {
+    // Write leading chunk as a single fast bulk write
+    const size_t j = next_text_index(utf8, i, n_bytes, text_must_escape);
+    result.status  = esink(&utf8[i], j - i, writer);
+    if ((i = j) == n_bytes) {
+      break; // Reached end
+    }
+
+    // Try to write character as a special short escape (newline and friends)
+    const char   in         = utf8[i];
+    const size_t escape_len = write_short_string_escape(writer, in);
+
+    if (!escape_len) {
+      // No special escape for this character, write full Unicode escape
+      result = write_text_character(writer, (const uint8_t*)utf8 + i);
+      i += result.read_count;
+
+      if (!result.read_count && (writer->flags & SERD_WRITE_LAX)) {
+        // Corrupt input, write replacement character and scan to the next start
+        result.status =
+          esink(replacement_char, sizeof(replacement_char), writer);
+        i += next_text_index(utf8, i, n_bytes, is_utf8_leading);
+      }
+    } else {
+      ++i;
     }
+  }
+
+  return result.status;
+}
 
-    // Scan for the longest chunk of characters that can be written directly
-    size_t j = i;
-    for (; j < n_bytes && !text_must_escape(utf8[j]); ++j) {
+SERD_NODISCARD static SerdStatus
+write_long_text(SerdWriter* writer, const char* utf8, size_t n_bytes)
+{
+  size_t         n_quotes = 0;
+  VariableResult result   = {SERD_SUCCESS, 0U, 0U};
+  for (size_t i = 0; !result.status && i < n_bytes;) {
+    if (utf8[i] != '"') {
+      n_quotes = 0;
     }
 
-    // Write chunk as a single fast bulk write
-    st = esink(&utf8[i], j - i, writer);
+    // Write leading chunk as a single fast bulk write
+    const size_t j = next_text_index(utf8, i, n_bytes, text_must_escape);
+    result.status  = esink(&utf8[i], j - i, writer);
     if ((i = j) == n_bytes) {
       break; // Reached end
     }
 
-    // Try to write character as a special short escape (newline and friends)
-    const char in         = utf8[i++];
-    size_t     escape_len = 0;
-    if (ctx == WRITE_LONG_STRING) {
-      n_consecutive_quotes = (in == '\"') ? (n_consecutive_quotes + 1) : 0;
-      escape_len           = write_long_string_escape(
-        writer, n_consecutive_quotes, i == n_bytes, in);
-    } else {
-      escape_len = write_short_string_escape(writer, in);
-    }
+    // Try to write character as a special long escape (newline and friends)
+    const char in = utf8[i];
+    n_quotes      = (in == '\"') ? (n_quotes + 1U) : 0;
+    const size_t escape_len =
+      write_long_string_escape(writer, n_quotes, i + 1U == n_bytes, in);
 
-    if (escape_len == 0) {
+    if (!escape_len) {
       // No special escape for this character, write full Unicode escape
-      uint8_t size = 0;
-      write_character(writer, (const uint8_t*)utf8 + i - 1, &size, &st);
-      if (st && !(writer->flags & SERD_WRITE_LAX)) {
-        return st;
-      }
-
-      if (size == 0) {
-        // Corrupt input, write replacement character and scan to the next start
-        st = esink(replacement_char, sizeof(replacement_char), writer);
-        for (; i < n_bytes && !is_utf8_leading((uint8_t)utf8[i]); ++i) {
-        }
-      } else {
-        i += size - 1U;
+      result = write_UCHAR(writer, (const uint8_t*)utf8 + i);
+      i += result.read_count;
+
+      if (!result.read_count && (writer->flags & SERD_WRITE_LAX)) {
+        // Corrupt input, write replacement character and scan to the next
+        // start
+        result.status =
+          esink(replacement_char, sizeof(replacement_char), writer);
+        i += next_text_index(utf8, i, n_bytes, is_utf8_leading);
       }
+    } else {
+      ++i;
     }
   }
 
@@ -615,8 +707,10 @@ uri_sink(const void* buf, size_t size, size_t nmemb, void* stream)
 
   UriSinkContext* const context = (UriSinkContext*)stream;
   SerdWriter* const     writer  = context->writer;
+  const VariableResult  r       = write_uri(writer, (const char*)buf, nmemb);
 
-  return write_uri(writer, (const char*)buf, nmemb, &context->status);
+  context->status = r.status;
+  return r.write_count;
 }
 
 SERD_NODISCARD static SerdStatus
@@ -774,11 +868,11 @@ write_literal(SerdWriter* const        writer,
 
   if (supports_abbrev(writer) && (node->flags & SERD_IS_LONG)) {
     TRY(st, esink("\"\"\"", 3, writer));
-    TRY(st, write_text(writer, WRITE_LONG_STRING, node_str, node->length));
+    TRY(st, write_long_text(writer, node_str, node->length));
     TRY(st, esink("\"\"\"", 3, writer));
   } else {
     TRY(st, esink("\"", 1, writer));
-    TRY(st, write_text(writer, WRITE_STRING, node_str, node->length));
+    TRY(st, write_short_text(writer, node_str, node->length));
     TRY(st, esink("\"", 1, writer));
   }
   if (lang && serd_node_string(lang)) {
@@ -1390,11 +1484,14 @@ serd_writer_set_base_uri(SerdWriter* writer, const SerdNode* uri)
 
   if (uri && (writer->syntax == SERD_TURTLE || writer->syntax == SERD_TRIG)) {
     TRY(st, terminate_context(writer));
-    TRY(st, esink("@base <", 7, writer));
-    TRY(st, esink(uri_string.data, uri_string.length, writer));
-    TRY(st, esink(">", 1, writer));
-    writer->last_sep = SEP_NODE;
-    TRY(st, write_sep(writer, writer->context.flags, SEP_END_DIRECT));
+
+    if (!(writer->flags & SERD_WRITE_CONTEXTUAL)) {
+      TRY(st, esink("@base <", 7, writer));
+      TRY(st, esink(uri_string.data, uri_string.length, writer));
+      TRY(st, esink(">", 1, writer));
+      writer->last_sep = SEP_NODE;
+      TRY(st, write_sep(writer, writer->context.flags, SEP_END_DIRECT));
+    }
   }
 
   return reset_context(writer, RESET_GRAPH | RESET_INDENT);
author	David Robillard <d@drobilla.net>	2023-02-05 18:39:49 -0500
committer	David Robillard <d@drobilla.net>	2023-12-02 18:49:08 -0500
commit	343124df71010055c2c1e6cdcadd13d23b2c013a (patch)
tree	7c2de6a72021adaac89e9c4fa97e7cc5503e0657 /src
parent	530edb265fbbed20e6d3a6fd7a36461ff83d9b46 (diff)
download	serd-343124df71010055c2c1e6cdcadd13d23b2c013a.tar.gz serd-343124df71010055c2c1e6cdcadd13d23b2c013a.tar.bz2 serd-343124df71010055c2c1e6cdcadd13d23b2c013a.zip