Support writing all escapes in Turtle and TriG prefixed names

author: David Robillard <d@drobilla.net> 2023-03-24 20:59:54 -0400
committer: David Robillard <d@drobilla.net> 2023-12-02 18:49:07 -0500
commit: 65cbb4a13f615658282677fcf04685bae63e893c (patch)
tree: b9c66e757cf28ce96906d3426300811645753173 /src
parent: c661dbe50d7f634ec5b2863260f41f098fc9c882 (diff)
download: serd-65cbb4a13f615658282677fcf04685bae63e893c.tar.gz
serd-65cbb4a13f615658282677fcf04685bae63e893c.tar.bz2
serd-65cbb4a13f615658282677fcf04685bae63e893c.zip
3 files changed, 74 insertions, 74 deletions
diff --git a/src/.clang-tidy b/src/.clang-tidy
index 638041cc..c2df3e44 100644
--- a/src/.clang-tidy
+++ b/src/.clang-tidy
@@ -9,7 +9,6 @@ Checks: >
   -clang-analyzer-valist.Uninitialized,
   -clang-diagnostic-unused-function,
   -concurrency-mt-unsafe,
-  -google-readability-todo,
   -hicpp-multiway-paths-covered,
   -hicpp-signed-bitwise,
   -llvm-header-guard,
diff --git a/src/turtle.h b/src/turtle.h
index 6e7e3a8d..f794e1e8 100644
--- a/src/turtle.h
+++ b/src/turtle.h
@@ -8,7 +8,6 @@
 #include "string_utils.h"
 
 #include <stdbool.h>
-#include <string.h>
 
 static inline bool
 is_PN_CHARS_U(const int c)
@@ -26,7 +25,10 @@ is_PN_CHARS(const int c)
 static inline bool
 is_PN_LOCAL_ESC(const int c)
 {
-  return strchr("!#$%&\'()*+,-./;=?@_~", c) != NULL;
+  return c == '!' || c == '#' || c == '$' || c == '%' || c == '&' ||
+         c == '\'' || c == '(' || c == ')' || c == '*' || c == '+' ||
+         c == ',' || c == '-' || c == '.' || c == '/' || c == ';' || c == '=' ||
+         c == '?' || c == '@' || c == '\\' || c == '_' || c == '~';
 }
 
 #endif // SERD_SRC_TURTLE_H
diff --git a/src/writer.c b/src/writer.c
index 7201c976..60c17e11 100644
--- a/src/writer.c
+++ b/src/writer.c
@@ -9,6 +9,7 @@
 #include "string_utils.h"
 #include "system.h"
 #include "try.h"
+#include "turtle.h"
 #include "uri_utils.h"
 #include "world.h"
 
@@ -281,10 +282,10 @@ esink(const void* buf, size_t len, SerdWriter* writer)
 // Write a single character as a Unicode escape
 // (Caller prints any single byte characters that don't need escaping)
 static size_t
-write_character(SerdWriter*    writer,
-                const uint8_t* utf8,
-                size_t*        size,
-                SerdStatus*    st)
+write_character(SerdWriter* const    writer,
+                const uint8_t* const utf8,
+                size_t* const        size,
+                SerdStatus* const    st)
 {
   char           escape[11] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
   const uint32_t c          = parse_utf8_char(utf8, size);
@@ -395,64 +396,77 @@ write_uri_from_node(SerdWriter* writer, const SerdNode* node)
   return ewrite_uri(writer, serd_node_string(node), serd_node_length(node));
 }
 
-static bool
-lname_must_escape(const char c)
+SERD_NODISCARD static SerdStatus
+write_utf8_percent_escape(SerdWriter* const writer,
+                          const char* const utf8,
+                          const size_t      n_bytes)
 {
-  /* This arbitrary list of characters, most of which have nothing to do with
-     Turtle, must be handled as special cases here because the RDF and SPARQL
-     WGs are apparently intent on making the once elegant Turtle a baroque
-     and inconsistent mess, throwing elegance and extensibility completely
-     out the window for no good reason.
+  static const char hex_chars[] = "0123456789ABCDEF";
 
-     Note '-', '.', and '_' are also in PN_LOCAL_ESC, but are valid unescaped
-     in local names, so they are not escaped here. */
+  SerdStatus st        = SERD_SUCCESS;
+  char       escape[4] = {'%', 0, 0, 0};
 
-  switch (c) {
-  case '\'':
-  case '!':
-  case '#':
-  case '$':
-  case '%':
-  case '&':
-  case '(':
-  case ')':
-  case '*':
-  case '+':
-  case ',':
-  case '/':
-  case ';':
-  case '=':
-  case '?':
-  case '@':
-  case '~':
-    return true;
-  default:
-    break;
+  for (size_t i = 0U; i < n_bytes; ++i) {
+    const uint8_t byte = (uint8_t)utf8[i];
+    escape[1]          = hex_chars[byte >> 4U];
+    escape[2]          = hex_chars[byte & 0x0FU];
+
+    TRY(st, esink(escape, 3, writer));
   }
-  return false;
+
+  return st;
+}
+
+SERD_NODISCARD static SerdStatus
+write_PN_LOCAL_ESC(SerdWriter* const writer, const char c)
+{
+  const char buf[2] = {'\\', c};
+
+  return esink(buf, sizeof(buf), writer);
+}
+
+SERD_NODISCARD static SerdStatus
+write_lname_escape(SerdWriter* writer, const char* const utf8, size_t n_bytes)
+{
+  return is_PN_LOCAL_ESC(utf8[0])
+           ? write_PN_LOCAL_ESC(writer, utf8[0])
+           : write_utf8_percent_escape(writer, utf8, n_bytes);
 }
 
 SERD_NODISCARD static SerdStatus
-write_lname(SerdWriter* writer, const char* utf8, size_t n_bytes)
+write_lname(SerdWriter* writer, const char* utf8, const size_t n_bytes)
 {
   SerdStatus st = SERD_SUCCESS;
-  for (size_t i = 0; i < n_bytes; ++i) {
-    size_t j = i; // Index of next character that must be escaped
-    for (; j < n_bytes; ++j) {
-      if (lname_must_escape(utf8[j])) {
-        break;
-      }
-    }
+  if (!n_bytes) {
+    return st;
+  }
 
-    // Bulk write all characters up to this special one
-    TRY(st, esink(&utf8[i], j - i, writer));
-    if ((i = j) == n_bytes) {
-      break; // Reached end
+  /* Thanks to the horribly complicated Turtle grammar for prefixed names,
+     making sure we never write an invalid character is tedious.  We need to
+     handle the first and last characters separately since they have different
+     sets of valid characters. */
+
+  // Write first character
+  size_t    first_size = 0U;
+  const int first = (int)parse_utf8_char((const uint8_t*)utf8, &first_size);
+  if (is_PN_CHARS_U(first) || first == ':' || is_digit(first)) {
+    TRY(st, esink(utf8, first_size, writer));
+  } else {
+    TRY(st, write_lname_escape(writer, utf8, first_size));
+  }
+
+  // Write middle and last characters
+  for (size_t i = first_size; i < n_bytes;) {
+    size_t    c_size = 0U;
+    const int c      = (int)parse_utf8_char((const uint8_t*)utf8 + i, &c_size);
+
+    if (is_PN_CHARS(c) || c == ':' || (c == '.' && (i + 1U < n_bytes))) {
+      TRY(st, esink(&utf8[i], c_size, writer));
+    } else {
+      TRY(st, write_lname_escape(writer, &utf8[i], c_size));
     }
 
-    // Write escape
-    TRY(st, esink("\\", 1, writer));
-    TRY(st, esink(&utf8[i], 1, writer));
+    i += c_size;
   }
 
   return st;
@@ -780,20 +794,6 @@ write_literal(SerdWriter* const        writer,
   return st;
 }
 
-// Return true iff `buf` is a valid prefixed name prefix or suffix
-static bool
-is_name(const char* buf, const size_t len)
-{
-  // TODO: This is more strict than it should be
-  for (size_t i = 0; i < len; ++i) {
-    if (!(is_alpha(buf[i]) || is_digit(buf[i]))) {
-      return false;
-    }
-  }
-
-  return true;
-}
-
 SERD_NODISCARD static SerdStatus
 write_full_uri_node(SerdWriter* const writer, const SerdNode* const node)
 {
@@ -837,12 +837,12 @@ write_uri_node(SerdWriter* const     writer,
                const SerdField       field)
 {
   SerdStatus        st         = SERD_SUCCESS;
-  const SerdNode*   prefix     = NULL;
-  SerdStringView    suffix     = {NULL, 0};
   const char* const node_str   = serd_node_string(node);
   const bool        has_scheme = serd_uri_string_has_scheme(node_str);
 
   if (supports_abbrev(writer)) {
+    const SerdNode* prefix_node = NULL;
+    SerdStringView  suffix      = {NULL, 0};
     if (field == SERD_PREDICATE && !strcmp(node_str, NS_RDF "type")) {
       return esink("a", 1, writer);
     }
@@ -852,12 +852,11 @@ write_uri_node(SerdWriter* const     writer,
     }
 
     if (has_scheme && !(writer->flags & SERD_WRITE_UNQUALIFIED) &&
-        serd_env_qualify_in_place(writer->env, node, &prefix, &suffix) &&
-        is_name(serd_node_string(prefix), serd_node_length(prefix)) &&
-        is_name(suffix.data, suffix.length)) {
-      TRY(st, write_uri_from_node(writer, prefix));
+        serd_env_qualify_in_place(writer->env, node, &prefix_node, &suffix)) {
+      const SerdStringView prefix = serd_node_string_view(prefix_node);
+      TRY(st, write_lname(writer, prefix.data, prefix.length));
       TRY(st, esink(":", 1, writer));
-      return ewrite_uri(writer, suffix.data, suffix.length);
+      return write_lname(writer, suffix.data, suffix.length);
     }
   }
author	David Robillard <d@drobilla.net>	2023-03-24 20:59:54 -0400
committer	David Robillard <d@drobilla.net>	2023-12-02 18:49:07 -0500
commit	65cbb4a13f615658282677fcf04685bae63e893c (patch)
tree	b9c66e757cf28ce96906d3426300811645753173 /src
parent	c661dbe50d7f634ec5b2863260f41f098fc9c882 (diff)
download	serd-65cbb4a13f615658282677fcf04685bae63e893c.tar.gz serd-65cbb4a13f615658282677fcf04685bae63e893c.tar.bz2 serd-65cbb4a13f615658282677fcf04685bae63e893c.zip