aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorDavid Robillard <d@drobilla.net>2023-03-24 20:59:54 -0400
committerDavid Robillard <d@drobilla.net>2023-12-02 18:49:07 -0500
commit65cbb4a13f615658282677fcf04685bae63e893c (patch)
treeb9c66e757cf28ce96906d3426300811645753173 /src
parentc661dbe50d7f634ec5b2863260f41f098fc9c882 (diff)
downloadserd-65cbb4a13f615658282677fcf04685bae63e893c.tar.gz
serd-65cbb4a13f615658282677fcf04685bae63e893c.tar.bz2
serd-65cbb4a13f615658282677fcf04685bae63e893c.zip
Support writing all escapes in Turtle and TriG prefixed names
Diffstat (limited to 'src')
-rw-r--r--src/.clang-tidy1
-rw-r--r--src/turtle.h6
-rw-r--r--src/writer.c141
3 files changed, 74 insertions, 74 deletions
diff --git a/src/.clang-tidy b/src/.clang-tidy
index 638041cc..c2df3e44 100644
--- a/src/.clang-tidy
+++ b/src/.clang-tidy
@@ -9,7 +9,6 @@ Checks: >
-clang-analyzer-valist.Uninitialized,
-clang-diagnostic-unused-function,
-concurrency-mt-unsafe,
- -google-readability-todo,
-hicpp-multiway-paths-covered,
-hicpp-signed-bitwise,
-llvm-header-guard,
diff --git a/src/turtle.h b/src/turtle.h
index 6e7e3a8d..f794e1e8 100644
--- a/src/turtle.h
+++ b/src/turtle.h
@@ -8,7 +8,6 @@
#include "string_utils.h"
#include <stdbool.h>
-#include <string.h>
static inline bool
is_PN_CHARS_U(const int c)
@@ -26,7 +25,10 @@ is_PN_CHARS(const int c)
static inline bool
is_PN_LOCAL_ESC(const int c)
{
- return strchr("!#$%&\'()*+,-./;=?@_~", c) != NULL;
+ return c == '!' || c == '#' || c == '$' || c == '%' || c == '&' ||
+ c == '\'' || c == '(' || c == ')' || c == '*' || c == '+' ||
+ c == ',' || c == '-' || c == '.' || c == '/' || c == ';' || c == '=' ||
+ c == '?' || c == '@' || c == '\\' || c == '_' || c == '~';
}
#endif // SERD_SRC_TURTLE_H
diff --git a/src/writer.c b/src/writer.c
index 7201c976..60c17e11 100644
--- a/src/writer.c
+++ b/src/writer.c
@@ -9,6 +9,7 @@
#include "string_utils.h"
#include "system.h"
#include "try.h"
+#include "turtle.h"
#include "uri_utils.h"
#include "world.h"
@@ -281,10 +282,10 @@ esink(const void* buf, size_t len, SerdWriter* writer)
// Write a single character as a Unicode escape
// (Caller prints any single byte characters that don't need escaping)
static size_t
-write_character(SerdWriter* writer,
- const uint8_t* utf8,
- size_t* size,
- SerdStatus* st)
+write_character(SerdWriter* const writer,
+ const uint8_t* const utf8,
+ size_t* const size,
+ SerdStatus* const st)
{
char escape[11] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
const uint32_t c = parse_utf8_char(utf8, size);
@@ -395,64 +396,77 @@ write_uri_from_node(SerdWriter* writer, const SerdNode* node)
return ewrite_uri(writer, serd_node_string(node), serd_node_length(node));
}
-static bool
-lname_must_escape(const char c)
+SERD_NODISCARD static SerdStatus
+write_utf8_percent_escape(SerdWriter* const writer,
+ const char* const utf8,
+ const size_t n_bytes)
{
- /* This arbitrary list of characters, most of which have nothing to do with
- Turtle, must be handled as special cases here because the RDF and SPARQL
- WGs are apparently intent on making the once elegant Turtle a baroque
- and inconsistent mess, throwing elegance and extensibility completely
- out the window for no good reason.
+ static const char hex_chars[] = "0123456789ABCDEF";
- Note '-', '.', and '_' are also in PN_LOCAL_ESC, but are valid unescaped
- in local names, so they are not escaped here. */
+ SerdStatus st = SERD_SUCCESS;
+ char escape[4] = {'%', 0, 0, 0};
- switch (c) {
- case '\'':
- case '!':
- case '#':
- case '$':
- case '%':
- case '&':
- case '(':
- case ')':
- case '*':
- case '+':
- case ',':
- case '/':
- case ';':
- case '=':
- case '?':
- case '@':
- case '~':
- return true;
- default:
- break;
+ for (size_t i = 0U; i < n_bytes; ++i) {
+ const uint8_t byte = (uint8_t)utf8[i];
+ escape[1] = hex_chars[byte >> 4U];
+ escape[2] = hex_chars[byte & 0x0FU];
+
+ TRY(st, esink(escape, 3, writer));
}
- return false;
+
+ return st;
+}
+
+SERD_NODISCARD static SerdStatus
+write_PN_LOCAL_ESC(SerdWriter* const writer, const char c)
+{
+ const char buf[2] = {'\\', c};
+
+ return esink(buf, sizeof(buf), writer);
+}
+
+SERD_NODISCARD static SerdStatus
+write_lname_escape(SerdWriter* writer, const char* const utf8, size_t n_bytes)
+{
+ return is_PN_LOCAL_ESC(utf8[0])
+ ? write_PN_LOCAL_ESC(writer, utf8[0])
+ : write_utf8_percent_escape(writer, utf8, n_bytes);
}
SERD_NODISCARD static SerdStatus
-write_lname(SerdWriter* writer, const char* utf8, size_t n_bytes)
+write_lname(SerdWriter* writer, const char* utf8, const size_t n_bytes)
{
SerdStatus st = SERD_SUCCESS;
- for (size_t i = 0; i < n_bytes; ++i) {
- size_t j = i; // Index of next character that must be escaped
- for (; j < n_bytes; ++j) {
- if (lname_must_escape(utf8[j])) {
- break;
- }
- }
+ if (!n_bytes) {
+ return st;
+ }
- // Bulk write all characters up to this special one
- TRY(st, esink(&utf8[i], j - i, writer));
- if ((i = j) == n_bytes) {
- break; // Reached end
+ /* Thanks to the horribly complicated Turtle grammar for prefixed names,
+ making sure we never write an invalid character is tedious. We need to
+ handle the first and last characters separately since they have different
+ sets of valid characters. */
+
+ // Write first character
+ size_t first_size = 0U;
+ const int first = (int)parse_utf8_char((const uint8_t*)utf8, &first_size);
+ if (is_PN_CHARS_U(first) || first == ':' || is_digit(first)) {
+ TRY(st, esink(utf8, first_size, writer));
+ } else {
+ TRY(st, write_lname_escape(writer, utf8, first_size));
+ }
+
+ // Write middle and last characters
+ for (size_t i = first_size; i < n_bytes;) {
+ size_t c_size = 0U;
+ const int c = (int)parse_utf8_char((const uint8_t*)utf8 + i, &c_size);
+
+ if (is_PN_CHARS(c) || c == ':' || (c == '.' && (i + 1U < n_bytes))) {
+ TRY(st, esink(&utf8[i], c_size, writer));
+ } else {
+ TRY(st, write_lname_escape(writer, &utf8[i], c_size));
}
- // Write escape
- TRY(st, esink("\\", 1, writer));
- TRY(st, esink(&utf8[i], 1, writer));
+ i += c_size;
}
return st;
@@ -780,20 +794,6 @@ write_literal(SerdWriter* const writer,
return st;
}
-// Return true iff `buf` is a valid prefixed name prefix or suffix
-static bool
-is_name(const char* buf, const size_t len)
-{
- // TODO: This is more strict than it should be
- for (size_t i = 0; i < len; ++i) {
- if (!(is_alpha(buf[i]) || is_digit(buf[i]))) {
- return false;
- }
- }
-
- return true;
-}
-
SERD_NODISCARD static SerdStatus
write_full_uri_node(SerdWriter* const writer, const SerdNode* const node)
{
@@ -837,12 +837,12 @@ write_uri_node(SerdWriter* const writer,
const SerdField field)
{
SerdStatus st = SERD_SUCCESS;
- const SerdNode* prefix = NULL;
- SerdStringView suffix = {NULL, 0};
const char* const node_str = serd_node_string(node);
const bool has_scheme = serd_uri_string_has_scheme(node_str);
if (supports_abbrev(writer)) {
+ const SerdNode* prefix_node = NULL;
+ SerdStringView suffix = {NULL, 0};
if (field == SERD_PREDICATE && !strcmp(node_str, NS_RDF "type")) {
return esink("a", 1, writer);
}
@@ -852,12 +852,11 @@ write_uri_node(SerdWriter* const writer,
}
if (has_scheme && !(writer->flags & SERD_WRITE_UNQUALIFIED) &&
- serd_env_qualify_in_place(writer->env, node, &prefix, &suffix) &&
- is_name(serd_node_string(prefix), serd_node_length(prefix)) &&
- is_name(suffix.data, suffix.length)) {
- TRY(st, write_uri_from_node(writer, prefix));
+ serd_env_qualify_in_place(writer->env, node, &prefix_node, &suffix)) {
+ const SerdStringView prefix = serd_node_string_view(prefix_node);
+ TRY(st, write_lname(writer, prefix.data, prefix.length));
TRY(st, esink(":", 1, writer));
- return ewrite_uri(writer, suffix.data, suffix.length);
+ return write_lname(writer, suffix.data, suffix.length);
}
}