aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid Robillard <d@drobilla.net>2023-03-24 20:59:54 -0400
committerDavid Robillard <d@drobilla.net>2023-12-02 18:49:07 -0500
commit65cbb4a13f615658282677fcf04685bae63e893c (patch)
treeb9c66e757cf28ce96906d3426300811645753173
parentc661dbe50d7f634ec5b2863260f41f098fc9c882 (diff)
downloadserd-65cbb4a13f615658282677fcf04685bae63e893c.tar.gz
serd-65cbb4a13f615658282677fcf04685bae63e893c.tar.bz2
serd-65cbb4a13f615658282677fcf04685bae63e893c.zip
Support writing all escapes in Turtle and TriG prefixed names
-rw-r--r--NEWS3
-rw-r--r--src/.clang-tidy1
-rw-r--r--src/turtle.h6
-rw-r--r--src/writer.c141
-rw-r--r--test/extra/qualify/qualify-in.ttl3
-rw-r--r--test/extra/qualify/qualify-out.ttl7
-rw-r--r--test/test_writer.c74
7 files changed, 158 insertions, 77 deletions
diff --git a/NEWS b/NEWS
index baaeb8a0..7df3b70a 100644
--- a/NEWS
+++ b/NEWS
@@ -13,10 +13,11 @@ serd (1.1.1) unstable; urgency=medium
* Rename SerdChunk to SerdStringView
* Simplify statement flags
* Simplify writer style options and write UTF-8 by default
+ * Support writing all escapes in Turtle and TriG prefixed names
* Use a fixed-size reader stack
* Use char* for strings in public API
- -- David Robillard <d@drobilla.net> Wed, 13 Jul 2022 20:39:07 +0000
+ -- David Robillard <d@drobilla.net> Wed, 13 Jul 2022 21:43:56 +0000
serd (0.32.0) stable; urgency=medium
diff --git a/src/.clang-tidy b/src/.clang-tidy
index 638041cc..c2df3e44 100644
--- a/src/.clang-tidy
+++ b/src/.clang-tidy
@@ -9,7 +9,6 @@ Checks: >
-clang-analyzer-valist.Uninitialized,
-clang-diagnostic-unused-function,
-concurrency-mt-unsafe,
- -google-readability-todo,
-hicpp-multiway-paths-covered,
-hicpp-signed-bitwise,
-llvm-header-guard,
diff --git a/src/turtle.h b/src/turtle.h
index 6e7e3a8d..f794e1e8 100644
--- a/src/turtle.h
+++ b/src/turtle.h
@@ -8,7 +8,6 @@
#include "string_utils.h"
#include <stdbool.h>
-#include <string.h>
static inline bool
is_PN_CHARS_U(const int c)
@@ -26,7 +25,10 @@ is_PN_CHARS(const int c)
static inline bool
is_PN_LOCAL_ESC(const int c)
{
- return strchr("!#$%&\'()*+,-./;=?@_~", c) != NULL;
+ return c == '!' || c == '#' || c == '$' || c == '%' || c == '&' ||
+ c == '\'' || c == '(' || c == ')' || c == '*' || c == '+' ||
+ c == ',' || c == '-' || c == '.' || c == '/' || c == ';' || c == '=' ||
+ c == '?' || c == '@' || c == '\\' || c == '_' || c == '~';
}
#endif // SERD_SRC_TURTLE_H
diff --git a/src/writer.c b/src/writer.c
index 7201c976..60c17e11 100644
--- a/src/writer.c
+++ b/src/writer.c
@@ -9,6 +9,7 @@
#include "string_utils.h"
#include "system.h"
#include "try.h"
+#include "turtle.h"
#include "uri_utils.h"
#include "world.h"
@@ -281,10 +282,10 @@ esink(const void* buf, size_t len, SerdWriter* writer)
// Write a single character as a Unicode escape
// (Caller prints any single byte characters that don't need escaping)
static size_t
-write_character(SerdWriter* writer,
- const uint8_t* utf8,
- size_t* size,
- SerdStatus* st)
+write_character(SerdWriter* const writer,
+ const uint8_t* const utf8,
+ size_t* const size,
+ SerdStatus* const st)
{
char escape[11] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
const uint32_t c = parse_utf8_char(utf8, size);
@@ -395,64 +396,77 @@ write_uri_from_node(SerdWriter* writer, const SerdNode* node)
return ewrite_uri(writer, serd_node_string(node), serd_node_length(node));
}
-static bool
-lname_must_escape(const char c)
+SERD_NODISCARD static SerdStatus
+write_utf8_percent_escape(SerdWriter* const writer,
+ const char* const utf8,
+ const size_t n_bytes)
{
- /* This arbitrary list of characters, most of which have nothing to do with
- Turtle, must be handled as special cases here because the RDF and SPARQL
- WGs are apparently intent on making the once elegant Turtle a baroque
- and inconsistent mess, throwing elegance and extensibility completely
- out the window for no good reason.
+ static const char hex_chars[] = "0123456789ABCDEF";
- Note '-', '.', and '_' are also in PN_LOCAL_ESC, but are valid unescaped
- in local names, so they are not escaped here. */
+ SerdStatus st = SERD_SUCCESS;
+ char escape[4] = {'%', 0, 0, 0};
- switch (c) {
- case '\'':
- case '!':
- case '#':
- case '$':
- case '%':
- case '&':
- case '(':
- case ')':
- case '*':
- case '+':
- case ',':
- case '/':
- case ';':
- case '=':
- case '?':
- case '@':
- case '~':
- return true;
- default:
- break;
+ for (size_t i = 0U; i < n_bytes; ++i) {
+ const uint8_t byte = (uint8_t)utf8[i];
+ escape[1] = hex_chars[byte >> 4U];
+ escape[2] = hex_chars[byte & 0x0FU];
+
+ TRY(st, esink(escape, 3, writer));
}
- return false;
+
+ return st;
+}
+
+SERD_NODISCARD static SerdStatus
+write_PN_LOCAL_ESC(SerdWriter* const writer, const char c)
+{
+ const char buf[2] = {'\\', c};
+
+ return esink(buf, sizeof(buf), writer);
+}
+
+SERD_NODISCARD static SerdStatus
+write_lname_escape(SerdWriter* writer, const char* const utf8, size_t n_bytes)
+{
+ return is_PN_LOCAL_ESC(utf8[0])
+ ? write_PN_LOCAL_ESC(writer, utf8[0])
+ : write_utf8_percent_escape(writer, utf8, n_bytes);
}
SERD_NODISCARD static SerdStatus
-write_lname(SerdWriter* writer, const char* utf8, size_t n_bytes)
+write_lname(SerdWriter* writer, const char* utf8, const size_t n_bytes)
{
SerdStatus st = SERD_SUCCESS;
- for (size_t i = 0; i < n_bytes; ++i) {
- size_t j = i; // Index of next character that must be escaped
- for (; j < n_bytes; ++j) {
- if (lname_must_escape(utf8[j])) {
- break;
- }
- }
+ if (!n_bytes) {
+ return st;
+ }
- // Bulk write all characters up to this special one
- TRY(st, esink(&utf8[i], j - i, writer));
- if ((i = j) == n_bytes) {
- break; // Reached end
+ /* Thanks to the horribly complicated Turtle grammar for prefixed names,
+ making sure we never write an invalid character is tedious. We need to
+ handle the first and last characters separately since they have different
+ sets of valid characters. */
+
+ // Write first character
+ size_t first_size = 0U;
+ const int first = (int)parse_utf8_char((const uint8_t*)utf8, &first_size);
+ if (is_PN_CHARS_U(first) || first == ':' || is_digit(first)) {
+ TRY(st, esink(utf8, first_size, writer));
+ } else {
+ TRY(st, write_lname_escape(writer, utf8, first_size));
+ }
+
+ // Write middle and last characters
+ for (size_t i = first_size; i < n_bytes;) {
+ size_t c_size = 0U;
+ const int c = (int)parse_utf8_char((const uint8_t*)utf8 + i, &c_size);
+
+ if (is_PN_CHARS(c) || c == ':' || (c == '.' && (i + 1U < n_bytes))) {
+ TRY(st, esink(&utf8[i], c_size, writer));
+ } else {
+ TRY(st, write_lname_escape(writer, &utf8[i], c_size));
}
- // Write escape
- TRY(st, esink("\\", 1, writer));
- TRY(st, esink(&utf8[i], 1, writer));
+ i += c_size;
}
return st;
@@ -780,20 +794,6 @@ write_literal(SerdWriter* const writer,
return st;
}
-// Return true iff `buf` is a valid prefixed name prefix or suffix
-static bool
-is_name(const char* buf, const size_t len)
-{
- // TODO: This is more strict than it should be
- for (size_t i = 0; i < len; ++i) {
- if (!(is_alpha(buf[i]) || is_digit(buf[i]))) {
- return false;
- }
- }
-
- return true;
-}
-
SERD_NODISCARD static SerdStatus
write_full_uri_node(SerdWriter* const writer, const SerdNode* const node)
{
@@ -837,12 +837,12 @@ write_uri_node(SerdWriter* const writer,
const SerdField field)
{
SerdStatus st = SERD_SUCCESS;
- const SerdNode* prefix = NULL;
- SerdStringView suffix = {NULL, 0};
const char* const node_str = serd_node_string(node);
const bool has_scheme = serd_uri_string_has_scheme(node_str);
if (supports_abbrev(writer)) {
+ const SerdNode* prefix_node = NULL;
+ SerdStringView suffix = {NULL, 0};
if (field == SERD_PREDICATE && !strcmp(node_str, NS_RDF "type")) {
return esink("a", 1, writer);
}
@@ -852,12 +852,11 @@ write_uri_node(SerdWriter* const writer,
}
if (has_scheme && !(writer->flags & SERD_WRITE_UNQUALIFIED) &&
- serd_env_qualify_in_place(writer->env, node, &prefix, &suffix) &&
- is_name(serd_node_string(prefix), serd_node_length(prefix)) &&
- is_name(suffix.data, suffix.length)) {
- TRY(st, write_uri_from_node(writer, prefix));
+ serd_env_qualify_in_place(writer->env, node, &prefix_node, &suffix)) {
+ const SerdStringView prefix = serd_node_string_view(prefix_node);
+ TRY(st, write_lname(writer, prefix.data, prefix.length));
TRY(st, esink(":", 1, writer));
- return ewrite_uri(writer, suffix.data, suffix.length);
+ return write_lname(writer, suffix.data, suffix.length);
}
}
diff --git a/test/extra/qualify/qualify-in.ttl b/test/extra/qualify/qualify-in.ttl
index 04afc07f..b30e1721 100644
--- a/test/extra/qualify/qualify-in.ttl
+++ b/test/extra/qualify/qualify-in.ttl
@@ -6,5 +6,8 @@
<http://example.org/a-subject>
<http://example.org/a-predicate> <http://example.org/a-object> .
+<http://example.org/special-!#$%&'()*+,-./;=?@_~-chars>
+ <http://example.org/p> <http://example.org/o> .
+
<http://www.w3.org/1999/02/22-rdf-syntax-ns#nil>
<http://www.w3.org/2000/01/rdf-schema#label> "nil" .
diff --git a/test/extra/qualify/qualify-out.ttl b/test/extra/qualify/qualify-out.ttl
index f4dd15d4..79148017 100644
--- a/test/extra/qualify/qualify-out.ttl
+++ b/test/extra/qualify/qualify-out.ttl
@@ -3,8 +3,11 @@
eg:s
eg:p eg:o .
-<http://example.org/a-subject>
- <http://example.org/a-predicate> <http://example.org/a-object> .
+eg:a-subject
+ eg:a-predicate eg:a-object .
+
+eg:special-\!\#\$\%\&\'\(\)\*\+\,-.\/\;\=\?\@_\~-chars
+ eg:p eg:o .
()
<http://www.w3.org/2000/01/rdf-schema#label> "nil" .
diff --git a/test/test_writer.c b/test/test_writer.c
index 6066b6e3..a4d92c5b 100644
--- a/test/test_writer.c
+++ b/test/test_writer.c
@@ -19,6 +19,7 @@
#include <assert.h>
#include <stdint.h>
#include <stdio.h>
+#include <stdlib.h>
#include <string.h>
static void
@@ -325,6 +326,78 @@ test_write_empty_syntax(void)
serd_world_free(world);
}
+static void
+check_pname_escape(const char* const lname, const char* const expected)
+{
+ SerdWorld* world = serd_world_new();
+ SerdEnv* env = serd_env_new(serd_empty_string());
+ SerdBuffer buffer = {NULL, 0};
+
+ SerdWriter* writer =
+ serd_writer_new(world, SERD_TURTLE, 0U, env, serd_buffer_sink, &buffer);
+
+ assert(writer);
+
+ static const char* const prefix = "http://example.org/";
+ const size_t prefix_len = strlen(prefix);
+
+ serd_env_set_prefix(env, serd_string("eg"), serd_string(prefix));
+
+ SerdNode* s = serd_new_uri(serd_string("http://example.org/s"));
+ SerdNode* p = serd_new_uri(serd_string("http://example.org/p"));
+
+ char* const uri = (char*)calloc(1, prefix_len + strlen(lname) + 1);
+ memcpy(uri, prefix, prefix_len + 1);
+ memcpy(uri + prefix_len, lname, strlen(lname) + 1);
+
+ SerdNode* node = serd_new_uri(serd_string(uri));
+ assert(!serd_sink_write(serd_writer_sink(writer), 0, s, p, node, NULL));
+ serd_node_free(node);
+
+ free(uri);
+ serd_node_free(p);
+ serd_node_free(s);
+ serd_writer_free(writer);
+ serd_env_free(env);
+
+ char* out = serd_buffer_sink_finish(&buffer);
+ assert(!strcmp((char*)out, expected));
+ serd_free(out);
+
+ serd_world_free(world);
+}
+
+static void
+test_write_pname_escapes(void)
+{
+ // Check that '.' is escaped only at the start and end
+ check_pname_escape(".xyz", "eg:s\n\teg:p eg:\\.xyz .\n");
+ check_pname_escape("w.yz", "eg:s\n\teg:p eg:w.yz .\n");
+ check_pname_escape("wx.z", "eg:s\n\teg:p eg:wx.z .\n");
+ check_pname_escape("wxy.", "eg:s\n\teg:p eg:wxy\\. .\n");
+
+ // Check that ':' is not escaped anywhere
+ check_pname_escape(":xyz", "eg:s\n\teg:p eg::xyz .\n");
+ check_pname_escape("w:yz", "eg:s\n\teg:p eg:w:yz .\n");
+ check_pname_escape("wx:z", "eg:s\n\teg:p eg:wx:z .\n");
+ check_pname_escape("wxy:", "eg:s\n\teg:p eg:wxy: .\n");
+
+ // Check that special characters like '~' are escaped everywhere
+ check_pname_escape("~xyz", "eg:s\n\teg:p eg:\\~xyz .\n");
+ check_pname_escape("w~yz", "eg:s\n\teg:p eg:w\\~yz .\n");
+ check_pname_escape("wx~z", "eg:s\n\teg:p eg:wx\\~z .\n");
+ check_pname_escape("wxy~", "eg:s\n\teg:p eg:wxy\\~ .\n");
+
+ // Check that out of range multi-byte characters are escaped everywhere
+ static const char first_escape[] = {(char)0xC3U, (char)0xB7U, 'y', 'z', 0};
+ static const char mid_escape[] = {'w', (char)0xC3U, (char)0xB7U, 'z', 0};
+ static const char last_escape[] = {'w', 'x', (char)0xC3U, (char)0xB7U, 0};
+
+ check_pname_escape((const char*)first_escape, "eg:s\n\teg:p eg:%C3%B7yz .\n");
+ check_pname_escape((const char*)mid_escape, "eg:s\n\teg:p eg:w%C3%B7z .\n");
+ check_pname_escape((const char*)last_escape, "eg:s\n\teg:p eg:wx%C3%B7 .\n");
+}
+
int
main(void)
{
@@ -336,6 +409,7 @@ main(void)
test_write_error();
test_writer_stack_overflow();
test_write_empty_syntax();
+ test_write_pname_escapes();
return 0;
}