aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorDavid Robillard <d@drobilla.net>2021-07-31 19:32:21 -0400
committerDavid Robillard <d@drobilla.net>2022-01-28 21:57:07 -0500
commit8e30b617725a50a5b27d400247095d0577e5874e (patch)
treebb9c84cd1aef6c1499955fd4ae1a2587ee5a1929 /src
parent2bb0250be8297cc950d0036915ecdf61ab6f3700 (diff)
downloadserd-8e30b617725a50a5b27d400247095d0577e5874e.tar.gz
serd-8e30b617725a50a5b27d400247095d0577e5874e.tar.bz2
serd-8e30b617725a50a5b27d400247095d0577e5874e.zip
Support writing all escapes in Turtle and TriG prefixed names
Diffstat (limited to 'src')
-rw-r--r--src/.clang-tidy1
-rw-r--r--src/n3.c42
-rw-r--r--src/writer.c151
3 files changed, 107 insertions, 87 deletions
diff --git a/src/.clang-tidy b/src/.clang-tidy
index 6029eeaa..5cf5e873 100644
--- a/src/.clang-tidy
+++ b/src/.clang-tidy
@@ -10,7 +10,6 @@ Checks: >
-bugprone-suspicious-string-compare,
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
-concurrency-mt-unsafe,
- -google-readability-todo,
-hicpp-multiway-paths-covered,
-hicpp-signed-bitwise,
-llvm-header-guard,
diff --git a/src/n3.c b/src/n3.c
index 08c7754d..00d5bc1d 100644
--- a/src/n3.c
+++ b/src/n3.c
@@ -658,6 +658,13 @@ read_anon(SerdReader* const reader,
return eat_byte_check(reader, ']');
}
+static bool
+node_has_string(const SerdNode* const node, const SerdStringView string)
+{
+ return node->length == string.len &&
+ !memcmp(serd_node_string(node), string.buf, string.len);
+}
+
// Read a "named" object: a boolean literal or a prefixed name
static SerdStatus
read_named_object(SerdReader* const reader,
@@ -672,38 +679,33 @@ read_named_object(SerdReader* const reader,
characters, so this is more tedious to deal with in a non-tokenizing
parser like this one.
- Deal with this here by first reading the prefix into a tentative node. If
- it turns out to be "true" or "false", switch it to a boolean literal after
- the fact. */
+ Deal with this here by trying to read a prefixed node, then if it turns
+ out to actually be "true" or "false", switch it to a boolean literal. */
if (!(*dest = push_node(reader, SERD_URI, "", 0))) {
return SERD_ERR_OVERFLOW;
}
- const size_t string_start_offset = reader->stack.size;
- SerdNode* const node = *dest;
- SerdStatus st = SERD_SUCCESS;
- while (!(st = read_PN_CHARS_BASE(reader, node))) {
- }
+ SerdNode* node = *dest;
+ SerdStatus st = SERD_SUCCESS;
- if (st > SERD_FAILURE) {
- return st;
- }
+ // Attempt to read a prefixed name
+ st = read_PrefixedName(reader, node, true, ate_dot, reader->stack.size);
- if ((node->length == 4 && !memcmp(serd_node_string(node), "true", 4)) ||
- (node->length == 5 && !memcmp(serd_node_string(node), "false", 5))) {
- node->flags |= SERD_HAS_DATATYPE;
- node->type = SERD_LITERAL;
+ // Check if this is actually a special boolean node
+ if (st == SERD_FAILURE && (node_has_string(node, SERD_STRING("true")) ||
+ node_has_string(node, SERD_STRING("false")))) {
+ node->flags = SERD_HAS_DATATYPE;
+ node->type = SERD_LITERAL;
return push_node(reader, SERD_URI, XSD_BOOLEAN, XSD_BOOLEAN_LEN)
? SERD_SUCCESS
: SERD_ERR_OVERFLOW;
}
- if ((st = read_PN_PREFIX_tail(reader, node)) > SERD_FAILURE ||
- (st = read_PrefixedName(
- reader, node, false, ate_dot, string_start_offset))) {
- st = (st > SERD_FAILURE) ? st : SERD_ERR_BAD_SYNTAX;
- return r_err(reader, st, "expected prefixed name");
+ // Any other failure is a syntax error
+ if (st) {
+ st = st > SERD_FAILURE ? st : SERD_ERR_BAD_SYNTAX;
+ return r_err(reader, st, "expected prefixed name or boolean");
}
return SERD_SUCCESS;
diff --git a/src/writer.c b/src/writer.c
index e73533e7..d139ef9d 100644
--- a/src/writer.c
+++ b/src/writer.c
@@ -23,6 +23,7 @@
#include "string_utils.h"
#include "system.h"
#include "try.h"
+#include "turtle.h"
#include "uri_utils.h"
#include "world.h"
@@ -395,64 +396,97 @@ write_uri_from_node(SerdWriter* writer, const SerdNode* node)
return ewrite_uri(writer, serd_node_string(node), node->length);
}
-static bool
-lname_must_escape(const char c)
+SERD_WARN_UNUSED_RESULT static SerdStatus
+write_utf8_percent_escape(SerdWriter* const writer,
+ const char* const utf8,
+ const size_t n_bytes)
{
- /* This arbitrary list of characters, most of which have nothing to do with
- Turtle, must be handled as special cases here because the RDF and SPARQL
- WGs are apparently intent on making the once elegant Turtle a baroque
- and inconsistent mess, throwing elegance and extensibility completely
- out the window for no good reason.
+ SerdStatus st = SERD_SUCCESS;
+ char escape[4] = {0, 0, 0, 0};
- Note '-', '.', and '_' are also in PN_LOCAL_ESC, but are valid unescaped
- in local names, so they are not escaped here. */
+ for (size_t i = 0u; i < n_bytes; ++i) {
+ snprintf(escape, sizeof(escape), "%%%02X", (uint8_t)utf8[i]);
+ TRY(st, esink(escape, 3, writer));
+ }
- switch (c) {
- case '\'':
- case '!':
- case '#':
- case '$':
- case '%':
- case '&':
- case '(':
- case ')':
- case '*':
- case '+':
- case ',':
- case '/':
- case ';':
- case '=':
- case '?':
- case '@':
- case '~':
- return true;
- default:
- break;
+ return st;
+}
+
+SERD_WARN_UNUSED_RESULT static SerdStatus
+write_PN_LOCAL_ESC(SerdWriter* const writer, const char c)
+{
+ SerdStatus st = SERD_SUCCESS;
+
+ if (!(st = esink("\\", 1, writer))) {
+ st = esink(&c, 1, writer);
}
- return false;
+
+ return st;
+}
+
+SERD_WARN_UNUSED_RESULT static SerdStatus
+write_lname_escape(SerdWriter* writer, const char* const utf8, size_t n_bytes)
+{
+ SerdStatus st = SERD_SUCCESS;
+
+ if (is_PN_LOCAL_ESC(utf8[0])) {
+ st = write_PN_LOCAL_ESC(writer, utf8[0]);
+ } else {
+ st = write_utf8_percent_escape(writer, utf8, n_bytes);
+ }
+
+ return st;
}
SERD_WARN_UNUSED_RESULT static SerdStatus
write_lname(SerdWriter* writer, const char* utf8, size_t n_bytes)
{
SerdStatus st = SERD_SUCCESS;
- for (size_t i = 0; i < n_bytes; ++i) {
- size_t j = i; // Index of next character that must be escaped
- for (; j < n_bytes; ++j) {
- if (lname_must_escape(utf8[j])) {
- break;
- }
+ if (!n_bytes) {
+ return st;
+ }
+
+ /* Thanks to the horribly complicated Turtle grammar for prefixed names,
+ making sure we never write an invalid character is tedious. We need to
+ handle the first and last characters separately since they have different
+ sets of valid characters. */
+
+ // Write first character
+ size_t first_size = 0u;
+ const int first = (int)parse_utf8_char((const uint8_t*)utf8, &first_size);
+ if (is_PN_CHARS_U(first) || first == ':' || is_digit(first)) {
+ st = esink(utf8, first_size, writer);
+ } else {
+ st = write_lname_escape(writer, utf8, first_size);
+ }
+
+ // Write middle characters
+ size_t i = first_size;
+ while (!st && i < n_bytes - 1u) {
+ size_t c_size = 0u;
+ const int c = (int)parse_utf8_char((const uint8_t*)utf8 + i, &c_size);
+ if (i + c_size >= n_bytes) {
+ break;
}
- // Bulk write all characters up to this special one
- TRY(st, esink(&utf8[i], j - i, writer));
- if ((i = j) == n_bytes) {
- break; // Reached end
+ if (is_PN_CHARS(c) || c == '.' || c == ':') {
+ st = esink(&utf8[i], c_size, writer);
+ } else {
+ st = write_lname_escape(writer, &utf8[i], c_size);
}
- // Write escape
- TRY(st, esink("\\", 1, writer));
- TRY(st, esink(&utf8[i], 1, writer));
+ i += c_size;
+ }
+
+ // Write last character
+ if (!st && i < n_bytes) {
+ size_t last_size = 0u;
+ const int last = (int)parse_utf8_char((const uint8_t*)utf8 + i, &last_size);
+ if (is_PN_CHARS(last) || last == ':') {
+ st = esink(&utf8[i], last_size, writer);
+ } else {
+ st = write_lname_escape(writer, &utf8[i], last_size);
+ }
}
return st;
@@ -756,20 +790,6 @@ write_literal(SerdWriter* const writer,
return st;
}
-// Return true iff `buf` is a valid prefixed name prefix or suffix
-static bool
-is_name(const char* buf, const size_t len)
-{
- // TODO: This is more strict than it should be
- for (size_t i = 0; i < len; ++i) {
- if (!(is_alpha(buf[i]) || is_digit(buf[i]) || lname_must_escape(buf[i]))) {
- return false;
- }
- }
-
- return true;
-}
-
SERD_WARN_UNUSED_RESULT static SerdStatus
write_full_uri_node(SerdWriter* const writer, const SerdNode* const node)
{
@@ -812,11 +832,12 @@ write_uri_node(SerdWriter* const writer,
const SerdNode* const node,
const SerdField field)
{
- SerdStatus st = SERD_SUCCESS;
- SerdStringView prefix = {NULL, 0};
- SerdStringView suffix = {NULL, 0};
- const char* node_str = serd_node_string(node);
- const bool has_scheme = serd_uri_string_has_scheme(node_str);
+ SerdStatus st = SERD_SUCCESS;
+ SerdStringView prefix = {NULL, 0};
+ SerdStringView suffix = {NULL, 0};
+ const SerdStringView node_view = serd_node_string_view(node);
+ const char* node_str = serd_node_string(node);
+ const bool has_scheme = serd_uri_string_has_scheme(node_str);
if (supports_abbrev(writer)) {
if (field == SERD_PREDICATE &&
serd_node_equals(node, writer->world->rdf_type)) {
@@ -828,9 +849,7 @@ write_uri_node(SerdWriter* const writer,
}
if (has_scheme && !(writer->flags & SERD_WRITE_UNQUALIFIED) &&
- !serd_env_qualify(
- writer->env, serd_node_string_view(node), &prefix, &suffix) &&
- is_name(prefix.buf, prefix.len) && is_name(suffix.buf, suffix.len)) {
+ !serd_env_qualify(writer->env, node_view, &prefix, &suffix)) {
TRY(st, write_lname(writer, prefix.buf, prefix.len));
TRY(st, esink(":", 1, writer));
return write_lname(writer, suffix.buf, suffix.len);