aboutsummaryrefslogtreecommitdiffstats
path: root/src/writer.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/writer.c')
-rw-r--r--src/writer.c141
1 files changed, 70 insertions, 71 deletions
diff --git a/src/writer.c b/src/writer.c
index 7201c976..60c17e11 100644
--- a/src/writer.c
+++ b/src/writer.c
@@ -9,6 +9,7 @@
#include "string_utils.h"
#include "system.h"
#include "try.h"
+#include "turtle.h"
#include "uri_utils.h"
#include "world.h"
@@ -281,10 +282,10 @@ esink(const void* buf, size_t len, SerdWriter* writer)
// Write a single character as a Unicode escape
// (Caller prints any single byte characters that don't need escaping)
static size_t
-write_character(SerdWriter* writer,
- const uint8_t* utf8,
- size_t* size,
- SerdStatus* st)
+write_character(SerdWriter* const writer,
+ const uint8_t* const utf8,
+ size_t* const size,
+ SerdStatus* const st)
{
char escape[11] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
const uint32_t c = parse_utf8_char(utf8, size);
@@ -395,64 +396,77 @@ write_uri_from_node(SerdWriter* writer, const SerdNode* node)
return ewrite_uri(writer, serd_node_string(node), serd_node_length(node));
}
-static bool
-lname_must_escape(const char c)
+SERD_NODISCARD static SerdStatus
+write_utf8_percent_escape(SerdWriter* const writer,
+ const char* const utf8,
+ const size_t n_bytes)
{
- /* This arbitrary list of characters, most of which have nothing to do with
- Turtle, must be handled as special cases here because the RDF and SPARQL
- WGs are apparently intent on making the once elegant Turtle a baroque
- and inconsistent mess, throwing elegance and extensibility completely
- out the window for no good reason.
+ static const char hex_chars[] = "0123456789ABCDEF";
- Note '-', '.', and '_' are also in PN_LOCAL_ESC, but are valid unescaped
- in local names, so they are not escaped here. */
+ SerdStatus st = SERD_SUCCESS;
+ char escape[4] = {'%', 0, 0, 0};
- switch (c) {
- case '\'':
- case '!':
- case '#':
- case '$':
- case '%':
- case '&':
- case '(':
- case ')':
- case '*':
- case '+':
- case ',':
- case '/':
- case ';':
- case '=':
- case '?':
- case '@':
- case '~':
- return true;
- default:
- break;
+ for (size_t i = 0U; i < n_bytes; ++i) {
+ const uint8_t byte = (uint8_t)utf8[i];
+ escape[1] = hex_chars[byte >> 4U];
+ escape[2] = hex_chars[byte & 0x0FU];
+
+ TRY(st, esink(escape, 3, writer));
}
- return false;
+
+ return st;
+}
+
+SERD_NODISCARD static SerdStatus
+write_PN_LOCAL_ESC(SerdWriter* const writer, const char c)
+{
+ const char buf[2] = {'\\', c};
+
+ return esink(buf, sizeof(buf), writer);
+}
+
+SERD_NODISCARD static SerdStatus
+write_lname_escape(SerdWriter* writer, const char* const utf8, size_t n_bytes)
+{
+ return is_PN_LOCAL_ESC(utf8[0])
+ ? write_PN_LOCAL_ESC(writer, utf8[0])
+ : write_utf8_percent_escape(writer, utf8, n_bytes);
}
SERD_NODISCARD static SerdStatus
-write_lname(SerdWriter* writer, const char* utf8, size_t n_bytes)
+write_lname(SerdWriter* writer, const char* utf8, const size_t n_bytes)
{
SerdStatus st = SERD_SUCCESS;
- for (size_t i = 0; i < n_bytes; ++i) {
- size_t j = i; // Index of next character that must be escaped
- for (; j < n_bytes; ++j) {
- if (lname_must_escape(utf8[j])) {
- break;
- }
- }
+ if (!n_bytes) {
+ return st;
+ }
- // Bulk write all characters up to this special one
- TRY(st, esink(&utf8[i], j - i, writer));
- if ((i = j) == n_bytes) {
- break; // Reached end
+ /* Thanks to the horribly complicated Turtle grammar for prefixed names,
+ making sure we never write an invalid character is tedious. We need to
+ handle the first and last characters separately since they have different
+ sets of valid characters. */
+
+ // Write first character
+ size_t first_size = 0U;
+ const int first = (int)parse_utf8_char((const uint8_t*)utf8, &first_size);
+ if (is_PN_CHARS_U(first) || first == ':' || is_digit(first)) {
+ TRY(st, esink(utf8, first_size, writer));
+ } else {
+ TRY(st, write_lname_escape(writer, utf8, first_size));
+ }
+
+ // Write middle and last characters
+ for (size_t i = first_size; i < n_bytes;) {
+ size_t c_size = 0U;
+ const int c = (int)parse_utf8_char((const uint8_t*)utf8 + i, &c_size);
+
+ if (is_PN_CHARS(c) || c == ':' || (c == '.' && (i + 1U < n_bytes))) {
+ TRY(st, esink(&utf8[i], c_size, writer));
+ } else {
+ TRY(st, write_lname_escape(writer, &utf8[i], c_size));
}
- // Write escape
- TRY(st, esink("\\", 1, writer));
- TRY(st, esink(&utf8[i], 1, writer));
+ i += c_size;
}
return st;
@@ -780,20 +794,6 @@ write_literal(SerdWriter* const writer,
return st;
}
-// Return true iff `buf` is a valid prefixed name prefix or suffix
-static bool
-is_name(const char* buf, const size_t len)
-{
- // TODO: This is more strict than it should be
- for (size_t i = 0; i < len; ++i) {
- if (!(is_alpha(buf[i]) || is_digit(buf[i]))) {
- return false;
- }
- }
-
- return true;
-}
-
SERD_NODISCARD static SerdStatus
write_full_uri_node(SerdWriter* const writer, const SerdNode* const node)
{
@@ -837,12 +837,12 @@ write_uri_node(SerdWriter* const writer,
const SerdField field)
{
SerdStatus st = SERD_SUCCESS;
- const SerdNode* prefix = NULL;
- SerdStringView suffix = {NULL, 0};
const char* const node_str = serd_node_string(node);
const bool has_scheme = serd_uri_string_has_scheme(node_str);
if (supports_abbrev(writer)) {
+ const SerdNode* prefix_node = NULL;
+ SerdStringView suffix = {NULL, 0};
if (field == SERD_PREDICATE && !strcmp(node_str, NS_RDF "type")) {
return esink("a", 1, writer);
}
@@ -852,12 +852,11 @@ write_uri_node(SerdWriter* const writer,
}
if (has_scheme && !(writer->flags & SERD_WRITE_UNQUALIFIED) &&
- serd_env_qualify_in_place(writer->env, node, &prefix, &suffix) &&
- is_name(serd_node_string(prefix), serd_node_length(prefix)) &&
- is_name(suffix.data, suffix.length)) {
- TRY(st, write_uri_from_node(writer, prefix));
+ serd_env_qualify_in_place(writer->env, node, &prefix_node, &suffix)) {
+ const SerdStringView prefix = serd_node_string_view(prefix_node);
+ TRY(st, write_lname(writer, prefix.data, prefix.length));
TRY(st, esink(":", 1, writer));
- return ewrite_uri(writer, suffix.data, suffix.length);
+ return write_lname(writer, suffix.data, suffix.length);
}
}