aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorDavid Robillard <d@drobilla.net>2021-07-22 15:26:22 -0400
committerDavid Robillard <d@drobilla.net>2023-12-02 18:49:08 -0500
commit5e4538756d601e6a941c5290777af95ea8848e1a (patch)
tree9868e188a48a528e9908fcf695147f75790c3a56 /src
parent64024d0fa6a6dc048b2b846738846da597025f56 (diff)
downloadserd-5e4538756d601e6a941c5290777af95ea8848e1a.tar.gz
serd-5e4538756d601e6a941c5290777af95ea8848e1a.tar.bz2
serd-5e4538756d601e6a941c5290777af95ea8848e1a.zip
[WIP] Preserve long or short quoting from input documents
Diffstat (limited to 'src')
-rw-r--r--src/node.c148
-rw-r--r--src/read_ntriples.c22
-rw-r--r--src/read_turtle.c4
-rw-r--r--src/string.c53
-rw-r--r--src/string_utils.h5
-rw-r--r--src/uri_utils.h6
-rw-r--r--src/writer.c3
7 files changed, 87 insertions, 154 deletions
diff --git a/src/node.c b/src/node.c
index d34d4552..f1fbf0e6 100644
--- a/src/node.c
+++ b/src/node.c
@@ -11,7 +11,6 @@
#include "serd/buffer.h"
#include "serd/node.h"
#include "serd/status.h"
-#include "serd/string.h"
#include "serd/string_view.h"
#include "serd/uri.h"
#include "serd/write_result.h"
@@ -195,88 +194,92 @@ serd_new_token(const SerdNodeType type, const SerdStringView str)
SerdNode*
serd_new_string(const SerdStringView str)
{
- SerdNodeFlags flags = 0;
- const size_t length = serd_substrlen(str.data, str.length, &flags);
- SerdNode* node = serd_node_malloc(length, flags, SERD_LITERAL);
-
- memcpy(serd_node_buffer(node), str.data, str.length);
- node->length = length;
-
- serd_node_check_padding(node);
- return node;
-}
-
-/// Internal pre-measured implementation of serd_new_plain_literal
-static SerdNode*
-serd_new_plain_literal_i(const SerdStringView str,
- SerdNodeFlags flags,
- const SerdStringView lang)
-{
- assert(str.length);
- assert(lang.length);
-
- flags |= SERD_HAS_LANGUAGE;
-
- const size_t len = serd_node_pad_length(str.length);
- const size_t total_len = len + sizeof(SerdNode) + lang.length;
+ SerdNodeFlags flags = 0U;
+ SerdNode* node = serd_node_malloc(str.length, flags, SERD_LITERAL);
- SerdNode* node = serd_node_malloc(total_len, flags, SERD_LITERAL);
- memcpy(serd_node_buffer(node), str.data, str.length);
- node->length = str.length;
+ if (node) {
+ if (str.data && str.length) {
+ memcpy(serd_node_buffer(node), str.data, str.length);
+ }
- SerdNode* lang_node = node + 1 + (len / sizeof(SerdNode));
- lang_node->type = SERD_LITERAL;
- lang_node->length = lang.length;
- memcpy(serd_node_buffer(lang_node), lang.data, lang.length);
- serd_node_check_padding(lang_node);
+ node->length = str.length;
+ serd_node_check_padding(node);
+ }
- serd_node_check_padding(node);
return node;
}
-SerdNode*
-serd_new_plain_literal(const SerdStringView str, const SerdStringView lang)
+ZIX_PURE_FUNC static bool
+is_langtag(const SerdStringView string)
{
- if (!lang.length) {
- return serd_new_string(str);
+ // First character must be a letter
+ size_t i = 0;
+ if (!string.length || !is_alpha(string.data[i])) {
+ return false;
}
- SerdNodeFlags flags = 0;
- serd_strlen(str.data, &flags);
+ // First component must be all letters
+ while (++i < string.length && string.data[i] && string.data[i] != '-') {
+ if (!is_alpha(string.data[i])) {
+ return false;
+ }
+ }
- return serd_new_plain_literal_i(str, flags, lang);
+ // Following components can have letters and digits
+ while (i < string.length && string.data[i] == '-') {
+ while (++i < string.length && string.data[i] && string.data[i] != '-') {
+ const char c = string.data[i];
+ if (!is_alpha(c) && !is_digit(c)) {
+ return false;
+ }
+ }
+ }
+
+ return true;
}
SerdNode*
-serd_new_typed_literal(const SerdStringView str,
- const SerdStringView datatype_uri)
+serd_new_literal(const SerdStringView string,
+ const SerdNodeFlags flags,
+ const SerdStringView meta)
{
- if (!datatype_uri.length) {
- return serd_new_string(str);
+ if (!(flags & (SERD_HAS_DATATYPE | SERD_HAS_LANGUAGE))) {
+ SerdNode* node = serd_node_malloc(string.length, flags, SERD_LITERAL);
+
+ memcpy(serd_node_buffer(node), string.data, string.length);
+ node->length = string.length;
+ serd_node_check_padding(node);
+ return node;
}
- if (!strcmp(datatype_uri.data, NS_RDF "langString")) {
+ if ((flags & SERD_HAS_DATATYPE) && (flags & SERD_HAS_LANGUAGE)) {
return NULL;
}
- SerdNodeFlags flags = 0U;
- serd_strlen(str.data, &flags);
+ if (!meta.length) {
+ return NULL;
+ }
- flags |= SERD_HAS_DATATYPE;
+ if (((flags & SERD_HAS_DATATYPE) &&
+ (!serd_uri_string_has_scheme(meta.data) ||
+ !strcmp(meta.data, NS_RDF "langString"))) ||
+ ((flags & SERD_HAS_LANGUAGE) && !is_langtag(meta))) {
+ return NULL;
+ }
- const size_t len = serd_node_pad_length(str.length);
- const size_t total_len = len + sizeof(SerdNode) + datatype_uri.length;
+ const size_t len = serd_node_pad_length(string.length);
+ const size_t meta_len = serd_node_pad_length(meta.length);
+ const size_t meta_size = sizeof(SerdNode) + meta_len;
- SerdNode* node = serd_node_malloc(total_len, flags, SERD_LITERAL);
- memcpy(serd_node_buffer(node), str.data, str.length);
- node->length = str.length;
+ SerdNode* node = serd_node_malloc(len + meta_size, flags, SERD_LITERAL);
+ memcpy(serd_node_buffer(node), string.data, string.length);
+ node->length = string.length;
- SerdNode* datatype_node = node + 1 + (len / sizeof(SerdNode));
- datatype_node->length = datatype_uri.length;
- datatype_node->type = SERD_URI;
- memcpy(
- serd_node_buffer(datatype_node), datatype_uri.data, datatype_uri.length);
- serd_node_check_padding(datatype_node);
+ SerdNode* meta_node = node + 1U + (len / sizeof(SerdNode));
+ meta_node->length = meta.length;
+ meta_node->type = (flags & SERD_HAS_DATATYPE) ? SERD_URI : SERD_LITERAL;
+ memcpy(serd_node_buffer(meta_node), meta.data, meta.length);
+ serd_node_check_padding(meta_node);
serd_node_check_padding(node);
return node;
@@ -548,13 +551,6 @@ typedef size_t (*SerdWriteLiteralFunc)(const void* user_data,
size_t buf_size,
char* buf);
-SerdNode*
-serd_new_boolean(bool b)
-{
- return serd_new_typed_literal(b ? serd_string("true") : serd_string("false"),
- serd_node_string_view(&serd_xsd_boolean.node));
-}
-
static SerdNode*
serd_new_custom_literal(const void* const user_data,
const size_t len,
@@ -589,8 +585,9 @@ serd_new_double(const double d)
const ExessResult r = exess_write_double(d, sizeof(buf), buf);
return r.status ? NULL
- : serd_new_typed_literal(serd_substring(buf, r.count),
- serd_string(EXESS_XSD_URI "double"));
+ : serd_new_literal(serd_substring(buf, r.count),
+ SERD_HAS_DATATYPE,
+ serd_string(EXESS_XSD_URI "double"));
}
SerdNode*
@@ -601,8 +598,17 @@ serd_new_float(const float f)
const ExessResult r = exess_write_float(f, sizeof(buf), buf);
return r.status ? NULL
- : serd_new_typed_literal(serd_substring(buf, r.count),
- serd_string(EXESS_XSD_URI "float"));
+ : serd_new_literal(serd_substring(buf, r.count),
+ SERD_HAS_DATATYPE,
+ serd_string(EXESS_XSD_URI "float"));
+}
+
+SerdNode*
+serd_new_boolean(bool b)
+{
+ return serd_new_literal(b ? serd_string("true") : serd_string("false"),
+ SERD_HAS_DATATYPE,
+ serd_node_string_view(&serd_xsd_boolean.node));
}
SerdNode*
diff --git a/src/read_ntriples.c b/src/read_ntriples.c
index bec59c13..6822b64f 100644
--- a/src/read_ntriples.c
+++ b/src/read_ntriples.c
@@ -190,24 +190,8 @@ read_IRI(SerdReader* const reader, SerdNode** const dest)
SerdStatus
read_character(SerdReader* const reader, SerdNode* const dest, const uint8_t c)
{
- if (!(c & 0x80)) {
- switch (c) {
- case 0xA:
- case 0xD:
- dest->flags |= SERD_HAS_NEWLINE;
- break;
- case '"':
- case '\'':
- dest->flags |= SERD_HAS_QUOTE;
- break;
- default:
- break;
- }
-
- return push_byte(reader, dest, c);
- }
-
- return read_utf8_continuation(reader, dest, c);
+ return !(c & 0x80) ? push_byte(reader, dest, c)
+ : read_utf8_continuation(reader, dest, c);
}
SerdStatus
@@ -423,10 +407,8 @@ read_ECHAR(SerdReader* const reader, SerdNode* const dest)
case 'b':
return (st = skip_byte(reader, 'b')) ? st : push_byte(reader, dest, '\b');
case 'n':
- dest->flags |= SERD_HAS_NEWLINE;
return (st = skip_byte(reader, 'n')) ? st : push_byte(reader, dest, '\n');
case 'r':
- dest->flags |= SERD_HAS_NEWLINE;
return (st = skip_byte(reader, 'r')) ? st : push_byte(reader, dest, '\r');
case 'f':
return (st = skip_byte(reader, 'f')) ? st : push_byte(reader, dest, '\f');
diff --git a/src/read_turtle.c b/src/read_turtle.c
index c3970a1e..22269741 100644
--- a/src/read_turtle.c
+++ b/src/read_turtle.c
@@ -115,7 +115,6 @@ read_STRING_LITERAL_LONG(SerdReader* const reader,
push_byte(reader, ref, c);
st = read_string_escape(reader, ref);
} else {
- ref->flags |= SERD_HAS_QUOTE;
if (!(st = push_byte(reader, ref, c))) {
st = read_character(reader, ref, (uint8_t)q2);
}
@@ -151,7 +150,10 @@ read_String(SerdReader* const reader, SerdNode* const node)
return SERD_SUCCESS;
}
+ // Long string
skip_byte(reader, q3);
+ node->flags |= SERD_IS_LONG;
+
return read_STRING_LITERAL_LONG(reader, node, (uint8_t)q1);
}
diff --git a/src/string.c b/src/string.c
index 8cc839bd..ed3149d0 100644
--- a/src/string.c
+++ b/src/string.c
@@ -1,16 +1,10 @@
// Copyright 2011-2020 David Robillard <d@drobilla.net>
// SPDX-License-Identifier: ISC
-#include "string_utils.h"
-
#include "serd/memory.h"
-#include "serd/node.h"
#include "serd/status.h"
-#include "serd/string.h"
-#include <assert.h>
#include <stdlib.h>
-#include <string.h>
void
serd_free(void* const ptr)
@@ -68,50 +62,3 @@ serd_strerror(const SerdStatus status)
return "Unknown error";
}
-
-static void
-serd_update_flags(const char c, SerdNodeFlags* const flags)
-{
- switch (c) {
- case '\r':
- case '\n':
- *flags |= SERD_HAS_NEWLINE;
- break;
- case '"':
- *flags |= SERD_HAS_QUOTE;
- break;
- default:
- break;
- }
-}
-
-size_t
-serd_substrlen(const char* const str,
- const size_t len,
- SerdNodeFlags* const flags)
-{
- assert(flags);
-
- size_t i = 0;
- *flags = 0;
- for (; i < len && str[i]; ++i) {
- serd_update_flags(str[i], flags);
- }
-
- return i;
-}
-
-size_t
-serd_strlen(const char* const str, SerdNodeFlags* const flags)
-{
- if (flags) {
- size_t i = 0;
- *flags = 0;
- for (; str[i]; ++i) {
- serd_update_flags(str[i], flags);
- }
- return i;
- }
-
- return strlen(str);
-}
diff --git a/src/string_utils.h b/src/string_utils.h
index 9de03fa0..2517b270 100644
--- a/src/string_utils.h
+++ b/src/string_utils.h
@@ -4,8 +4,6 @@
#ifndef SERD_SRC_STRING_UTILS_H
#define SERD_SRC_STRING_UTILS_H
-#include "serd/node.h"
-
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
@@ -97,9 +95,6 @@ is_windows_path(const char* path)
(path[2] == '/' || path[2] == '\\');
}
-size_t
-serd_substrlen(const char* str, size_t len, SerdNodeFlags* flags);
-
static inline uint8_t
hex_digit_value(const uint8_t c)
{
diff --git a/src/uri_utils.h b/src/uri_utils.h
index 004129d2..76060d6a 100644
--- a/src/uri_utils.h
+++ b/src/uri_utils.h
@@ -4,10 +4,12 @@
#ifndef SERD_SRC_URI_UTILS_H
#define SERD_SRC_URI_UTILS_H
-#include "serd/attributes.h"
-
#include "string_utils.h"
+#include "serd/attributes.h"
+#include "serd/string_view.h"
+#include "serd/uri.h"
+
#include <stdbool.h>
#include <string.h>
diff --git a/src/writer.c b/src/writer.c
index 94c75625..329a29ad 100644
--- a/src/writer.c
+++ b/src/writer.c
@@ -778,8 +778,7 @@ write_literal(SerdWriter* const writer,
}
}
- if (supports_abbrev(writer) &&
- (node->flags & (SERD_HAS_NEWLINE | SERD_HAS_QUOTE))) {
+ if (supports_abbrev(writer) && (node->flags & SERD_IS_LONG)) {
TRY(st, esink("\"\"\"", 3, writer));
TRY(st, write_text(writer, WRITE_LONG_STRING, node_str, node->length));
TRY(st, esink("\"\"\"", 3, writer));