aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorDavid Robillard <d@drobilla.net>2011-05-11 16:03:03 +0000
committerDavid Robillard <d@drobilla.net>2011-05-11 16:03:03 +0000
commitefd2a2ecb17a8c633f634c2cf38fb8be48db9af5 (patch)
tree2513b24e985a02e10565563a2a9151c58f38d2c1 /src
parent1dc85d2924eacef71d0cc9144c6c33e6f84248b1 (diff)
downloadserd-efd2a2ecb17a8c633f634c2cf38fb8be48db9af5.tar.gz
serd-efd2a2ecb17a8c633f634c2cf38fb8be48db9af5.tar.bz2
serd-efd2a2ecb17a8c633f634c2cf38fb8be48db9af5.zip
Record whether strings contains quotes or newlines for smart selection of long strings vs short strings in writer output.
git-svn-id: http://svn.drobilla.net/serd/trunk@167 490d8e77-9747-427b-9fa3-0b8f29cee8a0
Diffstat (limited to 'src')
-rw-r--r--src/env.c1
-rw-r--r--src/node.c9
-rw-r--r--src/reader.c83
-rw-r--r--src/serd_internal.h10
-rw-r--r--src/writer.c61
5 files changed, 100 insertions, 64 deletions
diff --git a/src/env.c b/src/env.c
index 6671d683..0d9bc128 100644
--- a/src/env.c
+++ b/src/env.c
@@ -206,6 +206,7 @@ serd_env_expand_node(const SerdEnv* env,
SerdNode ret = { NULL,
prefix.len + suffix.len + 1,
prefix.len + suffix.len, // FIXME: UTF-8
+ 0,
SERD_URI };
ret.buf = malloc(ret.n_bytes);
snprintf((char*)ret.buf, ret.n_bytes, "%s%s", prefix.buf, suffix.buf);
diff --git a/src/node.c b/src/node.c
index b6288ebc..edd0db86 100644
--- a/src/node.c
+++ b/src/node.c
@@ -23,9 +23,10 @@ SERD_API
SerdNode
serd_node_from_string(SerdType type, const uint8_t* buf)
{
- size_t buf_n_bytes;
- const size_t buf_n_chars = serd_strlen(buf, &buf_n_bytes);
- SerdNode ret = { buf, buf_n_bytes, buf_n_chars, type };
+ uint32_t flags;
+ size_t buf_n_bytes;
+ const size_t buf_n_chars = serd_strlen(buf, &buf_n_bytes, &flags);
+ SerdNode ret = { buf, buf_n_bytes, buf_n_chars, flags, type };
return ret;
}
@@ -118,7 +119,7 @@ serd_node_new_uri(const SerdURI* uri, const SerdURI* base, SerdURI* out)
const size_t len = serd_uri_string_length(&abs_uri);
uint8_t* buf = malloc(len + 1);
- SerdNode node = { buf, len + 1, len, SERD_URI }; // FIXME: UTF-8
+ SerdNode node = { buf, len + 1, len, 0, SERD_URI }; // FIXME: UTF-8
uint8_t* ptr = buf;
const size_t actual_len = serd_uri_serialise(&abs_uri, string_sink, &ptr);
diff --git a/src/reader.c b/src/reader.c
index e4e91f4c..b0ad02e5 100644
--- a/src/reader.c
+++ b/src/reader.c
@@ -269,9 +269,9 @@ public_node_from_ref(SerdReader* reader, SerdType type, Ref ref)
if (!ref) {
return SERD_NODE_NULL;
}
- const SerdString* str = deref(reader, ref);
- const SerdNode public = { str->buf, str->n_bytes, str->n_chars, type };
- return public;
+ const SerdString* str = deref(reader, ref);
+ const SerdNode node = { str->buf, str->n_bytes, str->n_chars, 0, type };
+ return node;
}
static inline SerdNode
@@ -287,16 +287,17 @@ public_node(SerdReader* reader, const Node* private)
static inline bool
emit_statement(SerdReader* reader,
const Node* g, const Node* s, const Node* p, const Node* o,
- const Node* d, Ref l)
+ const Node* d, Ref l, uint32_t f)
{
assert(s && p && o);
assert(s->value && p->value && o->value);
const SerdNode graph = public_node(reader, g);
const SerdNode subject = public_node(reader, s);
const SerdNode predicate = public_node(reader, p);
- const SerdNode object = public_node(reader, o);
+ SerdNode object = public_node(reader, o);
const SerdNode datatype = public_node(reader, d);
const SerdNode lang = public_node_from_ref(reader, SERD_LITERAL, l);
+ object.flags = f;
return !reader->statement_sink(reader->handle,
&graph,
&subject,
@@ -389,7 +390,7 @@ read_character_escape(SerdReader* reader, Ref dest)
}
static inline bool
-read_echaracter_escape(SerdReader* reader, Ref dest)
+read_echaracter_escape(SerdReader* reader, Ref dest, uint32_t* flags)
{
switch (peek_byte(reader)) {
case 't':
@@ -397,10 +398,12 @@ read_echaracter_escape(SerdReader* reader, Ref dest)
push_byte(reader, dest, '\t');
return true;
case 'n':
+ *flags |= SERD_HAS_NEWLINE;
eat_byte(reader, 'n');
push_byte(reader, dest, '\n');
return true;
case 'r':
+ *flags |= SERD_HAS_NEWLINE;
eat_byte(reader, 'r');
push_byte(reader, dest, '\r');
return true;
@@ -410,26 +413,28 @@ read_echaracter_escape(SerdReader* reader, Ref dest)
}
static inline bool
-read_scharacter_escape(SerdReader* reader, Ref dest)
+read_scharacter_escape(SerdReader* reader, Ref dest, uint32_t* flags)
{
switch (peek_byte(reader)) {
case '"':
+ *flags |= SERD_HAS_QUOTE;
push_byte(reader, dest, eat_byte(reader, '"'));
return true;
default:
- return read_echaracter_escape(reader, dest);
+ return read_echaracter_escape(reader, dest, flags);
}
}
static inline bool
read_ucharacter_escape(SerdReader* reader, Ref dest)
{
+ uint32_t flags = 0;
switch (peek_byte(reader)) {
case '>':
push_byte(reader, dest, eat_byte(reader, '>'));
return true;
default:
- return read_echaracter_escape(reader, dest);
+ return read_echaracter_escape(reader, dest, &flags);
}
}
@@ -477,11 +482,12 @@ read_character(SerdReader* reader, Ref dest)
static inline SerdStatus
read_echaracter(SerdReader* reader, Ref dest)
{
- uint8_t c = peek_byte(reader);
+ uint32_t flags = 0;
+ uint8_t c = peek_byte(reader);
switch (c) {
case '\\':
eat_byte(reader, '\\');
- if (read_echaracter_escape(reader, peek_byte(reader))) {
+ if (read_echaracter_escape(reader, peek_byte(reader), &flags)) {
return SERD_SUCCESS;
} else {
error(reader, "illegal escape `\\%c'\n", peek_byte(reader));
@@ -494,31 +500,34 @@ read_echaracter(SerdReader* reader, Ref dest)
// [43] lcharacter ::= echaracter | '\"' | #x9 | #xA | #xD
static inline SerdStatus
-read_lcharacter(SerdReader* reader, Ref dest)
+read_lcharacter(SerdReader* reader, Ref dest, uint32_t* flags)
{
const uint8_t c = peek_byte(reader);
uint8_t pre[3];
switch (c) {
case '"':
- peek_string(reader, pre, 3);
- if (pre[1] == '\"' && pre[2] == '\"') {
+ peek_string(reader, pre, 4);
+ if (pre[1] == '\"' && pre[2] == '\"' && pre[3] != '\"') {
eat_byte(reader, '\"');
eat_byte(reader, '\"');
eat_byte(reader, '\"');
return SERD_FAILURE;
} else {
+ *flags |= SERD_HAS_QUOTE;
push_byte(reader, dest, eat_byte(reader, '"'));
return SERD_SUCCESS;
}
case '\\':
eat_byte(reader, '\\');
- if (read_scharacter_escape(reader, dest)) {
+ if (read_scharacter_escape(reader, dest, flags)) {
return SERD_SUCCESS;
} else {
error(reader, "illegal escape `\\%c'\n", peek_byte(reader));
return SERD_ERR_BAD_SYNTAX;
}
- case 0x9: case 0xA: case 0xD:
+ case 0xA: case 0xD:
+ *flags |= SERD_HAS_NEWLINE;
+ case 0x9:
push_byte(reader, dest, eat_byte(reader, c));
return SERD_SUCCESS;
default:
@@ -528,13 +537,13 @@ read_lcharacter(SerdReader* reader, Ref dest)
// [42] scharacter ::= ( echaracter - #x22 ) | '\"'
static inline SerdStatus
-read_scharacter(SerdReader* reader, Ref dest)
+read_scharacter(SerdReader* reader, Ref dest, uint32_t* flags)
{
uint8_t c = peek_byte(reader);
switch (c) {
case '\\':
eat_byte(reader, '\\');
- if (read_scharacter_escape(reader, dest)) {
+ if (read_scharacter_escape(reader, dest, flags)) {
return SERD_SUCCESS;
} else {
error(reader, "illegal escape `\\%c'\n", peek_byte(reader));
@@ -612,12 +621,12 @@ read_ws_plus(SerdReader* reader)
// [37] longString ::= #x22 #x22 #x22 lcharacter* #x22 #x22 #x22
static Ref
-read_longString(SerdReader* reader)
+read_longString(SerdReader* reader, uint32_t* flags)
{
eat_string(reader, "\"\"\"", 3);
Ref str = push_string(reader, "", 1);
SerdStatus st;
- while (!(st = read_lcharacter(reader, str))) {}
+ while (!(st = read_lcharacter(reader, str, flags))) {}
if (st < SERD_ERR_UNKNOWN) {
return str;
}
@@ -627,12 +636,12 @@ read_longString(SerdReader* reader)
// [36] string ::= #x22 scharacter* #x22
static Ref
-read_string(SerdReader* reader)
+read_string(SerdReader* reader, uint32_t* flags)
{
eat_byte(reader, '\"');
Ref str = push_string(reader, "", 1);
SerdStatus st;
- while (!(st = read_scharacter(reader, str))) {}
+ while (!(st = read_scharacter(reader, str, flags))) {}
if (st < SERD_ERR_UNKNOWN) {
eat_byte(reader, '\"');
return str;
@@ -643,7 +652,7 @@ read_string(SerdReader* reader)
// [35] quotedString ::= string | longString
static Ref
-read_quotedString(SerdReader* reader)
+read_quotedString(SerdReader* reader, uint32_t* flags)
{
uint8_t pre[3];
peek_string(reader, pre, 3);
@@ -651,11 +660,11 @@ read_quotedString(SerdReader* reader)
switch (pre[1]) {
case '\"':
if (pre[2] == '\"')
- return read_longString(reader);
+ return read_longString(reader, flags);
else
- return read_string(reader);
+ return read_string(reader, flags);
default:
- return read_string(reader);
+ return read_string(reader, flags);
}
}
@@ -893,14 +902,15 @@ read_resource(SerdReader* reader, Node* dest)
// [14] literal ::= quotedString ( '@' language )? | datatypeString
// | integer | double | decimal | boolean
static bool
-read_literal(SerdReader* reader, Node* dest, Node* datatype, Ref* lang)
+read_literal(SerdReader* reader, Node* dest,
+ Node* datatype, Ref* lang, uint32_t* flags)
{
Ref str = 0;
const uint8_t c = peek_byte(reader);
if (c == '-' || c == '+' || c == '.' || is_digit(c)) {
return read_number(reader, dest, datatype);
} else if (c == '\"') {
- str = read_quotedString(reader);
+ str = read_quotedString(reader, flags);
if (!str) {
return false;
}
@@ -994,7 +1004,7 @@ read_blank(SerdReader* reader, ReadContext ctx, Node* dest)
if (ctx.subject) {
TRY_RET(emit_statement(reader,
ctx.graph, ctx.subject, ctx.predicate,
- dest, NULL, 0));
+ dest, NULL, 0, 0));
}
return true;
}
@@ -1002,7 +1012,7 @@ read_blank(SerdReader* reader, ReadContext ctx, Node* dest)
if (ctx.subject) {
TRY_RET(emit_statement(reader,
ctx.graph, ctx.subject, ctx.predicate,
- dest, NULL, 0));
+ dest, NULL, 0, 0));
dest->type = SERD_ANON;
}
ctx.subject = dest;
@@ -1019,7 +1029,7 @@ read_blank(SerdReader* reader, ReadContext ctx, Node* dest)
if (ctx.subject) {
TRY_RET(emit_statement(reader,
ctx.graph, ctx.subject, ctx.predicate,
- dest, NULL, 0));
+ dest, NULL, 0, 0));
}
return true;
}
@@ -1060,6 +1070,7 @@ read_object(SerdReader* reader, ReadContext ctx)
Node o = INTERNAL_NODE_NULL;
Node datatype = INTERNAL_NODE_NULL;
Ref lang = 0;
+ uint32_t flags = 0;
const uint8_t c = peek_byte(reader);
switch (c) {
case '\0':
@@ -1077,10 +1088,10 @@ read_object(SerdReader* reader, ReadContext ctx)
case '\"': case '+': case '-':
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
- TRY_THROW(ret = read_literal(reader, &o, &datatype, &lang));
+ TRY_THROW(ret = read_literal(reader, &o, &datatype, &lang, &flags));
break;
case '.':
- TRY_THROW(ret = read_literal(reader, &o, &datatype, &lang));
+ TRY_THROW(ret = read_literal(reader, &o, &datatype, &lang, &flags));
break;
default:
/* Either a boolean literal, or a qname.
@@ -1108,7 +1119,7 @@ read_object(SerdReader* reader, ReadContext ctx)
assert(o.value);
ret = emit_statement(reader,
ctx.graph, ctx.subject, ctx.predicate,
- &o, &datatype, lang);
+ &o, &datatype, lang, flags);
}
except:
@@ -1188,14 +1199,14 @@ read_collection_rec(SerdReader* reader, ReadContext ctx)
TRY_RET(emit_statement(reader, NULL,
ctx.subject,
&reader->rdf_rest,
- &reader->rdf_nil, NULL, 0));
+ &reader->rdf_nil, NULL, 0, 0));
return false;
} else {
const Node rest = make_node(SERD_BLANK_ID, blank_id(reader));
TRY_RET(emit_statement(reader, ctx.graph,
ctx.subject,
&reader->rdf_rest,
- &rest, NULL, 0));
+ &rest, NULL, 0, 0));
ctx.subject = &rest;
ctx.predicate = &reader->rdf_first;
if (read_object(reader, ctx)) {
diff --git a/src/serd_internal.h b/src/serd_internal.h
index dd57af1e..e573a806 100644
--- a/src/serd_internal.h
+++ b/src/serd_internal.h
@@ -105,7 +105,7 @@ is_digit(const uint8_t c)
@param n_bytes (Output) Set to the size of @a str in bytes (incl. NULL).
*/
static inline size_t
-serd_strlen(const uint8_t* str, size_t* n_bytes)
+serd_strlen(const uint8_t* str, size_t* n_bytes, uint32_t* flags)
{
size_t n_chars = 0;
size_t i = 0;
@@ -113,6 +113,14 @@ serd_strlen(const uint8_t* str, size_t* n_bytes)
if ((str[i] & 0xC0) != 0x80) {
// Does not start with `10', start of a new character
++n_chars;
+ switch (str[i]) {
+ case '\r':
+ case '\n':
+ *flags |= SERD_HAS_NEWLINE;
+ break;
+ case '"':
+ *flags |= SERD_HAS_QUOTE;
+ }
}
}
if (n_bytes) {
diff --git a/src/writer.c b/src/writer.c
index 4e4ee2eb..a9cdf6ac 100644
--- a/src/writer.c
+++ b/src/writer.c
@@ -31,7 +31,7 @@ typedef struct {
} WriteContext;
static const WriteContext WRITE_CONTEXT_NULL = {
- { 0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}
+ { 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}
};
struct SerdWriterImpl {
@@ -47,9 +47,9 @@ struct SerdWriterImpl {
};
typedef enum {
- WRITE_NORMAL,
WRITE_URI,
- WRITE_STRING
+ WRITE_STRING,
+ WRITE_LONG_STRING
} TextContext;
static inline WriteContext*
@@ -67,23 +67,29 @@ write_text(SerdWriter* writer, TextContext ctx,
char escape[11] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
for (size_t i = 0; i < n_bytes;) {
uint8_t in = utf8[i++];
- switch (in) {
- case '\\': writer->sink("\\\\", 2, writer->stream); continue;
- case '\n': writer->sink("\\n", 2, writer->stream); continue;
- case '\r': writer->sink("\\r", 2, writer->stream); continue;
- case '\t': writer->sink("\\t", 2, writer->stream); continue;
- case '"':
- if (terminator == '"') {
- writer->sink("\\\"", 2, writer->stream);
- continue;
- } // else fall-through
- default: break;
- }
+ if (ctx == WRITE_LONG_STRING) {
+ if (in == '\\') {
+ writer->sink("\\\\", 2, writer->stream); continue;
+ }
+ } else {
+ switch (in) {
+ case '\\': writer->sink("\\\\", 2, writer->stream); continue;
+ case '\n': writer->sink("\\n", 2, writer->stream); continue;
+ case '\r': writer->sink("\\r", 2, writer->stream); continue;
+ case '\t': writer->sink("\\t", 2, writer->stream); continue;
+ case '"':
+ if (terminator == '"') {
+ writer->sink("\\\"", 2, writer->stream);
+ continue;
+ } // else fall-through
+ default: break;
+ }
- if (in == terminator) {
- snprintf(escape, 7, "\\u%04X", terminator);
- writer->sink(escape, 6, writer->stream);
- continue;
+ if (in == terminator) {
+ snprintf(escape, 7, "\\u%04X", terminator);
+ writer->sink(escape, 6, writer->stream);
+ continue;
+ }
}
uint32_t c = 0;
@@ -109,7 +115,8 @@ write_text(SerdWriter* writer, TextContext ctx,
return false;
}
- if (ctx == WRITE_STRING && !(writer->style & SERD_STYLE_ASCII)) {
+ if ((ctx == WRITE_STRING || ctx == WRITE_LONG_STRING)
+ && !(writer->style & SERD_STYLE_ASCII)) {
// Write UTF-8 character directly to UTF-8 output
// TODO: Scan to next escape and write entire range at once
writer->sink(utf8 + i - 1, size, writer->stream);
@@ -228,9 +235,17 @@ write_node(SerdWriter* writer,
break;
}
}
- writer->sink("\"", 1, writer->stream);
- write_text(writer, WRITE_STRING, node->buf, node->n_bytes - 1, '"');
- writer->sink("\"", 1, writer->stream);
+ if (writer->syntax != SERD_NTRIPLES
+ && ((node->flags & SERD_HAS_NEWLINE)
+ || (node->flags & SERD_HAS_QUOTE))) {
+ writer->sink("\"\"\"", 3, writer->stream);
+ write_text(writer, WRITE_LONG_STRING, node->buf, node->n_bytes - 1, '\0');
+ writer->sink("\"\"\"", 3, writer->stream);
+ } else {
+ writer->sink("\"", 1, writer->stream);
+ write_text(writer, WRITE_STRING, node->buf, node->n_bytes - 1, '"');
+ writer->sink("\"", 1, writer->stream);
+ }
if (lang && lang->buf) {
writer->sink("@", 1, writer->stream);
writer->sink(lang->buf, lang->n_bytes - 1, writer->stream);