From efd2a2ecb17a8c633f634c2cf38fb8be48db9af5 Mon Sep 17 00:00:00 2001 From: David Robillard Date: Wed, 11 May 2011 16:03:03 +0000 Subject: Record whether strings contains quotes or newlines for smart selection of long strings vs short strings in writer output. git-svn-id: http://svn.drobilla.net/serd/trunk@167 490d8e77-9747-427b-9fa3-0b8f29cee8a0 --- serd/serd.h | 43 +++++++++++++++------------ src/env.c | 1 + src/node.c | 9 +++--- src/reader.c | 83 ++++++++++++++++++++++++++++++----------------------- src/serd_internal.h | 10 ++++++- src/writer.c | 61 ++++++++++++++++++++++++--------------- 6 files changed, 125 insertions(+), 82 deletions(-) diff --git a/serd/serd.h b/serd/serd.h index 351b8b8b..1e7da13c 100644 --- a/serd/serd.h +++ b/serd/serd.h @@ -113,28 +113,26 @@ typedef enum { This is more precise than the type of an abstract RDF node. An abstract node is either a resource, literal, or blank. In syntax there are two ways - to refer to both a resource (by URI or CURIE) and a blank (by ID or - anonymously). + to refer to a resource (by URI or CURIE) and two ways to refer to a blank + (by ID or anonymously). - Serd represents all nodes as an unquoted UTF-8 string "value" associated - with a @ref SerdType, which is precise enough to preserve the syntactic - information required for streaming abbreviation. A non-abbreviating sink - may simply consider @ref SERD_ANON_BEGIN and @ref SERD_ANON equivalent to - @ref SERD_BLANK_ID. + Serd represents a node as a string "value" associated with a @ref SerdType, + which is precise enough to support streaming abbreviation. If abbreviation + is not applicable, @ref SERD_ANON_BEGIN and @ref SERD_ANON may simply be + considered equivalent to @ref SERD_BLANK_ID. */ typedef enum { /** The type of a nonexistent node. - This type is occasionally useful, but is never emitted by the reader. + This type is useful as a sentinel, but is never emitted by the reader. */ SERD_NOTHING = 0, /** Literal value. - A literal optionally has either an associated language, or an associated - datatype (not both). + A literal optionally has either a language, or a datatype (not both). */ SERD_LITERAL = 1, @@ -142,8 +140,8 @@ typedef enum { URI (absolute or relative). Value is an unquoted URI string, which is either a relative reference - with respect to the current base URI, or an absolute URI. A URI is an - ID with universal scope. + with respect to the current base URI (e.g. "foo/bar"), or an absolute + URI (e.g. "http://example.org/foo"). @see RFC3986. */ SERD_URI = 2, @@ -160,8 +158,8 @@ typedef enum { /** A blank node ID. - Value is a blank node ID, e.g. "id3", which is valid only within this - serialisation. + Value is a blank node ID, e.g. "id3", which is meaningful only within + this serialisation. @see Turtle nodeID */ @@ -184,13 +182,22 @@ typedef enum { SERD_ANON = 6 } SerdType; +/** + Flags indicating certain string properties relevant to serialisation. +*/ +typedef enum { + SERD_HAS_NEWLINE = 1, /**< Contains line breaks ('\\n' or '\\r') */ + SERD_HAS_QUOTE = 1 << 1 /**< Contains quotes ('"') */ +} SerdNodeFlag; + /** A syntactic RDF node. */ typedef struct { - const uint8_t* buf; /**< Buffer */ + const uint8_t* buf; /**< Value string */ size_t n_bytes; /**< Size in bytes (including null) */ size_t n_chars; /**< Length in characters */ + uint32_t flags; /**< Bitwise OR of SerdNodeFlag values */ SerdType type; /**< Node type */ } SerdNode; @@ -223,8 +230,8 @@ typedef struct { The style of the writer output can be controlled by ORing together values from this enumeration. Note that some options are only supported - for some syntaxes (e.g. NTriples does not support any options except - @ref SERD_STYLE_ASCII, which is required). + for some syntaxes (e.g. NTriples does not support abbreviation and is + always ASCII). */ typedef enum { SERD_STYLE_ABBREVIATED = 1, /**< Abbreviate triples when possible. */ @@ -279,7 +286,7 @@ serd_uri_serialise(const SerdURI* uri, SerdSink sink, void* stream); @{ */ -static const SerdNode SERD_NODE_NULL = { 0, 0, 0, SERD_NOTHING }; +static const SerdNode SERD_NODE_NULL = { 0, 0, 0, 0, SERD_NOTHING }; /** Make a (shallow) node from @a str. diff --git a/src/env.c b/src/env.c index 6671d683..0d9bc128 100644 --- a/src/env.c +++ b/src/env.c @@ -206,6 +206,7 @@ serd_env_expand_node(const SerdEnv* env, SerdNode ret = { NULL, prefix.len + suffix.len + 1, prefix.len + suffix.len, // FIXME: UTF-8 + 0, SERD_URI }; ret.buf = malloc(ret.n_bytes); snprintf((char*)ret.buf, ret.n_bytes, "%s%s", prefix.buf, suffix.buf); diff --git a/src/node.c b/src/node.c index b6288ebc..edd0db86 100644 --- a/src/node.c +++ b/src/node.c @@ -23,9 +23,10 @@ SERD_API SerdNode serd_node_from_string(SerdType type, const uint8_t* buf) { - size_t buf_n_bytes; - const size_t buf_n_chars = serd_strlen(buf, &buf_n_bytes); - SerdNode ret = { buf, buf_n_bytes, buf_n_chars, type }; + uint32_t flags; + size_t buf_n_bytes; + const size_t buf_n_chars = serd_strlen(buf, &buf_n_bytes, &flags); + SerdNode ret = { buf, buf_n_bytes, buf_n_chars, flags, type }; return ret; } @@ -118,7 +119,7 @@ serd_node_new_uri(const SerdURI* uri, const SerdURI* base, SerdURI* out) const size_t len = serd_uri_string_length(&abs_uri); uint8_t* buf = malloc(len + 1); - SerdNode node = { buf, len + 1, len, SERD_URI }; // FIXME: UTF-8 + SerdNode node = { buf, len + 1, len, 0, SERD_URI }; // FIXME: UTF-8 uint8_t* ptr = buf; const size_t actual_len = serd_uri_serialise(&abs_uri, string_sink, &ptr); diff --git a/src/reader.c b/src/reader.c index e4e91f4c..b0ad02e5 100644 --- a/src/reader.c +++ b/src/reader.c @@ -269,9 +269,9 @@ public_node_from_ref(SerdReader* reader, SerdType type, Ref ref) if (!ref) { return SERD_NODE_NULL; } - const SerdString* str = deref(reader, ref); - const SerdNode public = { str->buf, str->n_bytes, str->n_chars, type }; - return public; + const SerdString* str = deref(reader, ref); + const SerdNode node = { str->buf, str->n_bytes, str->n_chars, 0, type }; + return node; } static inline SerdNode @@ -287,16 +287,17 @@ public_node(SerdReader* reader, const Node* private) static inline bool emit_statement(SerdReader* reader, const Node* g, const Node* s, const Node* p, const Node* o, - const Node* d, Ref l) + const Node* d, Ref l, uint32_t f) { assert(s && p && o); assert(s->value && p->value && o->value); const SerdNode graph = public_node(reader, g); const SerdNode subject = public_node(reader, s); const SerdNode predicate = public_node(reader, p); - const SerdNode object = public_node(reader, o); + SerdNode object = public_node(reader, o); const SerdNode datatype = public_node(reader, d); const SerdNode lang = public_node_from_ref(reader, SERD_LITERAL, l); + object.flags = f; return !reader->statement_sink(reader->handle, &graph, &subject, @@ -389,7 +390,7 @@ read_character_escape(SerdReader* reader, Ref dest) } static inline bool -read_echaracter_escape(SerdReader* reader, Ref dest) +read_echaracter_escape(SerdReader* reader, Ref dest, uint32_t* flags) { switch (peek_byte(reader)) { case 't': @@ -397,10 +398,12 @@ read_echaracter_escape(SerdReader* reader, Ref dest) push_byte(reader, dest, '\t'); return true; case 'n': + *flags |= SERD_HAS_NEWLINE; eat_byte(reader, 'n'); push_byte(reader, dest, '\n'); return true; case 'r': + *flags |= SERD_HAS_NEWLINE; eat_byte(reader, 'r'); push_byte(reader, dest, '\r'); return true; @@ -410,26 +413,28 @@ read_echaracter_escape(SerdReader* reader, Ref dest) } static inline bool -read_scharacter_escape(SerdReader* reader, Ref dest) +read_scharacter_escape(SerdReader* reader, Ref dest, uint32_t* flags) { switch (peek_byte(reader)) { case '"': + *flags |= SERD_HAS_QUOTE; push_byte(reader, dest, eat_byte(reader, '"')); return true; default: - return read_echaracter_escape(reader, dest); + return read_echaracter_escape(reader, dest, flags); } } static inline bool read_ucharacter_escape(SerdReader* reader, Ref dest) { + uint32_t flags = 0; switch (peek_byte(reader)) { case '>': push_byte(reader, dest, eat_byte(reader, '>')); return true; default: - return read_echaracter_escape(reader, dest); + return read_echaracter_escape(reader, dest, &flags); } } @@ -477,11 +482,12 @@ read_character(SerdReader* reader, Ref dest) static inline SerdStatus read_echaracter(SerdReader* reader, Ref dest) { - uint8_t c = peek_byte(reader); + uint32_t flags = 0; + uint8_t c = peek_byte(reader); switch (c) { case '\\': eat_byte(reader, '\\'); - if (read_echaracter_escape(reader, peek_byte(reader))) { + if (read_echaracter_escape(reader, peek_byte(reader), &flags)) { return SERD_SUCCESS; } else { error(reader, "illegal escape `\\%c'\n", peek_byte(reader)); @@ -494,31 +500,34 @@ read_echaracter(SerdReader* reader, Ref dest) // [43] lcharacter ::= echaracter | '\"' | #x9 | #xA | #xD static inline SerdStatus -read_lcharacter(SerdReader* reader, Ref dest) +read_lcharacter(SerdReader* reader, Ref dest, uint32_t* flags) { const uint8_t c = peek_byte(reader); uint8_t pre[3]; switch (c) { case '"': - peek_string(reader, pre, 3); - if (pre[1] == '\"' && pre[2] == '\"') { + peek_string(reader, pre, 4); + if (pre[1] == '\"' && pre[2] == '\"' && pre[3] != '\"') { eat_byte(reader, '\"'); eat_byte(reader, '\"'); eat_byte(reader, '\"'); return SERD_FAILURE; } else { + *flags |= SERD_HAS_QUOTE; push_byte(reader, dest, eat_byte(reader, '"')); return SERD_SUCCESS; } case '\\': eat_byte(reader, '\\'); - if (read_scharacter_escape(reader, dest)) { + if (read_scharacter_escape(reader, dest, flags)) { return SERD_SUCCESS; } else { error(reader, "illegal escape `\\%c'\n", peek_byte(reader)); return SERD_ERR_BAD_SYNTAX; } - case 0x9: case 0xA: case 0xD: + case 0xA: case 0xD: + *flags |= SERD_HAS_NEWLINE; + case 0x9: push_byte(reader, dest, eat_byte(reader, c)); return SERD_SUCCESS; default: @@ -528,13 +537,13 @@ read_lcharacter(SerdReader* reader, Ref dest) // [42] scharacter ::= ( echaracter - #x22 ) | '\"' static inline SerdStatus -read_scharacter(SerdReader* reader, Ref dest) +read_scharacter(SerdReader* reader, Ref dest, uint32_t* flags) { uint8_t c = peek_byte(reader); switch (c) { case '\\': eat_byte(reader, '\\'); - if (read_scharacter_escape(reader, dest)) { + if (read_scharacter_escape(reader, dest, flags)) { return SERD_SUCCESS; } else { error(reader, "illegal escape `\\%c'\n", peek_byte(reader)); @@ -612,12 +621,12 @@ read_ws_plus(SerdReader* reader) // [37] longString ::= #x22 #x22 #x22 lcharacter* #x22 #x22 #x22 static Ref -read_longString(SerdReader* reader) +read_longString(SerdReader* reader, uint32_t* flags) { eat_string(reader, "\"\"\"", 3); Ref str = push_string(reader, "", 1); SerdStatus st; - while (!(st = read_lcharacter(reader, str))) {} + while (!(st = read_lcharacter(reader, str, flags))) {} if (st < SERD_ERR_UNKNOWN) { return str; } @@ -627,12 +636,12 @@ read_longString(SerdReader* reader) // [36] string ::= #x22 scharacter* #x22 static Ref -read_string(SerdReader* reader) +read_string(SerdReader* reader, uint32_t* flags) { eat_byte(reader, '\"'); Ref str = push_string(reader, "", 1); SerdStatus st; - while (!(st = read_scharacter(reader, str))) {} + while (!(st = read_scharacter(reader, str, flags))) {} if (st < SERD_ERR_UNKNOWN) { eat_byte(reader, '\"'); return str; @@ -643,7 +652,7 @@ read_string(SerdReader* reader) // [35] quotedString ::= string | longString static Ref -read_quotedString(SerdReader* reader) +read_quotedString(SerdReader* reader, uint32_t* flags) { uint8_t pre[3]; peek_string(reader, pre, 3); @@ -651,11 +660,11 @@ read_quotedString(SerdReader* reader) switch (pre[1]) { case '\"': if (pre[2] == '\"') - return read_longString(reader); + return read_longString(reader, flags); else - return read_string(reader); + return read_string(reader, flags); default: - return read_string(reader); + return read_string(reader, flags); } } @@ -893,14 +902,15 @@ read_resource(SerdReader* reader, Node* dest) // [14] literal ::= quotedString ( '@' language )? | datatypeString // | integer | double | decimal | boolean static bool -read_literal(SerdReader* reader, Node* dest, Node* datatype, Ref* lang) +read_literal(SerdReader* reader, Node* dest, + Node* datatype, Ref* lang, uint32_t* flags) { Ref str = 0; const uint8_t c = peek_byte(reader); if (c == '-' || c == '+' || c == '.' || is_digit(c)) { return read_number(reader, dest, datatype); } else if (c == '\"') { - str = read_quotedString(reader); + str = read_quotedString(reader, flags); if (!str) { return false; } @@ -994,7 +1004,7 @@ read_blank(SerdReader* reader, ReadContext ctx, Node* dest) if (ctx.subject) { TRY_RET(emit_statement(reader, ctx.graph, ctx.subject, ctx.predicate, - dest, NULL, 0)); + dest, NULL, 0, 0)); } return true; } @@ -1002,7 +1012,7 @@ read_blank(SerdReader* reader, ReadContext ctx, Node* dest) if (ctx.subject) { TRY_RET(emit_statement(reader, ctx.graph, ctx.subject, ctx.predicate, - dest, NULL, 0)); + dest, NULL, 0, 0)); dest->type = SERD_ANON; } ctx.subject = dest; @@ -1019,7 +1029,7 @@ read_blank(SerdReader* reader, ReadContext ctx, Node* dest) if (ctx.subject) { TRY_RET(emit_statement(reader, ctx.graph, ctx.subject, ctx.predicate, - dest, NULL, 0)); + dest, NULL, 0, 0)); } return true; } @@ -1060,6 +1070,7 @@ read_object(SerdReader* reader, ReadContext ctx) Node o = INTERNAL_NODE_NULL; Node datatype = INTERNAL_NODE_NULL; Ref lang = 0; + uint32_t flags = 0; const uint8_t c = peek_byte(reader); switch (c) { case '\0': @@ -1077,10 +1088,10 @@ read_object(SerdReader* reader, ReadContext ctx) case '\"': case '+': case '-': case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': - TRY_THROW(ret = read_literal(reader, &o, &datatype, &lang)); + TRY_THROW(ret = read_literal(reader, &o, &datatype, &lang, &flags)); break; case '.': - TRY_THROW(ret = read_literal(reader, &o, &datatype, &lang)); + TRY_THROW(ret = read_literal(reader, &o, &datatype, &lang, &flags)); break; default: /* Either a boolean literal, or a qname. @@ -1108,7 +1119,7 @@ read_object(SerdReader* reader, ReadContext ctx) assert(o.value); ret = emit_statement(reader, ctx.graph, ctx.subject, ctx.predicate, - &o, &datatype, lang); + &o, &datatype, lang, flags); } except: @@ -1188,14 +1199,14 @@ read_collection_rec(SerdReader* reader, ReadContext ctx) TRY_RET(emit_statement(reader, NULL, ctx.subject, &reader->rdf_rest, - &reader->rdf_nil, NULL, 0)); + &reader->rdf_nil, NULL, 0, 0)); return false; } else { const Node rest = make_node(SERD_BLANK_ID, blank_id(reader)); TRY_RET(emit_statement(reader, ctx.graph, ctx.subject, &reader->rdf_rest, - &rest, NULL, 0)); + &rest, NULL, 0, 0)); ctx.subject = &rest; ctx.predicate = &reader->rdf_first; if (read_object(reader, ctx)) { diff --git a/src/serd_internal.h b/src/serd_internal.h index dd57af1e..e573a806 100644 --- a/src/serd_internal.h +++ b/src/serd_internal.h @@ -105,7 +105,7 @@ is_digit(const uint8_t c) @param n_bytes (Output) Set to the size of @a str in bytes (incl. NULL). */ static inline size_t -serd_strlen(const uint8_t* str, size_t* n_bytes) +serd_strlen(const uint8_t* str, size_t* n_bytes, uint32_t* flags) { size_t n_chars = 0; size_t i = 0; @@ -113,6 +113,14 @@ serd_strlen(const uint8_t* str, size_t* n_bytes) if ((str[i] & 0xC0) != 0x80) { // Does not start with `10', start of a new character ++n_chars; + switch (str[i]) { + case '\r': + case '\n': + *flags |= SERD_HAS_NEWLINE; + break; + case '"': + *flags |= SERD_HAS_QUOTE; + } } } if (n_bytes) { diff --git a/src/writer.c b/src/writer.c index 4e4ee2eb..a9cdf6ac 100644 --- a/src/writer.c +++ b/src/writer.c @@ -31,7 +31,7 @@ typedef struct { } WriteContext; static const WriteContext WRITE_CONTEXT_NULL = { - { 0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0} + { 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0} }; struct SerdWriterImpl { @@ -47,9 +47,9 @@ struct SerdWriterImpl { }; typedef enum { - WRITE_NORMAL, WRITE_URI, - WRITE_STRING + WRITE_STRING, + WRITE_LONG_STRING } TextContext; static inline WriteContext* @@ -67,23 +67,29 @@ write_text(SerdWriter* writer, TextContext ctx, char escape[11] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; for (size_t i = 0; i < n_bytes;) { uint8_t in = utf8[i++]; - switch (in) { - case '\\': writer->sink("\\\\", 2, writer->stream); continue; - case '\n': writer->sink("\\n", 2, writer->stream); continue; - case '\r': writer->sink("\\r", 2, writer->stream); continue; - case '\t': writer->sink("\\t", 2, writer->stream); continue; - case '"': - if (terminator == '"') { - writer->sink("\\\"", 2, writer->stream); - continue; - } // else fall-through - default: break; - } + if (ctx == WRITE_LONG_STRING) { + if (in == '\\') { + writer->sink("\\\\", 2, writer->stream); continue; + } + } else { + switch (in) { + case '\\': writer->sink("\\\\", 2, writer->stream); continue; + case '\n': writer->sink("\\n", 2, writer->stream); continue; + case '\r': writer->sink("\\r", 2, writer->stream); continue; + case '\t': writer->sink("\\t", 2, writer->stream); continue; + case '"': + if (terminator == '"') { + writer->sink("\\\"", 2, writer->stream); + continue; + } // else fall-through + default: break; + } - if (in == terminator) { - snprintf(escape, 7, "\\u%04X", terminator); - writer->sink(escape, 6, writer->stream); - continue; + if (in == terminator) { + snprintf(escape, 7, "\\u%04X", terminator); + writer->sink(escape, 6, writer->stream); + continue; + } } uint32_t c = 0; @@ -109,7 +115,8 @@ write_text(SerdWriter* writer, TextContext ctx, return false; } - if (ctx == WRITE_STRING && !(writer->style & SERD_STYLE_ASCII)) { + if ((ctx == WRITE_STRING || ctx == WRITE_LONG_STRING) + && !(writer->style & SERD_STYLE_ASCII)) { // Write UTF-8 character directly to UTF-8 output // TODO: Scan to next escape and write entire range at once writer->sink(utf8 + i - 1, size, writer->stream); @@ -228,9 +235,17 @@ write_node(SerdWriter* writer, break; } } - writer->sink("\"", 1, writer->stream); - write_text(writer, WRITE_STRING, node->buf, node->n_bytes - 1, '"'); - writer->sink("\"", 1, writer->stream); + if (writer->syntax != SERD_NTRIPLES + && ((node->flags & SERD_HAS_NEWLINE) + || (node->flags & SERD_HAS_QUOTE))) { + writer->sink("\"\"\"", 3, writer->stream); + write_text(writer, WRITE_LONG_STRING, node->buf, node->n_bytes - 1, '\0'); + writer->sink("\"\"\"", 3, writer->stream); + } else { + writer->sink("\"", 1, writer->stream); + write_text(writer, WRITE_STRING, node->buf, node->n_bytes - 1, '"'); + writer->sink("\"", 1, writer->stream); + } if (lang && lang->buf) { writer->sink("@", 1, writer->stream); writer->sink(lang->buf, lang->n_bytes - 1, writer->stream); -- cgit v1.2.1