aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid Robillard <d@drobilla.net>2011-01-20 07:31:58 +0000
committerDavid Robillard <d@drobilla.net>2011-01-20 07:31:58 +0000
commit0a62fc5f6aafd3e3f67d861634014d7e894c7bfd (patch)
tree570127d89143a64009bb753b2b6550507ae09cfa
parentfc2fe593097a523919ee71742081cbc6f3fc4c2c (diff)
downloadserd-0a62fc5f6aafd3e3f67d861634014d7e894c7bfd.tar.gz
serd-0a62fc5f6aafd3e3f67d861634014d7e894c7bfd.tar.bz2
serd-0a62fc5f6aafd3e3f67d861634014d7e894c7bfd.zip
Rework character reading functions to support reading multi-byte characters (take a string dest parameter instead of returning uchar).
Escape ntriples output. Pass all good read tests with output verification. git-svn-id: http://svn.drobilla.net/serd/trunk@8 490d8e77-9747-427b-9fa3-0b8f29cee8a0
-rw-r--r--doc/reference.doxygen.in8
-rw-r--r--serd/serd.h105
-rw-r--r--src/namespaces.c46
-rw-r--r--src/reader.c287
-rw-r--r--src/serdi.c84
-rw-r--r--src/string.c65
-rw-r--r--src/uri.c49
-rw-r--r--src/write.c170
-rw-r--r--wscript8
9 files changed, 472 insertions, 350 deletions
diff --git a/doc/reference.doxygen.in b/doc/reference.doxygen.in
index 66fda410..db972e34 100644
--- a/doc/reference.doxygen.in
+++ b/doc/reference.doxygen.in
@@ -270,7 +270,7 @@ SUBGROUPING = YES
# be useful for C code in case the coding convention dictates that all compound
# types are typedef'ed and only the typedef is referenced, never the tag name.
-TYPEDEF_HIDES_STRUCT = NO
+TYPEDEF_HIDES_STRUCT = YES
# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to
# determine which symbols to keep in memory and which to flush to disk.
@@ -297,7 +297,7 @@ SYMBOL_CACHE_SIZE = 0
# Private class members and static file members will be hidden unless
# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES
-EXTRACT_ALL = NO
+EXTRACT_ALL = YES
# If the EXTRACT_PRIVATE tag is set to YES all private members of a class
# will be included in the documentation.
@@ -480,14 +480,14 @@ SHOW_DIRECTORIES = NO
# This will remove the Files entry from the Quick Index and from the
# Folder Tree View (if specified). The default is YES.
-SHOW_FILES = YES
+SHOW_FILES = NO
# Set the SHOW_NAMESPACES tag to NO to disable the generation of the
# Namespaces page.
# This will remove the Namespaces entry from the Quick Index
# and from the Folder Tree View (if specified). The default is YES.
-SHOW_NAMESPACES = YES
+SHOW_NAMESPACES = NO
# The FILE_VERSION_FILTER tag can be used to specify a program or script that
# doxygen should invoke to get the current version for each file (typically from
diff --git a/serd/serd.h b/serd/serd.h
index d3d2b03d..dd4b25b7 100644
--- a/serd/serd.h
+++ b/serd/serd.h
@@ -49,6 +49,10 @@
* @{
*/
+typedef struct SerdNamespacesImpl* SerdNamespaces;
+typedef struct SerdReaderImpl* SerdReader;
+
+
/** RDF syntax */
typedef enum {
SERD_TURTLE = 1,
@@ -57,56 +61,31 @@ typedef enum {
/** Type of RDF node. */
typedef enum {
- BLANK = 1,
- URI = 2,
- QNAME = 3,
- LITERAL = 4
+ BLANK = 1, ///< Blank node (resource with no URI)
+ URI = 2, ///< URI (universal identifier)
+ QNAME = 3, ///< CURIE/QName (URI shortened with a namespace)
+ LITERAL = 4 ///< Literal string (with optional lang or datatype)
} SerdNodeType;
-
-/** @name String
- * @{
- */
-
-/** Measured UTF-8 string. */
-typedef struct {
- size_t n_bytes;
- size_t n_chars;
- uint8_t buf[];
-} SerdString;
-
-/** Create a new UTF-8 string from @a utf8. */
-SERD_API
-SerdString*
-serd_string_new(const uint8_t* utf8);
-
-/** Copy @a string. */
-SERD_API
-SerdString*
-serd_string_copy(const SerdString* string);
-
-/** @} */
-
-
/** @name URIs
* @{
*/
-/** Range of memory. */
+/* Range of memory. */
typedef struct {
const uint8_t* buf;
size_t len;
} SerdRange;
-/** Parsed URI. */
+/* Parsed URI. */
typedef struct {
- SerdRange scheme;
- SerdRange authority;
- SerdRange path_base;
- SerdRange path;
- SerdRange query;
- SerdRange fragment;
- bool base_uri_has_authority;
+ SerdRange scheme; ///< Scheme
+ SerdRange authority; ///< Authority
+ SerdRange path_base; ///< Path prefix if relative
+ SerdRange path; ///< Path suffix
+ SerdRange query; ///< Query
+ SerdRange fragment; ///< Fragment
+ bool base_uri_has_authority; ///< True iff base URI has authority
} SerdURI;
/** Return true iff @a utf8 is a relative URI string. */
@@ -129,11 +108,52 @@ SERD_API
bool
serd_uri_write(const SerdURI* uri, FILE* file);
+/** Sink function for raw string output. */
+typedef size_t (*SerdSink)(const uint8_t* buf, size_t len, void* stream);
+
+/** Serialise @a uri with a series of calls to @a sink. */
+SERD_API
+size_t
+serd_uri_serialise(const SerdURI* uri, SerdSink sink, void* stream);
+
+/** @} */
+
+/** @name String
+ * @{
+ */
+
+/** Measured UTF-8 string. */
+typedef struct {
+ size_t n_bytes; ///< Size in bytes including trailing null byte
+ size_t n_chars; ///< Length in characters
+ uint8_t buf[]; ///< Buffer
+} SerdString;
+
+/** Create a new UTF-8 string from @a utf8. */
+SERD_API
+SerdString*
+serd_string_new(const uint8_t* utf8);
+
+/** Copy @a string. */
+SERD_API
+SerdString*
+serd_string_copy(const SerdString* string);
+
/** Serialise @a uri to a string. */
SERD_API
SerdString*
-serd_uri_serialise(const SerdURI* uri,
- SerdURI* out);
+serd_string_new_from_uri(const SerdURI* uri,
+ SerdURI* out);
+
+SERD_API
+bool
+serd_write_node(FILE* file,
+ const SerdURI* base_uri,
+ SerdNamespaces ns,
+ SerdNodeType type,
+ const SerdString* str,
+ const SerdString* datatype,
+ const SerdString* lang);
/** @} */
@@ -142,9 +162,6 @@ serd_uri_serialise(const SerdURI* uri,
* @{
*/
-/** Reader. */
-typedef struct SerdReaderImpl* SerdReader;
-
/** Handler for base URI changes. */
typedef bool (*SerdBaseHandler)(void* handle,
const SerdString* uri);
@@ -194,8 +211,6 @@ serd_reader_free(SerdReader reader);
* @{
*/
-typedef struct SerdNamespacesImpl* SerdNamespaces;
-
/** Create a new namespaces dictionary. */
SERD_API
SerdNamespaces
diff --git a/src/namespaces.c b/src/namespaces.c
index fab53ea3..f18ebcff 100644
--- a/src/namespaces.c
+++ b/src/namespaces.c
@@ -32,48 +32,6 @@ struct SerdNamespacesImpl {
size_t n_namespaces;
};
-static inline size_t
-utf8_strlen(const uint8_t* utf8, size_t* out_n_bytes)
-{
- size_t n_chars = 0;
- size_t i = 0;
- for (; utf8[i]; ++i) {
- if ((utf8[i] & 0xC0) != 0x80) {
- // Does not start with `10', start of a new character
- ++n_chars;
- }
- }
- if (out_n_bytes) {
- *out_n_bytes = i + 1;
- }
- return n_chars;
-}
-
-SERD_API
-SerdString*
-serd_string_new(const uint8_t* utf8)
-{
- size_t n_bytes;
- size_t n_chars = utf8_strlen(utf8, &n_bytes);
- SerdString* const str = malloc(sizeof(SerdString) + n_bytes);
- str->n_bytes = n_bytes;
- str->n_chars = n_chars;
- memcpy(str->buf, utf8, str->n_bytes);
- return str;
-}
-
-SERD_API
-SerdString*
-serd_string_copy(const SerdString* s)
-{
- if (s) {
- SerdString* const copy = malloc(sizeof(SerdString) + s->n_bytes);
- memcpy(copy, s, sizeof(SerdString) + s->n_bytes);
- return copy;
- }
- return NULL;
-}
-
SERD_API
SerdNamespaces
serd_namespaces_new()
@@ -148,9 +106,9 @@ serd_namespaces_expand(SerdNamespaces ns,
SerdNamespace* const record = serd_namespaces_find(ns, qname->buf, colon - qname->buf);
if (record) {
uri_prefix->buf = record->uri->buf;
- uri_prefix->len = record->uri->n_bytes;
+ uri_prefix->len = record->uri->n_bytes - 1;
uri_suffix->buf = colon + 1;
- uri_suffix->len = qname->n_bytes - (colon - qname->buf) - 1;
+ uri_suffix->len = qname->n_bytes - (colon - qname->buf) - 2;
return true;
}
return false;
diff --git a/src/reader.c b/src/reader.c
index bf0eea11..52999180 100644
--- a/src/reader.c
+++ b/src/reader.c
@@ -140,22 +140,6 @@ readahead(SerdReader parser, uint8_t* pre, int n)
return true;
}
-static inline unsigned
-utf8_char_len(const uint8_t b0)
-{
- if ((b0 & 0x80) == 0) { // Starts with `0'
- return 1;
- } else if ((b0 & 0xE0) == 0xC0) { // Starts with `110'
- return 2;
- } else if ((b0 & 0xF0) == 0xE0) { // Starts with `1110'
- return 3;
- } else if ((b0 & 0xF8) == 0xF0) { // Starts with `11110'
- return 4;
- } else {
- return 0;
- }
-}
-
static inline uchar
peek_utf8_char(SerdReader parser, unsigned* n_bytes)
{
@@ -334,52 +318,82 @@ read_hex(SerdReader parser)
}
}
-static inline uchar
-read_hex_escape(SerdReader parser, unsigned length)
+static inline bool
+read_hex_escape(SerdReader parser, unsigned length, Ref dest)
{
- uchar ret = 0;
- uint8_t chars[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
- uint8_t code[4] = { 0, 0, 0, 0 };
+ uint8_t buf[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
for (unsigned i = 0; i < length; ++i) {
- chars[i] = read_hex(parser);
- }
-
- sscanf((const char*)chars, "%X", (uint32_t*)code);
- const uint32_t code_num = *(uint32_t*)code;
- if (code_num < 0x80) {
- fprintf(stderr, "1 byte UTF-8 escape\n");
- return code[0];
- } else if (code_num < 0x800) {
- fprintf(stderr, "2 byte UTF-8 escape\n");
- fprintf(stderr, "B0 %X\n", code[0]);
- fprintf(stderr, "B1 %X\n", code[1]);
- fprintf(stderr, "B2 %X\n", code[2]);
- fprintf(stderr, "B3 %X\n", code[3]);
- ret = ((0xC0 + ((code[3] & 0x1F) << 2) + ((code[4] & 0xC0) >> 6)) << 8)
- + (code[4] & 0x3F);
- fprintf(stderr, "RET %X\n", ret);
- } else if (code_num < 0x10000) {
- fprintf(stderr, "3 byte UTF-8 escape\n");
+ buf[i] = read_hex(parser);
+ }
+
+ uint32_t c;
+ sscanf((const char*)buf, "%X", &c);
+
+ unsigned size = 0;
+ if (c < 0x00000080) {
+ size = 1;
+ } else if (c < 0x00000800) {
+ size = 2;
+ } else if (c < 0x00010000) {
+ size = 3;
+ } else if (c < 0x00200000) {
+ size = 4;
+ } else if (c < 0x04000000) {
+ size = 5;
+ } else if (c < 0x80000000) {
+ size = 6;
} else {
- fprintf(stderr, "4 byte UTF-8 escape\n");
+ return false;
}
- return ret;
+
+ // Build output in buf
+ // (Note # of bytes = # of leading 1 bits in first byte)
+ switch (size) {
+ case 6:
+ buf[5] = 0x80 | (uint8_t)(c & 0x3F);
+ c >>= 6;
+ c |= (4 << 24); // set bit 2
+ case 5:
+ buf[4] = 0x80 | (uint8_t)(c & 0x3F);
+ c >>= 6;
+ c |= (8 << 18); // set bit 3
+ case 4:
+ buf[3] = 0x80 | (uint8_t)(c & 0x3F);
+ c >>= 6;
+ c |= (16 << 12); // set bit 4
+ case 3:
+ buf[2] = 0x80 | (uint8_t)(c & 0x3F);
+ c >>= 6;
+ c |= (32 << 6); // set bit 5
+ case 2:
+ buf[1] = 0x80 | (uint8_t)(c & 0x3F);
+ c >>= 6;
+ c |= 0xC0; // set bits 6 and 7
+ case 1:
+ buf[0] = (uint8_t)c;
+ }
+
+ for (unsigned i = 0; i < size; ++i) {
+ push_char(parser, dest, buf[i]);
+ }
+ return true;
}
-static inline uchar
-character_escape(SerdReader parser, const uchar esc)
+static inline bool
+read_character_escape(SerdReader parser, Ref dest)
{
- switch (esc) {
+ switch (peek_char(parser)) {
case '\\':
- return eat_char(parser, '\\');
+ push_char(parser, dest, eat_char(parser, '\\'));
+ return true;
case 'u':
- eat_char(parser, esc);
- return read_hex_escape(parser, 4);
+ eat_char(parser, 'u');
+ return read_hex_escape(parser, 4, dest);
case 'U':
- eat_char(parser, esc);
- return read_hex_escape(parser, 8);
+ eat_char(parser, 'U');
+ return read_hex_escape(parser, 8, dest);
default:
- return 0;
+ return false;
}
}
@@ -387,117 +401,124 @@ character_escape(SerdReader parser, const uchar esc)
// | '\U' hex hex hex hex hex hex hex hex
// | '\\'
// | [#x20-#x5B] | [#x5D-#x10FFFF]
-static inline uchar
-read_character(SerdReader parser)
+static inline bool
+read_character(SerdReader parser, Ref dest)
{
const uchar c = peek_char(parser);
- uchar esc;
switch (c) {
case '\\':
eat_char(parser, '\\');
- esc = character_escape(parser, peek_char(parser));
- if (esc) {
- return esc;
+ if (read_character_escape(parser, dest)) {
+ return true;
} else {
- return error(parser, "illegal escape `\\%c'\n", esc);
+ return error(parser, "invalid escape `\\%c'\n", peek_char(parser));
}
default:
if (in_range(c, 0x20, 0x5B) || in_range(c, 0x5D, 0x10FFF)) {
- return eat_char(parser, c);
+ push_char(parser, dest, eat_char(parser, c));
+ return true;
} else {
- return error(parser, "illegal character `%c'\n", c);
+ return error(parser, "invalid character `%c'\n", c);
}
}
}
-static inline uchar
-echaracter_escape(SerdReader parser, const uchar esc)
+static inline bool
+read_echaracter_escape(SerdReader parser, Ref dest)
{
- const uchar ret = character_escape(parser, esc);
- if (ret) {
- return ret;
+ if (read_character_escape(parser, dest)) {
+ return true;
}
- switch (esc) {
+ switch (peek_char(parser)) {
case 't':
eat_char(parser, 't');
- return '\t';
+ push_char(parser, dest, '\t');
+ return true;
case 'n':
eat_char(parser, 'n');
- return '\n';
+ push_char(parser, dest, '\n');
+ return true;
case 'r':
eat_char(parser, 'r');
- return '\r';
+ push_char(parser, dest, '\r');
+ return true;
default:
- return 0;
+ return false;
}
}
// [39] echaracter ::= character | '\t' | '\n' | '\r'
-static inline uchar
-read_echaracter(SerdReader parser)
+static inline bool
+read_echaracter(SerdReader parser, Ref dest)
{
uchar c = peek_char(parser);
- uchar esc;
switch (c) {
case '\\':
eat_char(parser, '\\');
- esc = echaracter_escape(parser, peek_char(parser));
- if (esc) {
- return esc;
+ if (read_echaracter_escape(parser, peek_char(parser))) {
+ return true;
} else {
- return error(parser, "illegal escape `\\%c'\n", esc);
+ return error(parser, "illegal escape `\\%c'\n", peek_char(parser));
}
default:
- return read_character(parser);
+ return read_character(parser, dest);
}
}
-static inline uchar
-scharacter_escape(SerdReader parser, const uchar esc)
+static inline bool
+read_scharacter_escape(SerdReader parser, Ref dest)
{
- const uchar ret = echaracter_escape(parser, esc);
- if (ret) {
- return ret;
- } else if (esc == '"') {
- return eat_char(parser, '"');
+ if (read_echaracter_escape(parser, dest)) {
+ return true;
+ } else if (peek_char(parser) == '"') {
+ push_char(parser, dest, eat_char(parser, '"'));
+ return true;
}
- return 0;
+ return false;
}
-static inline uchar
-ucharacter_escape(SerdReader parser, const uchar esc)
+static inline bool
+read_ucharacter_escape(SerdReader parser, Ref dest)
{
- const uchar ret = echaracter_escape(parser, esc);
- if (ret) {
- return ret;
- } else if (esc == '>') {
- return eat_char(parser, '>');
+ if (read_echaracter_escape(parser, dest)) {
+ return true;
+ } else if (peek_char(parser) == '>') {
+ push_char(parser, dest, eat_char(parser, '>'));
+ return true;
}
- return 0;
+ return false;
}
// [43] lcharacter ::= echaracter | '\"' | #x9 | #xA | #xD
-static inline uchar
-read_lcharacter(SerdReader parser, bool* is_escape)
+static inline bool
+read_lcharacter(SerdReader parser, Ref dest)
{
- *is_escape = false;
- const uchar c = peek_char(parser);
- uchar esc;
+ uchar c = peek_char(parser);
+ uint8_t pre[3];
switch (c) {
+ case '"':
+ readahead(parser, pre, 3);
+ if (pre[1] == '\"' && pre[2] == '\"') {
+ eat_char(parser, '\"');
+ eat_char(parser, '\"');
+ eat_char(parser, '\"');
+ return false;
+ } else {
+ push_char(parser, dest, eat_char(parser, '"'));
+ return true;
+ }
case '\\':
eat_char(parser, '\\');
- esc = scharacter_escape(parser, peek_char(parser));
- if (esc) {
- *is_escape = true;
- return esc;
+ if (read_scharacter_escape(parser, dest)) {
+ return true;
} else {
- return error(parser, "illegal escape `\\%c'\n", esc);
+ return error(parser, "illegal escape `\\%c'\n", peek_char(parser));
}
case 0x9: case 0xA: case 0xD:
- eat_char(parser, c);
+ push_char(parser, dest, eat_char(parser, c));
return c;
default:
- return read_echaracter(parser);
+ return read_echaracter(parser, dest);
}
}
@@ -506,48 +527,39 @@ static inline bool
read_scharacter(SerdReader parser, Ref dest)
{
uchar c = peek_char(parser);
- uchar esc;
switch (c) {
case '\\':
eat_char(parser, '\\');
- esc = scharacter_escape(parser, peek_char(parser));
- if (esc) {
- push_char(parser, dest, esc);
+ if (read_scharacter_escape(parser, dest)) {
return true;
} else {
- return error(parser, "illegal escape `\\%c'\n", esc);
+ return error(parser, "illegal escape `\\%c'\n", peek_char(parser));
}
case '\"':
return false;
default:
- c = read_character(parser);
- if (c) {
- push_char(parser, dest, c);
- }
- return c;
+ return read_character(parser, dest);
}
}
// Spec: [41] ucharacter ::= ( character - #x3E ) | '\>'
// Actual: [41] ucharacter ::= ( echaracter - #x3E ) | '\>'
-static inline uchar
-read_ucharacter(SerdReader parser)
+static inline bool
+read_ucharacter(SerdReader parser, Ref dest)
{
- const uchar c = peek_char(parser);
- uchar esc;
+ uchar c = peek_char(parser);
switch (c) {
case '\\':
eat_char(parser, '\\');
- esc = ucharacter_escape(parser, peek_char(parser));
- if (esc) {
- return esc;
+ if (read_ucharacter_escape(parser, dest)) {
+ return true;
} else {
- return error(parser, "illegal escape `\\%c'\n", esc);
+ return error(parser, "illegal escape `\\%c'\n", peek_char(parser));
}
case '>':
- return 0;
+ return false;
default:
- return read_character(parser);
+ return read_character(parser, dest);
}
}
@@ -607,21 +619,7 @@ read_longString(SerdReader parser)
{
eat_string(parser, "\"\"\"", 3);
Ref str = push_string(parser, "", 1);
- uchar c;
- bool is_escape = false;
- while ((c = read_lcharacter(parser, &is_escape)) != 0) {
- if (c == '\"' && !is_escape) {
- uint8_t pre[2];
- readahead(parser, pre, 2);
- if (pre[0] == '\"' && pre[1] == '\"') {
- eat_char(parser, '\"');
- eat_char(parser, '\"');
- return str;
- }
- }
- push_char(parser, str, c);
- }
- eat_string(parser, "\"\"\"", 3);
+ while (read_lcharacter(parser, str)) {}
return str;
}
@@ -658,11 +656,8 @@ read_quotedString(SerdReader parser)
static inline Ref
read_relativeURI(SerdReader parser)
{
- uchar c;
- Ref str = push_string(parser, "", 1);
- while ((c = read_ucharacter(parser)) != 0) {
- push_char(parser, str, c);
- }
+ Ref str = push_string(parser, "", 1);
+ while (read_ucharacter(parser, str)) {}
return str;
}
diff --git a/src/serdi.c b/src/serdi.c
index f5be93c3..3f91e040 100644
--- a/src/serdi.c
+++ b/src/serdi.c
@@ -49,7 +49,7 @@ event_base(void* handle,
assert(false);
return false;
}
- base_uri_str = serd_uri_serialise(&abs_base_uri, &base_uri);
+ base_uri_str = serd_string_new_from_uri(&abs_base_uri, &base_uri);
// FIXME: double parse
serd_uri_parse(base_uri_str->buf, &base_uri);
} else {
@@ -83,7 +83,7 @@ event_prefix(void* handle,
return false;
}
SerdURI new_abs_uri;
- SerdString* abs_uri_string = serd_uri_serialise(&abs_uri, &new_abs_uri);
+ SerdString* abs_uri_string = serd_string_new_from_uri(&abs_uri, &new_abs_uri);
serd_namespaces_add(state->ns, name, abs_uri_string);
} else {
serd_namespaces_add(state->ns, name, uri_string);
@@ -91,77 +91,6 @@ event_prefix(void* handle,
return true;
}
-static inline bool
-write_node(State* state,
- const SerdString* str,
- SerdNodeType type,
- const SerdString* datatype,
- const SerdString* lang)
-{
- SerdRange uri_prefix;
- SerdRange uri_suffix;
- switch (type) {
- case BLANK:
- fwrite("_:", 1, 2, state->out_fd);
- fwrite(str->buf, 1, str->n_bytes - 1, state->out_fd);
- break;
- case QNAME:
- if (!serd_namespaces_expand(state->ns, str, &uri_prefix, &uri_suffix)) {
- fprintf(stderr, "error: undefined namespace prefix `%s'\n", str->buf);
- return false;
- }
- fwrite("<", 1, 1, state->out_fd);
- fwrite(uri_prefix.buf, 1, uri_prefix.len - 1, state->out_fd);
- fwrite(uri_suffix.buf, 1, uri_suffix.len - 1, state->out_fd);
- fwrite(">", 1, 1, state->out_fd);
- break;
- case URI:
- if (serd_uri_string_is_relative(str->buf)) {
- SerdURI uri;
- if (serd_uri_parse(str->buf, &uri)) {
- SerdURI abs_uri;
- if (serd_uri_resolve(&uri, &state->base_uri, &abs_uri)) {
- fwrite("<", 1, 1, state->out_fd);
- serd_uri_write(&abs_uri, state->out_fd);
- fwrite(">", 1, 1, state->out_fd);
- return true;
- }
- }
- } else {
- fwrite("<", 1, 1, state->out_fd);
- fwrite(str->buf, 1, str->n_bytes - 1, state->out_fd);
- fwrite(">", 1, 1, state->out_fd);
- return true;
- }
- return false;
- case LITERAL:
- fwrite("\"", 1, 1, state->out_fd);
- for (size_t i = 0; i < str->n_bytes - 1; ++i) {
- const char c = str->buf[i];
- switch (c) {
- case '\\': fwrite("\\\\", 1, 2, state->out_fd); break;
- case '\n': fwrite("\\n", 1, 2, state->out_fd); break;
- case '\r': fwrite("\\r", 1, 2, state->out_fd); break;
- case '\t': fwrite("\\t", 1, 2, state->out_fd); break;
- case '"': fwrite("\\\"", 1, 2, state->out_fd); break;
- default:
- fwrite(&c, 1, 1, state->out_fd);
- }
- }
- fwrite("\"", 1, 1, state->out_fd);
- if (lang) {
- fwrite("@\"", 1, 2, state->out_fd);
- fwrite(lang->buf, 1, lang->n_bytes - 1, state->out_fd);
- fwrite("\"", 1, 1, state->out_fd);
- } else if (datatype) {
- fwrite("^^", 1, 2, state->out_fd);
- write_node(state, datatype, URI, NULL, NULL);
- }
- break;
- }
- return true;
-}
-
static bool
event_statement(void* handle,
const SerdString* graph,
@@ -176,11 +105,14 @@ event_statement(void* handle,
{
State* const state = (State*)handle;
FILE* const fd = state->out_fd;
- write_node(state, subject, subject_type, NULL, NULL);
+ serd_write_node(fd, &state->base_uri, state->ns,
+ subject_type, subject, NULL, NULL);
fwrite(" ", 1, 1, fd);
- write_node(state, predicate, predicate_type, NULL, NULL);
+ serd_write_node(fd, &state->base_uri, state->ns,
+ predicate_type, predicate, NULL, NULL);
fwrite(" ", 1, 1, fd);
- write_node(state, object, object_type, object_datatype, object_lang);
+ serd_write_node(fd, &state->base_uri, state->ns,
+ object_type, object, object_datatype, object_lang);
fwrite(" .\n", 1, 3, fd);
return true;
}
diff --git a/src/string.c b/src/string.c
new file mode 100644
index 00000000..301a98cc
--- /dev/null
+++ b/src/string.c
@@ -0,0 +1,65 @@
+/* Serd, an RDF serialisation library.
+ * Copyright 2011 David Robillard <d@drobilla.net>
+ *
+ * Serd is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Serd is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ * License for details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "serd/serd.h"
+
+static inline size_t
+utf8_strlen(const uint8_t* utf8, size_t* out_n_bytes)
+{
+ size_t n_chars = 0;
+ size_t i = 0;
+ for (; utf8[i]; ++i) {
+ if ((utf8[i] & 0xC0) != 0x80) {
+ // Does not start with `10', start of a new character
+ ++n_chars;
+ }
+ }
+ if (out_n_bytes) {
+ *out_n_bytes = i + 1;
+ }
+ return n_chars;
+}
+
+SERD_API
+SerdString*
+serd_string_new(const uint8_t* utf8)
+{
+ size_t n_bytes;
+ size_t n_chars = utf8_strlen(utf8, &n_bytes);
+ SerdString* const str = malloc(sizeof(SerdString) + n_bytes);
+ str->n_bytes = n_bytes;
+ str->n_chars = n_chars;
+ memcpy(str->buf, utf8, str->n_bytes);
+ return str;
+}
+
+SERD_API
+SerdString*
+serd_string_copy(const SerdString* s)
+{
+ if (s) {
+ SerdString* const copy = malloc(sizeof(SerdString) + s->n_bytes);
+ memcpy(copy, s, sizeof(SerdString) + s->n_bytes);
+ return copy;
+ }
+ return NULL;
+}
diff --git a/src/uri.c b/src/uri.c
index d98f07ff..1ff7a6d9 100644
--- a/src/uri.c
+++ b/src/uri.c
@@ -260,10 +260,9 @@ serd_uri_resolve(const SerdURI* r, const SerdURI* base, SerdURI* t)
return true;
}
-typedef size_t (*Sink)(const void* data, size_t size, size_t nmemb, void* stream);
-
-static size_t
-serd_uri_serialise_internal(const SerdURI* uri, Sink sink, void* stream)
+SERD_API
+size_t
+serd_uri_serialise(const SerdURI* uri, SerdSink sink, void* stream)
{
/* See http://tools.ietf.org/html/rfc3986#section-5.3 */
@@ -271,16 +270,16 @@ serd_uri_serialise_internal(const SerdURI* uri, Sink sink, void* stream)
#define WRITE(buf, len) \
write_size += len; \
if (len) { \
- sink(buf, 1, len, stream); \
+ sink((const uint8_t*)buf, len, stream); \
}
#define WRITE_CHAR(c) WRITE(&(c), 1)
#define WRITE_COMPONENT(prefix, field, suffix) \
if ((field).len) { \
- for (const char* c = prefix; *c != '\0'; ++c) { \
+ for (const uint8_t* c = (const uint8_t*)prefix; *c != '\0'; ++c) { \
WRITE(c, 1); \
} \
WRITE((field).buf, (field).len); \
- for (const char* c = suffix; *c != '\0'; ++c) { \
+ for (const uint8_t* c = (const uint8_t*)suffix; *c != '\0'; ++c) { \
WRITE(c, 1); \
} \
}
@@ -354,26 +353,9 @@ serd_uri_serialise_internal(const SerdURI* uri, Sink sink, void* stream)
// Note uri->fragment.buf includes the leading `#'
WRITE_COMPONENT("", uri->fragment, "");
}
- WRITE("\0", 1);
return write_size;
}
-SERD_API
-bool
-serd_uri_write(const SerdURI* uri, FILE* file)
-{
- //#if 0
- SerdURI flat_uri;
- SerdString* const flat_uri_str = serd_uri_serialise(uri, &flat_uri);
- if (flat_uri_str) {
- fwrite(flat_uri_str->buf, 1, flat_uri_str->n_bytes - 1, file);
- free(flat_uri_str);
- return true;
- }
- return false;
- //#endif
- //return (serd_uri_serialise_internal(uri, (Sink)fwrite, file) > 0);
-}
static size_t
serd_uri_string_length(const SerdURI* uri)
@@ -393,18 +375,17 @@ serd_uri_string_length(const SerdURI* uri)
}
static size_t
-string_write(const void* data, size_t size, size_t nmemb, void* stream)
+string_sink(const uint8_t* buf, size_t len, void* stream)
{
- uint8_t** ptr = (uint8_t**)stream;
- const size_t write_size = (size * nmemb);
- memcpy(*ptr, data, write_size);
- *ptr += write_size;
- return nmemb;
+ uint8_t** ptr = (uint8_t**)stream;
+ memcpy(*ptr, buf, len);
+ *ptr += len;
+ return len;
}
SERD_API
SerdString*
-serd_uri_serialise(const SerdURI* uri, SerdURI* out)
+serd_string_new_from_uri(const SerdURI* uri, SerdURI* out)
{
const size_t len = serd_uri_string_length(uri);
SerdString* str = malloc(sizeof(SerdString) + len + 1);
@@ -412,10 +393,10 @@ serd_uri_serialise(const SerdURI* uri, SerdURI* out)
str->n_chars = len; // FIXME: UTF-8
uint8_t* ptr = str->buf;
- const size_t actual_len = serd_uri_serialise_internal(uri, string_write, &ptr);
+ const size_t actual_len = serd_uri_serialise(uri, string_sink, &ptr);
- str->buf[actual_len] = '\0';
- str->n_bytes = actual_len;
+ str->buf[actual_len + 1] = '\0';
+ str->n_bytes = actual_len + 1;
str->n_chars = str->n_bytes - 1;
#ifdef URI_DEBUG
diff --git a/src/write.c b/src/write.c
new file mode 100644
index 00000000..c2b92d78
--- /dev/null
+++ b/src/write.c
@@ -0,0 +1,170 @@
+/* Serd, an RDF serialisation library.
+ * Copyright 2011 David Robillard <d@drobilla.net>
+ *
+ * Serd is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Serd is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ * License for details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include "serd/serd.h"
+
+static size_t
+file_sink(const uint8_t* buf, size_t len, void* stream)
+{
+ FILE* file = (FILE*)stream;
+ return fwrite(buf, 1, len, file);
+}
+
+static inline bool
+serd_write_uri(FILE* file, const SerdURI* uri)
+{
+ return serd_uri_serialise(uri, file_sink, file);
+}
+
+static bool
+serd_write_ascii(const uint8_t* utf8, size_t n_bytes, FILE* out_fd, const uint8_t esc)
+{
+ for (size_t i = 0; i < n_bytes;) {
+ uint8_t in = utf8[i++];
+ switch (in) {
+ case '\\': fwrite("\\\\", 1, 2, out_fd); continue;
+ case '\n': fwrite("\\n", 1, 2, out_fd); continue;
+ case '\r': fwrite("\\r", 1, 2, out_fd); continue;
+ case '\t': fwrite("\\t", 1, 2, out_fd); continue;
+ case '"': if (esc == '"') { fwrite("\\\"", 1, 2, out_fd); continue; }
+ default: break;
+ }
+
+ if (in == esc) {
+ fprintf(out_fd, "\\u%04X", esc);
+ continue;
+ }
+
+ uint32_t c = 0;
+ size_t size = 0;
+ if ((in & 0x80) == 0) { // Starts with `0'
+ size = 1;
+ c = in & 0x7F;
+ if ((in >= 0x20) && (in <= 0x7E)) { // Printable ASCII
+ fwrite(&in, 1, 1, out_fd);
+ continue;
+ }
+ } else if ((in & 0xE0) == 0xC0) { // Starts with `110'
+ size = 2;
+ c = in & 0x1F;
+ } else if ((in & 0xF0) == 0xE0) { // Starts with `1110'
+ size = 3;
+ c = in & 0x0F;
+ } else if ((in & 0xF8) == 0xF0) { // Starts with `11110'
+ size = 4;
+ c = in & 0x07;
+ } else if ((in & 0xFC) == 0xF8) { // Starts with `111110'
+ size = 5;
+ c = in & 0x03;
+ } else if ((in & 0xFE) == 0xFC) { // Starts with `1111110'
+ size = 6;
+ c = in & 0x01;
+ } else {
+ fprintf(stderr, "invalid UTF-8 at offset %zu: %X\n", i, in);
+ return false;
+ }
+
+#define READ_BYTE() do { \
+ assert(i < n_bytes); \
+ in = utf8[i++] & 0x3f; \
+ c <<= 6; \
+ c |= in; \
+ } while (0)
+
+ switch (size) {
+ case 6: READ_BYTE();
+ case 5: READ_BYTE();
+ case 4: READ_BYTE();
+ case 3: READ_BYTE();
+ case 2: READ_BYTE();
+ }
+
+ if (c < 0xFFFF) {
+ fprintf(out_fd, "\\u%04X", c);
+ } else {
+ fprintf(out_fd, "\\U%08X", c);
+ }
+ }
+ return true;
+}
+
+SERD_API
+bool
+serd_write_node(FILE* fd,
+ const SerdURI* base_uri,
+ SerdNamespaces ns,
+ SerdNodeType type,
+ const SerdString* str,
+ const SerdString* datatype,
+ const SerdString* lang)
+{
+ SerdRange uri_prefix;
+ SerdRange uri_suffix;
+ switch (type) {
+ case BLANK:
+ fwrite("_:", 1, 2, fd);
+ fwrite(str->buf, 1, str->n_bytes - 1, fd);
+ break;
+ case QNAME:
+ if (!serd_namespaces_expand(ns, str, &uri_prefix, &uri_suffix)) {
+ fprintf(stderr, "error: undefined namespace prefix `%s'\n", str->buf);
+ return false;
+ }
+ fwrite("<", 1, 1, fd);
+ serd_write_ascii(uri_prefix.buf, uri_prefix.len, fd, '>');
+ serd_write_ascii(uri_suffix.buf, uri_suffix.len, fd, '>');
+ fwrite(">", 1, 1, fd);
+ break;
+ case URI:
+ if (serd_uri_string_is_relative(str->buf)) {
+ SerdURI uri;
+ if (serd_uri_parse(str->buf, &uri)) {
+ SerdURI abs_uri;
+ if (serd_uri_resolve(&uri, base_uri, &abs_uri)) {
+ fwrite("<", 1, 1, fd);
+ serd_write_uri(fd, &abs_uri);
+ fwrite(">", 1, 1, fd);
+ return true;
+ }
+ }
+ } else {
+ fwrite("<", 1, 1, fd);
+ serd_write_ascii(str->buf, str->n_bytes - 1, fd, '>');
+ fwrite(">", 1, 1, fd);
+ return true;
+ }
+ return false;
+ case LITERAL:
+ fwrite("\"", 1, 1, fd);
+ serd_write_ascii(str->buf, str->n_bytes - 1, fd, '"');
+ fwrite("\"", 1, 1, fd);
+ if (lang) {
+ fwrite("@\"", 1, 2, fd);
+ fwrite(lang->buf, 1, lang->n_bytes - 1, fd);
+ fwrite("\"", 1, 1, fd);
+ } else if (datatype) {
+ fwrite("^^", 1, 2, fd);
+ serd_write_node(fd, base_uri, ns, URI, datatype, NULL, NULL);
+ }
+ break;
+ }
+ return true;
+}
+
diff --git a/wscript b/wscript
index 3367977e..b701c98c 100644
--- a/wscript
+++ b/wscript
@@ -54,7 +54,13 @@ def build(bld):
# Pkgconfig file
autowaf.build_pc(bld, 'SERD', SERD_VERSION, ['REDLAND'])
- lib_source = 'src/reader.c src/namespaces.c src/uri.c'
+ lib_source = '''
+ src/namespaces.c
+ src/reader.c
+ src/string.c
+ src/uri.c
+ src/write.c
+ '''
# Library
obj = bld(features = 'c cshlib')