diff options
author | David Robillard <d@drobilla.net> | 2023-02-05 18:39:49 -0500 |
---|---|---|
committer | David Robillard <d@drobilla.net> | 2023-12-02 18:49:08 -0500 |
commit | 343124df71010055c2c1e6cdcadd13d23b2c013a (patch) | |
tree | 7c2de6a72021adaac89e9c4fa97e7cc5503e0657 | |
parent | 530edb265fbbed20e6d3a6fd7a36461ff83d9b46 (diff) | |
download | serd-343124df71010055c2c1e6cdcadd13d23b2c013a.tar.gz serd-343124df71010055c2c1e6cdcadd13d23b2c013a.tar.bz2 serd-343124df71010055c2c1e6cdcadd13d23b2c013a.zip |
[WIP] Add support for URI hex escape decoding
46 files changed, 555 insertions, 149 deletions
diff --git a/.clang-tidy b/.clang-tidy index ba762f8a..e19963f0 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -8,11 +8,12 @@ Checks: > -clang-diagnostic-unused-macros, -llvmlibc-*, -modernize-macro-to-enum, - -readability-function-cognitive-complexity, -readability-identifier-length, CheckOptions: - key: hicpp-uppercase-literal-suffix.NewSuffixes value: 'L;U;UL;ULL' + - key: readability-function-cognitive-complexity.IgnoreMacros + value: 'true' - key: readability-uppercase-literal-suffix.NewSuffixes value: 'L;U;UL;ULL' FormatStyle: file diff --git a/doc/man/serd-pipe.1 b/doc/man/serd-pipe.1 index 54e3a3b1..9ee3ee54 100644 --- a/doc/man/serd-pipe.1 +++ b/doc/man/serd-pipe.1 @@ -152,6 +152,11 @@ Note that this may corrupt the output by merging distinct blank nodes. Generate blank node labels with suffixes left-padded with zeros. This generates IDs like "_:b0000000123" that sort in numerical order, which can be useful to preserve statement ordering. +.It Cm decoded +Read URIs with percent-encoded UTF-8 characters decoded. +Normally, percent-encoded octets in URIs are preserved as plain text. +This flag enables interpreting them as UTF-8, +decoding escapes like "%7E" to characters like "~" where possible. .El .It Fl O Ar syntax Set an output syntax or option. @@ -168,13 +173,32 @@ or an option: .Bl -tag -width 3n .It Cm ascii Escape all non-ASCII characters. +Normally, text is written in UTF-8. +This flag will escape additional non-printable-ASCII characters in string literals like +.Li \eU00B7 +or +.Li \eU0001F600 , +and in URIs like +.Li %B7 +or +.Li %F0%9F%98%80 . +.It Cm escapes +Escape all non-ASCII characters with +.Dq U +escapes. +This works like +.Cm ascii , +except percent-encoding will not be used in URIs +(matching the format used in the Turtle test suite). .It Cm contextual Suppress writing directives that describe the context. -Normally when writing Turtle or Trig, -a document will have a header that defines all the prefixes used in the input. -This flag will disable writing those directives, -so the output is document fragment with an implicit context. -This can be useful for writing output intended for humans. +This can be used to suppress the header of +.Li prefix +and +.Li base +directives, +making the output depend on an implied context. +Note that this option may produce incomprehensible output if prefixes change while writing! .It Cm expanded Write expanded URIs instead of prefixed names. .It Cm lax diff --git a/include/serd/reader.h b/include/serd/reader.h index 78b51d00..57c8b2c3 100644 --- a/include/serd/reader.h +++ b/include/serd/reader.h @@ -101,6 +101,15 @@ typedef enum { from documents when the statements are sorted, such as in a model. */ SERD_READ_ORDERED = 1U << 5U, + + /** + Read URIs with percent-encoded UTF-8 characters decoded. + + Normally, percent-encoded octets in URIs are treated as plain text and + preserved. This flags enables UTF-8 decoding of URIs, so octet escapes + like "%7E" in URIs will be decoded to UTF-8 characters like "~". + */ + SERD_READ_DECODED = 1U << 6U, } SerdReaderFlag; /// Bitwise OR of SerdReaderFlag values diff --git a/include/serd/writer.h b/include/serd/writer.h index bf54c46c..e3915a86 100644 --- a/include/serd/writer.h +++ b/include/serd/writer.h @@ -42,6 +42,10 @@ typedef enum { Although all the supported syntaxes are UTF-8 by definition, this can be used to escape all non-ASCII characters so that data will survive transmission through ASCII-only channels. + + Non-printable-ASCII characters will be written as "U" escapes like + "\u007F" in string literals, and as hex-encoded UTF-8 bytes like "%07F" in + URIs. */ SERD_WRITE_ASCII = 1U << 0U, @@ -96,6 +100,18 @@ typedef enum { implicit context, so it will only be readable in a suitable enviromnent. */ SERD_WRITE_CONTEXTUAL = 1U << 6U, + + /** + Escape additional characters in RDF Test Cases format. + + This writes "extended" characters as printable ASCII, using "U" escapes in + URIs instead of hex-encoding (escapes like "\u007F" instead of "%7F"). + This is the format used by the outputs in the Turtle test suite (which + predates RDF 1.1 NTriples). This style makes NTriples output + non-canonical, so it generally shouldn't be used except for compatibility + purposes. See <https://www.w3.org/TR/rdf-testcases/>. + */ + SERD_WRITE_ESCAPES = 1U << 7U, } SerdWriterFlag; /// Bitwise OR of #SerdWriterFlag values diff --git a/src/read_ntriples.c b/src/read_ntriples.c index dd5c28fc..5c02abfe 100644 --- a/src/read_ntriples.c +++ b/src/read_ntriples.c @@ -108,6 +108,67 @@ read_IRI_scheme(SerdReader* const reader, SerdNode* const dest) return st ? st : SERD_BAD_SYNTAX; } +static SerdStatus +read_hex_byte(SerdReader* const reader, uint8_t digits[const 2]) +{ + for (unsigned i = 0U; i < 2U; ++i) { + if (!(digits[i] = read_HEX(reader))) { + return SERD_BAD_SYNTAX; + } + } + + return SERD_SUCCESS; +} + +static uint8_t +hex_byte_value(const uint8_t c0, const uint8_t c1) +{ + return (uint8_t)((hex_digit_value(c0) << 4U) | hex_digit_value(c1)); +} + +/// RFC3986 S2.1: pct-encoded = "%" HEXDIG HEXDIG +static SerdStatus +read_pct_encoded(SerdReader* const reader, SerdNode* const node) +{ + SerdStatus st = SERD_SUCCESS; + uint8_t hex[9] = {0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U}; + + // Read first percent-encoded byte + TRY(st, read_hex_byte(reader, hex)); + + // Parse the leading byte and get the encoded size from it + uint8_t byte = hex_byte_value(hex[0], hex[1]); + const uint32_t size = utf8_num_bytes(byte); + if (!size) { + return SERD_BAD_TEXT; + } + + // Avoid decoding '%' itself + if (byte == '%') { + return push_bytes(reader, node, (const uint8_t*)"%25", 3); + } + + // Push the leading byte to the node + TRY(st, push_byte(reader, node, byte)); + + // Read remaining hex-encoded bytes + for (unsigned i = 1; i < size; ++i) { + const unsigned offset = 2U * i; + uint8_t* const digits = hex + offset; + TRY(st, eat_byte_check(reader, '%')); + TRY(st, read_hex_byte(reader, digits)); + + byte = hex_byte_value(digits[0], digits[1]); + if (!is_utf8_continuation(byte)) { + return SERD_BAD_TEXT; + } + + TRY(st, push_byte(reader, node, byte)); + } + + return st; +} + SerdStatus read_IRIREF_suffix(SerdReader* const reader, SerdNode* const node) { @@ -131,6 +192,11 @@ read_IRIREF_suffix(SerdReader* const reader, SerdNode* const node) case '>': return SERD_SUCCESS; + case '%': + st = (reader->flags & SERD_READ_DECODED) ? read_pct_encoded(reader, node) + : push_byte(reader, node, c); + break; + case '\\': if (!(st = read_UCHAR(reader, node, &code)) && (code == ' ' || code == '<' || code == '>')) { diff --git a/src/writer.c b/src/writer.c index fd52a123..ccb24e5f 100644 --- a/src/writer.c +++ b/src/writer.c @@ -53,6 +53,13 @@ typedef struct { bool comma_indented; } WriteContext; +/// A status for an operation that reads/writes variable numbers of bytes +typedef struct { + SerdStatus status; + size_t read_count; + size_t write_count; +} VariableResult; + static const WriteContext WRITE_CONTEXT_NULL = {CTX_NAMED, 0U, NULL, NULL, NULL, 0U, 0U}; @@ -280,47 +287,127 @@ esink(const void* buf, size_t len, SerdWriter* writer) return sink(buf, len, writer) == len ? SERD_SUCCESS : SERD_BAD_WRITE; } -// Write a single character as a Unicode escape -// (Caller prints any single byte characters that don't need escaping) -static size_t -write_character(SerdWriter* const writer, - const uint8_t* const utf8, - uint8_t* const size, - SerdStatus* const st) +static VariableResult +write_UCHAR(SerdWriter* const writer, const uint8_t* const utf8) { + VariableResult result = {SERD_SUCCESS, 0U, 0U}; char escape[11] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; - const uint32_t c = parse_utf8_char(utf8, size); - switch (*size) { - case 0: - *st = w_err(writer, SERD_BAD_TEXT, "invalid UTF-8 start: %X", utf8[0]); - return 0; - case 1: - snprintf(escape, sizeof(escape), "\\u%04X", utf8[0]); - return sink(escape, 6, writer); - default: - break; + uint8_t c_size = 0U; + const uint32_t c = parse_utf8_char(utf8, &c_size); + + result.read_count = c_size; + if (result.read_count == 0U) { + result.status = + w_err(writer, SERD_BAD_TEXT, "invalid UTF-8 start: %X", utf8[0]); + } else if (c <= 0xFFFF) { + // Write short (4 digit) escape + snprintf(escape, sizeof(escape), "\\u%04X", c); + result.write_count = sink(escape, 6, writer); + } else { + // Write long (6 digit) escape + snprintf(escape, sizeof(escape), "\\U%08X", c); + result.write_count = sink(escape, 10, writer); } - if (!(writer->flags & SERD_WRITE_ASCII)) { - // Write UTF-8 character directly to UTF-8 output - return sink(utf8, *size, writer); + return result; +} + +SERD_NODISCARD static VariableResult +write_percent_encoded_bytes(SerdWriter* const writer, + const size_t size, + const uint8_t* const data) +{ + static const char hex_chars[] = "0123456789ABCDEF"; + + VariableResult result = {SERD_SUCCESS, 0U, 0U}; + char escape[4] = {'%', 0, 0, 0}; + + for (size_t i = 0U; !result.status && i < size; ++i) { + const uint8_t byte = data[i]; + escape[1] = hex_chars[byte >> 4U]; + escape[2] = hex_chars[byte & 0x0FU]; + + const size_t n_written = sink(escape, 3U, writer); + result.write_count += n_written; + if (n_written != 3U) { + result.status = SERD_BAD_WRITE; + } + + ++result.read_count; } - if (c <= 0xFFFF) { - snprintf(escape, sizeof(escape), "\\u%04X", c); - return sink(escape, 6, writer); + return result; +} + +static VariableResult +write_text_character(SerdWriter* const writer, const uint8_t* const utf8) +{ + VariableResult result = {SERD_SUCCESS, 0U, 0U}; + const uint8_t c = utf8[0]; + + if ((writer->flags & (SERD_WRITE_ASCII | SERD_WRITE_ESCAPES)) || c < 0x20U || + c == 0x7FU) { + // Write ASCII-compatible UCHAR escape like "\u1234" + return write_UCHAR(writer, utf8); + } + + // Parse the leading byte to get the UTF-8 encoding size + if (!(result.read_count = utf8_num_bytes(c))) { + result.status = SERD_BAD_TEXT; + return result; + } + + // Write the UTF-8 encoding directly to the output + result.write_count = sink(utf8, result.read_count, writer); + if (result.write_count != result.read_count) { + result.status = SERD_BAD_WRITE; + } + + return result; +} + +static VariableResult +write_uri_character(SerdWriter* const writer, const uint8_t* const utf8) +{ + VariableResult result = {SERD_SUCCESS, 0U, 0U}; + const uint8_t c = utf8[0]; + + if ((writer->flags & SERD_WRITE_ESCAPES)) { + return write_UCHAR(writer, utf8); + } + + if (c == '%') { + // Avoid encoding '%' itself + result.read_count = 1; + result.write_count = sink("%25", 3, writer); + return result; + } + + if ((c & 0x80U) && !(writer->flags & SERD_WRITE_ASCII)) { + // Parse the leading byte to get the UTF-8 encoding size + if (!(result.read_count = utf8_num_bytes(c))) { + result.status = SERD_BAD_TEXT; + } else { + // Write the UTF-8 encoding directly to the output + result.write_count = sink(utf8, result.read_count, writer); + if (result.write_count != result.read_count) { + result.status = SERD_BAD_WRITE; + } + } + + return result; } - snprintf(escape, sizeof(escape), "\\U%08X", c); - return sink(escape, 10, writer); + return write_percent_encoded_bytes(writer, 1U, utf8); } static bool -uri_must_escape(const int c) +uri_must_escape(const uint8_t c) { switch (c) { case ' ': case '"': + // case '%': case '<': case '>': case '\\': @@ -336,58 +423,60 @@ uri_must_escape(const int c) } static size_t -write_uri(SerdWriter* writer, const char* utf8, size_t n_bytes, SerdStatus* st) +next_text_index(const char* utf8, + const size_t begin, + const size_t end, + bool (*const predicate)(uint8_t)) { - size_t len = 0; - for (size_t i = 0; i < n_bytes;) { - size_t j = i; // Index of next character that must be escaped - for (; j < n_bytes; ++j) { - if (uri_must_escape(utf8[j])) { - break; - } - } - - // Bulk write all characters up to this special one - const size_t n_bulk = sink(&utf8[i], j - i, writer); - len += n_bulk; - if (n_bulk != j - i) { - *st = SERD_BAD_WRITE; - return len; - } + size_t i = begin; + while (i < end && !predicate((uint8_t)utf8[i])) { + ++i; + } + return i; +} +static VariableResult +write_uri(SerdWriter* writer, const char* utf8, const size_t n_bytes) +{ + VariableResult result = {SERD_SUCCESS, 0U, 0U}; + for (size_t i = 0; i < n_bytes;) { + // Write leading chunk as a single fast bulk write + const size_t j = next_text_index(utf8, i, n_bytes, uri_must_escape); + result.status = esink(&utf8[i], j - i, writer); if ((i = j) == n_bytes) { break; // Reached end } - // Write UTF-8 character - uint8_t size = 0; - len += write_character(writer, (const uint8_t*)utf8 + i, &size, st); - i += size; - if (*st && !(writer->flags & SERD_WRITE_LAX)) { + // Write character (escape or UTF-8) + const VariableResult r = + write_uri_character(writer, (const uint8_t*)utf8 + i); + i += r.read_count; + result.write_count += r.write_count; + if (r.status && !(writer->flags & SERD_WRITE_LAX)) { + result.status = r.status; break; } - if (size == 0) { + if (r.read_count == 0) { // Corrupt input, write percent-encoded bytes and scan to next start char escape[4] = {0, 0, 0, 0}; for (; i < n_bytes && !is_utf8_leading((uint8_t)utf8[i]); ++i) { snprintf(escape, sizeof(escape), "%%%02X", (uint8_t)utf8[i]); - len += sink(escape, 3, writer); + result.write_count += sink(escape, 3, writer); } } } - return len; + return result; } SERD_NODISCARD static SerdStatus ewrite_uri(SerdWriter* writer, const char* utf8, size_t n_bytes) { - SerdStatus st = SERD_SUCCESS; - write_uri(writer, utf8, n_bytes, &st); + const VariableResult r = write_uri(writer, utf8, n_bytes); - return (st == SERD_BAD_WRITE || !(writer->flags & SERD_WRITE_LAX)) - ? st + return (r.status == SERD_BAD_WRITE || !(writer->flags & SERD_WRITE_LAX)) + ? r.status : SERD_SUCCESS; } @@ -398,27 +487,6 @@ write_uri_from_node(SerdWriter* writer, const SerdNode* node) } SERD_NODISCARD static SerdStatus -write_utf8_percent_escape(SerdWriter* const writer, - const char* const utf8, - const size_t n_bytes) -{ - static const char hex_chars[] = "0123456789ABCDEF"; - - SerdStatus st = SERD_SUCCESS; - char escape[4] = {'%', 0, 0, 0}; - - for (size_t i = 0U; i < n_bytes; ++i) { - const uint8_t byte = (uint8_t)utf8[i]; - escape[1] = hex_chars[byte >> 4U]; - escape[2] = hex_chars[byte & 0x0FU]; - - TRY(st, esink(escape, 3, writer)); - } - - return st; -} - -SERD_NODISCARD static SerdStatus write_PN_LOCAL_ESC(SerdWriter* const writer, const char c) { const char buf[2] = {'\\', c}; @@ -431,7 +499,8 @@ write_lname_escape(SerdWriter* writer, const char* const utf8, size_t n_bytes) { return is_PN_LOCAL_ESC(utf8[0]) ? write_PN_LOCAL_ESC(writer, utf8[0]) - : write_utf8_percent_escape(writer, utf8, n_bytes); + : write_percent_encoded_bytes(writer, n_bytes, (const uint8_t*)utf8) + .status; } SERD_NODISCARD static SerdStatus @@ -518,14 +587,16 @@ write_short_string_escape(SerdWriter* const writer, const char c) case '\r': return sink("\\r", 2, writer); case '\t': - return sink("\\t", 2, writer); + return (writer->flags & SERD_WRITE_ESCAPES) ? sink("\\t", 2, writer) + : sink("\t", 1, writer); case '"': return sink("\\\"", 2, writer); default: break; } - if (writer->syntax == SERD_TURTLE) { + if (!(writer->flags & SERD_WRITE_ESCAPES)) { + // These are written with UCHAR in pre-NTriples test cases format switch (c) { case '\b': return sink("\\b", 2, writer); @@ -539,63 +610,84 @@ write_short_string_escape(SerdWriter* const writer, const char c) return 0; } -static bool -text_must_escape(const char c) +SERD_NODISCARD static bool +text_must_escape(const uint8_t c) { return c == '\\' || c == '"' || !in_range(c, 0x20, 0x7E); } SERD_NODISCARD static SerdStatus -write_text(SerdWriter* writer, - TextContext ctx, - const char* utf8, - size_t n_bytes) +write_short_text(SerdWriter* writer, const char* utf8, size_t n_bytes) { - size_t n_consecutive_quotes = 0; - SerdStatus st = SERD_SUCCESS; - for (size_t i = 0; !st && i < n_bytes;) { - if (utf8[i] != '"') { - n_consecutive_quotes = 0; + VariableResult result = {SERD_SUCCESS, 0U, 0U}; + for (size_t i = 0; !result.status && i < n_bytes;) { + // Write leading chunk as a single fast bulk write + const size_t j = next_text_index(utf8, i, n_bytes, text_must_escape); + result.status = esink(&utf8[i], j - i, writer); + if ((i = j) == n_bytes) { + break; // Reached end + } + + // Try to write character as a special short escape (newline and friends) + const char in = utf8[i]; + const size_t escape_len = write_short_string_escape(writer, in); + + if (!escape_len) { + // No special escape for this character, write full Unicode escape + result = write_text_character(writer, (const uint8_t*)utf8 + i); + i += result.read_count; + + if (!result.read_count && (writer->flags & SERD_WRITE_LAX)) { + // Corrupt input, write replacement character and scan to the next start + result.status = + esink(replacement_char, sizeof(replacement_char), writer); + i += next_text_index(utf8, i, n_bytes, is_utf8_leading); + } + } else { + ++i; } + } + + return result.status; +} - // Scan for the longest chunk of characters that can be written directly - size_t j = i; - for (; j < n_bytes && !text_must_escape(utf8[j]); ++j) { +SERD_NODISCARD static SerdStatus +write_long_text(SerdWriter* writer, const char* utf8, size_t n_bytes) +{ + size_t n_quotes = 0; + VariableResult result = {SERD_SUCCESS, 0U, 0U}; + for (size_t i = 0; !result.status && i < n_bytes;) { + if (utf8[i] != '"') { + n_quotes = 0; } - // Write chunk as a single fast bulk write - st = esink(&utf8[i], j - i, writer); + // Write leading chunk as a single fast bulk write + const size_t j = next_text_index(utf8, i, n_bytes, text_must_escape); + result.status = esink(&utf8[i], j - i, writer); if ((i = j) == n_bytes) { break; // Reached end } - // Try to write character as a special short escape (newline and friends) - const char in = utf8[i++]; - size_t escape_len = 0; - if (ctx == WRITE_LONG_STRING) { - n_consecutive_quotes = (in == '\"') ? (n_consecutive_quotes + 1) : 0; - escape_len = write_long_string_escape( - writer, n_consecutive_quotes, i == n_bytes, in); - } else { - escape_len = write_short_string_escape(writer, in); - } + // Try to write character as a special long escape (newline and friends) + const char in = utf8[i]; + n_quotes = (in == '\"') ? (n_quotes + 1U) : 0; + const size_t escape_len = + write_long_string_escape(writer, n_quotes, i + 1U == n_bytes, in); - if (escape_len == 0) { + if (!escape_len) { // No special escape for this character, write full Unicode escape - uint8_t size = 0; - write_character(writer, (const uint8_t*)utf8 + i - 1, &size, &st); - if (st && !(writer->flags & SERD_WRITE_LAX)) { - return st; - } - - if (size == 0) { - // Corrupt input, write replacement character and scan to the next start - st = esink(replacement_char, sizeof(replacement_char), writer); - for (; i < n_bytes && !is_utf8_leading((uint8_t)utf8[i]); ++i) { - } - } else { - i += size - 1U; + result = write_UCHAR(writer, (const uint8_t*)utf8 + i); + i += result.read_count; + + if (!result.read_count && (writer->flags & SERD_WRITE_LAX)) { + // Corrupt input, write replacement character and scan to the next + // start + result.status = + esink(replacement_char, sizeof(replacement_char), writer); + i += next_text_index(utf8, i, n_bytes, is_utf8_leading); } + } else { + ++i; } } @@ -615,8 +707,10 @@ uri_sink(const void* buf, size_t size, size_t nmemb, void* stream) UriSinkContext* const context = (UriSinkContext*)stream; SerdWriter* const writer = context->writer; + const VariableResult r = write_uri(writer, (const char*)buf, nmemb); - return write_uri(writer, (const char*)buf, nmemb, &context->status); + context->status = r.status; + return r.write_count; } SERD_NODISCARD static SerdStatus @@ -774,11 +868,11 @@ write_literal(SerdWriter* const writer, if (supports_abbrev(writer) && (node->flags & SERD_IS_LONG)) { TRY(st, esink("\"\"\"", 3, writer)); - TRY(st, write_text(writer, WRITE_LONG_STRING, node_str, node->length)); + TRY(st, write_long_text(writer, node_str, node->length)); TRY(st, esink("\"\"\"", 3, writer)); } else { TRY(st, esink("\"", 1, writer)); - TRY(st, write_text(writer, WRITE_STRING, node_str, node->length)); + TRY(st, write_short_text(writer, node_str, node->length)); TRY(st, esink("\"", 1, writer)); } if (lang && serd_node_string(lang)) { @@ -1390,11 +1484,14 @@ serd_writer_set_base_uri(SerdWriter* writer, const SerdNode* uri) if (uri && (writer->syntax == SERD_TURTLE || writer->syntax == SERD_TRIG)) { TRY(st, terminate_context(writer)); - TRY(st, esink("@base <", 7, writer)); - TRY(st, esink(uri_string.data, uri_string.length, writer)); - TRY(st, esink(">", 1, writer)); - writer->last_sep = SEP_NODE; - TRY(st, write_sep(writer, writer->context.flags, SEP_END_DIRECT)); + + if (!(writer->flags & SERD_WRITE_CONTEXTUAL)) { + TRY(st, esink("@base <", 7, writer)); + TRY(st, esink(uri_string.data, uri_string.length, writer)); + TRY(st, esink(">", 1, writer)); + writer->last_sep = SEP_NODE; + TRY(st, write_sep(writer, writer->context.flags, SEP_END_DIRECT)); + } } return reset_context(writer, RESET_GRAPH | RESET_INDENT); diff --git a/test/extra/ascii/UTF-8.nt b/test/extra/ascii/UTF-8.nt new file mode 100644 index 00000000..cb47d4d9 --- /dev/null +++ b/test/extra/ascii/UTF-8.nt @@ -0,0 +1,2 @@ +<http://example.org/text> <http://www.w3.org/2000/01/rdf-schema#comment> "\nUTF-8 encoded sample plain-text file\n\u203E\u203E\u203E\u203E\u203E\u203E\u203E\u203E\u203E\u203E\u203E\u203E\u203E\u203E\u203E\u203E\u203E\u203E\u203E\u203E\u203E\u203E\u203E\u203E\u203E\u203E\u203E\u203E\u203E\u203E\u203E\u203E\u203E\u203E\u203E\u203E\n\nMarkus Kuhn [\u02C8ma\u02B3k\u028As ku\u02D0n] <http://www.cl.cam.ac.uk/~mgk25/> \u2014 2002-07-25\n\n\nThe ASCII compatible UTF-8 encoding used in this plain-text file\nis defined in Unicode, ISO 10646-1, and RFC 2279.\n\n\nUsing Unicode/UTF-8, you can write in emails and source code things such as\n\nMathematics and sciences:\n\n \u222E E\u22C5da = Q, n \u2192 \u221E, \u2211 f(i) = \u220F g(i), \u23A7\u23A1\u239B\u250C\u2500\u2500\u2500\u2500\u2500\u2510\u239E\u23A4\u23AB\n \u23AA\u23A2\u239C\u2502a\u00B2+b\u00B3 \u239F\u23A5\u23AA\n \u2200x\u2208\u211D: \u2308x\u2309 = \u2212\u230A\u2212x\u230B, \u03B1 \u2227 \u00AC\u03B2 = \u00AC(\u00AC\u03B1 \u2228 \u03B2), \u23AA\u23A2\u239C\u2502\u2500\u2500\u2500\u2500\u2500 \u239F\u23A5\u23AA\n \u23AA\u23A2\u239C\u23B7 c\u2088 \u239F\u23A5\u23AA\n \u2115 \u2286 \u2115\u2080 \u2282 \u2124 \u2282 \u211A \u2282 \u211D \u2282 \u2102, \u23A8\u23A2\u239C \u239F\u23A5\u23AC\n \u23AA\u23A2\u239C \u221E \u239F\u23A5\u23AA\n \u22A5 < a \u2260 b \u2261 c \u2264 d \u226A \u22A4 \u21D2 (\u27E6A\u27E7 \u21D4 \u27EAB\u27EB), \u23AA\u23A2\u239C \u23B2 \u239F\u23A5\u23AA\n \u23AA\u23A2\u239C \u23B3a\u2071-b\u2071\u239F\u23A5\u23AA\n 2H\u2082 + O\u2082 \u21CC 2H\u2082O, R = 4.7 k\u03A9, \u2300 200 mm \u23A9\u23A3\u239Di=1 \u23A0\u23A6\u23AD\n\nLinguistics and dictionaries:\n\n \u00F0i \u0131nt\u0259\u02C8n\u00E6\u0283\u0259n\u0259l f\u0259\u02C8n\u025Bt\u0131k \u0259so\u028Asi\u02C8e\u0131\u0283n\n Y [\u02C8\u028Fpsil\u0254n], Yen [j\u025Bn], Yoga [\u02C8jo\u02D0g\u0251]\n\nAPL:\n\n ((V\u2373V)=\u2373\u2374V)/V\u2190,V \u2337\u2190\u2373\u2192\u2374\u2206\u2207\u2283\u203E\u234E\u2355\u2308\n\nNicer typography in plain text files:\n\n \u2554\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2557\n \u2551 \u2551\n \u2551 \u2022 \u2018single\u2019 and \u201Cdouble\u201D quotes \u2551\n \u2551 \u2551\n \u2551 \u2022 Curly apostrophes: \u201CWe\u2019ve been here\u201D \u2551\n \u2551 \u2551\n \u2551 \u2022 Latin-1 apostrophe and accents: '\u00B4` \u2551\n \u2551 \u2551\n \u2551 \u2022 \u201Adeutsche\u2018 \u201EAnf\u00FChrungszeichen\u201C \u2551\n \u2551 \u2551\n \u2551 \u2022 \u2020, \u2021, \u2030, \u2022, 3\u20134, \u2014, \u22125/+5, \u2122, \u2026 \u2551\n \u2551 \u2551\n \u2551 \u2022 ASCII safety test: 1lI|, 0OD, 8B \u2551\n \u2551 \u256D\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256E \u2551\n \u2551 \u2022 the euro symbol: \u2502 14.95 \u20AC \u2502 \u2551\n \u2551 \u2570\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256F \u2551\n \u255A\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u255D\n\nCombining characters:\n\n STARG\u039B\u030ATE SG-1, a = v\u0307 = r\u0308, a\u20D1 \u22A5 b\u20D1\n\nGreek (in Polytonic):\n\n The Greek anthem:\n\n \u03A3\u1F72 \u03B3\u03BD\u03C9\u03C1\u1F77\u03B6\u03C9 \u1F00\u03C0\u1F78 \u03C4\u1F74\u03BD \u03BA\u1F79\u03C8\u03B7\n \u03C4\u03BF\u1FE6 \u03C3\u03C0\u03B1\u03B8\u03B9\u03BF\u1FE6 \u03C4\u1F74\u03BD \u03C4\u03C1\u03BF\u03BC\u03B5\u03C1\u1F75,\n \u03C3\u1F72 \u03B3\u03BD\u03C9\u03C1\u1F77\u03B6\u03C9 \u1F00\u03C0\u1F78 \u03C4\u1F74\u03BD \u1F44\u03C8\u03B7\n \u03C0\u03BF\u1F7A \u03BC\u1F72 \u03B2\u1F77\u03B1 \u03BC\u03B5\u03C4\u03C1\u1F71\u03B5\u03B9 \u03C4\u1F74 \u03B3\u1FC6.\n\n \u1FBF\u0391\u03C0\u1FBF \u03C4\u1F70 \u03BA\u1F79\u03BA\u03BA\u03B1\u03BB\u03B1 \u03B2\u03B3\u03B1\u03BB\u03BC\u1F73\u03BD\u03B7\n \u03C4\u1FF6\u03BD \u1FFE\u0395\u03BB\u03BB\u1F75\u03BD\u03C9\u03BD \u03C4\u1F70 \u1F31\u03B5\u03C1\u1F71\n \u03BA\u03B1\u1F76 \u03C3\u1F70\u03BD \u03C0\u03C1\u1FF6\u03C4\u03B1 \u1F00\u03BD\u03B4\u03C1\u03B5\u03B9\u03C9\u03BC\u1F73\u03BD\u03B7\n \u03C7\u03B1\u1FD6\u03C1\u03B5, \u1F66 \u03C7\u03B1\u1FD6\u03C1\u03B5, \u1FBF\u0395\u03BB\u03B5\u03C5\u03B8\u03B5\u03C1\u03B9\u1F71!\n\n From a speech of Demosthenes in the 4th century BC:\n\n \u039F\u1F50\u03C7\u1F76 \u03C4\u03B1\u1F50\u03C4\u1F70 \u03C0\u03B1\u03C1\u1F77\u03C3\u03C4\u03B1\u03C4\u03B1\u1F77 \u03BC\u03BF\u03B9 \u03B3\u03B9\u03B3\u03BD\u1F7D\u03C3\u03BA\u03B5\u03B9\u03BD, \u1F66 \u1F04\u03BD\u03B4\u03C1\u03B5\u03C2 \u1FBF\u0391\u03B8\u03B7\u03BD\u03B1\u1FD6\u03BF\u03B9,\n \u1F45\u03C4\u03B1\u03BD \u03C4\u1FBF \u03B5\u1F30\u03C2 \u03C4\u1F70 \u03C0\u03C1\u1F71\u03B3\u03BC\u03B1\u03C4\u03B1 \u1F00\u03C0\u03BF\u03B2\u03BB\u1F73\u03C8\u03C9 \u03BA\u03B1\u1F76 \u1F45\u03C4\u03B1\u03BD \u03C0\u03C1\u1F78\u03C2 \u03C4\u03BF\u1F7A\u03C2\n \u03BB\u1F79\u03B3\u03BF\u03C5\u03C2 \u03BF\u1F53\u03C2 \u1F00\u03BA\u03BF\u1F7B\u03C9\u0387 \u03C4\u03BF\u1F7A\u03C2 \u03BC\u1F72\u03BD \u03B3\u1F70\u03C1 \u03BB\u1F79\u03B3\u03BF\u03C5\u03C2 \u03C0\u03B5\u03C1\u1F76 \u03C4\u03BF\u1FE6\n \u03C4\u03B9\u03BC\u03C9\u03C1\u1F75\u03C3\u03B1\u03C3\u03B8\u03B1\u03B9 \u03A6\u1F77\u03BB\u03B9\u03C0\u03C0\u03BF\u03BD \u1F41\u03C1\u1FF6 \u03B3\u03B9\u03B3\u03BD\u03BF\u03BC\u1F73\u03BD\u03BF\u03C5\u03C2, \u03C4\u1F70 \u03B4\u1F72 \u03C0\u03C1\u1F71\u03B3\u03BC\u03B1\u03C4\u1FBF\n \u03B5\u1F30\u03C2 \u03C4\u03BF\u1FE6\u03C4\u03BF \u03C0\u03C1\u03BF\u1F75\u03BA\u03BF\u03BD\u03C4\u03B1, \u1F65\u03C3\u03B8\u1FBF \u1F45\u03C0\u03C9\u03C2 \u03BC\u1F74 \u03C0\u03B5\u03B9\u03C3\u1F79\u03BC\u03B5\u03B8\u1FBF \u03B1\u1F50\u03C4\u03BF\u1F76\n \u03C0\u03C1\u1F79\u03C4\u03B5\u03C1\u03BF\u03BD \u03BA\u03B1\u03BA\u1FF6\u03C2 \u03C3\u03BA\u1F73\u03C8\u03B1\u03C3\u03B8\u03B1\u03B9 \u03B4\u1F73\u03BF\u03BD. \u03BF\u1F50\u03B4\u1F73\u03BD \u03BF\u1F56\u03BD \u1F04\u03BB\u03BB\u03BF \u03BC\u03BF\u03B9 \u03B4\u03BF\u03BA\u03BF\u1FE6\u03C3\u03B9\u03BD\n \u03BF\u1F31 \u03C4\u1F70 \u03C4\u03BF\u03B9\u03B1\u1FE6\u03C4\u03B1 \u03BB\u1F73\u03B3\u03BF\u03BD\u03C4\u03B5\u03C2 \u1F22 \u03C4\u1F74\u03BD \u1F51\u03C0\u1F79\u03B8\u03B5\u03C3\u03B9\u03BD, \u03C0\u03B5\u03C1\u1F76 \u1F27\u03C2 \u03B2\u03BF\u03C5\u03BB\u03B5\u1F7B\u03B5\u03C3\u03B8\u03B1\u03B9,\n \u03BF\u1F50\u03C7\u1F76 \u03C4\u1F74\u03BD \u03BF\u1F56\u03C3\u03B1\u03BD \u03C0\u03B1\u03C1\u03B9\u03C3\u03C4\u1F71\u03BD\u03C4\u03B5\u03C2 \u1F51\u03BC\u1FD6\u03BD \u1F01\u03BC\u03B1\u03C1\u03C4\u1F71\u03BD\u03B5\u03B9\u03BD. \u1F10\u03B3\u1F7C \u03B4\u1F73, \u1F45\u03C4\u03B9 \u03BC\u1F73\u03BD\n \u03C0\u03BF\u03C4\u1FBF \u1F10\u03BE\u1FC6\u03BD \u03C4\u1FC7 \u03C0\u1F79\u03BB\u03B5\u03B9 \u03BA\u03B1\u1F76 \u03C4\u1F70 \u03B1\u1F51\u03C4\u1FC6\u03C2 \u1F14\u03C7\u03B5\u03B9\u03BD \u1F00\u03C3\u03C6\u03B1\u03BB\u1FF6\u03C2 \u03BA\u03B1\u1F76 \u03A6\u1F77\u03BB\u03B9\u03C0\u03C0\u03BF\u03BD\n \u03C4\u03B9\u03BC\u03C9\u03C1\u1F75\u03C3\u03B1\u03C3\u03B8\u03B1\u03B9, \u03BA\u03B1\u1F76 \u03BC\u1F71\u03BB\u1FBF \u1F00\u03BA\u03C1\u03B9\u03B2\u1FF6\u03C2 \u03BF\u1F36\u03B4\u03B1\u0387 \u1F10\u03C0\u1FBF \u1F10\u03BC\u03BF\u1FE6 \u03B3\u1F71\u03C1, \u03BF\u1F50 \u03C0\u1F71\u03BB\u03B1\u03B9\n \u03B3\u1F73\u03B3\u03BF\u03BD\u03B5\u03BD \u03C4\u03B1\u1FE6\u03C4\u1FBF \u1F00\u03BC\u03C6\u1F79\u03C4\u03B5\u03C1\u03B1\u0387 \u03BD\u1FE6\u03BD \u03BC\u1F73\u03BD\u03C4\u03BF\u03B9 \u03C0\u1F73\u03C0\u03B5\u03B9\u03C3\u03BC\u03B1\u03B9 \u03C4\u03BF\u1FE6\u03B8\u1FBF \u1F31\u03BA\u03B1\u03BD\u1F78\u03BD\n \u03C0\u03C1\u03BF\u03BB\u03B1\u03B2\u03B5\u1FD6\u03BD \u1F21\u03BC\u1FD6\u03BD \u03B5\u1F36\u03BD\u03B1\u03B9 \u03C4\u1F74\u03BD \u03C0\u03C1\u1F7D\u03C4\u03B7\u03BD, \u1F45\u03C0\u03C9\u03C2 \u03C4\u03BF\u1F7A\u03C2 \u03C3\u03C5\u03BC\u03BC\u1F71\u03C7\u03BF\u03C5\u03C2\n \u03C3\u1F7D\u03C3\u03BF\u03BC\u03B5\u03BD. \u1F10\u1F70\u03BD \u03B3\u1F70\u03C1 \u03C4\u03BF\u1FE6\u03C4\u03BF \u03B2\u03B5\u03B2\u03B1\u1F77\u03C9\u03C2 \u1F51\u03C0\u1F71\u03C1\u03BE\u1FC3, \u03C4\u1F79\u03C4\u03B5 \u03BA\u03B1\u1F76 \u03C0\u03B5\u03C1\u1F76 \u03C4\u03BF\u1FE6\n \u03C4\u1F77\u03BD\u03B1 \u03C4\u03B9\u03BC\u03C9\u03C1\u1F75\u03C3\u03B5\u03C4\u03B1\u1F77 \u03C4\u03B9\u03C2 \u03BA\u03B1\u1F76 \u1F43\u03BD \u03C4\u03C1\u1F79\u03C0\u03BF\u03BD \u1F10\u03BE\u1F73\u03C3\u03C4\u03B1\u03B9 \u03C3\u03BA\u03BF\u03C0\u03B5\u1FD6\u03BD\u0387 \u03C0\u03C1\u1F76\u03BD \u03B4\u1F72\n \u03C4\u1F74\u03BD \u1F00\u03C1\u03C7\u1F74\u03BD \u1F40\u03C1\u03B8\u1FF6\u03C2 \u1F51\u03C0\u03BF\u03B8\u1F73\u03C3\u03B8\u03B1\u03B9, \u03BC\u1F71\u03C4\u03B1\u03B9\u03BF\u03BD \u1F21\u03B3\u03BF\u1FE6\u03BC\u03B1\u03B9 \u03C0\u03B5\u03C1\u1F76 \u03C4\u1FC6\u03C2\n \u03C4\u03B5\u03BB\u03B5\u03C5\u03C4\u1FC6\u03C2 \u1F41\u03BD\u03C4\u03B9\u03BD\u03BF\u1FE6\u03BD \u03C0\u03BF\u03B9\u03B5\u1FD6\u03C3\u03B8\u03B1\u03B9 \u03BB\u1F79\u03B3\u03BF\u03BD.\n\n \u0394\u03B7\u03BC\u03BF\u03C3\u03B8\u1F73\u03BD\u03BF\u03C5\u03C2, \u0393\u1FFD \u1FBF\u039F\u03BB\u03C5\u03BD\u03B8\u03B9\u03B1\u03BA\u1F78\u03C2\n\nGeorgian:\n\n From a Unicode conference invitation:\n\n \u10D2\u10D7\u10EE\u10DD\u10D5\u10D7 \u10D0\u10EE\u10DA\u10D0\u10D5\u10D4 \u10D2\u10D0\u10D8\u10D0\u10E0\u10DD\u10D7 \u10E0\u10D4\u10D2\u10D8\u10E1\u10E2\u10E0\u10D0\u10EA\u10D8\u10D0 Unicode-\u10D8\u10E1 \u10DB\u10D4\u10D0\u10D7\u10D4 \u10E1\u10D0\u10D4\u10E0\u10D7\u10D0\u10E8\u10DD\u10E0\u10D8\u10E1\u10DD\n \u10D9\u10DD\u10DC\u10E4\u10D4\u10E0\u10D4\u10DC\u10EA\u10D8\u10D0\u10D6\u10D4 \u10D3\u10D0\u10E1\u10D0\u10E1\u10EC\u10E0\u10D4\u10D1\u10D0\u10D3, \u10E0\u10DD\u10DB\u10D4\u10DA\u10D8\u10EA \u10D2\u10D0\u10D8\u10DB\u10D0\u10E0\u10D7\u10D4\u10D1\u10D0 10-12 \u10DB\u10D0\u10E0\u10E2\u10E1,\n \u10E5. \u10DB\u10D0\u10D8\u10DC\u10EA\u10E8\u10D8, \u10D2\u10D4\u10E0\u10DB\u10D0\u10DC\u10D8\u10D0\u10E8\u10D8. \u10D9\u10DD\u10DC\u10E4\u10D4\u10E0\u10D4\u10DC\u10EA\u10D8\u10D0 \u10E8\u10D4\u10F0\u10D9\u10E0\u10D4\u10D1\u10E1 \u10D4\u10E0\u10D7\u10D0\u10D3 \u10DB\u10E1\u10DD\u10E4\u10DA\u10D8\u10DD\u10E1\n \u10D4\u10E5\u10E1\u10DE\u10D4\u10E0\u10E2\u10D4\u10D1\u10E1 \u10D8\u10E1\u10D4\u10D7 \u10D3\u10D0\u10E0\u10D2\u10D4\u10D1\u10E8\u10D8 \u10E0\u10DD\u10D2\u10DD\u10E0\u10D8\u10EA\u10D0\u10D0 \u10D8\u10DC\u10E2\u10D4\u10E0\u10DC\u10D4\u10E2\u10D8 \u10D3\u10D0 Unicode-\u10D8,\n \u10D8\u10DC\u10E2\u10D4\u10E0\u10DC\u10D0\u10EA\u10D8\u10DD\u10DC\u10D0\u10DA\u10D8\u10D6\u10D0\u10EA\u10D8\u10D0 \u10D3\u10D0 \u10DA\u10DD\u10D9\u10D0\u10DA\u10D8\u10D6\u10D0\u10EA\u10D8\u10D0, Unicode-\u10D8\u10E1 \u10D2\u10D0\u10DB\u10DD\u10E7\u10D4\u10DC\u10D4\u10D1\u10D0\n \u10DD\u10DE\u10D4\u10E0\u10D0\u10EA\u10D8\u10E3\u10DA \u10E1\u10D8\u10E1\u10E2\u10D4\u10DB\u10D4\u10D1\u10E1\u10D0, \u10D3\u10D0 \u10D2\u10D0\u10DB\u10DD\u10E7\u10D4\u10DC\u10D4\u10D1\u10D8\u10D7 \u10DE\u10E0\u10DD\u10D2\u10E0\u10D0\u10DB\u10D4\u10D1\u10E8\u10D8, \u10E8\u10E0\u10D8\u10E4\u10E2\u10D4\u10D1\u10E8\u10D8,\n \u10E2\u10D4\u10E5\u10E1\u10E2\u10D4\u10D1\u10D8\u10E1 \u10D3\u10D0\u10DB\u10E3\u10E8\u10D0\u10D5\u10D4\u10D1\u10D0\u10E1\u10D0 \u10D3\u10D0 \u10DB\u10E0\u10D0\u10D5\u10D0\u10DA\u10D4\u10DC\u10DD\u10D5\u10D0\u10DC \u10D9\u10DD\u10DB\u10DE\u10D8\u10E3\u10E2\u10D4\u10E0\u10E3\u10DA \u10E1\u10D8\u10E1\u10E2\u10D4\u10DB\u10D4\u10D1\u10E8\u10D8.\n\nRussian:\n\n From a Unicode conference invitation:\n\n \u0417\u0430\u0440\u0435\u0433\u0438\u0441\u0442\u0440\u0438\u0440\u0443\u0439\u0442\u0435\u0441\u044C \u0441\u0435\u0439\u0447\u0430\u0441 \u043D\u0430 \u0414\u0435\u0441\u044F\u0442\u0443\u044E \u041C\u0435\u0436\u0434\u0443\u043D\u0430\u0440\u043E\u0434\u043D\u0443\u044E \u041A\u043E\u043D\u0444\u0435\u0440\u0435\u043D\u0446\u0438\u044E \u043F\u043E\n Unicode, \u043A\u043E\u0442\u043E\u0440\u0430\u044F \u0441\u043E\u0441\u0442\u043E\u0438\u0442\u0441\u044F 10-12 \u043C\u0430\u0440\u0442\u0430 1997 \u0433\u043E\u0434\u0430 \u0432 \u041C\u0430\u0439\u043D\u0446\u0435 \u0432 \u0413\u0435\u0440\u043C\u0430\u043D\u0438\u0438.\n \u041A\u043E\u043D\u0444\u0435\u0440\u0435\u043D\u0446\u0438\u044F \u0441\u043E\u0431\u0435\u0440\u0435\u0442 \u0448\u0438\u0440\u043E\u043A\u0438\u0439 \u043A\u0440\u0443\u0433 \u044D\u043A\u0441\u043F\u0435\u0440\u0442\u043E\u0432 \u043F\u043E \u0432\u043E\u043F\u0440\u043E\u0441\u0430\u043C \u0433\u043B\u043E\u0431\u0430\u043B\u044C\u043D\u043E\u0433\u043E\n \u0418\u043D\u0442\u0435\u0440\u043D\u0435\u0442\u0430 \u0438 Unicode, \u043B\u043E\u043A\u0430\u043B\u0438\u0437\u0430\u0446\u0438\u0438 \u0438 \u0438\u043D\u0442\u0435\u0440\u043D\u0430\u0446\u0438\u043E\u043D\u0430\u043B\u0438\u0437\u0430\u0446\u0438\u0438, \u0432\u043E\u043F\u043B\u043E\u0449\u0435\u043D\u0438\u044E \u0438\n \u043F\u0440\u0438\u043C\u0435\u043D\u0435\u043D\u0438\u044E Unicode \u0432 \u0440\u0430\u0437\u043B\u0438\u0447\u043D\u044B\u0445 \u043E\u043F\u0435\u0440\u0430\u0446\u0438\u043E\u043D\u043D\u044B\u0445 \u0441\u0438\u0441\u0442\u0435\u043C\u0430\u0445 \u0438 \u043F\u0440\u043E\u0433\u0440\u0430\u043C\u043C\u043D\u044B\u0445\n \u043F\u0440\u0438\u043B\u043E\u0436\u0435\u043D\u0438\u044F\u0445, \u0448\u0440\u0438\u0444\u0442\u0430\u0445, \u0432\u0435\u0440\u0441\u0442\u043A\u0435 \u0438 \u043C\u043D\u043E\u0433\u043E\u044F\u0437\u044B\u0447\u043D\u044B\u0445 \u043A\u043E\u043C\u043F\u044C\u044E\u0442\u0435\u0440\u043D\u044B\u0445 \u0441\u0438\u0441\u0442\u0435\u043C\u0430\u0445.\n\nThai (UCS Level 2):\n\n Excerpt from a poetry on The Romance of The Three Kingdoms (a Chinese\n classic 'San Gua'):\n\n [----------------------------|------------------------]\n \u0E4F \u0E41\u0E1C\u0E48\u0E19\u0E14\u0E34\u0E19\u0E2E\u0E31\u0E48\u0E19\u0E40\u0E2A\u0E37\u0E48\u0E2D\u0E21\u0E42\u0E17\u0E23\u0E21\u0E41\u0E2A\u0E19\u0E2A\u0E31\u0E07\u0E40\u0E27\u0E0A \u0E1E\u0E23\u0E30\u0E1B\u0E01\u0E40\u0E01\u0E28\u0E01\u0E2D\u0E07\u0E1A\u0E39\u0E4A\u0E01\u0E39\u0E49\u0E02\u0E36\u0E49\u0E19\u0E43\u0E2B\u0E21\u0E48\n \u0E2A\u0E34\u0E1A\u0E2A\u0E2D\u0E07\u0E01\u0E29\u0E31\u0E15\u0E23\u0E34\u0E22\u0E4C\u0E01\u0E48\u0E2D\u0E19\u0E2B\u0E19\u0E49\u0E32\u0E41\u0E25\u0E16\u0E31\u0E14\u0E44\u0E1B \u0E2A\u0E2D\u0E07\u0E2D\u0E07\u0E04\u0E4C\u0E44\u0E0B\u0E23\u0E49\u0E42\u0E07\u0E48\u0E40\u0E02\u0E25\u0E32\u0E40\u0E1A\u0E32\u0E1B\u0E31\u0E0D\u0E0D\u0E32\n \u0E17\u0E23\u0E07\u0E19\u0E31\u0E1A\u0E16\u0E37\u0E2D\u0E02\u0E31\u0E19\u0E17\u0E35\u0E40\u0E1B\u0E47\u0E19\u0E17\u0E35\u0E48\u0E1E\u0E36\u0E48\u0E07 \u0E1A\u0E49\u0E32\u0E19\u0E40\u0E21\u0E37\u0E2D\u0E07\u0E08\u0E36\u0E07\u0E27\u0E34\u0E1B\u0E23\u0E34\u0E15\u0E40\u0E1B\u0E47\u0E19\u0E19\u0E31\u0E01\u0E2B\u0E19\u0E32\n \u0E42\u0E2E\u0E08\u0E34\u0E4B\u0E19\u0E40\u0E23\u0E35\u0E22\u0E01\u0E17\u0E31\u0E1E\u0E17\u0E31\u0E48\u0E27\u0E2B\u0E31\u0E27\u0E40\u0E21\u0E37\u0E2D\u0E07\u0E21\u0E32 \u0E2B\u0E21\u0E32\u0E22\u0E08\u0E30\u0E06\u0E48\u0E32\u0E21\u0E14\u0E0A\u0E31\u0E48\u0E27\u0E15\u0E31\u0E27\u0E2A\u0E33\u0E04\u0E31\u0E0D\n \u0E40\u0E2B\u0E21\u0E37\u0E2D\u0E19\u0E02\u0E31\u0E1A\u0E44\u0E2A\u0E44\u0E25\u0E48\u0E40\u0E2A\u0E37\u0E2D\u0E08\u0E32\u0E01\u0E40\u0E04\u0E2B\u0E32 \u0E23\u0E31\u0E1A\u0E2B\u0E21\u0E32\u0E1B\u0E48\u0E32\u0E40\u0E02\u0E49\u0E32\u0E21\u0E32\u0E40\u0E25\u0E22\u0E2D\u0E32\u0E2A\u0E31\u0E0D\n \u0E1D\u0E48\u0E32\u0E22\u0E2D\u0E49\u0E2D\u0E07\u0E2D\u0E38\u0E49\u0E19\u0E22\u0E38\u0E41\u0E22\u0E01\u0E43\u0E2B\u0E49\u0E41\u0E15\u0E01\u0E01\u0E31\u0E19 \u0E43\u0E0A\u0E49\u0E2A\u0E32\u0E27\u0E19\u0E31\u0E49\u0E19\u0E40\u0E1B\u0E47\u0E19\u0E0A\u0E19\u0E27\u0E19\u0E0A\u0E37\u0E48\u0E19\u0E0A\u0E27\u0E19\u0E43\u0E08\n \u0E1E\u0E25\u0E31\u0E19\u0E25\u0E34\u0E09\u0E38\u0E22\u0E01\u0E38\u0E22\u0E01\u0E35\u0E01\u0E25\u0E31\u0E1A\u0E01\u0E48\u0E2D\u0E40\u0E2B\u0E15\u0E38 \u0E0A\u0E48\u0E32\u0E07\u0E2D\u0E32\u0E40\u0E1E\u0E28\u0E08\u0E23\u0E34\u0E07\u0E2B\u0E19\u0E32\u0E1F\u0E49\u0E32\u0E23\u0E49\u0E2D\u0E07\u0E44\u0E2B\u0E49\n \u0E15\u0E49\u0E2D\u0E07\u0E23\u0E1A\u0E23\u0E32\u0E06\u0E48\u0E32\u0E1F\u0E31\u0E19\u0E08\u0E19\u0E1A\u0E23\u0E23\u0E25\u0E31\u0E22 \u0E24\u0E45\u0E2B\u0E32\u0E43\u0E04\u0E23\u0E04\u0E49\u0E33\u0E0A\u0E39\u0E01\u0E39\u0E49\u0E1A\u0E23\u0E23\u0E25\u0E31\u0E07\u0E01\u0E4C \u0E2F\n\n (The above is a two-column text. If combining characters are handled\n correctly, the lines of the second column should be aligned with the\n | character above.)\n\nEthiopian:\n\n Proverbs in the Amharic language:\n\n \u1230\u121B\u12ED \u12A0\u12ED\u1273\u1228\u1235 \u1295\u1309\u1225 \u12A0\u12ED\u12A8\u1230\u1235\u1362\n \u1265\u120B \u12AB\u1208\u129D \u12A5\u1295\u12F0\u12A0\u1263\u1274 \u1260\u1246\u1218\u1320\u129D\u1362\n \u130C\u1325 \u12EB\u1208\u1264\u1271 \u1241\u121D\u1325\u1293 \u1290\u12CD\u1362\n \u12F0\u1200 \u1260\u1215\u120D\u1219 \u1245\u1264 \u1263\u12ED\u1320\u1323 \u1295\u1323\u1275 \u1260\u1308\u12F0\u1208\u12CD\u1362\n \u12E8\u12A0\u134D \u12C8\u1208\u121D\u1273 \u1260\u1245\u1264 \u12A0\u12ED\u1273\u123D\u121D\u1362\n \u12A0\u12ED\u1325 \u1260\u1260\u120B \u12F3\u12CB \u1270\u1218\u1273\u1362\n \u1232\u1270\u1228\u1309\u1219 \u12ED\u12F0\u1228\u130D\u1219\u1362\n \u1240\u1235 \u1260\u1240\u1235\u1365 \u12D5\u1295\u1241\u120B\u120D \u1260\u12A5\u130D\u1229 \u12ED\u1204\u12F3\u120D\u1362\n \u12F5\u122D \u1262\u12EB\u1265\u122D \u12A0\u1295\u1260\u1233 \u12EB\u1235\u122D\u1362\n \u1230\u12CD \u12A5\u1295\u12F0\u1264\u1271 \u12A5\u1295\u1305 \u12A5\u1295\u12F0 \u1309\u1228\u1264\u1271 \u12A0\u12ED\u1270\u12F3\u12F0\u122D\u121D\u1362\n \u12A5\u130D\u12DC\u122D \u12E8\u12A8\u1348\u1270\u12CD\u1295 \u1309\u122E\u122E \u1233\u12ED\u12D8\u130B\u12CD \u12A0\u12ED\u12F5\u122D\u121D\u1362\n \u12E8\u130E\u1228\u1264\u1275 \u120C\u1263\u1365 \u1262\u12EB\u12E9\u1275 \u12ED\u1235\u1245 \u1263\u12EB\u12E9\u1275 \u12EB\u1320\u120D\u1245\u1362\n \u1225\u122B \u12A8\u1218\u134D\u1273\u1275 \u120D\u1304\u1295 \u120B\u134B\u1273\u1275\u1362\n \u12D3\u1263\u12ED \u121B\u12F0\u122A\u12EB \u12E8\u1208\u12CD\u1365 \u130D\u1295\u12F5 \u12ED\u12DE \u12ED\u12DE\u122B\u120D\u1362\n \u12E8\u12A5\u1235\u120B\u121D \u12A0\u1308\u1229 \u1218\u12AB \u12E8\u12A0\u121E\u122B \u12A0\u1308\u1229 \u12CB\u122D\u12AB\u1362\n \u1270\u1295\u130B\u120E \u1262\u1270\u1349 \u1270\u1218\u120D\u1236 \u1263\u1349\u1362\n \u12C8\u12F3\u1305\u1205 \u121B\u122D \u1262\u1206\u1295 \u1328\u122D\u1235\u1205 \u12A0\u1275\u120B\u1230\u12CD\u1362\n \u12A5\u130D\u122D\u1205\u1295 \u1260\u134D\u122B\u123D\u1205 \u120D\u12AD \u12D8\u122D\u130B\u1362\n\nRunes:\n\n \u16BB\u16D6 \u16B3\u16B9\u16AB\u16A6 \u16A6\u16AB\u16CF \u16BB\u16D6 \u16D2\u16A2\u16DE\u16D6 \u16A9\u16BE \u16A6\u16AB\u16D7 \u16DA\u16AA\u16BE\u16DE\u16D6 \u16BE\u16A9\u16B1\u16A6\u16B9\u16D6\u16AA\u16B1\u16DE\u16A2\u16D7 \u16B9\u16C1\u16A6 \u16A6\u16AA \u16B9\u16D6\u16E5\u16AB\n\n (Old English, which transcribed into Latin reads 'He cwaeth that he\n bude thaem lande northweardum with tha Westsae.' and means 'He said\n that he lived in the northern land near the Western Sea.')\n\nBraille:\n\n \u284C\u2801\u2827\u2811 \u283C\u2801\u2812 \u284D\u281C\u2807\u2811\u2839\u2830\u280E \u2863\u2815\u280C\n\n \u284D\u281C\u2807\u2811\u2839 \u283A\u2801\u280E \u2819\u2811\u2801\u2819\u2812 \u281E\u2815 \u2803\u2811\u281B\u2814 \u283A\u280A\u2839\u2832 \u2879\u283B\u2811 \u280A\u280E \u281D\u2815 \u2819\u2833\u2803\u281E\n \u2831\u2801\u281E\u2811\u2827\u283B \u2801\u2803\u2833\u281E \u2839\u2801\u281E\u2832 \u2879\u2811 \u2817\u2811\u281B\u280A\u280C\u283B \u2815\u280B \u2819\u280A\u280E \u2803\u2825\u2817\u280A\u2801\u2807 \u283A\u2801\u280E\n \u280E\u280A\u281B\u281D\u282B \u2803\u2839 \u2839\u2811 \u280A\u2807\u283B\u281B\u2839\u280D\u2801\u281D\u2802 \u2839\u2811 \u280A\u2807\u283B\u2805\u2802 \u2839\u2811 \u2825\u281D\u2819\u283B\u281E\u2801\u2805\u283B\u2802\n \u2801\u281D\u2819 \u2839\u2811 \u2821\u280A\u2811\u280B \u280D\u2833\u2817\u281D\u283B\u2832 \u284E\u280A\u2817\u2815\u2815\u281B\u2811 \u280E\u280A\u281B\u281D\u282B \u280A\u281E\u2832 \u2841\u281D\u2819\n \u284E\u280A\u2817\u2815\u2815\u281B\u2811\u2830\u280E \u281D\u2801\u280D\u2811 \u283A\u2801\u280E \u281B\u2815\u2815\u2819 \u2825\u280F\u2815\u281D \u2830\u2861\u2801\u281D\u281B\u2811\u2802 \u280B\u2815\u2817 \u2801\u281D\u2839\u2839\u2814\u281B \u2819\u2811\n \u2821\u2815\u280E\u2811 \u281E\u2815 \u280F\u2825\u281E \u2819\u280A\u280E \u2819\u2801\u281D\u2819 \u281E\u2815\u2832\n\n \u2855\u2807\u2819 \u284D\u281C\u2807\u2811\u2839 \u283A\u2801\u280E \u2801\u280E \u2819\u2811\u2801\u2819 \u2801\u280E \u2801 \u2819\u2815\u2815\u2817\u2824\u281D\u2801\u280A\u2807\u2832\n\n \u284D\u2814\u2819\u2816 \u284A \u2819\u2815\u281D\u2830\u281E \u280D\u2811\u2801\u281D \u281E\u2815 \u280E\u2801\u2839 \u2839\u2801\u281E \u284A \u2805\u281D\u282A\u2802 \u2815\u280B \u280D\u2839\n \u282A\u281D \u2805\u281D\u282A\u2807\u282B\u281B\u2811\u2802 \u2831\u2801\u281E \u2839\u283B\u2811 \u280A\u280E \u280F\u281C\u281E\u280A\u280A\u2825\u2807\u281C\u2807\u2839 \u2819\u2811\u2801\u2819 \u2801\u2803\u2833\u281E\n \u2801 \u2819\u2815\u2815\u2817\u2824\u281D\u2801\u280A\u2807\u2832 \u284A \u280D\u280A\u2823\u281E \u2819\u2801\u2827\u2811 \u2803\u2811\u2832 \u2814\u280A\u2807\u2814\u282B\u2802 \u280D\u2839\u280E\u2811\u2807\u280B\u2802 \u281E\u2815\n \u2817\u2811\u281B\u281C\u2819 \u2801 \u280A\u2815\u280B\u280B\u2814\u2824\u281D\u2801\u280A\u2807 \u2801\u280E \u2839\u2811 \u2819\u2811\u2801\u2819\u2811\u280C \u280F\u280A\u2811\u280A\u2811 \u2815\u280B \u280A\u2817\u2815\u281D\u280D\u2815\u281D\u281B\u283B\u2839\n \u2814 \u2839\u2811 \u281E\u2817\u2801\u2819\u2811\u2832 \u2843\u2825\u281E \u2839\u2811 \u283A\u280A\u280E\u2819\u2815\u280D \u2815\u280B \u2833\u2817 \u2801\u281D\u280A\u2811\u280C\u2815\u2817\u280E\n \u280A\u280E \u2814 \u2839\u2811 \u280E\u280A\u280D\u280A\u2807\u2811\u2806 \u2801\u281D\u2819 \u280D\u2839 \u2825\u281D\u2819\u2801\u2807\u2807\u282A\u282B \u2819\u2801\u281D\u2819\u280E\n \u2829\u2801\u2807\u2807 \u281D\u2815\u281E \u2819\u280A\u280C\u2825\u2817\u2803 \u280A\u281E\u2802 \u2815\u2817 \u2839\u2811 \u284A\u2833\u281D\u281E\u2817\u2839\u2830\u280E \u2819\u2815\u281D\u2811 \u280B\u2815\u2817\u2832 \u2879\u2833\n \u283A\u280A\u2807\u2807 \u2839\u283B\u2811\u280B\u2815\u2817\u2811 \u280F\u283B\u280D\u280A\u281E \u280D\u2811 \u281E\u2815 \u2817\u2811\u280F\u2811\u2801\u281E\u2802 \u2811\u280D\u280F\u2819\u2801\u281E\u280A\u280A\u2801\u2807\u2807\u2839\u2802 \u2839\u2801\u281E\n \u284D\u281C\u2807\u2811\u2839 \u283A\u2801\u280E \u2801\u280E \u2819\u2811\u2801\u2819 \u2801\u280E \u2801 \u2819\u2815\u2815\u2817\u2824\u281D\u2801\u280A\u2807\u2832\n\n (The first couple of paragraphs of \"A Christmas Carol\" by Dickens)\n\nCompact font selection example text:\n\n ABCDEFGHIJKLMNOPQRSTUVWXYZ /0123456789\n abcdefghijklmnopqrstuvwxyz \u00A3\u00A9\u00B5\u00C0\u00C6\u00D6\u00DE\u00DF\u00E9\u00F6\u00FF\n \u2013\u2014\u2018\u201C\u201D\u201E\u2020\u2022\u2026\u2030\u2122\u0153\u0160\u0178\u017E\u20AC \u0391\u0392\u0393\u0394\u03A9\u03B1\u03B2\u03B3\u03B4\u03C9 \u0410\u0411\u0412\u0413\u0414\u0430\u0431\u0432\u0433\u0434\n \u2200\u2202\u2208\u211D\u2227\u222A\u2261\u221E \u2191\u2197\u21A8\u21BB\u21E3 \u2510\u253C\u2554\u2558\u2591\u25BA\u263A\u2640 \uFB01\uFFFD\u2440\u2082\u1F20\u1E02\u04E5\u1E84\u0250\u02D0\u234E\u05D0\u0531\u10D0\n\nGreetings in various languages:\n\n Hello world, \u039A\u03B1\u03BB\u03B7\u03BC\u1F73\u03C1\u03B1 \u03BA\u1F79\u03C3\u03BC\u03B5, \u30B3\u30F3\u30CB\u30C1\u30CF\n\nBox drawing alignment tests: \u2588\n \u2589\n \u2554\u2550\u2550\u2566\u2550\u2550\u2557 \u250C\u2500\u2500\u252C\u2500\u2500\u2510 \u256D\u2500\u2500\u252C\u2500\u2500\u256E \u256D\u2500\u2500\u252C\u2500\u2500\u256E \u250F\u2501\u2501\u2533\u2501\u2501\u2513 \u250E\u2512\u250F\u2511 \u2577 \u257B \u250F\u252F\u2513 \u250C\u2530\u2510 \u258A \u2571\u2572\u2571\u2572\u2573\u2573\u2573\n \u2551\u250C\u2500\u2568\u2500\u2510\u2551 \u2502\u2554\u2550\u2567\u2550\u2557\u2502 \u2502\u2552\u2550\u256A\u2550\u2555\u2502 \u2502\u2553\u2500\u2541\u2500\u2556\u2502 \u2503\u250C\u2500\u2542\u2500\u2510\u2503 \u2517\u2543\u2544\u2519 \u2576\u253C\u2574\u257A\u254B\u2578\u2520\u253C\u2528 \u251D\u254B\u2525 \u258B \u2572\u2571\u2572\u2571\u2573\u2573\u2573\n \u2551\u2502\u2572 \u2571\u2502\u2551 \u2502\u2551 \u2551\u2502 \u2502\u2502 \u2502 \u2502\u2502 \u2502\u2551 \u2503 \u2551\u2502 \u2503\u2502 \u257F \u2502\u2503 \u250D\u2545\u2546\u2513 \u2575 \u2579 \u2517\u2537\u251B \u2514\u2538\u2518 \u258C \u2571\u2572\u2571\u2572\u2573\u2573\u2573\n \u2560\u2561 \u2573 \u255E\u2563 \u251C\u2562 \u255F\u2524 \u251C\u253C\u2500\u253C\u2500\u253C\u2524 \u251C\u256B\u2500\u2542\u2500\u256B\u2524 \u2523\u253F\u257E\u253C\u257C\u253F\u252B \u2515\u251B\u2516\u251A \u250C\u2504\u2504\u2510 \u254E \u250F\u2505\u2505\u2513 \u250B \u258D \u2572\u2571\u2572\u2571\u2573\u2573\u2573\n \u2551\u2502\u2571 \u2572\u2502\u2551 \u2502\u2551 \u2551\u2502 \u2502\u2502 \u2502 \u2502\u2502 \u2502\u2551 \u2503 \u2551\u2502 \u2503\u2502 \u257D \u2502\u2503 \u2591\u2591\u2592\u2592\u2593\u2593\u2588\u2588 \u250A \u2506 \u254E \u254F \u2507 \u250B \u258E\n \u2551\u2514\u2500\u2565\u2500\u2518\u2551 \u2502\u255A\u2550\u2564\u2550\u255D\u2502 \u2502\u2558\u2550\u256A\u2550\u255B\u2502 \u2502\u2559\u2500\u2540\u2500\u255C\u2502 \u2503\u2514\u2500\u2542\u2500\u2518\u2503 \u2591\u2591\u2592\u2592\u2593\u2593\u2588\u2588 \u250A \u2506 \u254E \u254F \u2507 \u250B \u258F\n \u255A\u2550\u2550\u2569\u2550\u2550\u255D \u2514\u2500\u2500\u2534\u2500\u2500\u2518 \u2570\u2500\u2500\u2534\u2500\u2500\u256F \u2570\u2500\u2500\u2534\u2500\u2500\u256F \u2517\u2501\u2501\u253B\u2501\u2501\u251B \u2597\u2584\u2596\u259B\u2580\u259C \u2514\u254C\u254C\u2518 \u254E \u2517\u254D\u254D\u251B \u250B \u2581\u2582\u2583\u2584\u2585\u2586\u2587\u2588\n \u259D\u2580\u2598\u2599\u2584\u259F\n" . +<http://drobilla.net/sw/serd/test/ascii/UTF-8.ttl> <http://www.w3.org/2000/01/rdf-schema#comment> "\n Two byte Unicode escape: \u00E0\n Largest Unicode escape in Turtle: \U0010FFFF\n" . diff --git a/test/extra/ascii/manifest.ttl b/test/extra/ascii/manifest.ttl new file mode 100644 index 00000000..625a8a4c --- /dev/null +++ b/test/extra/ascii/manifest.ttl @@ -0,0 +1,65 @@ +@prefix mf: <http://www.w3.org/2001/sw/DataAccess/tests/test-manifest#> . +@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . +@prefix rdft: <http://www.w3.org/ns/rdftest#> . + +<> + a mf:Manifest ; + rdfs:comment "Serd ASCII writing test cases" ; + mf:entries ( + <#test-backspace> + <#test-delete> + <#test-escapes> + <#test-form-feed> + <#test-uri-escape> + <#test-utf8-literal> + <#test-utf8-uri> + <#test-whitespace> + ) . + +<#test-backspace> + a rdft:TestTurtleEval ; + mf:action <test-backspace.ttl> ; + mf:name "test-backspace" ; + mf:result <test-backspace.nt> . + +<#test-delete> + a rdft:TestTurtleEval ; + mf:action <test-delete.ttl> ; + mf:name "test-delete" ; + mf:result <test-delete.nt> . + +<#test-escapes> + a rdft:TestTurtleEval ; + mf:action <test-escapes.ttl> ; + mf:name "test-escapes" ; + mf:result <test-escapes.nt> . + +<#test-form-feed> + a rdft:TestTurtleEval ; + mf:action <test-form-feed.ttl> ; + mf:name "test-form-feed" ; + mf:result <test-form-feed.nt> . + +<#test-uri-escape> + a rdft:TestTurtleEval ; + mf:action <test-uri-escape.ttl> ; + mf:name "test-uri-escape" ; + mf:result <test-uri-escape.nt> . + +<#test-utf8-literal> + a rdft:TestTurtleEval ; + mf:action <test-utf8-literal.ttl> ; + mf:name "test-utf8-literal" ; + mf:result <test-utf8-literal.nt> . + +<#test-utf8-uri> + a rdft:TestTurtleEval ; + mf:action <test-utf8-uri.ttl> ; + mf:name "test-utf8-uri" ; + mf:result <test-utf8-uri.nt> . + +<#test-whitespace> + a rdft:TestTurtleEval ; + mf:action <test-whitespace.ttl> ; + mf:name "test-whitespace" ; + mf:result <test-whitespace.nt> . diff --git a/test/extra/ascii/test-backspace.nt b/test/extra/ascii/test-backspace.nt new file mode 100644 index 00000000..000d2f8a --- /dev/null +++ b/test/extra/ascii/test-backspace.nt @@ -0,0 +1,3 @@ +<http://example.org/s> <http://example.org/p> "\b" . +<http://example.org/s> <http://example.org/p> "\b" . +<http://example.org/s> <http://example.org/p> "\n\b\n" . diff --git a/test/extra/ascii/test-backspace.ttl b/test/extra/ascii/test-backspace.ttl new file mode 100644 index 00000000..a290cb01 --- /dev/null +++ b/test/extra/ascii/test-backspace.ttl @@ -0,0 +1,5 @@ +<http://example.org/s> <http://example.org/p> "\u0008" . +<http://example.org/s> <http://example.org/p> "" . +<http://example.org/s> <http://example.org/p> """ + +""" . diff --git a/test/extra/ascii/test-delete.nt b/test/extra/ascii/test-delete.nt new file mode 100644 index 00000000..1f3abc71 --- /dev/null +++ b/test/extra/ascii/test-delete.nt @@ -0,0 +1,2 @@ +<http://example.org/s> <http://example.org/p> "\u007F" . +<http://example.org/s> <http://example.org/p> "\u007F" . diff --git a/test/extra/ascii/test-delete.ttl b/test/extra/ascii/test-delete.ttl new file mode 100644 index 00000000..f9a86040 --- /dev/null +++ b/test/extra/ascii/test-delete.ttl @@ -0,0 +1,2 @@ +<http://example.org/s> <http://example.org/p> "\u007F" . +<http://example.org/s> <http://example.org/p> "" . diff --git a/test/extra/ascii/test-escapes.nt b/test/extra/ascii/test-escapes.nt new file mode 100644 index 00000000..81e4e110 --- /dev/null +++ b/test/extra/ascii/test-escapes.nt @@ -0,0 +1,2 @@ +<http://example.org/s> <http://example.org/p> "\\\r\n " . +<http://example.org/s> <http://example.org/p> <http://example.org/%5C> . diff --git a/test/extra/ascii/test-escapes.ttl b/test/extra/ascii/test-escapes.ttl new file mode 100644 index 00000000..cb459c3c --- /dev/null +++ b/test/extra/ascii/test-escapes.ttl @@ -0,0 +1,2 @@ +<http://example.org/s> <http://example.org/p> "\\\r\n\t" . +<http://example.org/s> <http://example.org/p> <http://example.org/\u005C> . diff --git a/test/extra/ascii/test-form-feed.nt b/test/extra/ascii/test-form-feed.nt new file mode 100644 index 00000000..7e62b690 --- /dev/null +++ b/test/extra/ascii/test-form-feed.nt @@ -0,0 +1,2 @@ +<http://example.org/s> <http://example.org/p> "\f" . +<http://example.org/s> <http://example.org/p> "\f" . diff --git a/test/extra/ascii/test-form-feed.ttl b/test/extra/ascii/test-form-feed.ttl new file mode 100644 index 00000000..7355d4cc --- /dev/null +++ b/test/extra/ascii/test-form-feed.ttl @@ -0,0 +1,2 @@ +<http://example.org/s> <http://example.org/p> "\u000C" . +<http://example.org/s> <http://example.org/p> "" . diff --git a/test/extra/ascii/test-uri-escape.nt b/test/extra/ascii/test-uri-escape.nt new file mode 100644 index 00000000..e5c04478 --- /dev/null +++ b/test/extra/ascii/test-uri-escape.nt @@ -0,0 +1 @@ +<http://example.org/s> <http://example.org/p> <scheme:%01%02%03%04%05%06%07%08%0B%0C%0E%0F%10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F%20!%22#$%&'()*+,-./0123456789:/%3C=%3E?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[%5C]%5E_%60abcdefghijklmnopqrstuvwxyz%7B%7C%7D~%7F> . diff --git a/test/extra/ascii/test-uri-escape.ttl b/test/extra/ascii/test-uri-escape.ttl new file mode 100644 index 00000000..d9e8d3cf --- /dev/null +++ b/test/extra/ascii/test-uri-escape.ttl @@ -0,0 +1 @@ +<http://example.org/s> <http://example.org/p> <scheme:\u0001\u0002\u0003\u0004\u0005\u0006\u0007\u0008\u000B\u000C\u000E\u000F\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F%20!\u0022#$%&'()*+,-./0123456789:/%3C=%3E?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\u005C]\u005E_\u0060abcdefghijklmnopqrstuvwxyz\u007B\u007C\u007D~\u007F> . diff --git a/test/extra/ascii/test-utf8-literal.nt b/test/extra/ascii/test-utf8-literal.nt new file mode 100644 index 00000000..b8633737 --- /dev/null +++ b/test/extra/ascii/test-utf8-literal.nt @@ -0,0 +1 @@ +<http://example.org/s> <http://example.org/p> "\u2200x\u2208\u211D: \u0E41\u0E1C\u0E48\u0E19\u0E14\u0E34\u0E19\u0E2E\u0E31\u0E48\u0E19\u0E40\u0E2A\u0E37\u0E48\u0E2D\u0E21\u0E42\u0E17\u0E23\u0E21\u0E41\u0E2A\u0E19\u0E2A\u0E31\u0E07\u0E40\u0E27\u0E0A" . diff --git a/test/extra/ascii/test-utf8-literal.ttl b/test/extra/ascii/test-utf8-literal.ttl new file mode 100644 index 00000000..38160383 --- /dev/null +++ b/test/extra/ascii/test-utf8-literal.ttl @@ -0,0 +1 @@ +<http://example.org/s> <http://example.org/p> "∀x∈ℝ: แผ่นดินฮั่นเสื่อมโทรมแสนสังเวช" . diff --git a/test/extra/ascii/test-utf8-uri.nt b/test/extra/ascii/test-utf8-uri.nt new file mode 100644 index 00000000..e2d573c3 --- /dev/null +++ b/test/extra/ascii/test-utf8-uri.nt @@ -0,0 +1 @@ +<http://example.org/math/%E2%88%80x%E2%88%88%E2%84%9D> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://example.org/Thing> . diff --git a/test/extra/ascii/test-utf8-uri.ttl b/test/extra/ascii/test-utf8-uri.ttl new file mode 100644 index 00000000..feb556ad --- /dev/null +++ b/test/extra/ascii/test-utf8-uri.ttl @@ -0,0 +1 @@ +<http://example.org/math/∀x∈ℝ> a <http://example.org/Thing> . diff --git a/test/extra/ascii/test-whitespace.nt b/test/extra/ascii/test-whitespace.nt new file mode 100644 index 00000000..3eee355e --- /dev/null +++ b/test/extra/ascii/test-whitespace.nt @@ -0,0 +1,2 @@ +<http://example.org/foo#a> <http://example.org/foo#b> "\nthis \ris a \U00015678long \nliteral\uABCD\n" . +<http://example.org/foo#d> <http://example.org/foo#e> " This \uABCDis\r \U00015678another\n\none\n" . diff --git a/test/extra/ascii/test-whitespace.ttl b/test/extra/ascii/test-whitespace.ttl new file mode 100644 index 00000000..92c8892a --- /dev/null +++ b/test/extra/ascii/test-whitespace.ttl @@ -0,0 +1,9 @@ +@prefix : <http://example.org/foo#> . + +:a :b """\nthis \ris a \U00015678long +literal\uABCD +""" . + +:d :e """\tThis \uABCDis\r \U00015678another\n +one +""" . diff --git a/test/extra/decode/extended-iri-chars-escaped.nt b/test/extra/decode/extended-iri-chars-escaped.nt new file mode 100644 index 00000000..4b6336bd --- /dev/null +++ b/test/extra/decode/extended-iri-chars-escaped.nt @@ -0,0 +1 @@ +<http://example.org/s> <http://example.org/p> <http://example.org/verschl%C3%BCsselt%20%E2%9C%89/%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F%10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F%20!%22#$%25&'()*+,-./0123456789:;%3C=%3E?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[%5C]%5E_%60abcdefghijklmnopqrstuvwxyz%7B%7C%7D~%7F> . diff --git a/test/extra/decode/extended-iri-chars.nt b/test/extra/decode/extended-iri-chars.nt new file mode 100644 index 00000000..f153d6b6 --- /dev/null +++ b/test/extra/decode/extended-iri-chars.nt @@ -0,0 +1 @@ +<http://example.org/s> <http://example.org/p> <http://example.org/verschlüsselt%20✉/%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F%10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F%20!%22#$%25&'()*+,-./0123456789:;%3C=%3E?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[%5C]%5E_%60abcdefghijklmnopqrstuvwxyz%7B%7C%7D~%7F> . diff --git a/test/extra/decode/manifest.ttl b/test/extra/decode/manifest.ttl new file mode 100644 index 00000000..1bc8b9a8 --- /dev/null +++ b/test/extra/decode/manifest.ttl @@ -0,0 +1,17 @@ +@prefix mf: <http://www.w3.org/2001/sw/DataAccess/tests/test-manifest#> . +@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . +@prefix rdft: <http://www.w3.org/ns/rdftest#> . + +<> + a mf:Manifest ; + rdfs:comment "Serd URI decoding test cases" ; + mf:entries ( + <#extended-iri-chars> + <#wide-characters> + ) . + +<#extended-iri-chars> + a rdft:TestTurtleEval ; + mf:action <extended-iri-chars-escaped.nt> ; + mf:name "extended-iri-chars" ; + mf:result <extended-iri-chars.nt> . diff --git a/test/extra/encode/manifest.ttl b/test/extra/encode/manifest.ttl new file mode 100644 index 00000000..0ad05437 --- /dev/null +++ b/test/extra/encode/manifest.ttl @@ -0,0 +1,24 @@ +@prefix mf: <http://www.w3.org/2001/sw/DataAccess/tests/test-manifest#> . +@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> . +@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . +@prefix rdft: <http://www.w3.org/ns/rdftest#> . + +<> + a mf:Manifest ; + rdfs:comment "Serd URI encoding test cases" ; + mf:entries ( + <#uri-ascii> + <#uri-utf8> + ) . + +<#uri-ascii> + a rdft:TestTurtleEval ; + mf:action <uri-ascii.ttl> ; + mf:name "uri-ascii" ; + mf:result <uri-ascii.nt> . + +<#uri-utf8> + a rdft:TestTurtleEval ; + mf:action <uri-utf8.ttl> ; + mf:name "uri-utf8" ; + mf:result <uri-utf8.nt> . diff --git a/test/extra/encode/uri-ascii.nt b/test/extra/encode/uri-ascii.nt new file mode 100644 index 00000000..0abb79bd --- /dev/null +++ b/test/extra/encode/uri-ascii.nt @@ -0,0 +1 @@ +<http://example.org/eg#s> <http://example.org/eg#p> <http://example.org/scheme:%01%02%03%04%05%06%07%08%0B%0C%0E%0F%10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F%20!%22#$%25&'()*+,-./0123456789:/%3C=%3E?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[%5C]%5E_%60abcdefghijklmnopqrstuvwxyz%7B%7C%7D~%7F> . diff --git a/test/extra/encode/uri-ascii.ttl b/test/extra/encode/uri-ascii.ttl new file mode 100644 index 00000000..df0a1182 --- /dev/null +++ b/test/extra/encode/uri-ascii.ttl @@ -0,0 +1,2 @@ +<http://example.org/eg#s> + <http://example.org/eg#p> <http://example.org/scheme:\u0001\u0002\u0003\u0004\u0005\u0006\u0007\u0008\u000B\u000C\u000E\u000F\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F%20!\u0022#$%25&'()*+,-./0123456789:/%3C=%3E?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\u005C]\u005E_\u0060abcdefghijklmnopqrstuvwxyz\u007B\u007C\u007D~\u007F> . diff --git a/test/extra/encode/uri-utf8.nt b/test/extra/encode/uri-utf8.nt new file mode 100644 index 00000000..29f05fec --- /dev/null +++ b/test/extra/encode/uri-utf8.nt @@ -0,0 +1 @@ +<http://example.org/eg#s> <http://example.org/eg#p> <https://en.wiktionary.org/wiki/Ῥόδος> . diff --git a/test/extra/encode/uri-utf8.ttl b/test/extra/encode/uri-utf8.ttl new file mode 100644 index 00000000..e5454177 --- /dev/null +++ b/test/extra/encode/uri-utf8.ttl @@ -0,0 +1,2 @@ +<http://example.org/eg#s> + <http://example.org/eg#p> <https://en.wiktionary.org/wiki/\u1FEC\u03CC\u03B4\u03BF\u03C2> . diff --git a/test/extra/good/manifest.ttl b/test/extra/good/manifest.ttl index 11a7ec5e..07147578 100644 --- a/test/extra/good/manifest.ttl +++ b/test/extra/good/manifest.ttl @@ -43,6 +43,7 @@ <#test-several-eaten-dots> <#test-string-escapes> <#test-uri> + <#test-uri-uchar> ) . <#test-a-without-whitespace> @@ -266,3 +267,9 @@ mf:action <test-uri.ttl> ; mf:name "test-uri" ; mf:result <test-uri.nt> . + +<#test-uri-uchar> + a rdft:TestTurtleEval ; + mf:action <test-uri-uchar.ttl> ; + mf:name "test-uri-uchar" ; + mf:result <test-uri-uchar.nt> . diff --git a/test/extra/good/test-uri-uchar.nt b/test/extra/good/test-uri-uchar.nt new file mode 100644 index 00000000..e09cc214 --- /dev/null +++ b/test/extra/good/test-uri-uchar.nt @@ -0,0 +1 @@ +<http://example.org/eg#s> <http://example.org/eg#p> <scheme:\u0001\u0002\u0003\u0004\u0005\u0006\u0007\u0008\u000B\u000C\u000E\u000F\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F%20!\u0022#$%25&'()*+,-./0123456789:/%3C=%3E?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\u005C]\u005E_\u0060abcdefghijklmnopqrstuvwxyz\u007B\u007C\u007D~\u007F> . diff --git a/test/extra/good/test-uri-uchar.ttl b/test/extra/good/test-uri-uchar.ttl new file mode 100644 index 00000000..99b1d4e0 --- /dev/null +++ b/test/extra/good/test-uri-uchar.ttl @@ -0,0 +1,2 @@ +<http://example.org/eg#s> + <http://example.org/eg#p> <scheme:\u0001\u0002\u0003\u0004\u0005\u0006\u0007\u0008\u000B\u000C\u000E\u000F\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F%20!\u0022#$%25&'()*+,-./0123456789:/%3C=%3E?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\u005C]\u005E_\u0060abcdefghijklmnopqrstuvwxyz\u007B\u007C\u007D~\u007F> . diff --git a/test/extra/perfect/test-backspace-escape.nt b/test/extra/perfect/test-backspace-escape.nt index f0b894a2..68883511 100644 --- a/test/extra/perfect/test-backspace-escape.nt +++ b/test/extra/perfect/test-backspace-escape.nt @@ -1,2 +1,2 @@ -<http://example.org/eg#s> <http://example.org/eg#p> "\u0008 first" . -<http://example.org/eg#s> <http://example.org/eg#p> "last \u0008" . +<http://example.org/eg#s> <http://example.org/eg#p> "\b first" . +<http://example.org/eg#s> <http://example.org/eg#p> "last \b" . diff --git a/test/extra/perfect/test-form-feed-escape.nt b/test/extra/perfect/test-form-feed-escape.nt index 6606fb07..0cecc352 100644 --- a/test/extra/perfect/test-form-feed-escape.nt +++ b/test/extra/perfect/test-form-feed-escape.nt @@ -1,2 +1,2 @@ -<http://example.org/eg#s> <http://example.org/eg#p> "\u000C first" . -<http://example.org/eg#s> <http://example.org/eg#p> "last \u000C" . +<http://example.org/eg#s> <http://example.org/eg#p> "\f first" . +<http://example.org/eg#s> <http://example.org/eg#p> "last \f" . diff --git a/test/extra/perfect/test-uri-escape.nt b/test/extra/perfect/test-uri-escape.nt index 320e7c33..d5daddd4 100644 --- a/test/extra/perfect/test-uri-escape.nt +++ b/test/extra/perfect/test-uri-escape.nt @@ -1 +1 @@ -<http://example.org/node> <http://example.org/prop> <scheme:\u0001\u0002\u0003\u0004\u0005\u0006\u0007\u0008\u000B\u000C\u000E\u000F\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F%20!\u0022#$%&'()*+,-./0123456789:/%3C=%3E?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\u005C]\u005E_\u0060abcdefghijklmnopqrstuvwxyz\u007B\u007C\u007D~\u007F> . +<http://example.org/eg#s> <http://example.org/eg#p> <scheme:%01%02%03%04%05%06%07%08%0B%0C%0E%0F%10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F%20!%22#$%25&'()*+,-./0123456789:/%3C=%3E?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[%5C]%5E_%60abcdefghijklmnopqrstuvwxyz%7B%7C%7D~%7F> . diff --git a/test/extra/perfect/test-uri-escape.ttl b/test/extra/perfect/test-uri-escape.ttl index 77d523fd..73cb3e77 100644 --- a/test/extra/perfect/test-uri-escape.ttl +++ b/test/extra/perfect/test-uri-escape.ttl @@ -1,2 +1,2 @@ -<http://example.org/node> - <http://example.org/prop> <scheme:\u0001\u0002\u0003\u0004\u0005\u0006\u0007\u0008\u000B\u000C\u000E\u000F\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F%20!\u0022#$%&'()*+,-./0123456789:/%3C=%3E?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\u005C]\u005E_\u0060abcdefghijklmnopqrstuvwxyz\u007B\u007C\u007D~\u007F> . +<http://example.org/eg#s> + <http://example.org/eg#p> <scheme:%01%02%03%04%05%06%07%08%0B%0C%0E%0F%10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F%20!%22#$%25&'()*+,-./0123456789:/%3C=%3E?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[%5C]%5E_%60abcdefghijklmnopqrstuvwxyz%7B%7C%7D~%7F> . diff --git a/test/extra/pretty/short-string-escapes.ttl b/test/extra/pretty/short-string-escapes.ttl index 0665e814..fad74284 100644 --- a/test/extra/pretty/short-string-escapes.ttl +++ b/test/extra/pretty/short-string-escapes.ttl @@ -1,4 +1,4 @@ @prefix eg: <http://example.org/> . eg:s - eg:p "\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007\b\t\u000B\f\u000E\u000F\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F\\\u007F" . + eg:p "\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007\b \u000B\f\u000E\u000F\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F\\\u007F" . diff --git a/test/extra/pretty/uri-escapes.ttl b/test/extra/pretty/uri-escapes.ttl index 09ced38a..9d12a798 100644 --- a/test/extra/pretty/uri-escapes.ttl +++ b/test/extra/pretty/uri-escapes.ttl @@ -1,2 +1,2 @@ <http://example.org/s> - <http://example.org/p> <http://example.org/\u0001\u0002\u0003\u0004\u0005\u0006\u0007\u0008\u0009\u000A\u000B\u000C\u000D\u000E\u000F\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F!\u0022#$%&'()*+,-./0123456789:;=?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\u005C]\u005E_\u0060abcdefghijklmnopqrstuvwxyz\u007B\u007C\u007D~\u007F> . + <http://example.org/p> <http://example.org/%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F%10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F!%22#$%25&'()*+,-./0123456789:;=?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[%5C]%5E_%60abcdefghijklmnopqrstuvwxyz%7B%7C%7D~%7F> . diff --git a/test/meson.build b/test/meson.build index e86f34e9..0f8d4685 100644 --- a/test/meson.build +++ b/test/meson.build @@ -579,20 +579,26 @@ test_suites = { ns_w3 + 'TriGTests/', '--', ['-I', 'TriG'], - ['-O', 'ascii'], + ['-O', 'escapes'], ], 'turtle': [ files('w3c/turtle/manifest.ttl'), ns_w3 + 'TurtleTests/', '--', ['-I', 'Turtle'], - ['-O', 'ascii'], + ['-O', 'escapes'], ], 'abbreviate': [ files('extra/abbreviate/manifest.ttl'), ns_serdtest + 'abbreviate/', ], + 'ascii': [ + files('extra/ascii/manifest.ttl'), + ns_serdtest + 'ascii/', + '--', + ['-O', 'ascii'], + ], 'bad': [ files('extra/bad/manifest.ttl'), ns_serdtest + 'bad/', @@ -611,16 +617,33 @@ test_suites = { files('extra/good/manifest.ttl'), ns_serdtest + 'good/', '--', + ['-O', 'escapes'], ['-b', '1'], - ['-O', 'ascii'], ], 'canon': [ files('extra/canon/manifest.ttl'), ns_serdtest + 'canon/', '--', '-C', + ['-O', 'escapes'], + ], + 'decode_forward': [ + files('extra/decode/manifest.ttl'), + ns_serdtest + 'decode/', + '--', + ['-I', 'decoded'], + ], + 'decode_reverse': [ + '--reverse', + files('extra/decode/manifest.ttl'), + ns_serdtest + 'decode/', + '--', ['-O', 'ascii'], ], + 'encode': [ + files('extra/encode/manifest.ttl'), + ns_serdtest + 'encode/', + ], 'fast': [ files('extra/perfect/manifest.ttl'), ns_serdtest + 'perfect/', @@ -639,7 +662,7 @@ test_suites = { files('extra/good/manifest.ttl'), ns_serdtest + 'good/', '--', - ['-O', 'ascii'], + ['-O', 'escapes'], ], 'lax_lax': [ '--lax', @@ -647,7 +670,7 @@ test_suites = { ns_serdtest + 'lax/', '--', ['-I', 'lax'], - ['-O', 'ascii'], + ['-O', 'escapes'], ], 'lax_strict': [ files('extra/lax/manifest.ttl'), diff --git a/test/run_suite.py b/test/run_suite.py index aa95b90f..ba83fb0d 100755 --- a/test/run_suite.py +++ b/test/run_suite.py @@ -41,6 +41,7 @@ def run_eval_test(command, in_path, good_path, out_path, getlines): """Run a positive eval test and return whether the output matches.""" command = command + ["-o", out_path, in_path] + print(shlex.join(command)) subprocess.check_call(command, encoding="utf-8") with open(good_path, "r", encoding="utf-8") as good: diff --git a/test/test_reader_writer.c b/test/test_reader_writer.c index 0237cb6c..76ba10e8 100644 --- a/test/test_reader_writer.c +++ b/test/test_reader_writer.c @@ -202,10 +202,14 @@ test_writer(const char* const path) // Write statements with bad UTF-8 (should be replaced) const SerdNode* bad_lit = serd_nodes_get(nodes, serd_a_string(bad_lit_str)); + const SerdNode* bad_long_lit = serd_nodes_get( + nodes, + serd_a_literal(zix_string(bad_lit_str), SERD_IS_LONG, zix_empty_string())); const SerdNode* bad_uri = serd_nodes_get(nodes, serd_a_uri_string(bad_uri_str)); assert(!serd_sink_write(iface, 0, s, p, bad_lit, 0)); + assert(!serd_sink_write(iface, 0, s, p, bad_long_lit, 0)); assert(!serd_sink_write(iface, 0, s, p, bad_uri, 0)); // Write 1 valid statement @@ -267,7 +271,7 @@ test_reader(const char* path) SerdInputStream in = serd_open_input_file(path); assert(!serd_reader_start(reader, &in, NULL, 4096)); assert(!serd_reader_read_document(reader)); - assert(rt.n_statement == 6); + assert(rt.n_statement == 7); assert(!serd_reader_finish(reader)); serd_close_input(&in); diff --git a/tools/console.c b/tools/console.c index 057656d5..661e7ab4 100644 --- a/tools/console.c +++ b/tools/console.c @@ -241,6 +241,7 @@ serd_set_input_option(const ZixStringView name, {"global", SERD_READ_GLOBAL}, {"generated", SERD_READ_GENERATED}, {"ordered", SERD_READ_ORDERED}, + {"decoded", SERD_READ_DECODED}, {NULL, SERD_READ_LAX}, }; @@ -293,6 +294,7 @@ serd_set_output_option(const ZixStringView name, static const OutputOption output_options[] = { {"ascii", SERD_WRITE_ASCII}, {"contextual", SERD_WRITE_CONTEXTUAL}, + {"escapes", SERD_WRITE_ESCAPES}, {"expanded", SERD_WRITE_EXPANDED}, {"lax", SERD_WRITE_LAX}, {"longhand", SERD_WRITE_LONGHAND}, diff --git a/tools/serd-pipe.c b/tools/serd-pipe.c index 06c8e55e..4fed5a1f 100644 --- a/tools/serd-pipe.c +++ b/tools/serd-pipe.c @@ -100,7 +100,7 @@ print_usage(const char* const name, const bool error) " -B BASE_URI Base URI or path for resolving relative references.\n" " -C Convert literals to canonical form.\n" " -I SYNTAX Input syntax turtle/ntriples/trig/nquads, or option\n" - " lax/variables/relative/global/generated.\n" + " lax/decoded/variables/relative/global/generated.\n" " -O SYNTAX Output syntax empty/turtle/ntriples/nquads, or option\n" " ascii/contextual/expanded/verbatim/terse/lax.\n" " -R ROOT_URI Keep relative URIs within ROOT_URI.\n" |