aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--doc/man/serd-pipe.1111
-rwxr-xr-xscripts/serd_bench.py3
-rw-r--r--test/meson.build53
-rwxr-xr-xtest/run_suite.py2
-rwxr-xr-xtest/test_base.py2
-rwxr-xr-xtest/test_empty.py2
-rwxr-xr-xtest/test_stdin.py2
-rw-r--r--tools/console.c70
-rw-r--r--tools/console.h13
-rw-r--r--tools/serd-pipe.c70
10 files changed, 215 insertions, 113 deletions
diff --git a/doc/man/serd-pipe.1 b/doc/man/serd-pipe.1
index d731f0b4..0ce40dbd 100644
--- a/doc/man/serd-pipe.1
+++ b/doc/man/serd-pipe.1
@@ -8,13 +8,13 @@
.Nd read and write RDF data
.Sh SYNOPSIS
.Nm serd-pipe
-.Op Fl Cafhlqtvx
+.Op Fl Cfhqv
.Op Fl B Ar base
+.Op Fl I Ar syntax
+.Op Fl O Ar syntax
.Op Fl b Ar bytes
.Op Fl c Ar prefix
-.Op Fl i Ar syntax
.Op Fl k Ar bytes
-.Op Fl o Ar syntax
.Op Fl p Ar prefix
.Op Fl r Ar root
.Op Fl s Ar string
@@ -80,9 +80,67 @@ All numeric datatypes are supported, as well as
.Vt hexBinary ,
and
.Vt base64Binary .
-.It Fl a
-Write ASCII output.
-If this is enabled, all non-ASCII characters will be escaped, even if the output syntax allows them to be written in UTF-8.
+.It Fl I Ar syntax
+Set an input syntax or option.
+May be given multiple times.
+The case-insensitive
+.Ar syntax
+can be
+.Cm NQuads ,
+.Cm NTriples ,
+.Cm TriG ,
+.Cm Turtle ,
+or an option:
+.Bl -tag -width 3n
+.It Cm lax
+Tolerate invalid input where possible.
+Warnings will be printed for syntax errors,
+but parsing will attempt to continue.
+Note that data may be lost when using this option!
+.It Cm variables
+Support parsing variable nodes.
+Variables can be written in SPARQL style, for example
+.Li ?var
+or
+.Li $var .
+.It Cm verbatim
+Normally, the reader expands all relative URIs,
+and may adjust blank node labels to avoid clashing with generated ones.
+This flag disables all of this processing,
+so that URI references and blank nodes are passed to the sink exactly as they are in the input.
+Note that this does not apply to CURIEs, since serd deliberately does not
+have a way to represent CURIE nodes.
+A bad namespace prefix is considered a syntax error.
+.El
+.It Fl O Ar syntax
+Set an output syntax or option.
+May be given multiple times.
+The case-insensitive
+.Ar syntax
+can be
+.Cm empty ,
+.Cm NQuads ,
+.Cm NTriples ,
+.Cm TriG ,
+.Cm Turtle ,
+or an option:
+.Bl -tag -width 3n
+.It Cm ascii
+Escape all non-ASCII characters.
+.It Cm expanded
+Write expanded URIs instead of prefixed names.
+.It Cm lax
+Tolerate corrupt UTF-8 and write replacements.
+.It Cm terse
+Write terser output with fewer, longer lines.
+.It Cm verbatim
+Write URI references exactly as in the input.
+.El
+.Pp
+The
+.Cm empty
+syntax suppresses the output,
+so that only warnings and errors will be printed.
.It Fl b Ar bytes
I/O block size.
This is the number of bytes in a file that will be read or written at once.
@@ -111,39 +169,12 @@ preserve full URIs (without qualifying or making relative),
and pass prefixed names through as-is.
.It Fl h
Print the command line options.
-.It Fl i Ar syntax
-Read input as
-.Ar syntax .
-Case is ignored, valid values are:
-.Dq NQuads ,
-.Dq NTriples ,
-.Dq TriG ,
-and
-.Dq Turtle .
.It Fl k Ar bytes
Parser stack size.
Parsing is performed using a pre-allocated stack for performance and security reasons.
By default, the stack is 1 MiB, which should be sufficient for most data.
This can be increased to support unusually structured data and huge literals,
or decreased to reduce overall memory requirements and reduce startup time.
-.It Fl l
-Lax (non-strict) parsing.
-If this is enabled, recoverable syntax errors will print a warning, but parsing will proceed starting at the next statement if possible.
-Note that data may be lost when using this option.
-.It Fl o Ar syntax
-Write output as
-.Ar syntax .
-Case is ignored, valid values are:
-.Dq empty ,
-.Dq NQuads ,
-.Dq NTriples ,
-.Dq TriG ,
-and
-.Dq Turtle .
-The
-.Cm empty
-syntax suppresses the output,
-so that only warnings and errors will be printed.
.It Fl p Ar prefix
Add
.Ar prefix
@@ -191,20 +222,12 @@ This is useful for keeping relative references within some directory.
Parse
.Ar string
as input.
-.It Fl t
-Write terser output without newlines.
.It Fl v
Display version information and exit.
.It Fl w Ar filename
Write output to the given
.Ar filename
instead of stdout.
-.It Fl x
-Support parsing variable nodes.
-Variables can be written in SPARQL style, for example
-.Dq ?var
-or
-.Dq $var .
.El
.Sh ENVIRONMENT
Errors and warnings are printed in color by default if the output is a terminal.
@@ -235,6 +258,12 @@ exits with a status of 0, or non-zero if an error occurred.
.Nm Fl o
.Ar output.ttl
.Pa input.nt
+.It Expand all prefixed names into full URIs:
+.Nm Fl O
+.Ar expanded
+.Fl o
+.Ar expanded.ttl
+.Pa input.ttl
.It Merge two files:
.Nm Fl o
.Pa merged.ttl
diff --git a/scripts/serd_bench.py b/scripts/serd_bench.py
index d764cbda..db8c1c5b 100755
--- a/scripts/serd_bench.py
+++ b/scripts/serd_bench.py
@@ -271,7 +271,8 @@ example:
args = ap.parse_args(sys.argv[1:])
- progs = ["serd-pipe -f -i turtle -o turtle"] + args.run
+ serd_opts = "-I turtle -I verbatim -O turtle -O verbatim -O expanded"
+ progs = ["tools/serd-pipe " + serd_opts] + args.run
min_n = int(args.max / args.steps)
max_n = args.max
step = min_n
diff --git a/test/meson.build b/test/meson.build
index 24180efa..84ae7525 100644
--- a/test/meson.build
+++ b/test/meson.build
@@ -108,7 +108,7 @@ if get_option('lint')
test(
ttl_file_path.underscorify(),
check_formatting_py,
- args: [files(ttl_file_path), serd_pipe, '-o', 'turtle'],
+ args: [files(ttl_file_path), serd_pipe, '-O', 'turtle'],
suite: 'data',
)
endforeach
@@ -180,22 +180,22 @@ simple_command_tests = {
'bad': [
['-B', 'nonuriorpath'],
['-B'],
+ ['-I', 'turtle'],
+ ['-I', 'unknown'],
+ ['-I'],
+ ['-O', 'unknown'],
+ ['-O'],
['-b', '-1'],
['-b', '1024junk'],
['-b', '9223372036854775807'],
['-b'],
['-c'],
- ['-fi'],
- ['-i', 'turtle'],
- ['-i', 'unknown'],
- ['-i'],
['-k', '-1'],
['-k', '1024junk'],
['-k', '9223372036854775807'],
['-k'],
- ['-o', 'unknown'],
- ['-o'],
['-p'],
+ ['-qi'],
['-r'],
['-s', '<foo> a <Bar> .'],
['-s'],
@@ -395,30 +395,30 @@ test_suites = {
files('w3c/nquads/manifest.ttl'),
ns_w3 + 'NQuadsTests/',
'--',
- '-a',
- ['-i', 'NQuads'],
+ ['-I', 'NQuads'],
+ ['-O', 'ascii'],
],
'ntriples': [
files('w3c/ntriples/manifest.ttl'),
ns_w3 + 'NTriplesTests/',
'--',
- '-a',
- ['-i', 'NTriples'],
+ ['-I', 'NTriples'],
+ ['-O', 'ascii'],
['-k', '1024'],
],
'trig': [
files('w3c/trig/manifest.ttl'),
ns_w3 + 'TriGTests/',
'--',
- '-a',
- ['-i', 'TriG'],
+ ['-I', 'TriG'],
+ ['-O', 'ascii'],
],
'turtle': [
files('w3c/turtle/manifest.ttl'),
ns_w3 + 'TurtleTests/',
'--',
- '-a',
- ['-i', 'Turtle'],
+ ['-I', 'Turtle'],
+ ['-O', 'ascii'],
],
'abbreviate': [
@@ -433,7 +433,7 @@ test_suites = {
files('extra/bad/manifest.ttl'),
ns_serdtest + 'bad/',
'--',
- ['-o', 'turtle'],
+ ['-O', 'turtle'],
],
'big': [
files('extra/big/manifest.ttl'),
@@ -443,41 +443,42 @@ test_suites = {
files('extra/good/manifest.ttl'),
ns_serdtest + 'good/',
'--',
- '-a',
['-b', '1'],
+ ['-O', 'ascii'],
],
'canon': [
files('extra/canon/manifest.ttl'),
ns_serdtest + 'canon/',
'--',
'-C',
- '-a',
+ ['-O', 'ascii'],
],
'fast': [
files('extra/perfect/manifest.ttl'),
ns_serdtest + 'perfect/',
'--',
- '-f',
+ ['-I', 'verbatim'],
+ ['-O', 'verbatim'],
],
'full': [
files('extra/full/manifest.ttl'),
ns_serdtest + 'full/',
'--',
- '-f',
+ ['-O', 'expanded'],
],
'good': [
files('extra/good/manifest.ttl'),
ns_serdtest + 'good/',
'--',
- '-a',
+ ['-O', 'ascii'],
],
'lax_lax': [
'--lax',
files('extra/lax/manifest.ttl'),
ns_serdtest + 'lax/',
'--',
- '-a',
- '-l',
+ ['-I', 'lax'],
+ ['-O', 'ascii'],
],
'lax_strict': [
files('extra/lax/manifest.ttl'),
@@ -487,7 +488,7 @@ test_suites = {
files('extra/pattern/manifest.ttl'),
ns_serdtest + 'pattern/',
'--',
- '-x',
+ ['-I', 'variables'],
],
'perfect_forward': [
files('extra/perfect/manifest.ttl'),
@@ -519,7 +520,7 @@ test_suites = {
files('extra/qualify/manifest.ttl'),
ns_serdtest + 'qualify/',
'--',
- ['-i', 'turtle'], # Just for coverage
+ ['-I', 'turtle'], # Just for coverage
],
'root': [
files('extra/root/manifest.ttl'),
@@ -531,7 +532,7 @@ test_suites = {
files('extra/terse/manifest.ttl'),
ns_serdtest + 'terse/',
'--',
- '-t',
+ ['-O', 'terse'],
],
}
diff --git a/test/run_suite.py b/test/run_suite.py
index cee9d88e..16e527af 100755
--- a/test/run_suite.py
+++ b/test/run_suite.py
@@ -41,7 +41,7 @@ def run_eval_test(command, in_path, good_path, out_path):
"""Run a positive eval test and return whether the output matches."""
syntax = util.syntax_from_path(out_path)
- command = command + ["-o", syntax, "-w", out_path, in_path]
+ command = command + ["-O", syntax, "-w", out_path, in_path]
subprocess.check_call(command, encoding="utf-8")
with open(good_path, "r", encoding="utf-8") as good:
diff --git a/test/test_base.py b/test/test_base.py
index b63bb135..cb696d2f 100755
--- a/test/test_base.py
+++ b/test/test_base.py
@@ -10,7 +10,7 @@
import serd_test_util as util
args = util.wrapper_args(__doc__)
-command = [args.tool, "-B", "http://example.org", "-i", "turtle", "-"]
+command = [args.tool, "-B", "http://example.org", "-I", "turtle", "-"]
IN_DOC = "<s> <p> <o> ."
OUT_DOC = "<{0}s> <{0}p> <{0}o> .".format("http://example.org/")
diff --git a/test/test_empty.py b/test/test_empty.py
index 0ee641d5..85c603fe 100755
--- a/test/test_empty.py
+++ b/test/test_empty.py
@@ -14,7 +14,7 @@ import tempfile
import serd_test_util as util
args = util.wrapper_args(__doc__, True)
-command = shlex.split(args.wrapper) + [args.tool, "-o", "empty", args.input]
+command = shlex.split(args.wrapper) + [args.tool, "-O", "empty", args.input]
with tempfile.TemporaryFile() as out:
proc = subprocess.run(command, check=False, stdout=out)
diff --git a/test/test_stdin.py b/test/test_stdin.py
index 9ffd19ff..790d34ce 100755
--- a/test/test_stdin.py
+++ b/test/test_stdin.py
@@ -10,7 +10,7 @@
import serd_test_util as util
args = util.wrapper_args(__doc__)
-command = [args.tool, "-i", "ntriples", "-B", "http://example.org", "-"]
+command = [args.tool, "-I", "ntriples", "-B", "http://example.org", "-"]
DOC = "<{0}s> <{0}p> <{0}o> .".format("http://example.org/")
diff --git a/tools/console.c b/tools/console.c
index 72b9b222..56464696 100644
--- a/tools/console.c
+++ b/tools/console.c
@@ -85,6 +85,76 @@ serd_choose_syntax(SerdWorld* const world,
return SERD_TRIG;
}
+SerdStatus
+serd_set_input_option(const SerdStringView name,
+ SerdSyntax* const syntax,
+ SerdReaderFlags* const flags)
+{
+ typedef struct {
+ const char* name;
+ SerdReaderFlag flag;
+ } InputOption;
+
+ static const InputOption input_options[] = {
+ {"lax", SERD_READ_LAX},
+ {"variables", SERD_READ_VARIABLES},
+ {"verbatim", SERD_READ_VERBATIM},
+ {NULL, SERD_READ_LAX},
+ };
+
+ const SerdSyntax named_syntax = serd_syntax_by_name(name.data);
+ if (!serd_strncasecmp(name.data, "empty", name.length) ||
+ named_syntax != SERD_SYNTAX_EMPTY) {
+ *syntax = named_syntax;
+ return SERD_SUCCESS;
+ }
+
+ for (const InputOption* o = input_options; o->name; ++o) {
+ if (!serd_strncasecmp(o->name, name.data, name.length)) {
+ *flags |= o->flag;
+ return SERD_SUCCESS;
+ }
+ }
+
+ return SERD_FAILURE;
+}
+
+SerdStatus
+serd_set_output_option(const SerdStringView name,
+ SerdSyntax* const syntax,
+ SerdWriterFlags* const flags)
+{
+ typedef struct {
+ const char* name;
+ SerdWriterFlag flag;
+ } OutputOption;
+
+ static const OutputOption output_options[] = {
+ {"ascii", SERD_WRITE_ASCII},
+ {"expanded", SERD_WRITE_EXPANDED},
+ {"lax", SERD_WRITE_LAX},
+ {"terse", SERD_WRITE_TERSE},
+ {"verbatim", SERD_WRITE_VERBATIM},
+ {NULL, SERD_WRITE_ASCII},
+ };
+
+ const SerdSyntax named_syntax = serd_syntax_by_name(name.data);
+ if (!serd_strncasecmp(name.data, "empty", name.length) ||
+ named_syntax != SERD_SYNTAX_EMPTY) {
+ *syntax = named_syntax;
+ return SERD_SUCCESS;
+ }
+
+ for (const OutputOption* o = output_options; o->name; ++o) {
+ if (!serd_strncasecmp(o->name, name.data, name.length)) {
+ *flags |= o->flag;
+ return SERD_SUCCESS;
+ }
+ }
+
+ return SERD_FAILURE;
+}
+
/// Wrapper for getc that is compatible with SerdReadFunc but faster than fread
static size_t
serd_file_read_byte(void* buf, size_t size, size_t nmemb, void* stream)
diff --git a/tools/console.h b/tools/console.h
index 29b0a7df..a7e8423f 100644
--- a/tools/console.h
+++ b/tools/console.h
@@ -7,9 +7,12 @@
#include "serd/env.h"
#include "serd/input_stream.h"
#include "serd/output_stream.h"
+#include "serd/reader.h"
#include "serd/status.h"
+#include "serd/string_view.h"
#include "serd/syntax.h"
#include "serd/world.h"
+#include "serd/writer.h"
#include <stdio.h>
@@ -27,6 +30,16 @@ serd_choose_syntax(SerdWorld* world,
SerdSyntax requested,
const char* filename);
+SerdStatus
+serd_set_input_option(SerdStringView name,
+ SerdSyntax* syntax,
+ SerdReaderFlags* flags);
+
+SerdStatus
+serd_set_output_option(SerdStringView name,
+ SerdSyntax* syntax,
+ SerdWriterFlags* flags);
+
SerdInputStream
serd_open_tool_input(const char* filename);
diff --git a/tools/serd-pipe.c b/tools/serd-pipe.c
index bdb871fa..16abbd2c 100644
--- a/tools/serd-pipe.c
+++ b/tools/serd-pipe.c
@@ -39,23 +39,20 @@ print_usage(const char* const name, const bool error)
"Use - for INPUT to read from standard input.\n\n"
" -B BASE_URI Base URI.\n"
" -C Convert literals to canonical form.\n"
- " -a Write ASCII output.\n"
+ " -I SYNTAX Input syntax (turtle/ntriples/trig/nquads),\n"
+ " or flag (lax/variables/verbatim).\n"
+ " -O SYNTAX Output syntax (empty/turtle/ntriples/nquads),\n"
+ " or flag (ascii/expanded/verbatim/terse/lax).\n"
" -b BYTES I/O block size.\n"
" -c PREFIX Chop PREFIX from matching blank node IDs.\n"
- " -f Fast and loose URI pass-through.\n"
" -h Display this help and exit.\n"
- " -i SYNTAX Input syntax: turtle/ntriples/trig/nquads.\n"
" -k BYTES Parser stack size.\n"
- " -l Lax (non-strict) parsing.\n"
- " -o SYNTAX Output syntax: empty/turtle/ntriples/nquads.\n"
" -p PREFIX Add PREFIX to blank node IDs.\n"
" -q Suppress all output except data.\n"
" -r ROOT_URI Keep relative URIs within ROOT_URI.\n"
" -s STRING Parse STRING as input.\n"
- " -t Write terser output without newlines.\n"
" -v Display version information and exit.\n"
- " -w FILENAME Write output to FILENAME instead of stdout.\n"
- " -x Support parsing variable nodes like \"?x\".\n";
+ " -w FILENAME Write output to FILENAME instead of stdout.\n";
FILE* const os = error ? stderr : stdout;
fprintf(os, "%s", error ? "\n" : "");
@@ -145,24 +142,12 @@ main(int argc, char** argv)
if (opt == 'C') {
canonical = true;
- } else if (opt == 'a') {
- writer_flags |= SERD_WRITE_ASCII;
- } else if (opt == 'f') {
- writer_flags |= (SERD_WRITE_EXPANDED | SERD_WRITE_VERBATIM);
} else if (opt == 'h') {
return print_usage(prog, false);
- } else if (opt == 'l') {
- reader_flags |= SERD_READ_LAX;
- writer_flags |= SERD_WRITE_LAX;
} else if (opt == 'q') {
quiet = true;
- } else if (opt == 't') {
- writer_flags |= SERD_WRITE_TERSE;
} else if (opt == 'v') {
return serd_print_version(argv[0]);
- } else if (opt == 'x') {
- reader_flags |= SERD_READ_VARIABLES;
- break;
} else if (argv[a][1] == 'B') {
if (++a == argc) {
return missing_arg(prog, 'B');
@@ -170,6 +155,30 @@ main(int argc, char** argv)
base = serd_node_new(NULL, serd_a_uri_string(argv[a]));
break;
+ } else if (opt == 'I') {
+ if (argv[a][o + 1] || ++a == argc) {
+ return missing_arg(prog, 'I');
+ }
+
+ if (serd_set_input_option(
+ serd_string(argv[a]), &input_syntax, &reader_flags)) {
+ return print_usage(argv[0], true);
+ }
+ break;
+ } else if (opt == 'O') {
+ if (argv[a][o + 1] || ++a == argc) {
+ return missing_arg(prog, 'O');
+ }
+
+ if (serd_set_output_option(
+ serd_string(argv[a]), &output_syntax, &writer_flags)) {
+ return print_usage(argv[0], true);
+ }
+
+ osyntax_set =
+ output_syntax != SERD_SYNTAX_EMPTY || !strcmp(argv[a], "empty");
+
+ break;
} else if (opt == 'b') {
if (argv[a][o + 1] || ++a == argc) {
return missing_arg(prog, 'b');
@@ -190,15 +199,6 @@ main(int argc, char** argv)
chop_prefix = argv[a];
break;
- } else if (opt == 'i') {
- if (argv[a][o + 1] || ++a == argc) {
- return missing_arg(prog, 'i');
- }
-
- if (!(input_syntax = serd_syntax_by_name(argv[a]))) {
- return print_usage(prog, true);
- }
- break;
} else if (opt == 'k') {
if (argv[a][o + 1] || ++a == argc) {
return missing_arg(prog, 'k');
@@ -212,18 +212,6 @@ main(int argc, char** argv)
}
stack_size = (size_t)size;
break;
- } else if (opt == 'o') {
- osyntax_set = true;
- if (argv[a][o + 1] || ++a == argc) {
- return missing_arg(prog, 'o');
- }
-
- if (!strcmp(argv[a], "empty")) {
- output_syntax = SERD_SYNTAX_EMPTY;
- } else if (!(output_syntax = serd_syntax_by_name(argv[a]))) {
- return print_usage(argv[0], true);
- }
- break;
} else if (opt == 'p') {
if (argv[a][o + 1] || ++a == argc) {
return missing_arg(prog, 'p');