diff options
-rw-r--r-- | doc/man/serd-pipe.1 | 111 | ||||
-rwxr-xr-x | scripts/serd_bench.py | 3 | ||||
-rw-r--r-- | test/meson.build | 53 | ||||
-rwxr-xr-x | test/run_suite.py | 2 | ||||
-rwxr-xr-x | test/test_base.py | 2 | ||||
-rwxr-xr-x | test/test_empty.py | 2 | ||||
-rwxr-xr-x | test/test_stdin.py | 2 | ||||
-rw-r--r-- | tools/console.c | 70 | ||||
-rw-r--r-- | tools/console.h | 13 | ||||
-rw-r--r-- | tools/serd-pipe.c | 70 |
10 files changed, 215 insertions, 113 deletions
diff --git a/doc/man/serd-pipe.1 b/doc/man/serd-pipe.1 index d731f0b4..0ce40dbd 100644 --- a/doc/man/serd-pipe.1 +++ b/doc/man/serd-pipe.1 @@ -8,13 +8,13 @@ .Nd read and write RDF data .Sh SYNOPSIS .Nm serd-pipe -.Op Fl Cafhlqtvx +.Op Fl Cfhqv .Op Fl B Ar base +.Op Fl I Ar syntax +.Op Fl O Ar syntax .Op Fl b Ar bytes .Op Fl c Ar prefix -.Op Fl i Ar syntax .Op Fl k Ar bytes -.Op Fl o Ar syntax .Op Fl p Ar prefix .Op Fl r Ar root .Op Fl s Ar string @@ -80,9 +80,67 @@ All numeric datatypes are supported, as well as .Vt hexBinary , and .Vt base64Binary . -.It Fl a -Write ASCII output. -If this is enabled, all non-ASCII characters will be escaped, even if the output syntax allows them to be written in UTF-8. +.It Fl I Ar syntax +Set an input syntax or option. +May be given multiple times. +The case-insensitive +.Ar syntax +can be +.Cm NQuads , +.Cm NTriples , +.Cm TriG , +.Cm Turtle , +or an option: +.Bl -tag -width 3n +.It Cm lax +Tolerate invalid input where possible. +Warnings will be printed for syntax errors, +but parsing will attempt to continue. +Note that data may be lost when using this option! +.It Cm variables +Support parsing variable nodes. +Variables can be written in SPARQL style, for example +.Li ?var +or +.Li $var . +.It Cm verbatim +Normally, the reader expands all relative URIs, +and may adjust blank node labels to avoid clashing with generated ones. +This flag disables all of this processing, +so that URI references and blank nodes are passed to the sink exactly as they are in the input. +Note that this does not apply to CURIEs, since serd deliberately does not +have a way to represent CURIE nodes. +A bad namespace prefix is considered a syntax error. +.El +.It Fl O Ar syntax +Set an output syntax or option. +May be given multiple times. +The case-insensitive +.Ar syntax +can be +.Cm empty , +.Cm NQuads , +.Cm NTriples , +.Cm TriG , +.Cm Turtle , +or an option: +.Bl -tag -width 3n +.It Cm ascii +Escape all non-ASCII characters. +.It Cm expanded +Write expanded URIs instead of prefixed names. +.It Cm lax +Tolerate corrupt UTF-8 and write replacements. +.It Cm terse +Write terser output with fewer, longer lines. +.It Cm verbatim +Write URI references exactly as in the input. +.El +.Pp +The +.Cm empty +syntax suppresses the output, +so that only warnings and errors will be printed. .It Fl b Ar bytes I/O block size. This is the number of bytes in a file that will be read or written at once. @@ -111,39 +169,12 @@ preserve full URIs (without qualifying or making relative), and pass prefixed names through as-is. .It Fl h Print the command line options. -.It Fl i Ar syntax -Read input as -.Ar syntax . -Case is ignored, valid values are: -.Dq NQuads , -.Dq NTriples , -.Dq TriG , -and -.Dq Turtle . .It Fl k Ar bytes Parser stack size. Parsing is performed using a pre-allocated stack for performance and security reasons. By default, the stack is 1 MiB, which should be sufficient for most data. This can be increased to support unusually structured data and huge literals, or decreased to reduce overall memory requirements and reduce startup time. -.It Fl l -Lax (non-strict) parsing. -If this is enabled, recoverable syntax errors will print a warning, but parsing will proceed starting at the next statement if possible. -Note that data may be lost when using this option. -.It Fl o Ar syntax -Write output as -.Ar syntax . -Case is ignored, valid values are: -.Dq empty , -.Dq NQuads , -.Dq NTriples , -.Dq TriG , -and -.Dq Turtle . -The -.Cm empty -syntax suppresses the output, -so that only warnings and errors will be printed. .It Fl p Ar prefix Add .Ar prefix @@ -191,20 +222,12 @@ This is useful for keeping relative references within some directory. Parse .Ar string as input. -.It Fl t -Write terser output without newlines. .It Fl v Display version information and exit. .It Fl w Ar filename Write output to the given .Ar filename instead of stdout. -.It Fl x -Support parsing variable nodes. -Variables can be written in SPARQL style, for example -.Dq ?var -or -.Dq $var . .El .Sh ENVIRONMENT Errors and warnings are printed in color by default if the output is a terminal. @@ -235,6 +258,12 @@ exits with a status of 0, or non-zero if an error occurred. .Nm Fl o .Ar output.ttl .Pa input.nt +.It Expand all prefixed names into full URIs: +.Nm Fl O +.Ar expanded +.Fl o +.Ar expanded.ttl +.Pa input.ttl .It Merge two files: .Nm Fl o .Pa merged.ttl diff --git a/scripts/serd_bench.py b/scripts/serd_bench.py index d764cbda..db8c1c5b 100755 --- a/scripts/serd_bench.py +++ b/scripts/serd_bench.py @@ -271,7 +271,8 @@ example: args = ap.parse_args(sys.argv[1:]) - progs = ["serd-pipe -f -i turtle -o turtle"] + args.run + serd_opts = "-I turtle -I verbatim -O turtle -O verbatim -O expanded" + progs = ["tools/serd-pipe " + serd_opts] + args.run min_n = int(args.max / args.steps) max_n = args.max step = min_n diff --git a/test/meson.build b/test/meson.build index 24180efa..84ae7525 100644 --- a/test/meson.build +++ b/test/meson.build @@ -108,7 +108,7 @@ if get_option('lint') test( ttl_file_path.underscorify(), check_formatting_py, - args: [files(ttl_file_path), serd_pipe, '-o', 'turtle'], + args: [files(ttl_file_path), serd_pipe, '-O', 'turtle'], suite: 'data', ) endforeach @@ -180,22 +180,22 @@ simple_command_tests = { 'bad': [ ['-B', 'nonuriorpath'], ['-B'], + ['-I', 'turtle'], + ['-I', 'unknown'], + ['-I'], + ['-O', 'unknown'], + ['-O'], ['-b', '-1'], ['-b', '1024junk'], ['-b', '9223372036854775807'], ['-b'], ['-c'], - ['-fi'], - ['-i', 'turtle'], - ['-i', 'unknown'], - ['-i'], ['-k', '-1'], ['-k', '1024junk'], ['-k', '9223372036854775807'], ['-k'], - ['-o', 'unknown'], - ['-o'], ['-p'], + ['-qi'], ['-r'], ['-s', '<foo> a <Bar> .'], ['-s'], @@ -395,30 +395,30 @@ test_suites = { files('w3c/nquads/manifest.ttl'), ns_w3 + 'NQuadsTests/', '--', - '-a', - ['-i', 'NQuads'], + ['-I', 'NQuads'], + ['-O', 'ascii'], ], 'ntriples': [ files('w3c/ntriples/manifest.ttl'), ns_w3 + 'NTriplesTests/', '--', - '-a', - ['-i', 'NTriples'], + ['-I', 'NTriples'], + ['-O', 'ascii'], ['-k', '1024'], ], 'trig': [ files('w3c/trig/manifest.ttl'), ns_w3 + 'TriGTests/', '--', - '-a', - ['-i', 'TriG'], + ['-I', 'TriG'], + ['-O', 'ascii'], ], 'turtle': [ files('w3c/turtle/manifest.ttl'), ns_w3 + 'TurtleTests/', '--', - '-a', - ['-i', 'Turtle'], + ['-I', 'Turtle'], + ['-O', 'ascii'], ], 'abbreviate': [ @@ -433,7 +433,7 @@ test_suites = { files('extra/bad/manifest.ttl'), ns_serdtest + 'bad/', '--', - ['-o', 'turtle'], + ['-O', 'turtle'], ], 'big': [ files('extra/big/manifest.ttl'), @@ -443,41 +443,42 @@ test_suites = { files('extra/good/manifest.ttl'), ns_serdtest + 'good/', '--', - '-a', ['-b', '1'], + ['-O', 'ascii'], ], 'canon': [ files('extra/canon/manifest.ttl'), ns_serdtest + 'canon/', '--', '-C', - '-a', + ['-O', 'ascii'], ], 'fast': [ files('extra/perfect/manifest.ttl'), ns_serdtest + 'perfect/', '--', - '-f', + ['-I', 'verbatim'], + ['-O', 'verbatim'], ], 'full': [ files('extra/full/manifest.ttl'), ns_serdtest + 'full/', '--', - '-f', + ['-O', 'expanded'], ], 'good': [ files('extra/good/manifest.ttl'), ns_serdtest + 'good/', '--', - '-a', + ['-O', 'ascii'], ], 'lax_lax': [ '--lax', files('extra/lax/manifest.ttl'), ns_serdtest + 'lax/', '--', - '-a', - '-l', + ['-I', 'lax'], + ['-O', 'ascii'], ], 'lax_strict': [ files('extra/lax/manifest.ttl'), @@ -487,7 +488,7 @@ test_suites = { files('extra/pattern/manifest.ttl'), ns_serdtest + 'pattern/', '--', - '-x', + ['-I', 'variables'], ], 'perfect_forward': [ files('extra/perfect/manifest.ttl'), @@ -519,7 +520,7 @@ test_suites = { files('extra/qualify/manifest.ttl'), ns_serdtest + 'qualify/', '--', - ['-i', 'turtle'], # Just for coverage + ['-I', 'turtle'], # Just for coverage ], 'root': [ files('extra/root/manifest.ttl'), @@ -531,7 +532,7 @@ test_suites = { files('extra/terse/manifest.ttl'), ns_serdtest + 'terse/', '--', - '-t', + ['-O', 'terse'], ], } diff --git a/test/run_suite.py b/test/run_suite.py index cee9d88e..16e527af 100755 --- a/test/run_suite.py +++ b/test/run_suite.py @@ -41,7 +41,7 @@ def run_eval_test(command, in_path, good_path, out_path): """Run a positive eval test and return whether the output matches.""" syntax = util.syntax_from_path(out_path) - command = command + ["-o", syntax, "-w", out_path, in_path] + command = command + ["-O", syntax, "-w", out_path, in_path] subprocess.check_call(command, encoding="utf-8") with open(good_path, "r", encoding="utf-8") as good: diff --git a/test/test_base.py b/test/test_base.py index b63bb135..cb696d2f 100755 --- a/test/test_base.py +++ b/test/test_base.py @@ -10,7 +10,7 @@ import serd_test_util as util args = util.wrapper_args(__doc__) -command = [args.tool, "-B", "http://example.org", "-i", "turtle", "-"] +command = [args.tool, "-B", "http://example.org", "-I", "turtle", "-"] IN_DOC = "<s> <p> <o> ." OUT_DOC = "<{0}s> <{0}p> <{0}o> .".format("http://example.org/") diff --git a/test/test_empty.py b/test/test_empty.py index 0ee641d5..85c603fe 100755 --- a/test/test_empty.py +++ b/test/test_empty.py @@ -14,7 +14,7 @@ import tempfile import serd_test_util as util args = util.wrapper_args(__doc__, True) -command = shlex.split(args.wrapper) + [args.tool, "-o", "empty", args.input] +command = shlex.split(args.wrapper) + [args.tool, "-O", "empty", args.input] with tempfile.TemporaryFile() as out: proc = subprocess.run(command, check=False, stdout=out) diff --git a/test/test_stdin.py b/test/test_stdin.py index 9ffd19ff..790d34ce 100755 --- a/test/test_stdin.py +++ b/test/test_stdin.py @@ -10,7 +10,7 @@ import serd_test_util as util args = util.wrapper_args(__doc__) -command = [args.tool, "-i", "ntriples", "-B", "http://example.org", "-"] +command = [args.tool, "-I", "ntriples", "-B", "http://example.org", "-"] DOC = "<{0}s> <{0}p> <{0}o> .".format("http://example.org/") diff --git a/tools/console.c b/tools/console.c index 72b9b222..56464696 100644 --- a/tools/console.c +++ b/tools/console.c @@ -85,6 +85,76 @@ serd_choose_syntax(SerdWorld* const world, return SERD_TRIG; } +SerdStatus +serd_set_input_option(const SerdStringView name, + SerdSyntax* const syntax, + SerdReaderFlags* const flags) +{ + typedef struct { + const char* name; + SerdReaderFlag flag; + } InputOption; + + static const InputOption input_options[] = { + {"lax", SERD_READ_LAX}, + {"variables", SERD_READ_VARIABLES}, + {"verbatim", SERD_READ_VERBATIM}, + {NULL, SERD_READ_LAX}, + }; + + const SerdSyntax named_syntax = serd_syntax_by_name(name.data); + if (!serd_strncasecmp(name.data, "empty", name.length) || + named_syntax != SERD_SYNTAX_EMPTY) { + *syntax = named_syntax; + return SERD_SUCCESS; + } + + for (const InputOption* o = input_options; o->name; ++o) { + if (!serd_strncasecmp(o->name, name.data, name.length)) { + *flags |= o->flag; + return SERD_SUCCESS; + } + } + + return SERD_FAILURE; +} + +SerdStatus +serd_set_output_option(const SerdStringView name, + SerdSyntax* const syntax, + SerdWriterFlags* const flags) +{ + typedef struct { + const char* name; + SerdWriterFlag flag; + } OutputOption; + + static const OutputOption output_options[] = { + {"ascii", SERD_WRITE_ASCII}, + {"expanded", SERD_WRITE_EXPANDED}, + {"lax", SERD_WRITE_LAX}, + {"terse", SERD_WRITE_TERSE}, + {"verbatim", SERD_WRITE_VERBATIM}, + {NULL, SERD_WRITE_ASCII}, + }; + + const SerdSyntax named_syntax = serd_syntax_by_name(name.data); + if (!serd_strncasecmp(name.data, "empty", name.length) || + named_syntax != SERD_SYNTAX_EMPTY) { + *syntax = named_syntax; + return SERD_SUCCESS; + } + + for (const OutputOption* o = output_options; o->name; ++o) { + if (!serd_strncasecmp(o->name, name.data, name.length)) { + *flags |= o->flag; + return SERD_SUCCESS; + } + } + + return SERD_FAILURE; +} + /// Wrapper for getc that is compatible with SerdReadFunc but faster than fread static size_t serd_file_read_byte(void* buf, size_t size, size_t nmemb, void* stream) diff --git a/tools/console.h b/tools/console.h index 29b0a7df..a7e8423f 100644 --- a/tools/console.h +++ b/tools/console.h @@ -7,9 +7,12 @@ #include "serd/env.h" #include "serd/input_stream.h" #include "serd/output_stream.h" +#include "serd/reader.h" #include "serd/status.h" +#include "serd/string_view.h" #include "serd/syntax.h" #include "serd/world.h" +#include "serd/writer.h" #include <stdio.h> @@ -27,6 +30,16 @@ serd_choose_syntax(SerdWorld* world, SerdSyntax requested, const char* filename); +SerdStatus +serd_set_input_option(SerdStringView name, + SerdSyntax* syntax, + SerdReaderFlags* flags); + +SerdStatus +serd_set_output_option(SerdStringView name, + SerdSyntax* syntax, + SerdWriterFlags* flags); + SerdInputStream serd_open_tool_input(const char* filename); diff --git a/tools/serd-pipe.c b/tools/serd-pipe.c index bdb871fa..16abbd2c 100644 --- a/tools/serd-pipe.c +++ b/tools/serd-pipe.c @@ -39,23 +39,20 @@ print_usage(const char* const name, const bool error) "Use - for INPUT to read from standard input.\n\n" " -B BASE_URI Base URI.\n" " -C Convert literals to canonical form.\n" - " -a Write ASCII output.\n" + " -I SYNTAX Input syntax (turtle/ntriples/trig/nquads),\n" + " or flag (lax/variables/verbatim).\n" + " -O SYNTAX Output syntax (empty/turtle/ntriples/nquads),\n" + " or flag (ascii/expanded/verbatim/terse/lax).\n" " -b BYTES I/O block size.\n" " -c PREFIX Chop PREFIX from matching blank node IDs.\n" - " -f Fast and loose URI pass-through.\n" " -h Display this help and exit.\n" - " -i SYNTAX Input syntax: turtle/ntriples/trig/nquads.\n" " -k BYTES Parser stack size.\n" - " -l Lax (non-strict) parsing.\n" - " -o SYNTAX Output syntax: empty/turtle/ntriples/nquads.\n" " -p PREFIX Add PREFIX to blank node IDs.\n" " -q Suppress all output except data.\n" " -r ROOT_URI Keep relative URIs within ROOT_URI.\n" " -s STRING Parse STRING as input.\n" - " -t Write terser output without newlines.\n" " -v Display version information and exit.\n" - " -w FILENAME Write output to FILENAME instead of stdout.\n" - " -x Support parsing variable nodes like \"?x\".\n"; + " -w FILENAME Write output to FILENAME instead of stdout.\n"; FILE* const os = error ? stderr : stdout; fprintf(os, "%s", error ? "\n" : ""); @@ -145,24 +142,12 @@ main(int argc, char** argv) if (opt == 'C') { canonical = true; - } else if (opt == 'a') { - writer_flags |= SERD_WRITE_ASCII; - } else if (opt == 'f') { - writer_flags |= (SERD_WRITE_EXPANDED | SERD_WRITE_VERBATIM); } else if (opt == 'h') { return print_usage(prog, false); - } else if (opt == 'l') { - reader_flags |= SERD_READ_LAX; - writer_flags |= SERD_WRITE_LAX; } else if (opt == 'q') { quiet = true; - } else if (opt == 't') { - writer_flags |= SERD_WRITE_TERSE; } else if (opt == 'v') { return serd_print_version(argv[0]); - } else if (opt == 'x') { - reader_flags |= SERD_READ_VARIABLES; - break; } else if (argv[a][1] == 'B') { if (++a == argc) { return missing_arg(prog, 'B'); @@ -170,6 +155,30 @@ main(int argc, char** argv) base = serd_node_new(NULL, serd_a_uri_string(argv[a])); break; + } else if (opt == 'I') { + if (argv[a][o + 1] || ++a == argc) { + return missing_arg(prog, 'I'); + } + + if (serd_set_input_option( + serd_string(argv[a]), &input_syntax, &reader_flags)) { + return print_usage(argv[0], true); + } + break; + } else if (opt == 'O') { + if (argv[a][o + 1] || ++a == argc) { + return missing_arg(prog, 'O'); + } + + if (serd_set_output_option( + serd_string(argv[a]), &output_syntax, &writer_flags)) { + return print_usage(argv[0], true); + } + + osyntax_set = + output_syntax != SERD_SYNTAX_EMPTY || !strcmp(argv[a], "empty"); + + break; } else if (opt == 'b') { if (argv[a][o + 1] || ++a == argc) { return missing_arg(prog, 'b'); @@ -190,15 +199,6 @@ main(int argc, char** argv) chop_prefix = argv[a]; break; - } else if (opt == 'i') { - if (argv[a][o + 1] || ++a == argc) { - return missing_arg(prog, 'i'); - } - - if (!(input_syntax = serd_syntax_by_name(argv[a]))) { - return print_usage(prog, true); - } - break; } else if (opt == 'k') { if (argv[a][o + 1] || ++a == argc) { return missing_arg(prog, 'k'); @@ -212,18 +212,6 @@ main(int argc, char** argv) } stack_size = (size_t)size; break; - } else if (opt == 'o') { - osyntax_set = true; - if (argv[a][o + 1] || ++a == argc) { - return missing_arg(prog, 'o'); - } - - if (!strcmp(argv[a], "empty")) { - output_syntax = SERD_SYNTAX_EMPTY; - } else if (!(output_syntax = serd_syntax_by_name(argv[a]))) { - return print_usage(argv[0], true); - } - break; } else if (opt == 'p') { if (argv[a][o + 1] || ++a == argc) { return missing_arg(prog, 'p'); |