From dc01b7e301e91d0d7bfc358f569f4f3849471c52 Mon Sep 17 00:00:00 2001 From: David Robillard <d@drobilla.net> Date: Sun, 1 Aug 2021 20:09:38 -0400 Subject: Collapse input and output options into a single flag --- doc/serdi.1 | 89 ++++++++++++++++++++++++++++++++++---------------- test/meson.build | 9 ++--- test/run_test_suite.py | 13 +++++--- tools/console.c | 71 ++++++++++++++++++++++++++++++++++++++++ tools/console.h | 10 ++++++ tools/serdi.c | 36 ++++++++------------ 6 files changed, 168 insertions(+), 60 deletions(-) diff --git a/doc/serdi.1 b/doc/serdi.1 index 99652738..698bec29 100644 --- a/doc/serdi.1 +++ b/doc/serdi.1 @@ -6,7 +6,7 @@ .Nd read, filter, transform, and write RDF data .Sh SYNOPSIS .Nm serdi -.Op Fl Cafhlmqtvx +.Op Fl Cfhmqv .Op Fl F Ar pattern | Fl G Ar pattern .Op Fl I Ar base .Op Fl b Ar bytes @@ -74,10 +74,6 @@ the URI of the file is automatically used as the base URI. This option can be used to override that, or to provide a base URI for input from stdin or a string. .Pp -.It Fl a -Write ASCII output. -If this is enabled, all non-ASCII characters will be escaped, even if the output syntax allows them to be written in UTF-8. -.Pp .It Fl b Ar bytes I/O block size. This is the number of bytes in a file that will be read or written at once. @@ -102,15 +98,44 @@ since blank nodes will not be inlined. Print the command line options. .Pp .It Fl i Ar syntax -Read input as -.Ar syntax . -Case is ignored, valid values are: +Set an input syntax option. +May be given multiple times. +The case-insensitive +.Ar syntax +can be either a syntax name or an input syntax option. +The supported syntaxes are .Dq NQuads , .Dq NTriples , .Dq TriG , and .Dq Turtle . .Pp +The supported input options are: +.Pp +.Bl -tag -width "QvariablesQ" -compact -offset indent +.It Dq lax +Tolerate invalid input where possible. +Warnings will be printed on syntax errors, +but parsing will attempt to continue. +Note that data may be lost when using this option! +.Pp +.It Dq variables +Support parsing variable nodes. +Variables can be written in SPARQL style, for example +.Dq ?var +or +.Dq $var . +.Pp +.It Dq verbatim +Normally, the reader expands all relative URIs, +and may adjust blank node labels to avoid clashing with generated ones. +This flag disables all of this processing, +so that URI references and blank nodes are passed to the sink exactly as they are in the input. +Note that this does not apply to CURIEs, since serd deliberately does not +have a way to represent CURIE nodes. A bad namespace prefix is considered +a syntax error. +.El +.Pp .It Fl k Ar bytes Parser stack size. For performance and security reasons, parsing is performed with a fixed-size stack. @@ -119,11 +144,6 @@ If some data has very deep nesting or very large literal values, it may exceed the default amount of space, and this option can be used to increase it and allow the document to be parsed successfully. .Pp -.It Fl l -Lax (non-strict) parsing. -If this is enabled, recoverable syntax errors will print a warning, but parsing will proceed starting at the next statement if possible. -Note that data may be lost when using this option. -.Pp .It Fl m Build a model in memory. This loads all of the input into memory before writing the output. @@ -131,18 +151,39 @@ This will reorder statements and eliminate duplicates, at the cost of performanc When writing TriG or Turtle, this may enable better pretty-printing with more inline descriptions. .Pp .It Fl o Ar syntax -Write output as -.Ar syntax . -Case is ignored, valid values are: +Set an output syntax option. +May be given multiple times. +The case-insensitive +.Ar syntax +can be either a syntax name or an output syntax option. +The supported syntaxes are .Dq empty , .Dq NQuads , .Dq NTriples , .Dq TriG , and .Dq Turtle . -When -.Dq empty -is given, output is suppressed, so only errors will be printed. +.Pp +The supported output options are: +.Pp +.Bl -tag -width "QverbatimQ" -compact -offset indent +.It Dq ascii +Escape all non-ASCII characters. +.Pp +.It Dq expanded +Write expanded URIs instead of prefixed names. +.Pp +.It Dq verbatim +Write URI references exactly as they are in the input. +This avoids resolving URIs and making them relative to the output base URI. +.Pp +.It Dq terse +Write terser output without newlines. +.Pp +.It Dq lax +Tolerate invalid UTF-8 by writing the replacement character when necessary. +Note that data may be lost when using this option! +.El .Pp .It Fl p Ar prefix Add @@ -167,9 +208,6 @@ Parse .Ar string as input. .Pp -.It Fl t -Write terser output without newlines. -.Pp .It Fl v Display version information and exit. .Pp @@ -177,13 +215,6 @@ Display version information and exit. Write output to the given .Ar filename instead of stdout. -.Pp -.It Fl x -Support parsing variable nodes. -Variables can be written in SPARQL style, for example -.Dq ?var -or -.Dq $var . .El .Sh EXIT STATUS .Nm diff --git a/test/meson.build b/test/meson.build index 2cf3f32c..95fa9587 100644 --- a/test/meson.build +++ b/test/meson.build @@ -247,12 +247,12 @@ if is_variable('serdi') timeout: 240) endforeach - ### The terse suite needs to be run with -t + ### The terse suite needs to be run with -o terse test('terse', run_test_suite, args: script_args + ['--osyntax', 'turtle', files('terse/manifest.ttl'), serd_base + 'terse/', - '--', '-t'], + '--', '-o', 'terse'], env: test_env, suite: ['rdf', 'serd'], timeout: 240) @@ -260,7 +260,7 @@ if is_variable('serdi') manifest = files('pattern' / 'manifest.ttl') base_uri = serd_base + 'pattern' + '/' test('pattern', run_test_suite, - args: script_args + [manifest, base_uri, '--', '-x'], + args: script_args + [manifest, base_uri, '--', '-i', 'variables'], env: test_env, suite: ['rdf', 'serd'], timeout: 240) @@ -287,7 +287,8 @@ if is_variable('serdi') ### ... and once with lax parsing to tolerate them test('lax.lax', run_test_suite, - args: script_args + [lax_manifest, lax_base_uri, '--', '-l'], + args: script_args + [lax_manifest, lax_base_uri, '--', + '-i', 'lax', '-o', 'lax'], env: test_env, is_parallel: false, suite: ['rdf', 'serd'], diff --git a/test/run_test_suite.py b/test/run_test_suite.py index 0d865c1f..1c5a7cf3 100755 --- a/test/run_test_suite.py +++ b/test/run_test_suite.py @@ -26,7 +26,7 @@ def log_error(message): def test_osyntax_options(osyntax): if osyntax.lower() == "ntriples" or osyntax.lower() == "nquads": - return ["-a"] + return ["-o", "ascii"] return [] @@ -80,7 +80,8 @@ def test_thru( "foo", "-w", thru_path, - "-a", + "-o", + "ascii", "-I", base_uri, out_path, @@ -243,9 +244,10 @@ def test_suite( test_path = os.path.join(test_dir, test_name) command = command_prefix + [ - "-a", "-o", osyntax, + "-o", + "ascii", "-I", test_uri, test_path, @@ -305,9 +307,10 @@ def test_suite( model_command = command_prefix + [ "-m", - "-a", "-o", osyntax, + "-o", + "ascii", "-w", out_filename, "-I", @@ -365,7 +368,7 @@ def test_suite( if test_class.startswith(ns_rdftest): expected = ( 1 - if "-l" not in command_prefix and "Negative" in test_class + if "lax" not in command_prefix and "Negative" in test_class else 0 ) run_tests(test_class, instances, expected, results) diff --git a/tools/console.c b/tools/console.c index df1bc2ff..339aca29 100644 --- a/tools/console.c +++ b/tools/console.c @@ -56,6 +56,77 @@ serd_print_version(const char* const program) return 0; } +SerdStatus +serd_set_input_option(const SerdStringView name, + SerdSyntax* const syntax, + SerdReaderFlags* const flags) +{ + typedef struct { + const char* name; + SerdReaderFlag flag; + } InputOption; + + static const InputOption input_options[] = { + {"lax", SERD_READ_LAX}, + {"variables", SERD_READ_VARIABLES}, + {"verbatim", SERD_READ_VERBATIM}, + {NULL, SERD_READ_LAX}, + }; + + const SerdSyntax named_syntax = serd_syntax_by_name(name.buf); + if (!serd_strncasecmp(name.buf, "empty", name.len) || + named_syntax != SERD_SYNTAX_EMPTY) { + *syntax = named_syntax; + return SERD_SUCCESS; + } + + for (const InputOption* o = input_options; o->name; ++o) { + if (!serd_strncasecmp(o->name, name.buf, name.len)) { + *flags |= o->flag; + return SERD_SUCCESS; + } + } + + // SERDI_ERRORF("invalid input option `%s'\n", name.buf); + return SERD_FAILURE; +} + +SerdStatus +serd_set_output_option(const SerdStringView name, + SerdSyntax* const syntax, + SerdWriterFlags* const flags) +{ + typedef struct { + const char* name; + SerdWriterFlag flag; + } OutputOption; + + static const OutputOption output_options[] = { + {"ascii", SERD_WRITE_ASCII}, + {"expanded", SERD_WRITE_EXPANDED}, + {"verbatim", SERD_WRITE_VERBATIM}, + {"terse", SERD_WRITE_TERSE}, + {"lax", SERD_WRITE_LAX}, + {NULL, SERD_WRITE_ASCII}, + }; + + const SerdSyntax named_syntax = serd_syntax_by_name(name.buf); + if (!serd_strncasecmp(name.buf, "empty", name.len) || + named_syntax != SERD_SYNTAX_EMPTY) { + *syntax = named_syntax; + return SERD_SUCCESS; + } + + for (const OutputOption* o = output_options; o->name; ++o) { + if (!serd_strncasecmp(o->name, name.buf, name.len)) { + *flags |= o->flag; + return SERD_SUCCESS; + } + } + + return SERD_FAILURE; +} + /// Wrapper for getc that is compatible with SerdReadFunc but faster than fread static size_t serd_file_read_byte(void* buf, size_t size, size_t nmemb, void* stream) diff --git a/tools/console.h b/tools/console.h index 31076b24..5d174f1e 100644 --- a/tools/console.h +++ b/tools/console.h @@ -24,6 +24,16 @@ serd_set_stream_utf8_mode(FILE* stream); int serd_print_version(const char* program); +SerdStatus +serd_set_input_option(SerdStringView name, + SerdSyntax* syntax, + SerdReaderFlags* flags); + +SerdStatus +serd_set_output_option(SerdStringView name, + SerdSyntax* syntax, + SerdWriterFlags* flags); + SerdByteSource* serd_open_input(const char* filename, size_t block_size); diff --git a/tools/serdi.c b/tools/serdi.c index f46cab48..db1cfde0 100644 --- a/tools/serdi.c +++ b/tools/serdi.c @@ -45,24 +45,22 @@ print_usage(const char* const name, const bool error) " -F PATTERN Filter out statements that match PATTERN.\n" " -G PATTERN Only include statements matching PATTERN.\n" " -I BASE_URI Input base URI.\n" - " -a Write ASCII output if possible.\n" " -b BYTES I/O block size.\n" " -c PREFIX Chop PREFIX from matching blank node IDs.\n" " -f Fast and loose mode (possibly ugly output).\n" " -h Display this help and exit.\n" - " -i SYNTAX Input syntax: turtle/ntriples/trig/nquads.\n" + " -i SYNTAX Input syntax (turtle/ntriples/trig/nquads),\n" + " or flag (lax/variables/verbatim).\n" " -k BYTES Parser stack size.\n" - " -l Lax (non-strict) parsing.\n" " -m Build a model in memory before writing.\n" - " -o SYNTAX Output syntax: empty/turtle/ntriples/nquads.\n" + " -o SYNTAX Output syntax (empty/turtle/ntriples/nquads),\n" + " or flag (ascii/expanded/verbatim/terse/lax).\n" " -p PREFIX Add PREFIX to blank node IDs.\n" " -q Suppress all output except data.\n" " -r ROOT_URI Keep relative URIs within ROOT_URI.\n" " -s STRING Parse STRING as input.\n" - " -t Write terser output without newlines.\n" " -v Display version information and exit.\n" - " -w FILENAME Write output to FILENAME instead of stdout.\n" - " -x Support parsing variable nodes like `?x'.\n"; + " -w FILENAME Write output to FILENAME instead of stdout.\n"; FILE* const os = error ? stderr : stdout; fprintf(os, "%s", error ? "\n" : ""); @@ -207,26 +205,17 @@ main(int argc, char** argv) if (opt == 'C') { canonical = true; - } else if (opt == 'a') { - writer_flags |= SERD_WRITE_ASCII; } else if (opt == 'f') { no_inline = true; writer_flags |= (SERD_WRITE_EXPANDED | SERD_WRITE_VERBATIM); } else if (opt == 'h') { return print_usage(prog, false); - } else if (opt == 'l') { - reader_flags |= SERD_READ_LAX; - writer_flags |= SERD_WRITE_LAX; } else if (argv[a][1] == 'm') { use_model = true; } else if (opt == 'q') { quiet = true; - } else if (opt == 't') { - writer_flags |= SERD_WRITE_TERSE; } else if (opt == 'v') { return serd_print_version(argv[0]); - } else if (opt == 'x') { - reader_flags |= SERD_READ_VARIABLES; } else if (argv[a][1] == 'F') { if (++a == argc) { return missing_arg(argv[0], 'F'); @@ -273,8 +262,9 @@ main(int argc, char** argv) return missing_arg(prog, 'i'); } - if (!(input_syntax = serd_syntax_by_name(argv[a]))) { - return print_usage(prog, true); + if (serd_set_input_option( + SERD_STRING(argv[a]), &input_syntax, &reader_flags)) { + return print_usage(argv[0], true); } break; } else if (opt == 'k') { @@ -291,16 +281,18 @@ main(int argc, char** argv) stack_size = (size_t)size; break; } else if (opt == 'o') { - osyntax_set = true; if (argv[a][o + 1] || ++a == argc) { return missing_arg(prog, 'o'); } - if (!strcmp(argv[a], "empty")) { - output_syntax = SERD_SYNTAX_EMPTY; - } else if (!(output_syntax = serd_syntax_by_name(argv[a]))) { + if (serd_set_output_option( + SERD_STRING(argv[a]), &output_syntax, &writer_flags)) { return print_usage(argv[0], true); } + + osyntax_set = + output_syntax != SERD_SYNTAX_EMPTY || !strcmp(argv[a], "empty"); + break; } else if (opt == 'p') { if (argv[a][o + 1] || ++a == argc) { -- cgit v1.2.1