diff options
author | David Robillard <d@drobilla.net> | 2019-05-05 16:12:38 +0200 |
---|---|---|
committer | David Robillard <d@drobilla.net> | 2021-03-08 23:23:06 -0500 |
commit | 4e7e642d0d7b6dfa704f5ae95475854bb8c9b0b2 (patch) | |
tree | 6d04338db96fa2546e7e81168eb8ddbafb904b23 | |
parent | 8b353f1c530ae0f8d112648f34ea612c681ba57c (diff) | |
download | serd-4e7e642d0d7b6dfa704f5ae95475854bb8c9b0b2.tar.gz serd-4e7e642d0d7b6dfa704f5ae95475854bb8c9b0b2.tar.bz2 serd-4e7e642d0d7b6dfa704f5ae95475854bb8c9b0b2.zip |
Add support for reading multiple files at once
-rw-r--r-- | doc/serdi.1 | 2 | ||||
-rw-r--r-- | src/serdi.c | 127 | ||||
-rw-r--r-- | test/meson.build | 4 | ||||
-rw-r--r-- | test/multifile/input1.ttl | 2 | ||||
-rw-r--r-- | test/multifile/input2.trig | 7 | ||||
-rw-r--r-- | test/multifile/output.nq | 3 | ||||
-rwxr-xr-x | test/test_multifile.py | 52 | ||||
-rwxr-xr-x | test/test_stdin.py | 7 |
8 files changed, 173 insertions, 31 deletions
diff --git a/doc/serdi.1 b/doc/serdi.1 index c23fa174..c834ce42 100644 --- a/doc/serdi.1 +++ b/doc/serdi.1 @@ -16,7 +16,7 @@ .Op Fl r Ar root .Op Fl s Ar string .Op Fl w Ar filename -.Ar input +.Ar input ... .Sh DESCRIPTION .Nm is a fast command-line utility for streaming and processing RDF data. diff --git a/src/serdi.c b/src/serdi.c index 30933552..e8ef9897 100644 --- a/src/serdi.c +++ b/src/serdi.c @@ -90,6 +90,42 @@ quiet_error_func(void* const handle, const SerdError* const e) return SERD_SUCCESS; } +static SerdStatus +read_file(SerdWorld* const world, + SerdSyntax syntax, + const SerdReaderFlags flags, + const SerdSink* const sink, + const size_t stack_size, + const char* filename, + const char* add_prefix, + bool bulk_read) +{ + syntax = syntax ? syntax : serd_guess_syntax(filename); + syntax = syntax ? syntax : SERD_TRIG; + + SerdStatus st = SERD_SUCCESS; + SerdReader* reader = serd_reader_new(world, syntax, flags, sink, stack_size); + + serd_reader_add_blank_prefix(reader, add_prefix); + + if (!strcmp(filename, "-")) { + SerdNode* name = serd_new_string(SERD_STATIC_STRING("stdin")); + + st = serd_reader_start_stream( + reader, serd_file_read_byte, (SerdStreamErrorFunc)ferror, stdin, name, 1); + + serd_node_free(name); + } else { + st = serd_reader_start_file(reader, filename, bulk_read); + } + + st = st ? st : serd_reader_read_document(reader); + + serd_reader_free(reader); + + return st; +} + int main(int argc, char** argv) { @@ -102,21 +138,19 @@ main(int argc, char** argv) SerdSyntax output_syntax = SERD_SYNTAX_EMPTY; SerdReaderFlags reader_flags = 0; SerdWriterFlags writer_flags = 0; - bool from_stdin = false; bool bulk_read = true; bool bulk_write = false; bool osyntax_set = false; bool quiet = false; size_t stack_size = 4194304; const char* input_string = NULL; - const char* add_prefix = NULL; + const char* add_prefix = ""; const char* chop_prefix = NULL; const char* root_uri = NULL; const char* out_filename = NULL; int a = 1; for (; a < argc && argv[a][0] == '-'; ++a) { if (argv[a][1] == '\0') { - from_stdin = true; break; } @@ -214,19 +248,27 @@ main(int argc, char** argv) return 1; } - const char* input = argv[a++]; + char* const* const inputs = argv + a; + const int n_inputs = argc - a; - if ((!input_syntax && !input) || !(input_syntax = serd_guess_syntax(input))) { - input_syntax = SERD_TRIG; + bool input_has_graphs = serd_syntax_has_graphs(input_syntax); + for (int i = a; i < argc; ++i) { + if (serd_syntax_has_graphs(serd_guess_syntax(argv[i]))) { + input_has_graphs = true; + break; + } } - const bool input_has_graphs = serd_syntax_has_graphs(input_syntax); if (!output_syntax && !osyntax_set) { output_syntax = input_has_graphs ? SERD_NQUADS : SERD_NTRIPLES; } - if (!base && input) { // Use input file URI - base = serd_new_file_uri(SERD_MEASURE_STRING(input), SERD_EMPTY_STRING()); + if (!base && n_inputs == 1 && + (output_syntax == SERD_NQUADS || output_syntax == SERD_NTRIPLES)) { + // Choose base URI from the single input path + if (!(base = serd_new_real_file_uri(inputs[0], NULL))) { + SERDI_ERRORF("unable to determine base URI from path %s\n", inputs[0]); + } } SerdWorld* const world = serd_world_new(); @@ -253,9 +295,6 @@ main(int argc, char** argv) SerdWriter* const writer = serd_writer_new(world, output_syntax, writer_flags, env, byte_sink); - SerdReader* const reader = serd_reader_new( - world, input_syntax, reader_flags, serd_writer_sink(writer), stack_size); - if (quiet) { serd_world_set_error_func(world, quiet_error_func, NULL); } @@ -263,32 +302,62 @@ main(int argc, char** argv) SerdNode* root = serd_new_uri(SERD_MEASURE_STRING(root_uri)); serd_writer_set_root_uri(writer, root); serd_writer_chop_blank_prefix(writer, chop_prefix); - serd_reader_add_blank_prefix(reader, add_prefix); serd_node_free(root); SerdStatus st = SERD_SUCCESS; SerdNode* input_name = NULL; if (input_string) { - input_name = serd_new_string(SERD_STATIC_STRING("string")); - st = serd_reader_start_string(reader, input_string, input_name); - } else if (from_stdin) { - input_name = serd_new_string(SERD_STATIC_STRING("stdin")); - st = serd_reader_start_stream(reader, - serd_file_read_byte, - (SerdStreamErrorFunc)ferror, - stdin, - input_name, - 1); - } else { - st = serd_reader_start_file(reader, input, bulk_read); + SerdReader* const reader = + serd_reader_new(world, + input_syntax ? input_syntax : SERD_TRIG, + reader_flags, + serd_writer_sink(writer), + stack_size); + + serd_reader_add_blank_prefix(reader, add_prefix); + + SerdNode* name = serd_new_string(SERD_STATIC_STRING("string")); + if (!(st = serd_reader_start_string(reader, input_string, name))) { + st = serd_reader_read_document(reader); + } + + serd_node_free(name); + serd_reader_free(reader); } - if (!st) { - st = serd_reader_read_document(reader); + size_t prefix_len = 0; + char* prefix = NULL; + if (n_inputs > 1) { + prefix_len = 8 + strlen(add_prefix); + prefix = (char*)calloc(1, prefix_len); } - serd_reader_finish(reader); - serd_reader_free(reader); + for (int i = 0; i < n_inputs; ++i) { + if (!base) { + SerdNode* file_uri = + serd_new_file_uri(SERD_MEASURE_STRING(inputs[i]), SERD_EMPTY_STRING()); + + serd_env_set_base_uri(env, serd_node_string_view(file_uri)); + serd_node_free(file_uri); + } + + if (n_inputs > 1) { + snprintf(prefix, prefix_len, "f%d%s", i, add_prefix); + } + + if ((st = read_file(world, + input_syntax, + reader_flags, + serd_writer_sink(writer), + stack_size, + inputs[i], + n_inputs > 1 ? prefix : add_prefix, + bulk_read))) { + break; + } + } + free(prefix); + serd_writer_free(writer); serd_node_free(input_name); serd_env_free(env); diff --git a/test/meson.build b/test/meson.build index 6f6d7378..ac1fb205 100644 --- a/test/meson.build +++ b/test/meson.build @@ -89,6 +89,10 @@ if get_option('utils') args: script_args, suite: ['serdi', 'input']) + test('multiple', files('test_multifile.py'), + args: script_args + [meson.current_source_dir() / 'multifile'], + suite: ['serdi', 'input']) + test('string', serdi, args: ['-s', '<foo> a <Bar> .'], should_fail: true, diff --git a/test/multifile/input1.ttl b/test/multifile/input1.ttl new file mode 100644 index 00000000..88c3f8e9 --- /dev/null +++ b/test/multifile/input1.ttl @@ -0,0 +1,2 @@ +[] + a <http://example.org/Type> . diff --git a/test/multifile/input2.trig b/test/multifile/input2.trig new file mode 100644 index 00000000..260080a8 --- /dev/null +++ b/test/multifile/input2.trig @@ -0,0 +1,7 @@ +[] + a <http://example.org/Type> . + +<http://example.org/graph> { + [] + a <http://example.org/OtherType> . +} diff --git a/test/multifile/output.nq b/test/multifile/output.nq new file mode 100644 index 00000000..dd35dc4d --- /dev/null +++ b/test/multifile/output.nq @@ -0,0 +1,3 @@ +_:f0b1 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://example.org/Type> . +_:f1b1 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://example.org/Type> . +_:f1b2 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://example.org/OtherType> <http://example.org/graph> . diff --git a/test/test_multifile.py b/test/test_multifile.py new file mode 100755 index 00000000..5fb44bc5 --- /dev/null +++ b/test/test_multifile.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 + +"""Test reading from several input files.""" + +import argparse +import difflib +import os +import shlex +import subprocess +import sys +import tempfile + +parser = argparse.ArgumentParser(description=__doc__) + +parser.add_argument("--serdi", default="./serdi", help="path to serdi") +parser.add_argument("--wrapper", default="", help="executable wrapper") +parser.add_argument("testdir", help="multifile test directory") + +args = parser.parse_args(sys.argv[1:]) +in1_path = os.path.join(args.testdir, "input1.ttl") +in2_path = os.path.join(args.testdir, "input2.trig") +check_path = os.path.join(args.testdir, "output.nq") +command = shlex.split(args.wrapper) + [args.serdi, in1_path, in2_path] + + +def _show_diff(from_lines, to_lines, from_filename, to_filename): + same = True + for line in difflib.unified_diff( + from_lines, + to_lines, + fromfile=os.path.abspath(from_filename), + tofile=os.path.abspath(to_filename), + ): + sys.stderr.write(line) + same = False + + return same + + +with tempfile.TemporaryFile(mode="w+", encoding="utf-8") as out: + proc = subprocess.run(command, check=False, stdout=out) + + assert proc.returncode == 0 + + out.seek(0) + with open(check_path, "r", encoding="utf-8") as check: + + output_matches = _show_diff( + check.readlines(), out.readlines(), check_path, "output" + ) + + assert output_matches diff --git a/test/test_stdin.py b/test/test_stdin.py index 28286e09..9a27fcd9 100755 --- a/test/test_stdin.py +++ b/test/test_stdin.py @@ -14,7 +14,12 @@ parser.add_argument("--serdi", default="./serdi", help="path to serdi") parser.add_argument("--wrapper", default="", help="executable wrapper") args = parser.parse_args(sys.argv[1:]) -command = shlex.split(args.wrapper) + [args.serdi, "-"] +command = shlex.split(args.wrapper) + [ + args.serdi, + "-I", + "http://example.org", + "-", +] DOCUMENT = "<{0}s> <{0}p> <{0}o> .".format("http://example.org/") |