diff options
author | David Robillard <d@drobilla.net> | 2023-05-05 12:35:46 -0400 |
---|---|---|
committer | David Robillard <d@drobilla.net> | 2023-12-02 18:49:08 -0500 |
commit | 439d6ec3d6dfbea74334beace790f500e61c9b7d (patch) | |
tree | e385755a7d557dd5eb6f33b841072375cfaca29d | |
parent | c9afaab2a84f592e4567b37b3551511381e734e4 (diff) | |
download | serd-439d6ec3d6dfbea74334beace790f500e61c9b7d.tar.gz serd-439d6ec3d6dfbea74334beace790f500e61c9b7d.tar.bz2 serd-439d6ec3d6dfbea74334beace790f500e61c9b7d.zip |
Add statement filter sink and serd-filter tool
-rw-r--r-- | NEWS | 3 | ||||
-rw-r--r-- | README.md | 1 | ||||
-rw-r--r-- | doc/man/meson.build | 2 | ||||
-rw-r--r-- | doc/man/serd-filter.1 | 172 | ||||
-rw-r--r-- | doc/man/serd-pipe.1 | 2 | ||||
-rw-r--r-- | include/serd/filter.h | 60 | ||||
-rw-r--r-- | include/serd/serd.h | 1 | ||||
-rw-r--r-- | include/serd/status.h | 1 | ||||
-rw-r--r-- | meson.build | 2 | ||||
-rw-r--r-- | src/filter.c | 127 | ||||
-rw-r--r-- | src/string.c | 2 | ||||
-rw-r--r-- | test/extra/filter/input.ttl | 9 | ||||
-rw-r--r-- | test/extra/filter/manifest.ttl | 48 | ||||
-rw-r--r-- | test/extra/filter/meson.build | 17 | ||||
-rw-r--r-- | test/extra/filter/o1.pattern.nt | 1 | ||||
-rw-r--r-- | test/extra/filter/o1.result.nt | 2 | ||||
-rw-r--r-- | test/extra/filter/p1.pattern.nt | 1 | ||||
-rw-r--r-- | test/extra/filter/p1.result.nt | 2 | ||||
-rw-r--r-- | test/extra/filter/s1.pattern.nt | 1 | ||||
-rw-r--r-- | test/extra/filter/s1.result.nt | 2 | ||||
-rw-r--r-- | test/meson.build | 110 | ||||
-rwxr-xr-x | test/run_filter_suite.py | 109 | ||||
-rwxr-xr-x | test/run_suite.py | 5 | ||||
-rw-r--r-- | test/serd_test_util/__init__.py | 13 | ||||
-rw-r--r-- | test/test_filter.c | 61 | ||||
-rwxr-xr-x | test/test_patterns.py | 83 | ||||
-rw-r--r-- | test/test_string.c | 2 | ||||
-rw-r--r-- | tools/meson.build | 10 | ||||
-rw-r--r-- | tools/serd-filter.c | 317 |
29 files changed, 1148 insertions, 18 deletions
@@ -3,6 +3,7 @@ serd (1.1.1) unstable; urgency=medium * Add SerdBuffer for mutable buffers to keep SerdChunk const-correct * Add SerdWorld for shared library state * Add extensible logging API + * Add filtering of statements by a triple or quad pattern * Add support for converting literals to canonical form * Add support for parsing variables * Add support for writing terse output with minimal newlines @@ -25,7 +26,7 @@ serd (1.1.1) unstable; urgency=medium * Use a fixed-size reader stack * Use char* for strings in public API - -- David Robillard <d@drobilla.net> Mon, 19 Dec 2022 20:55:34 +0000 + -- David Robillard <d@drobilla.net> Wed, 13 Jul 2022 23:12:23 +0000 serd (0.32.0) stable; urgency=medium @@ -57,6 +57,7 @@ Documentation * [Installation instructions](INSTALL.md) * [Single-page API reference](https://drobilla.gitlab.io/serd/doc/singlehtml/) * [Paginated API reference](https://drobilla.gitlab.io/serd/doc/html/) + * [`serd-filter` man page](https://drobilla.gitlab.io/serd/man/serd-filter.html) * [`serd-pipe` man page](https://drobilla.gitlab.io/serd/man/serd-pipe.html) Versioning diff --git a/doc/man/meson.build b/doc/man/meson.build index ae0c1c51..575d3d71 100644 --- a/doc/man/meson.build +++ b/doc/man/meson.build @@ -21,6 +21,7 @@ if not get_option('tools').disabled() ) endif + install_man(files('serd-filter.1')) install_man(files('serd-pipe.1')) endif @@ -44,6 +45,7 @@ if not get_option('tools').disabled() ] page_names = [ + 'serd-filter', 'serd-pipe', ] diff --git a/doc/man/serd-filter.1 b/doc/man/serd-filter.1 new file mode 100644 index 00000000..634d5f3b --- /dev/null +++ b/doc/man/serd-filter.1 @@ -0,0 +1,172 @@ +.\" # Copyright 2021-2022 David Robillard <d@drobilla.net> +.\" # SPDX-License-Identifier: ISC +.Dd July 15, 2022 +.Dt SERD-FILTER 1 +.Os Serd +.Sh NAME +.Nm serd-filter +.Nd print RDF statements that match a pattern +.Sh SYNOPSIS +.Nm serd-filter +.Op Fl hVv +.Op Fl B Ar base +.Op Fl I Ar syntax +.Op Fl O Ar syntax +.Op Fl b Ar bytes +.Op Fl f Ar pattern_file +.Op Fl k Ar bytes +.Op Fl o Ar filename +.Ar pattern +.Op Ar input ... +.Sh DESCRIPTION +.Nm +filters statements in RDF data. +Its interface is similar to +.Xr grep 1 , +except patterns are structural: +instead of matching characters within a line, +.Nm +matches nodes within a statement. +Only those statements from the input that match the pattern +(or do not match the pattern, if +.Fl v +is given) are written. +.Pp +Patterns are written in NTriples or NQuads with an extension that allows variables written like +.Li ?this +or +.Li $that . +.Pp +Input and output arguments work the same way as with +.Xr serd-pipe 1 . +.Pp +The options are as follows: +.Pp +.Bl -tag -compact -width 3n +.It Fl B Ar base +Base URI, path, or +.Cm rebase +to use the output path. +See +.Xr serd-pipe 1 +for details. +.Pp +.It Fl I Ar syntax +Input syntax or option: +.Cm NQuads , +.Cm NTriples , +.Cm TriG , +.Cm Turtle , +.Cm lax , +.Cm variables , +.Cm relative , +or +.Cm labels . +See +.Xr serd-pipe 1 +for details. +.Pp +.It Fl O Ar syntax +Output syntax or option: +.Cm empty , +.Cm NQuads , +.Cm NTriples , +.Cm TriG , +.Cm Turtle , +.Cm ascii , +.Cm expanded , +.Cm verbatim , +.Cm terse , +or +.Cm lax . +See +.Xr serd-pipe 1 +for details. +.Pp +.It Fl V +Display version information and exit. +.Pp +.It Fl b Ar bytes +I/O block size. +See +.Xr serd-pipe 1 +for details. +.Pp +.It Fl f Ar pattern_file +Load pattern from +.Ar pattern_file +instead of the first positional argument. +.Pp +.It Fl h +Print the command line options. +.Pp +.It Fl k Ar bytes +Parser stack size. +See +.Xr serd-pipe 1 +for details. +.Pp +.It Fl o Ar filename +Write output to the given +.Ar filename +instead of stdout. +.Pp +.It Fl v +Invert filter to only emit statements that do +.Em not +match the pattern. +.El +.Sh EXIT STATUS +.Nm +exits with a status of 0, or non-zero if an error occured. +.Sh EXAMPLES +To print all type statements: +.Pp +.Dl $ serd-filter '?subject a ?type .' input.ttl +.Pp +To print every statement about http://example.org/subject: +.Pp +.Dl $ serd-filter '<http://example.org/subject> ?p ?o .' input.ttl +.Sh SEE ALSO +.Bl -item -compact +.It +.Xr serd-pipe 1 +.It +.Lk http://drobilla.net/software/serd/ +.El +.Sh STANDARDS +.Bl -item -compact +.It +.Rs +.%A W3C +.%T RDF 1.1 NQuads +.%D February 2014 +.Re +.Lk https://www.w3.org/TR/n-quads/ +.It +.Rs +.%A W3C +.%D February 2014 +.%T RDF 1.1 NTriples +.Re +.Lk https://www.w3.org/TR/n-triples/ +.It +.Rs +.%A W3C +.%T RDF 1.1 TriG +.%D February 2014 +.Re +.Lk https://www.w3.org/TR/trig/ +.It +.Rs +.%A W3C +.%D February 2014 +.%T RDF 1.1 Turtle +.Re +.Lk https://www.w3.org/TR/turtle/ +.El +.Sh AUTHORS +.Nm +is a part of serd, by +.An David Robillard +.Mt d@drobilla.net . diff --git a/doc/man/serd-pipe.1 b/doc/man/serd-pipe.1 index 7f91de29..2d6534ae 100644 --- a/doc/man/serd-pipe.1 +++ b/doc/man/serd-pipe.1 @@ -279,6 +279,8 @@ exits with a status of 0, or non-zero if an error occurred. .Sh SEE ALSO .Bl -item -compact .It +.Xr serd-filter 1 +.It .Lk http://drobilla.net/software/serd/ .It .Lk http://gitlab.com/drobilla/serd/ diff --git a/include/serd/filter.h b/include/serd/filter.h new file mode 100644 index 00000000..856df530 --- /dev/null +++ b/include/serd/filter.h @@ -0,0 +1,60 @@ +// Copyright 2011-2022 David Robillard <d@drobilla.net> +// SPDX-License-Identifier: ISC + +#ifndef SERD_FILTER_H +#define SERD_FILTER_H + +#include "serd/attributes.h" +#include "serd/node.h" +#include "serd/sink.h" +#include "serd/world.h" +#include "zix/attributes.h" + +#include <stdbool.h> + +SERD_BEGIN_DECLS + +/** + @defgroup serd_filter Filter + @ingroup serd_streaming + @{ +*/ + +/** + Return a new sink that filters out statements that do not match a pattern. + + The returned sink acts like `target` in all respects, except that some + statements may be dropped. + + @param world The world to create the sink in. + + @param target The target sink to pass the filtered data to. + + @param subject The optional subject of the filter pattern. + + @param predicate The optional predicate of the filter pattern. + + @param object The optional object of the filter pattern. + + @param graph The optional graph of the filter pattern. + + @param inclusive If true, then only statements that match the pattern are + passed through. Otherwise, only statements that do *not* match the pattern + are passed through. +*/ +SERD_API SerdSink* ZIX_ALLOCATED +serd_filter_new(const SerdWorld* ZIX_NONNULL world, + const SerdSink* ZIX_NONNULL target, + const SerdNode* ZIX_NULLABLE subject, + const SerdNode* ZIX_NULLABLE predicate, + const SerdNode* ZIX_NULLABLE object, + const SerdNode* ZIX_NULLABLE graph, + bool inclusive); + +/** + @} +*/ + +SERD_END_DECLS + +#endif // SERD_FILTER_H diff --git a/include/serd/serd.h b/include/serd/serd.h index f874a0cc..77d1abf8 100644 --- a/include/serd/serd.h +++ b/include/serd/serd.h @@ -72,6 +72,7 @@ #include "serd/canon.h" #include "serd/env.h" #include "serd/event.h" +#include "serd/filter.h" #include "serd/sink.h" /** diff --git a/include/serd/status.h b/include/serd/status.h index c6047aff..5aedd5a6 100644 --- a/include/serd/status.h +++ b/include/serd/status.h @@ -39,6 +39,7 @@ typedef enum { SERD_BAD_URI, ///< Invalid or unresolved URI SERD_BAD_DATA, ///< Invalid data SERD_BAD_LITERAL, ///< Invalid literal + SERD_BAD_PATTERN, ///< Invalid statement pattern } SerdStatus; /// Return a string describing a status code diff --git a/meson.build b/meson.build index e877fc9c..96fb60d1 100644 --- a/meson.build +++ b/meson.build @@ -133,6 +133,7 @@ c_headers = files( 'include/serd/caret.h', 'include/serd/env.h', 'include/serd/event.h', + 'include/serd/filter.h', 'include/serd/input_stream.h', 'include/serd/log.h', 'include/serd/memory.h', @@ -164,6 +165,7 @@ sources = files( 'src/canon.c', 'src/caret.c', 'src/env.c', + 'src/filter.c', 'src/input_stream.c', 'src/log.c', 'src/memory.c', diff --git a/src/filter.c b/src/filter.c new file mode 100644 index 00000000..581a7b72 --- /dev/null +++ b/src/filter.c @@ -0,0 +1,127 @@ +// Copyright 2019-2022 David Robillard <d@drobilla.net> +// SPDX-License-Identifier: ISC + +#include "serd/filter.h" + +#include "serd/event.h" +#include "serd/memory.h" +#include "serd/statement.h" +#include "serd/status.h" + +#include "memory.h" +#include "sink.h" + +#include <assert.h> +#include <stdbool.h> +#include <stdlib.h> + +typedef struct { + const SerdSink* target; + SerdNode* subject; + SerdNode* predicate; + SerdNode* object; + SerdNode* graph; + bool inclusive; +} SerdFilterData; + +static void +free_data(void* const handle) +{ + if (handle) { + SerdFilterData* const data = (SerdFilterData*)handle; + SerdAllocator* const allocator = data->target->allocator; + + serd_node_free(allocator, data->subject); + serd_node_free(allocator, data->predicate); + serd_node_free(allocator, data->object); + serd_node_free(allocator, data->graph); + serd_afree(allocator, data); + } +} + +static SerdStatus +serd_filter_on_event(void* const handle, const SerdEvent* const event) +{ + const SerdFilterData* const data = (SerdFilterData*)handle; + + if (event->type == SERD_STATEMENT) { + const bool matches = serd_statement_matches(event->statement.statement, + data->subject, + data->predicate, + data->object, + data->graph); + + if (data->inclusive == matches) { + // Emit statement with reset flags to avoid confusing the writer + SerdEvent out_event = *event; + out_event.statement.flags = 0U; + return serd_sink_write_event(data->target, &out_event); + } + + return SERD_SUCCESS; // Skip statement + } + + return event->type == SERD_END ? SERD_SUCCESS + : serd_sink_write_event(data->target, event); +} + +SerdSink* +serd_filter_new(const SerdWorld* const world, + const SerdSink* const target, + const SerdNode* const subject, + const SerdNode* const predicate, + const SerdNode* const object, + const SerdNode* const graph, + const bool inclusive) +{ + assert(world); + assert(target); + + SerdAllocator* const allocator = serd_world_allocator(world); + SerdFilterData* const data = + (SerdFilterData*)serd_wcalloc(world, 1, sizeof(SerdFilterData)); + + if (!data) { + return NULL; + } + + data->target = target; + data->inclusive = inclusive; + + if (subject && serd_node_type(subject) != SERD_VARIABLE) { + if (!(data->subject = serd_node_copy(allocator, subject))) { + free_data(data); + return NULL; + } + } + + if (predicate && serd_node_type(predicate) != SERD_VARIABLE) { + if (!(data->predicate = serd_node_copy(allocator, predicate))) { + free_data(data); + return NULL; + } + } + + if (object && serd_node_type(object) != SERD_VARIABLE) { + if (!(data->object = serd_node_copy(allocator, object))) { + free_data(data); + return NULL; + } + } + + if (graph && serd_node_type(graph) != SERD_VARIABLE) { + if (!(data->graph = serd_node_copy(allocator, graph))) { + free_data(data); + return NULL; + } + } + + SerdSink* const sink = + serd_sink_new(allocator, data, serd_filter_on_event, free_data); + + if (!sink) { + free_data(data); + } + + return sink; +} diff --git a/src/string.c b/src/string.c index a1151c60..b8227100 100644 --- a/src/string.c +++ b/src/string.c @@ -62,6 +62,8 @@ serd_strerror(const SerdStatus status) return "Invalid data"; case SERD_BAD_LITERAL: return "Invalid literal"; + case SERD_BAD_PATTERN: + return "Invalid statement pattern"; } return "Unknown error"; diff --git a/test/extra/filter/input.ttl b/test/extra/filter/input.ttl new file mode 100644 index 00000000..59aa67f7 --- /dev/null +++ b/test/extra/filter/input.ttl @@ -0,0 +1,9 @@ +@prefix eg: <http://example.org/> . + +eg:s1 + eg:p1 eg:o1 ; + eg:p2 eg:o2 . + +eg:s2 + eg:p1 eg:o1 ; + eg:p2 eg:o2 . diff --git a/test/extra/filter/manifest.ttl b/test/extra/filter/manifest.ttl new file mode 100644 index 00000000..6ac5cec8 --- /dev/null +++ b/test/extra/filter/manifest.ttl @@ -0,0 +1,48 @@ +@prefix checks: <http://drobilla.net/ns/serd/checks#> . +@prefix mf: <http://www.w3.org/2001/sw/DataAccess/tests/test-manifest#> . +@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> . +@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . +@prefix rdft: <http://www.w3.org/ns/rdftest#> . +@prefix serd: <http://drobilla.net/ns/serd#> . + +serd:TestFilterPositive + a rdfs:Class ; + rdfs:label "Positive Filtering" ; + rdfs:subClassOf rdft:Test . + +serd:patternFile + a rdf:Property ; + rdfs:label "pattern file" . + +rdft:Test + rdfs:subClassOf mf:ManifestEntry . + +<> + a mf:Manifest ; + rdfs:comment "Serd statement filtering test suite" ; + mf:entries ( + <#o1> + <#p1> + <#s1> + ) . + +<#o1> + a serd:TestFilterPositive ; + serd:patternFile <o1.pattern.nt> ; + mf:action <input.ttl> ; + mf:name "o1" ; + mf:result <o1.result.nt> . + +<#p1> + a serd:TestFilterPositive ; + serd:patternFile <p1.pattern.nt> ; + mf:action <input.ttl> ; + mf:name "p1" ; + mf:result <p1.result.nt> . + +<#s1> + a serd:TestFilterPositive ; + serd:patternFile <s1.pattern.nt> ; + mf:action <input.ttl> ; + mf:name "s1" ; + mf:result <s1.result.nt> . diff --git a/test/extra/filter/meson.build b/test/extra/filter/meson.build new file mode 100644 index 00000000..f03c7130 --- /dev/null +++ b/test/extra/filter/meson.build @@ -0,0 +1,17 @@ +base_uri = 'http://drobilla.net/sw/serd/test/filter/' + +test( + 'filter', + run_filter_suite, + args: common_script_args + [ + '--pipe', + serd_pipe, + '--filter', + serd_filter, + files('manifest.ttl'), + base_uri, + ], + env: test_env, + suite: ['suite', 'extra'], + timeout: 240, +) diff --git a/test/extra/filter/o1.pattern.nt b/test/extra/filter/o1.pattern.nt new file mode 100644 index 00000000..41932fd7 --- /dev/null +++ b/test/extra/filter/o1.pattern.nt @@ -0,0 +1 @@ +?s ?p <http://example.org/o1> . diff --git a/test/extra/filter/o1.result.nt b/test/extra/filter/o1.result.nt new file mode 100644 index 00000000..e7b1e759 --- /dev/null +++ b/test/extra/filter/o1.result.nt @@ -0,0 +1,2 @@ +<http://example.org/s1> <http://example.org/p1> <http://example.org/o1> . +<http://example.org/s2> <http://example.org/p1> <http://example.org/o1> . diff --git a/test/extra/filter/p1.pattern.nt b/test/extra/filter/p1.pattern.nt new file mode 100644 index 00000000..fca20e94 --- /dev/null +++ b/test/extra/filter/p1.pattern.nt @@ -0,0 +1 @@ +?s <http://example.org/p1> ?o . diff --git a/test/extra/filter/p1.result.nt b/test/extra/filter/p1.result.nt new file mode 100644 index 00000000..e7b1e759 --- /dev/null +++ b/test/extra/filter/p1.result.nt @@ -0,0 +1,2 @@ +<http://example.org/s1> <http://example.org/p1> <http://example.org/o1> . +<http://example.org/s2> <http://example.org/p1> <http://example.org/o1> . diff --git a/test/extra/filter/s1.pattern.nt b/test/extra/filter/s1.pattern.nt new file mode 100644 index 00000000..f5b87db1 --- /dev/null +++ b/test/extra/filter/s1.pattern.nt @@ -0,0 +1 @@ +<http://example.org/s1> ?p ?o . diff --git a/test/extra/filter/s1.result.nt b/test/extra/filter/s1.result.nt new file mode 100644 index 00000000..023faf42 --- /dev/null +++ b/test/extra/filter/s1.result.nt @@ -0,0 +1,2 @@ +<http://example.org/s1> <http://example.org/p1> <http://example.org/o1> . +<http://example.org/s1> <http://example.org/p2> <http://example.org/o2> . diff --git a/test/meson.build b/test/meson.build index c186b6e4..6ca0e38b 100644 --- a/test/meson.build +++ b/test/meson.build @@ -1,6 +1,7 @@ # Copyright 2020-2023 David Robillard <d@drobilla.net> # SPDX-License-Identifier: 0BSD OR ISC +run_filter_suite = find_program('run_filter_suite.py') run_suite = find_program('run_suite.py') wrapper = meson.get_external_property('exe_wrapper', '') @@ -15,10 +16,12 @@ plot_script_paths = [ simple_script_paths = [ '../scripts/check_formatting.py', 'serd_test_util/__init__.py', + 'run_filter_suite.py', 'run_suite.py', 'test_base.py', 'test_empty.py', 'test_multifile.py', + 'test_patterns.py', 'test_quiet.py', 'test_stdin.py', 'test_write_error.py', @@ -127,6 +130,7 @@ unit_tests = [ 'canon', 'caret', 'env', + 'filter', 'free_null', 'log', 'node', @@ -177,6 +181,25 @@ if wrapper != '' endif simple_command_tests = { + 'filter': { + 'bad': [ + ['-B', 'unknown'], + ['-F', '', '-G', ''], + ['-F', '?s ?p ?o . ?q ?r ?s .', '-s', ''], + ['-F', '?s ?p ?o .\n?q ?r ?s .\n', '-s', ''], + ['-F', 'bad_pattern', '-s', ''], + ['-F'], + ['-G', '?s ?p ?o . ?q ?r ?s .', '-s', ''], + ['-G', 'bad_pattern', '-s', ''], + ['-G'], + ['-f', '/no/such/file.nt', '-'], + ['-z'], + ], + 'good': [ + ['-V'], + ['-h'], + ], + }, 'pipe': { 'bad': [ ['-B', 'nonuriorpath'], @@ -206,6 +229,24 @@ simple_command_tests = { }, } +foreach tool, tests : simple_command_tests + tool_var_name = 'serd_' + tool + if is_variable(tool_var_name) + foreach kind, cases : tests + foreach args : cases + test( + ' '.join(args).substring(1).underscorify(), + get_variable(tool_var_name), + args: args, + env: test_env, + should_fail: kind == 'bad', + suite: ['tools', tool, 'options'], + ) + endforeach + endforeach + endif +endforeach + if is_variable('serd_pipe') pipe_script_args = common_script_args + ['--tool', serd_pipe] serd_ttl = files('../serd.ttl')[0] @@ -218,19 +259,6 @@ if is_variable('serd_pipe') cmd_suite = ['tools', 'pipe', 'options'] - foreach kind, cases : simple_command_tests['pipe'] - foreach args : cases - test( - ' '.join(args).substring(1).underscorify(), - serd_pipe, - args: args, - env: test_env, - should_fail: kind == 'bad', - suite: cmd_suite, - ) - endforeach - endforeach - # Base URI options test( @@ -405,6 +433,62 @@ if is_variable('serd_pipe') endif endif +# Test specifics to serd-filter +if is_variable('serd_filter') + tool = serd_filter + filter_script_args = common_script_args + ['--tool', serd_filter] + + # Command line options + + test( + 'garbage_pattern', + tool, + args: ['junk', serd_ttl], + env: test_env, + should_fail: true, + suite: ['tools', 'filter', 'options'], + ) + test( + 'multiple_patterns', + tool, + args: ['?s ?p ?o .\n?t ?u ?v .\n', serd_ttl], + env: test_env, + should_fail: true, + suite: ['tools', 'filter', 'output'], + ) + test( + 'missing_output', + tool, + args: ['-o', '/does/not/exist.ttl', '?s ?p ?o .', serd_ttl], + env: test_env, + should_fail: true, + suite: ['tools', 'filter', 'output'], + ) + + # Different input sources + + test( + 'filter_dir', + tool, + args: ['?s ?p ?o .', serd_src_root], + env: test_env, + should_fail: true, + suite: ['tools', 'filter', 'input'], + ) + + # Filtering + + test( + 'patterns', + files('test_patterns.py'), + args: filter_script_args, + env: test_env, + suite: ['tools'], + ) + + subdir('extra/filter') +endif + ########################### # Data-Driven Test Suites # ########################### diff --git a/test/run_filter_suite.py b/test/run_filter_suite.py new file mode 100755 index 00000000..a1134538 --- /dev/null +++ b/test/run_filter_suite.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python3 + +# Copyright 2022-2023 David Robillard <d@drobilla.net> +# SPDX-License-Identifier: ISC + +"""Run the RDF-based test suite for serd-filter.""" + +# pylint: disable=duplicate-code + +import argparse +import os +import shlex +import subprocess +import sys +import tempfile + +import serd_test_util as util + +NS_MF = "http://www.w3.org/2001/sw/DataAccess/tests/test-manifest#" +NS_RDFT = "http://www.w3.org/ns/rdftest#" +NS_SERD = "http://drobilla.net/ns/serd#" + + +def run_entry(entry, filter_command, out_dir, suite_dir): + """Run a single test entry from the manifest.""" + + pattern_path = util.file_path(suite_dir, entry[NS_SERD + "patternFile"][0]) + in_path = util.file_path(suite_dir, entry[NS_MF + "action"][0]) + good_path = util.file_path(suite_dir, entry[NS_MF + "result"][0]) + out_path = os.path.join(out_dir, os.path.basename(good_path)) + + # Run the command to write the output file + options = ["-f", pattern_path, "-o", out_path] + command = filter_command + options + [in_path] + subprocess.run(command, check=True) + + # Check that the filtered output matches the expected result + return util.file_equals(good_path, out_path) + + +def run_suite(manifest_path, base_uri, filter_command, pipe_command, out_dir): + """Run all tests in the manifest.""" + + # Load manifest model + suite_dir = os.path.dirname(manifest_path) + load_command = pipe_command + ["-B", base_uri] + filter_command = filter_command + ["-B", base_uri] + model, instances = util.load_rdf(load_command, manifest_path) + + # Run all filter tests in the test suite + results = util.Results() + for klass, instances in instances.items(): + if klass != "http://drobilla.net/ns/serd#TestFilterPositive": + continue + + for instance in instances: + try: + entry = model[instance] + results.check( + run_entry(entry, filter_command, out_dir, suite_dir) + ) + + except subprocess.CalledProcessError as exception: + if exception.stderr is not None: + sys.stderr.write(exception.stderr.decode("utf-8")) + + results.check(False, str(exception)) + + return util.print_result_summary(results) + + +def main(): + """Run the filter test suite via the command line tools.""" + + parser = argparse.ArgumentParser( + usage="%(prog)s [OPTION]... MANIFEST BASE_URI -- [ARG]...", + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + + parser.add_argument( + "--pipe", default="tools/serd-pipe", help="pipe executable" + ) + + parser.add_argument( + "--filter", default="tools/serd-filter", help="filter executable" + ) + + parser.add_argument("--wrapper", default="", help="executable wrapper") + parser.add_argument("manifest", help="test suite manifest.ttl file") + parser.add_argument("base_uri", help="base URI for tests") + + args = parser.parse_args(sys.argv[1:]) + wrapper_prefix = shlex.split(args.wrapper) + filter_command = wrapper_prefix + [args.filter] + pipe_command = wrapper_prefix + [args.pipe] + + with tempfile.TemporaryDirectory() as test_out_dir: + return run_suite( + args.manifest, + args.base_uri, + filter_command, + pipe_command, + test_out_dir, + ) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/test/run_suite.py b/test/run_suite.py index 52a418ef..463e40c4 100755 --- a/test/run_suite.py +++ b/test/run_suite.py @@ -125,7 +125,6 @@ def run_suite(args, command, out_dir): # Run test and record result passed = run_entry(args, entry, command, out_dir, top) - results.check(passed) # Write test report entry if args.report: @@ -137,13 +136,13 @@ def run_suite(args, command, out_dir): if exception.stderr is not None: sys.stderr.write(exception.stderr) - results.check(False, str(exception) + "\n") + results.check(passed) return util.print_result_summary(results) def main(): - """Run the command line tool.""" + """Run the test suite via the command line tool.""" parser = argparse.ArgumentParser( usage="%(prog)s [OPTION]... MANIFEST BASE_URI -- [ARG]...", diff --git a/test/serd_test_util/__init__.py b/test/serd_test_util/__init__.py index 5f0e0033..a75bb2ae 100644 --- a/test/serd_test_util/__init__.py +++ b/test/serd_test_util/__init__.py @@ -182,3 +182,16 @@ def lines_equal(from_lines, to_lines, from_filename, to_filename): same = False return same + + +def file_equals(patha, pathb): + """Return true if the file at patha is the same as the file at pathb.""" + + for path in (patha, pathb): + if not os.access(path, os.F_OK): + error("missing file {}\n".format(path)) + return False + + with open(patha, "r", encoding="utf-8") as fa: + with open(pathb, "r", encoding="utf-8") as fb: + return lines_equal(fa.readlines(), fb.readlines(), patha, pathb) diff --git a/test/test_filter.c b/test/test_filter.c new file mode 100644 index 00000000..652adb48 --- /dev/null +++ b/test/test_filter.c @@ -0,0 +1,61 @@ +// Copyright 2021 David Robillard <d@drobilla.net> +// SPDX-License-Identifier: ISC + +#undef NDEBUG + +#include "failing_allocator.h" + +#include "serd/filter.h" +#include "serd/node.h" +#include "serd/nodes.h" +#include "serd/sink.h" +#include "serd/world.h" + +#include <assert.h> +#include <stdbool.h> +#include <stddef.h> + +static void +test_new_failed_alloc(void) +{ + const SerdNodeArgs s_args = serd_a_uri_string("http://example.org/s"); + const SerdNodeArgs p_args = serd_a_uri_string("http://example.org/p"); + const SerdNodeArgs o_args = serd_a_uri_string("http://example.org/o"); + const SerdNodeArgs g_args = serd_a_uri_string("http://example.org/g"); + + SerdFailingAllocator allocator = serd_failing_allocator(); + + SerdWorld* const world = serd_world_new(&allocator.base); + SerdNodes* const nodes = serd_nodes_new(&allocator.base); + + const SerdNode* const s = serd_nodes_get(nodes, s_args); + const SerdNode* const p = serd_nodes_get(nodes, p_args); + const SerdNode* const o = serd_nodes_get(nodes, o_args); + const SerdNode* const g = serd_nodes_get(nodes, g_args); + + SerdSink* target = serd_sink_new(&allocator.base, NULL, NULL, NULL); + const size_t n_setup_allocs = allocator.n_allocations; + + // Successfully allocate a filter to count the number of allocations + SerdSink* filter = serd_filter_new(world, target, s, p, o, g, true); + assert(filter); + + // Test that each allocation failing is handled gracefully + const size_t n_new_allocs = allocator.n_allocations - n_setup_allocs; + for (size_t i = 0; i < n_new_allocs; ++i) { + allocator.n_remaining = i; + assert(!serd_filter_new(world, target, s, p, o, g, true)); + } + + serd_sink_free(filter); + serd_sink_free(target); + serd_nodes_free(nodes); + serd_world_free(world); +} + +int +main(void) +{ + test_new_failed_alloc(); + return 0; +} diff --git a/test/test_patterns.py b/test/test_patterns.py new file mode 100755 index 00000000..50571a92 --- /dev/null +++ b/test/test_patterns.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 + +# Copyright 2021-2023 David Robillard <d@drobilla.net> +# SPDX-License-Identifier: ISC + +"""Test filtering statements inclusively and exclusively.""" + +import serd_test_util as util + +DOCS = { + "ntriples": """ +<http://example.org/s> <http://example.org/p> <http://example.org/o> . +<http://example.org/N> <http://example.org/I> <http://example.org/L> . +""", + "nquads": """ +<urn:example:s> <urn:example:p> <urn:example:o> <urn:example:g> . +<urn:example:N> <urn:example:U> <urn:example:L> <urn:example:L> . +""", +} + +args = util.wrapper_args(__doc__) + + +def check_pattern(syntax, pattern, expected_inclusive, expected_exclusive): + """Run a check with an exclusive pattern.""" + + command = [args.tool, "-I", syntax, pattern] + inclusive = util.command_output(args.wrapper, command, DOCS[syntax]) + assert inclusive == expected_inclusive + + command = [args.tool, "-I", syntax, "-v", pattern] + exclusive = util.command_output(args.wrapper, command, DOCS[syntax]) + assert exclusive == expected_exclusive + + +check_pattern( + "ntriples", + "?s <http://example.org/p> <http://example.org/o> .", + "<http://example.org/s> <http://example.org/p> <http://example.org/o> .\n", + "<http://example.org/N> <http://example.org/I> <http://example.org/L> .\n", +) + +check_pattern( + "ntriples", + "<http://example.org/s> ?p <http://example.org/o> .", + "<http://example.org/s> <http://example.org/p> <http://example.org/o> .\n", + "<http://example.org/N> <http://example.org/I> <http://example.org/L> .\n", +) + +check_pattern( + "ntriples", + "<http://example.org/s> <http://example.org/p> ?o .", + "<http://example.org/s> <http://example.org/p> <http://example.org/o> .\n", + "<http://example.org/N> <http://example.org/I> <http://example.org/L> .\n", +) + +check_pattern( + "nquads", + "?s <urn:example:p> <urn:example:o> <urn:example:g> .", + "<urn:example:s> <urn:example:p> <urn:example:o> <urn:example:g> .\n", + "<urn:example:N> <urn:example:U> <urn:example:L> <urn:example:L> .\n", +) + +check_pattern( + "nquads", + "<urn:example:s> ?p <urn:example:o> <urn:example:g> .", + "<urn:example:s> <urn:example:p> <urn:example:o> <urn:example:g> .\n", + "<urn:example:N> <urn:example:U> <urn:example:L> <urn:example:L> .\n", +) + +check_pattern( + "nquads", + "<urn:example:s> <urn:example:p> ?o <urn:example:g> .", + "<urn:example:s> <urn:example:p> <urn:example:o> <urn:example:g> .\n", + "<urn:example:N> <urn:example:U> <urn:example:L> <urn:example:L> .\n", +) + +check_pattern( + "nquads", + "<urn:example:s> <urn:example:p> <urn:example:o> ?g .", + "<urn:example:s> <urn:example:p> <urn:example:o> <urn:example:g> .\n", + "<urn:example:N> <urn:example:U> <urn:example:L> <urn:example:L> .\n", +) diff --git a/test/test_string.c b/test/test_string.c index 5205cc9c..2d16936a 100644 --- a/test/test_string.c +++ b/test/test_string.c @@ -14,7 +14,7 @@ test_strerror(void) { const char* msg = serd_strerror(SERD_SUCCESS); assert(!strcmp(msg, "Success")); - for (int i = SERD_FAILURE; i <= SERD_BAD_LITERAL; ++i) { + for (int i = SERD_FAILURE; i <= SERD_BAD_PATTERN; ++i) { msg = serd_strerror((SerdStatus)i); assert(strcmp(msg, "Success")); } diff --git a/tools/meson.build b/tools/meson.build index ce82b212..43902c74 100644 --- a/tools/meson.build +++ b/tools/meson.build @@ -8,6 +8,15 @@ if get_option('static') and cc.get_id() != 'msvc' tool_link_args += ['-static'] endif +serd_filter = executable( + 'serd-filter', + files('console.c', 'serd-filter.c'), + c_args: tool_c_args, + dependencies: [serd_dep, zix_dep], + install: true, + link_args: tool_link_args, +) + serd_pipe = executable( 'serd-pipe', files('console.c', 'serd-pipe.c'), @@ -17,4 +26,5 @@ serd_pipe = executable( link_args: tool_link_args, ) +meson.override_find_program('serd-filter', serd_filter) meson.override_find_program('serd-pipe', serd_pipe) diff --git a/tools/serd-filter.c b/tools/serd-filter.c new file mode 100644 index 00000000..01834e5a --- /dev/null +++ b/tools/serd-filter.c @@ -0,0 +1,317 @@ +// Copyright 2011-2021 David Robillard <d@drobilla.net> +// SPDX-License-Identifier: ISC + +#include "console.h" + +#include "serd/attributes.h" +#include "serd/env.h" +#include "serd/event.h" +#include "serd/filter.h" +#include "serd/input_stream.h" +#include "serd/log.h" +#include "serd/memory.h" +#include "serd/node.h" +#include "serd/nodes.h" +#include "serd/reader.h" +#include "serd/sink.h" +#include "serd/statement.h" +#include "serd/status.h" +#include "serd/string_view.h" +#include "serd/syntax.h" +#include "serd/world.h" +#include "serd/writer.h" + +#include <stdarg.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> + +/* Application (after parsing command-line arguments) */ + +// All options +typedef struct { + SerdCommonOptions common; + const char* pattern; + const char* pattern_file; + char* const* inputs; + intptr_t n_inputs; + bool invert; +} Options; + +// A single statement pattern +typedef struct { + SerdNode* s; + SerdNode* p; + SerdNode* o; + SerdNode* g; +} FilterPattern; + +// Context for the pattern event callback +typedef struct { + SerdAllocator* allocator; + FilterPattern pattern; +} PatternEventContext; + +// Handler for events read from a pattern +static SerdStatus +on_pattern_event(void* const handle, const SerdEvent* const event) +{ + PatternEventContext* const ctx = (PatternEventContext*)handle; + SerdAllocator* const allocator = ctx->allocator; + + if (event->type == SERD_STATEMENT) { + FilterPattern* const pat = &ctx->pattern; + if (pat->s) { + return SERD_BAD_PATTERN; + } + + const SerdStatement* const statement = event->statement.statement; + pat->s = serd_node_copy(allocator, serd_statement_subject(statement)); + pat->p = serd_node_copy(allocator, serd_statement_predicate(statement)); + pat->o = serd_node_copy(allocator, serd_statement_object(statement)); + pat->g = serd_node_copy(allocator, serd_statement_graph(statement)); + } + + return SERD_SUCCESS; +} + +// Parse a pattern from some input and return a new filter for it +static SerdSink* +parse_pattern(SerdWorld* const world, + const SerdSink* const sink, + SerdInputStream* const in, + const bool inclusive) +{ + SerdAllocator* const allocator = serd_world_allocator(world); + SerdEnv* const env = serd_env_new(allocator, serd_empty_string()); + PatternEventContext ctx = {allocator, {NULL, NULL, NULL, NULL}}; + + SerdSink* in_sink = serd_sink_new(allocator, &ctx, on_pattern_event, NULL); + SerdReader* reader = + serd_reader_new(world, SERD_NQUADS, SERD_READ_VARIABLES, env, in_sink); + + const SerdNode* pattern_name = + serd_nodes_get(serd_world_nodes(world), serd_a_string("pattern")); + + SerdStatus st = serd_reader_start(reader, in, pattern_name, 1); + if (!st) { + st = serd_reader_read_document(reader); + } + + serd_close_input(in); + serd_reader_free(reader); + serd_env_free(env); + serd_sink_free(in_sink); + + if (st) { + serd_logf(world, + SERD_LOG_LEVEL_ERROR, + "failed to parse pattern (%s)", + serd_strerror(st)); + return NULL; + } + + SerdSink* filter = serd_filter_new(world, + sink, + ctx.pattern.s, + ctx.pattern.p, + ctx.pattern.o, + ctx.pattern.g, + inclusive); + + serd_node_free(allocator, ctx.pattern.s); + serd_node_free(allocator, ctx.pattern.p); + serd_node_free(allocator, ctx.pattern.o); + serd_node_free(allocator, ctx.pattern.g); + return filter; +} + +SERD_LOG_FUNC(2, 3) +static SerdStatus +log_error(SerdWorld* const world, const char* const fmt, ...) +{ + va_list args; + va_start(args, fmt); + + const SerdLogField file = {"SERD_FILE", "serd-filter"}; + const SerdStatus st = + serd_vxlogf(world, SERD_LOG_LEVEL_ERROR, 1, &file, fmt, args); + + va_end(args); + return st; +} + +// Run the tool using the given options +static SerdStatus +run(Options opts) +{ + SerdTool app = {{NULL, NULL, NULL, NULL}, NULL, NULL, NULL}; + + // Set up the writing environment + SerdStatus st = SERD_SUCCESS; + if ((st = serd_tool_setup(&app, "serd-filter", opts.common))) { + serd_tool_cleanup(app); + return st; + } + + const SerdSink* const target = serd_writer_sink(app.writer); + + // Open the pattern input (either a string or filename) + SerdInputStream pattern = {NULL, NULL, NULL, NULL}; + const char* position = opts.pattern; + if (opts.pattern) { + pattern = serd_open_input_string(&position); + } else if (opts.pattern_file) { + pattern = serd_open_input_file(opts.pattern_file); + } + + if (!pattern.stream) { + log_error(app.world, "failed to open pattern"); + return SERD_BAD_STREAM; + } + + // Set up the output pipeline: filter -> writer + SerdSink* const filter = + parse_pattern(app.world, target, &pattern, !opts.invert); + if (!filter) { + log_error(app.world, "failed to set up filter"); + return SERD_UNKNOWN_ERROR; + } + + serd_close_input(&pattern); + + // Read all the inputs, which drives the writer to emit the output + if (!(st = serd_read_inputs(app.world, + opts.common, + app.env, + opts.n_inputs, + opts.inputs, + filter))) { + st = serd_writer_finish(app.writer); + } + + if (st) { + log_error(app.world, "failed to read input (%s)", serd_strerror(st)); + } + + serd_sink_free(filter); + + const SerdStatus cst = serd_tool_cleanup(app); + return st ? st : cst; +} + +/* Command-line interface (before setting up serd) */ + +static int +print_usage(const char* const name, const bool error) +{ + static const char* const description = + "Search for statements matching PATTERN in each INPUT.\n" + "INPUT can be a local filename, or \"-\" to read from standard input.\n\n" + " -B BASE_URI Base URI or path for resolving relative references.\n" + " -I SYNTAX Input syntax turtle/ntriples/trig/nquads, or option\n" + " lax/variables/relative/global/generated.\n" + " -O SYNTAX Output syntax empty/turtle/ntriples/nquads, or option\n" + " ascii/expanded/verbatim/terse/lax.\n" + " -V Display version information and exit.\n" + " -f PATTERN_FILE Read pattern from PATTERN_FILE instead.\n" + " -h Display this help and exit.\n" + " -k BYTES Parser stack size.\n" + " -o FILENAME Write output to FILENAME instead of stdout.\n" + " -v Invert filter to select non-matching statements.\n"; + + FILE* const os = error ? stderr : stdout; + fprintf(os, "%s", error ? "\n" : ""); + fprintf(os, "Usage: %s [OPTION]... PATTERN [INPUT]...\n", name); + fprintf(os, " %s [OPTION]... -f PATTERN_FILE [INPUT]...\n", name); + fprintf(os, "\n%s", description); + return error ? EXIT_FAILURE : EXIT_SUCCESS; +} + +// Parse the option pointed to by `iter`, and advance it to the next one +static SerdStatus +parse_option(OptionIter* const iter, Options* const opts) +{ +#define ARG_ERRORF(fmt, ...) \ + fprintf(stderr, "%s: " fmt, iter->argv[0], __VA_ARGS__) + + SerdStatus st = serd_parse_common_option(iter, &opts->common); + if (st != SERD_FAILURE) { + return st; + } + + const char opt = iter->argv[iter->a][iter->f]; + + switch (opt) { + case 'V': + return serd_print_version("serd-filter"); + + case 'f': + return serd_get_argument(iter, &opts->pattern_file); + + case 'h': + print_usage(iter->argv[0], false); + return SERD_FAILURE; + + case 'v': + opts->invert = true; + return serd_option_iter_advance(iter); + + default: + break; + } + + ARG_ERRORF("invalid option -- '%c'\n", opt); + return SERD_BAD_ARG; + +#undef ARG_ERRORF +} + +int +main(int argc, char** argv) +{ + char default_input[] = "-"; + char* default_inputs[] = {default_input}; + + Options opts = {{"", + NULL, + 4096U, + 1048576U, + {SERD_SYNTAX_EMPTY, 0U, false}, + {SERD_NQUADS, 0U, false}}, + NULL, + NULL, + NULL, + 0U, + false}; + + // Parse all command line options (which must precede inputs) + SerdStatus st = SERD_SUCCESS; + OptionIter iter = {argv, argc, 1, 1}; + while (!serd_option_iter_is_end(iter)) { + if ((st = parse_option(&iter, &opts))) { + return (st == SERD_FAILURE) ? 0 : print_usage(argv[0], true); + } + } + + // If -f isn't used, then the first positional argument is the pattern + if (!opts.pattern_file) { + opts.pattern = argv[iter.a++]; + } + + // Every argument past that is an input + opts.inputs = argv + iter.a; + opts.n_inputs = argc - iter.a; + if (opts.n_inputs == 0) { + opts.n_inputs = 1; + opts.inputs = default_inputs; + } + + // Don't add prefixes to blank node labels if there is only one input + if (opts.n_inputs == 1) { + opts.common.input.flags |= SERD_READ_GLOBAL; + } + + return run(opts) > SERD_FAILURE; +} |