diff options
-rw-r--r-- | doc/serdi.1 | 28 | ||||
-rw-r--r-- | include/serd/serd.h | 35 | ||||
-rw-r--r-- | meson.build | 1 | ||||
-rw-r--r-- | src/filter.c | 102 | ||||
-rw-r--r-- | src/serdi.c | 108 | ||||
-rw-r--r-- | test/meson.build | 18 | ||||
-rwxr-xr-x | test/test_filter.py | 94 | ||||
-rwxr-xr-x | test/test_grep.py | 94 |
8 files changed, 476 insertions, 4 deletions
diff --git a/doc/serdi.1 b/doc/serdi.1 index 59eb67ce..2a110785 100644 --- a/doc/serdi.1 +++ b/doc/serdi.1 @@ -3,10 +3,11 @@ .Os Serd 0.30.11 .Sh NAME .Nm serdi -.Nd read, transform, and write RDF data +.Nd read, filter, transform, and write RDF data .Sh SYNOPSIS .Nm serdi .Op Fl Cabefhlmqtvx +.Op Fl F Ar pattern | Fl G Ar pattern .Op Fl I Ar base .Op Fl c Ar prefix .Op Fl i Ar syntax @@ -47,6 +48,23 @@ All numeric datatypes are supported, as well as and .Vt base64Binary . .Pp +.It Fl F Ar pattern +Filter out statements that match +.Ar pattern . +The pattern must be a single statement written in NTriples or NQuads, +with variables like +.Dq ?name +for wildcards. +The names of variables in the pattern are insignificant. +.Pp +.It Fl G Ar pattern +Only include statements that match +.Ar pattern . +This option is like +.Fl p +but inverted, +so that only matching statements are included, like grep. +.Pp .It Fl I Ar base Input base URI. Relative URI references in the input will be resolved against this. @@ -180,6 +198,14 @@ To pretty-print a document: To print any errors: .Pp .Dl $ serdi file.ttl > /dev/null +.Pp +To remove any rdf:type properties: +.Pp +.Dl $ serdi -F \(dq?s <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?o .\(dq file.ttl +.Pp +To include only rdf:type properties: +.Pp +.Dl $ serdi -G \(dq?s <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?o .\(dq file.ttl .Sh SEE ALSO .Bl -item -compact .It diff --git a/include/serd/serd.h b/include/serd/serd.h index fad9d070..5edb4b2f 100644 --- a/include/serd/serd.h +++ b/include/serd/serd.h @@ -1966,6 +1966,41 @@ serd_canon_new(const SerdWorld* SERD_NULLABLE world, /** @} + @defgroup serd_filter Filter + @{ +*/ + +/** + Return a new sink that filters out statements that do not match a pattern. + + The returned sink acts like `target` in all respects, except that some + statements may be dropped. + + @param target The target sink to pass the filtered data to. + + @param subject The optional subject of the filter pattern. + + @param predicate The optional predicate of the filter pattern. + + @param object The optional object of the filter pattern. + + @param graph The optional graph of the filter pattern. + + @param inclusive If true, then only statements that match the pattern are + passed through. Otherwise, only statements that do *not* match the pattern + are passed through. +*/ +SERD_API +SerdSink* SERD_ALLOCATED +serd_filter_new(const SerdSink* SERD_NONNULL target, + const SerdNode* SERD_NULLABLE subject, + const SerdNode* SERD_NULLABLE predicate, + const SerdNode* SERD_NULLABLE object, + const SerdNode* SERD_NULLABLE graph, + bool inclusive); + +/** + @} @defgroup serd_env Environment @{ */ diff --git a/meson.build b/meson.build index 25cce6d0..9ae07fa1 100644 --- a/meson.build +++ b/meson.build @@ -92,6 +92,7 @@ sources = [ 'src/cursor.c', 'src/describe.c', 'src/env.c', + 'src/filter.c', 'src/inserter.c', 'src/log.c', 'src/model.c', diff --git a/src/filter.c b/src/filter.c new file mode 100644 index 00000000..6d5e5a04 --- /dev/null +++ b/src/filter.c @@ -0,0 +1,102 @@ +/* + Copyright 2019-2020 David Robillard <d@drobilla.net> + + Permission to use, copy, modify, and/or distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ + +#include "serd/serd.h" + +#include <stdbool.h> +#include <stdlib.h> + +typedef struct { + const SerdSink* target; + SerdNode* subject; + SerdNode* predicate; + SerdNode* object; + SerdNode* graph; + bool inclusive; +} SerdFilterData; + +static void +free_data(void* const handle) +{ + if (handle) { + SerdFilterData* data = (SerdFilterData*)handle; + + serd_node_free(data->subject); + serd_node_free(data->predicate); + serd_node_free(data->object); + serd_node_free(data->graph); + free(data); + } +} + +static SerdStatus +serd_filter_on_event(void* const handle, const SerdEvent* const event) +{ + const SerdFilterData* const data = (SerdFilterData*)handle; + + if (event->type == SERD_STATEMENT) { + const bool matches = serd_statement_matches(event->statement.statement, + data->subject, + data->predicate, + data->object, + data->graph); + + if (data->inclusive == matches) { + // Emit statement with reset flags to avoid confusing the writer + SerdEvent out_event = *event; + out_event.statement.flags = 0u; + return serd_sink_write_event(data->target, &out_event); + } + + return SERD_SUCCESS; // Skip statement + } + + return event->type == SERD_END ? SERD_SUCCESS + : serd_sink_write_event(data->target, event); +} + +SerdSink* +serd_filter_new(const SerdSink* const target, + const SerdNode* const subject, + const SerdNode* const predicate, + const SerdNode* const object, + const SerdNode* const graph, + const bool inclusive) +{ + SerdFilterData* const data = + (SerdFilterData*)calloc(1, sizeof(SerdFilterData)); + + data->target = target; + data->inclusive = inclusive; + + if (subject && serd_node_type(subject) != SERD_VARIABLE) { + data->subject = serd_node_copy(subject); + } + + if (predicate && serd_node_type(predicate) != SERD_VARIABLE) { + data->predicate = serd_node_copy(predicate); + } + + if (object && serd_node_type(object) != SERD_VARIABLE) { + data->object = serd_node_copy(object); + } + + if (graph && serd_node_type(graph) != SERD_VARIABLE) { + data->graph = serd_node_copy(graph); + } + + return serd_sink_new(data, serd_filter_on_event, free_data); +} diff --git a/src/serdi.c b/src/serdi.c index 243be98e..7c664b72 100644 --- a/src/serdi.c +++ b/src/serdi.c @@ -37,6 +37,13 @@ #define SERDI_ERROR(msg) fprintf(stderr, "serdi: " msg) #define SERDI_ERRORF(fmt, ...) fprintf(stderr, "serdi: " fmt, __VA_ARGS__) +typedef struct { + SerdNode* s; + SerdNode* p; + SerdNode* o; + SerdNode* g; +} FilterPattern; + static int print_version(void) { @@ -57,6 +64,8 @@ print_usage(const char* const name, const bool error) fprintf(os, "Read and write RDF syntax.\n"); fprintf(os, "Use - for INPUT to read from standard input.\n\n"); fprintf(os, " -C Convert literals to canonical form.\n"); + fprintf(os, " -F PATTERN Filter out statements that match PATTERN.\n"); + fprintf(os, " -G PATTERN Only include statements matching PATTERN.\n"); fprintf(os, " -I BASE_URI Input base URI.\n"); fprintf(os, " -a Write ASCII output if possible.\n"); fprintf(os, " -b Fast bulk output for large serialisations.\n"); @@ -88,6 +97,62 @@ missing_arg(const char* const name, const char opt) } static SerdStatus +on_filter_event(void* const handle, const SerdEvent* const event) +{ + if (event->type == SERD_STATEMENT) { + FilterPattern* const pat = (FilterPattern*)handle; + if (pat->s) { + return SERD_ERR_INVALID; + } + + const SerdStatement* const statement = event->statement.statement; + pat->s = serd_node_copy(serd_statement_subject(statement)); + pat->p = serd_node_copy(serd_statement_predicate(statement)); + pat->o = serd_node_copy(serd_statement_object(statement)); + pat->g = serd_node_copy(serd_statement_graph(statement)); + } + + return SERD_SUCCESS; +} + +static SerdSink* +parse_filter(SerdWorld* const world, + const SerdSink* const sink, + const char* const str, + const bool inclusive) +{ + SerdEnv* const env = serd_env_new(SERD_EMPTY_STRING()); + FilterPattern pat = {NULL, NULL, NULL, NULL}; + SerdSink* in_sink = serd_sink_new(&pat, on_filter_event, NULL); + SerdByteSource* byte_source = serd_byte_source_new_string(str, NULL); + SerdReader* reader = serd_reader_new( + world, SERD_NQUADS, SERD_READ_VARIABLES, env, in_sink, 4096); + + SerdStatus st = serd_reader_start(reader, byte_source); + if (!st) { + st = serd_reader_read_document(reader); + } + + serd_reader_free(reader); + serd_env_free(env); + serd_byte_source_free(byte_source); + serd_sink_free(in_sink); + + if (st) { + return NULL; + } + + SerdSink* filter = + serd_filter_new(sink, pat.s, pat.p, pat.o, pat.g, inclusive); + + serd_node_free(pat.s); + serd_node_free(pat.p); + serd_node_free(pat.o); + serd_node_free(pat.g); + return filter; +} + +static SerdStatus read_file(SerdWorld* const world, SerdSyntax syntax, const SerdReaderFlags flags, @@ -158,6 +223,8 @@ main(int argc, char** argv) bool quiet = false; size_t stack_size = 4194304; const char* input_string = NULL; + const char* in_pattern = NULL; + const char* out_pattern = NULL; const char* add_prefix = ""; const char* chop_prefix = NULL; const char* root_uri = NULL; @@ -197,6 +264,20 @@ main(int argc, char** argv) return print_version(); } else if (opt == 'x') { reader_flags |= SERD_READ_VARIABLES; + } else if (argv[a][1] == 'F') { + if (++a == argc) { + return missing_arg(argv[0], 'F'); + } + + out_pattern = argv[a]; + break; + } else if (argv[a][1] == 'G') { + if (++a == argc) { + return missing_arg(argv[0], 'g'); + } + + in_pattern = argv[a]; + break; } else if (argv[a][1] == 'I') { if (++a == argc) { return missing_arg(prog, 'I'); @@ -280,6 +361,11 @@ main(int argc, char** argv) } } + if (in_pattern && out_pattern) { + SERDI_ERROR("only one of -F and -G can be given at once\n"); + return 1; + } + if (a == argc && !input_string) { SERDI_ERROR("missing input\n"); return 1; @@ -370,6 +456,23 @@ main(int argc, char** argv) sink = canon = serd_canon_new(world, out_sink, reader_flags); } + SerdSink* filter = NULL; + if (in_pattern) { + if (!(filter = parse_filter(world, sink, in_pattern, true))) { + SERDI_ERROR("error parsing inclusive filter pattern\n"); + return EXIT_FAILURE; + } + + sink = filter; + } else if (out_pattern) { + if (!(filter = parse_filter(world, sink, out_pattern, false))) { + SERDI_ERROR("error parsing exclusive filter pattern\n"); + return EXIT_FAILURE; + } + + sink = filter; + } + if (quiet) { serd_set_log_func(world, serd_quiet_log_func, NULL); } @@ -382,8 +485,7 @@ main(int argc, char** argv) serd_writer_chop_blank_prefix(writer, chop_prefix); - SerdStatus st = SERD_SUCCESS; - SerdNode* input_name = NULL; + SerdStatus st = SERD_SUCCESS; if (input_string) { SerdByteSource* const byte_source = serd_byte_source_new_string(input_string, NULL); @@ -467,10 +569,10 @@ main(int argc, char** argv) } serd_sink_free(canon); + serd_sink_free(filter); serd_sink_free(inserter); serd_model_free(model); serd_writer_free(writer); - serd_node_free(input_name); serd_env_free(env); serd_node_free(base); serd_world_free(world); diff --git a/test/meson.build b/test/meson.build index e8392559..67907c25 100644 --- a/test/meson.build +++ b/test/meson.build @@ -77,6 +77,14 @@ if get_option('utils') bad_args = [ ['/no/such/file'], ['ftp://unsupported.org'], + ['-F', '', '-G', ''], + ['-F'], + ['-F', '?s ?p ?o . ?q ?r ?s .', '-s', ''], + ['-F', '?s ?p ?o .\n?q ?r ?s .\n', '-s', ''], + ['-F', 'bad_pattern', '-s', ''], + ['-G'], + ['-G', '?s ?p ?o . ?q ?r ?s .', '-s', ''], + ['-G', 'bad_pattern', '-s', ''], ['-I'], ['-c'], ['-i', 'unknown'], @@ -135,6 +143,16 @@ if get_option('utils') env: test_env, suite: ['serdi', 'options']) + test('filter', files('test_filter.py'), + args: script_args, + env: test_env, + suite: ['serdi', 'options']) + + test('grep', files('test_grep.py'), + args: script_args, + env: test_env, + suite: ['serdi', 'options']) + # Inputs test('stdin', files('test_stdin.py'), diff --git a/test/test_filter.py b/test/test_filter.py new file mode 100755 index 00000000..d44677f5 --- /dev/null +++ b/test/test_filter.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python3 + +"""Test filtering statements inclusively.""" + +import argparse +import sys +import shlex +import subprocess +import tempfile + +DOCUMENTS = { + "ntriples": """ + <urn:example:s> <urn:example:p> <urn:example:o> . + <urn:example:s> <urn:example:q> <urn:example:r> . +""", + "nquads": """ + <urn:example:s> <urn:example:p> <urn:example:o> <urn:example:g> . + <urn:example:s> <urn:example:q> <urn:example:r> <urn:example:g> . +""", +} + +parser = argparse.ArgumentParser(description=__doc__) + +parser.add_argument("--serdi", default="./serdi", help="path to serdi") +parser.add_argument("--wrapper", default="", help="executable wrapper") + +args = parser.parse_args(sys.argv[1:]) + + +def check_pattern(syntax, pattern, result): + command = shlex.split(args.wrapper) + [ + args.serdi, + "-i", + syntax, + "-F", + pattern, + "-s", + DOCUMENTS[syntax], + ] + + with tempfile.TemporaryFile() as out: + proc = subprocess.run( + command, + check=False, + encoding="utf-8", + capture_output=True, + ) + + assert proc.returncode == 0 + assert args.wrapper or len(proc.stderr) == 0 + assert proc.stdout == result + + +check_pattern( + "ntriples", + "?s <urn:example:p> <urn:example:o> .", + "<urn:example:s> <urn:example:q> <urn:example:r> .\n", +) + +check_pattern( + "ntriples", + "<urn:example:s> ?p <urn:example:o> .", + "<urn:example:s> <urn:example:q> <urn:example:r> .\n", +) + +check_pattern( + "ntriples", + "<urn:example:s> <urn:example:p> ?o .", + "<urn:example:s> <urn:example:q> <urn:example:r> .\n", +) + +check_pattern( + "nquads", + "?s <urn:example:p> <urn:example:o> <urn:example:g> .", + "<urn:example:s> <urn:example:q> <urn:example:r> <urn:example:g> .\n", +) + +check_pattern( + "nquads", + "<urn:example:s> ?p <urn:example:o> <urn:example:g> .", + "<urn:example:s> <urn:example:q> <urn:example:r> <urn:example:g> .\n", +) + +check_pattern( + "nquads", + "<urn:example:s> <urn:example:p> ?o <urn:example:g> .", + "<urn:example:s> <urn:example:q> <urn:example:r> <urn:example:g> .\n", +) + +check_pattern( + "nquads", + "<urn:example:s> <urn:example:p> <urn:example:o> ?g .", + "<urn:example:s> <urn:example:q> <urn:example:r> <urn:example:g> .\n", +) diff --git a/test/test_grep.py b/test/test_grep.py new file mode 100755 index 00000000..0c8c5228 --- /dev/null +++ b/test/test_grep.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python3 + +"""Test filtering statements exclusively.""" + +import argparse +import sys +import shlex +import subprocess +import tempfile + +DOCUMENTS = { + "ntriples": """ + <urn:example:s> <urn:example:p> <urn:example:o> . + <urn:example:s> <urn:example:q> <urn:example:r> . +""", + "nquads": """ + <urn:example:s> <urn:example:p> <urn:example:o> <urn:example:g> . + <urn:example:s> <urn:example:q> <urn:example:r> <urn:example:g> . +""", +} + +parser = argparse.ArgumentParser(description=__doc__) + +parser.add_argument("--serdi", default="./serdi", help="path to serdi") +parser.add_argument("--wrapper", default="", help="executable wrapper") + +args = parser.parse_args(sys.argv[1:]) + + +def check_pattern(syntax, pattern, result): + command = shlex.split(args.wrapper) + [ + args.serdi, + "-i", + syntax, + "-G", + pattern, + "-s", + DOCUMENTS[syntax], + ] + + with tempfile.TemporaryFile() as out: + proc = subprocess.run( + command, + check=False, + encoding="utf-8", + capture_output=True, + ) + + assert proc.returncode == 0 + assert args.wrapper or len(proc.stderr) == 0 + assert proc.stdout == result + + +check_pattern( + "ntriples", + "?s <urn:example:p> <urn:example:o> .", + "<urn:example:s> <urn:example:p> <urn:example:o> .\n", +) + +check_pattern( + "ntriples", + "<urn:example:s> ?p <urn:example:o> .", + "<urn:example:s> <urn:example:p> <urn:example:o> .\n", +) + +check_pattern( + "ntriples", + "<urn:example:s> <urn:example:p> ?o .", + "<urn:example:s> <urn:example:p> <urn:example:o> .\n", +) + +check_pattern( + "nquads", + "?s <urn:example:p> <urn:example:o> <urn:example:g> .", + "<urn:example:s> <urn:example:p> <urn:example:o> <urn:example:g> .\n", +) + +check_pattern( + "nquads", + "<urn:example:s> ?p <urn:example:o> <urn:example:g> .", + "<urn:example:s> <urn:example:p> <urn:example:o> <urn:example:g> .\n", +) + +check_pattern( + "nquads", + "<urn:example:s> <urn:example:p> ?o <urn:example:g> .", + "<urn:example:s> <urn:example:p> <urn:example:o> <urn:example:g> .\n", +) + +check_pattern( + "nquads", + "<urn:example:s> <urn:example:p> <urn:example:o> ?g .", + "<urn:example:s> <urn:example:p> <urn:example:o> <urn:example:g> .\n", +) |