aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--doc/serdi.128
-rw-r--r--include/serd/serd.h35
-rw-r--r--meson.build1
-rw-r--r--src/filter.c102
-rw-r--r--src/serdi.c108
-rw-r--r--test/meson.build18
-rwxr-xr-xtest/test_filter.py94
-rwxr-xr-xtest/test_grep.py94
8 files changed, 476 insertions, 4 deletions
diff --git a/doc/serdi.1 b/doc/serdi.1
index 59eb67ce..2a110785 100644
--- a/doc/serdi.1
+++ b/doc/serdi.1
@@ -3,10 +3,11 @@
.Os Serd 0.30.11
.Sh NAME
.Nm serdi
-.Nd read, transform, and write RDF data
+.Nd read, filter, transform, and write RDF data
.Sh SYNOPSIS
.Nm serdi
.Op Fl Cabefhlmqtvx
+.Op Fl F Ar pattern | Fl G Ar pattern
.Op Fl I Ar base
.Op Fl c Ar prefix
.Op Fl i Ar syntax
@@ -47,6 +48,23 @@ All numeric datatypes are supported, as well as
and
.Vt base64Binary .
.Pp
+.It Fl F Ar pattern
+Filter out statements that match
+.Ar pattern .
+The pattern must be a single statement written in NTriples or NQuads,
+with variables like
+.Dq ?name
+for wildcards.
+The names of variables in the pattern are insignificant.
+.Pp
+.It Fl G Ar pattern
+Only include statements that match
+.Ar pattern .
+This option is like
+.Fl p
+but inverted,
+so that only matching statements are included, like grep.
+.Pp
.It Fl I Ar base
Input base URI.
Relative URI references in the input will be resolved against this.
@@ -180,6 +198,14 @@ To pretty-print a document:
To print any errors:
.Pp
.Dl $ serdi file.ttl > /dev/null
+.Pp
+To remove any rdf:type properties:
+.Pp
+.Dl $ serdi -F \(dq?s <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?o .\(dq file.ttl
+.Pp
+To include only rdf:type properties:
+.Pp
+.Dl $ serdi -G \(dq?s <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?o .\(dq file.ttl
.Sh SEE ALSO
.Bl -item -compact
.It
diff --git a/include/serd/serd.h b/include/serd/serd.h
index fad9d070..5edb4b2f 100644
--- a/include/serd/serd.h
+++ b/include/serd/serd.h
@@ -1966,6 +1966,41 @@ serd_canon_new(const SerdWorld* SERD_NULLABLE world,
/**
@}
+ @defgroup serd_filter Filter
+ @{
+*/
+
+/**
+ Return a new sink that filters out statements that do not match a pattern.
+
+ The returned sink acts like `target` in all respects, except that some
+ statements may be dropped.
+
+ @param target The target sink to pass the filtered data to.
+
+ @param subject The optional subject of the filter pattern.
+
+ @param predicate The optional predicate of the filter pattern.
+
+ @param object The optional object of the filter pattern.
+
+ @param graph The optional graph of the filter pattern.
+
+ @param inclusive If true, then only statements that match the pattern are
+ passed through. Otherwise, only statements that do *not* match the pattern
+ are passed through.
+*/
+SERD_API
+SerdSink* SERD_ALLOCATED
+serd_filter_new(const SerdSink* SERD_NONNULL target,
+ const SerdNode* SERD_NULLABLE subject,
+ const SerdNode* SERD_NULLABLE predicate,
+ const SerdNode* SERD_NULLABLE object,
+ const SerdNode* SERD_NULLABLE graph,
+ bool inclusive);
+
+/**
+ @}
@defgroup serd_env Environment
@{
*/
diff --git a/meson.build b/meson.build
index 25cce6d0..9ae07fa1 100644
--- a/meson.build
+++ b/meson.build
@@ -92,6 +92,7 @@ sources = [
'src/cursor.c',
'src/describe.c',
'src/env.c',
+ 'src/filter.c',
'src/inserter.c',
'src/log.c',
'src/model.c',
diff --git a/src/filter.c b/src/filter.c
new file mode 100644
index 00000000..6d5e5a04
--- /dev/null
+++ b/src/filter.c
@@ -0,0 +1,102 @@
+/*
+ Copyright 2019-2020 David Robillard <d@drobilla.net>
+
+ Permission to use, copy, modify, and/or distribute this software for any
+ purpose with or without fee is hereby granted, provided that the above
+ copyright notice and this permission notice appear in all copies.
+
+ THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+*/
+
+#include "serd/serd.h"
+
+#include <stdbool.h>
+#include <stdlib.h>
+
+typedef struct {
+ const SerdSink* target;
+ SerdNode* subject;
+ SerdNode* predicate;
+ SerdNode* object;
+ SerdNode* graph;
+ bool inclusive;
+} SerdFilterData;
+
+static void
+free_data(void* const handle)
+{
+ if (handle) {
+ SerdFilterData* data = (SerdFilterData*)handle;
+
+ serd_node_free(data->subject);
+ serd_node_free(data->predicate);
+ serd_node_free(data->object);
+ serd_node_free(data->graph);
+ free(data);
+ }
+}
+
+static SerdStatus
+serd_filter_on_event(void* const handle, const SerdEvent* const event)
+{
+ const SerdFilterData* const data = (SerdFilterData*)handle;
+
+ if (event->type == SERD_STATEMENT) {
+ const bool matches = serd_statement_matches(event->statement.statement,
+ data->subject,
+ data->predicate,
+ data->object,
+ data->graph);
+
+ if (data->inclusive == matches) {
+ // Emit statement with reset flags to avoid confusing the writer
+ SerdEvent out_event = *event;
+ out_event.statement.flags = 0u;
+ return serd_sink_write_event(data->target, &out_event);
+ }
+
+ return SERD_SUCCESS; // Skip statement
+ }
+
+ return event->type == SERD_END ? SERD_SUCCESS
+ : serd_sink_write_event(data->target, event);
+}
+
+SerdSink*
+serd_filter_new(const SerdSink* const target,
+ const SerdNode* const subject,
+ const SerdNode* const predicate,
+ const SerdNode* const object,
+ const SerdNode* const graph,
+ const bool inclusive)
+{
+ SerdFilterData* const data =
+ (SerdFilterData*)calloc(1, sizeof(SerdFilterData));
+
+ data->target = target;
+ data->inclusive = inclusive;
+
+ if (subject && serd_node_type(subject) != SERD_VARIABLE) {
+ data->subject = serd_node_copy(subject);
+ }
+
+ if (predicate && serd_node_type(predicate) != SERD_VARIABLE) {
+ data->predicate = serd_node_copy(predicate);
+ }
+
+ if (object && serd_node_type(object) != SERD_VARIABLE) {
+ data->object = serd_node_copy(object);
+ }
+
+ if (graph && serd_node_type(graph) != SERD_VARIABLE) {
+ data->graph = serd_node_copy(graph);
+ }
+
+ return serd_sink_new(data, serd_filter_on_event, free_data);
+}
diff --git a/src/serdi.c b/src/serdi.c
index 243be98e..7c664b72 100644
--- a/src/serdi.c
+++ b/src/serdi.c
@@ -37,6 +37,13 @@
#define SERDI_ERROR(msg) fprintf(stderr, "serdi: " msg)
#define SERDI_ERRORF(fmt, ...) fprintf(stderr, "serdi: " fmt, __VA_ARGS__)
+typedef struct {
+ SerdNode* s;
+ SerdNode* p;
+ SerdNode* o;
+ SerdNode* g;
+} FilterPattern;
+
static int
print_version(void)
{
@@ -57,6 +64,8 @@ print_usage(const char* const name, const bool error)
fprintf(os, "Read and write RDF syntax.\n");
fprintf(os, "Use - for INPUT to read from standard input.\n\n");
fprintf(os, " -C Convert literals to canonical form.\n");
+ fprintf(os, " -F PATTERN Filter out statements that match PATTERN.\n");
+ fprintf(os, " -G PATTERN Only include statements matching PATTERN.\n");
fprintf(os, " -I BASE_URI Input base URI.\n");
fprintf(os, " -a Write ASCII output if possible.\n");
fprintf(os, " -b Fast bulk output for large serialisations.\n");
@@ -88,6 +97,62 @@ missing_arg(const char* const name, const char opt)
}
static SerdStatus
+on_filter_event(void* const handle, const SerdEvent* const event)
+{
+ if (event->type == SERD_STATEMENT) {
+ FilterPattern* const pat = (FilterPattern*)handle;
+ if (pat->s) {
+ return SERD_ERR_INVALID;
+ }
+
+ const SerdStatement* const statement = event->statement.statement;
+ pat->s = serd_node_copy(serd_statement_subject(statement));
+ pat->p = serd_node_copy(serd_statement_predicate(statement));
+ pat->o = serd_node_copy(serd_statement_object(statement));
+ pat->g = serd_node_copy(serd_statement_graph(statement));
+ }
+
+ return SERD_SUCCESS;
+}
+
+static SerdSink*
+parse_filter(SerdWorld* const world,
+ const SerdSink* const sink,
+ const char* const str,
+ const bool inclusive)
+{
+ SerdEnv* const env = serd_env_new(SERD_EMPTY_STRING());
+ FilterPattern pat = {NULL, NULL, NULL, NULL};
+ SerdSink* in_sink = serd_sink_new(&pat, on_filter_event, NULL);
+ SerdByteSource* byte_source = serd_byte_source_new_string(str, NULL);
+ SerdReader* reader = serd_reader_new(
+ world, SERD_NQUADS, SERD_READ_VARIABLES, env, in_sink, 4096);
+
+ SerdStatus st = serd_reader_start(reader, byte_source);
+ if (!st) {
+ st = serd_reader_read_document(reader);
+ }
+
+ serd_reader_free(reader);
+ serd_env_free(env);
+ serd_byte_source_free(byte_source);
+ serd_sink_free(in_sink);
+
+ if (st) {
+ return NULL;
+ }
+
+ SerdSink* filter =
+ serd_filter_new(sink, pat.s, pat.p, pat.o, pat.g, inclusive);
+
+ serd_node_free(pat.s);
+ serd_node_free(pat.p);
+ serd_node_free(pat.o);
+ serd_node_free(pat.g);
+ return filter;
+}
+
+static SerdStatus
read_file(SerdWorld* const world,
SerdSyntax syntax,
const SerdReaderFlags flags,
@@ -158,6 +223,8 @@ main(int argc, char** argv)
bool quiet = false;
size_t stack_size = 4194304;
const char* input_string = NULL;
+ const char* in_pattern = NULL;
+ const char* out_pattern = NULL;
const char* add_prefix = "";
const char* chop_prefix = NULL;
const char* root_uri = NULL;
@@ -197,6 +264,20 @@ main(int argc, char** argv)
return print_version();
} else if (opt == 'x') {
reader_flags |= SERD_READ_VARIABLES;
+ } else if (argv[a][1] == 'F') {
+ if (++a == argc) {
+ return missing_arg(argv[0], 'F');
+ }
+
+ out_pattern = argv[a];
+ break;
+ } else if (argv[a][1] == 'G') {
+ if (++a == argc) {
+ return missing_arg(argv[0], 'g');
+ }
+
+ in_pattern = argv[a];
+ break;
} else if (argv[a][1] == 'I') {
if (++a == argc) {
return missing_arg(prog, 'I');
@@ -280,6 +361,11 @@ main(int argc, char** argv)
}
}
+ if (in_pattern && out_pattern) {
+ SERDI_ERROR("only one of -F and -G can be given at once\n");
+ return 1;
+ }
+
if (a == argc && !input_string) {
SERDI_ERROR("missing input\n");
return 1;
@@ -370,6 +456,23 @@ main(int argc, char** argv)
sink = canon = serd_canon_new(world, out_sink, reader_flags);
}
+ SerdSink* filter = NULL;
+ if (in_pattern) {
+ if (!(filter = parse_filter(world, sink, in_pattern, true))) {
+ SERDI_ERROR("error parsing inclusive filter pattern\n");
+ return EXIT_FAILURE;
+ }
+
+ sink = filter;
+ } else if (out_pattern) {
+ if (!(filter = parse_filter(world, sink, out_pattern, false))) {
+ SERDI_ERROR("error parsing exclusive filter pattern\n");
+ return EXIT_FAILURE;
+ }
+
+ sink = filter;
+ }
+
if (quiet) {
serd_set_log_func(world, serd_quiet_log_func, NULL);
}
@@ -382,8 +485,7 @@ main(int argc, char** argv)
serd_writer_chop_blank_prefix(writer, chop_prefix);
- SerdStatus st = SERD_SUCCESS;
- SerdNode* input_name = NULL;
+ SerdStatus st = SERD_SUCCESS;
if (input_string) {
SerdByteSource* const byte_source =
serd_byte_source_new_string(input_string, NULL);
@@ -467,10 +569,10 @@ main(int argc, char** argv)
}
serd_sink_free(canon);
+ serd_sink_free(filter);
serd_sink_free(inserter);
serd_model_free(model);
serd_writer_free(writer);
- serd_node_free(input_name);
serd_env_free(env);
serd_node_free(base);
serd_world_free(world);
diff --git a/test/meson.build b/test/meson.build
index e8392559..67907c25 100644
--- a/test/meson.build
+++ b/test/meson.build
@@ -77,6 +77,14 @@ if get_option('utils')
bad_args = [
['/no/such/file'],
['ftp://unsupported.org'],
+ ['-F', '', '-G', ''],
+ ['-F'],
+ ['-F', '?s ?p ?o . ?q ?r ?s .', '-s', ''],
+ ['-F', '?s ?p ?o .\n?q ?r ?s .\n', '-s', ''],
+ ['-F', 'bad_pattern', '-s', ''],
+ ['-G'],
+ ['-G', '?s ?p ?o . ?q ?r ?s .', '-s', ''],
+ ['-G', 'bad_pattern', '-s', ''],
['-I'],
['-c'],
['-i', 'unknown'],
@@ -135,6 +143,16 @@ if get_option('utils')
env: test_env,
suite: ['serdi', 'options'])
+ test('filter', files('test_filter.py'),
+ args: script_args,
+ env: test_env,
+ suite: ['serdi', 'options'])
+
+ test('grep', files('test_grep.py'),
+ args: script_args,
+ env: test_env,
+ suite: ['serdi', 'options'])
+
# Inputs
test('stdin', files('test_stdin.py'),
diff --git a/test/test_filter.py b/test/test_filter.py
new file mode 100755
index 00000000..d44677f5
--- /dev/null
+++ b/test/test_filter.py
@@ -0,0 +1,94 @@
+#!/usr/bin/env python3
+
+"""Test filtering statements inclusively."""
+
+import argparse
+import sys
+import shlex
+import subprocess
+import tempfile
+
+DOCUMENTS = {
+ "ntriples": """
+ <urn:example:s> <urn:example:p> <urn:example:o> .
+ <urn:example:s> <urn:example:q> <urn:example:r> .
+""",
+ "nquads": """
+ <urn:example:s> <urn:example:p> <urn:example:o> <urn:example:g> .
+ <urn:example:s> <urn:example:q> <urn:example:r> <urn:example:g> .
+""",
+}
+
+parser = argparse.ArgumentParser(description=__doc__)
+
+parser.add_argument("--serdi", default="./serdi", help="path to serdi")
+parser.add_argument("--wrapper", default="", help="executable wrapper")
+
+args = parser.parse_args(sys.argv[1:])
+
+
+def check_pattern(syntax, pattern, result):
+ command = shlex.split(args.wrapper) + [
+ args.serdi,
+ "-i",
+ syntax,
+ "-F",
+ pattern,
+ "-s",
+ DOCUMENTS[syntax],
+ ]
+
+ with tempfile.TemporaryFile() as out:
+ proc = subprocess.run(
+ command,
+ check=False,
+ encoding="utf-8",
+ capture_output=True,
+ )
+
+ assert proc.returncode == 0
+ assert args.wrapper or len(proc.stderr) == 0
+ assert proc.stdout == result
+
+
+check_pattern(
+ "ntriples",
+ "?s <urn:example:p> <urn:example:o> .",
+ "<urn:example:s> <urn:example:q> <urn:example:r> .\n",
+)
+
+check_pattern(
+ "ntriples",
+ "<urn:example:s> ?p <urn:example:o> .",
+ "<urn:example:s> <urn:example:q> <urn:example:r> .\n",
+)
+
+check_pattern(
+ "ntriples",
+ "<urn:example:s> <urn:example:p> ?o .",
+ "<urn:example:s> <urn:example:q> <urn:example:r> .\n",
+)
+
+check_pattern(
+ "nquads",
+ "?s <urn:example:p> <urn:example:o> <urn:example:g> .",
+ "<urn:example:s> <urn:example:q> <urn:example:r> <urn:example:g> .\n",
+)
+
+check_pattern(
+ "nquads",
+ "<urn:example:s> ?p <urn:example:o> <urn:example:g> .",
+ "<urn:example:s> <urn:example:q> <urn:example:r> <urn:example:g> .\n",
+)
+
+check_pattern(
+ "nquads",
+ "<urn:example:s> <urn:example:p> ?o <urn:example:g> .",
+ "<urn:example:s> <urn:example:q> <urn:example:r> <urn:example:g> .\n",
+)
+
+check_pattern(
+ "nquads",
+ "<urn:example:s> <urn:example:p> <urn:example:o> ?g .",
+ "<urn:example:s> <urn:example:q> <urn:example:r> <urn:example:g> .\n",
+)
diff --git a/test/test_grep.py b/test/test_grep.py
new file mode 100755
index 00000000..0c8c5228
--- /dev/null
+++ b/test/test_grep.py
@@ -0,0 +1,94 @@
+#!/usr/bin/env python3
+
+"""Test filtering statements exclusively."""
+
+import argparse
+import sys
+import shlex
+import subprocess
+import tempfile
+
+DOCUMENTS = {
+ "ntriples": """
+ <urn:example:s> <urn:example:p> <urn:example:o> .
+ <urn:example:s> <urn:example:q> <urn:example:r> .
+""",
+ "nquads": """
+ <urn:example:s> <urn:example:p> <urn:example:o> <urn:example:g> .
+ <urn:example:s> <urn:example:q> <urn:example:r> <urn:example:g> .
+""",
+}
+
+parser = argparse.ArgumentParser(description=__doc__)
+
+parser.add_argument("--serdi", default="./serdi", help="path to serdi")
+parser.add_argument("--wrapper", default="", help="executable wrapper")
+
+args = parser.parse_args(sys.argv[1:])
+
+
+def check_pattern(syntax, pattern, result):
+ command = shlex.split(args.wrapper) + [
+ args.serdi,
+ "-i",
+ syntax,
+ "-G",
+ pattern,
+ "-s",
+ DOCUMENTS[syntax],
+ ]
+
+ with tempfile.TemporaryFile() as out:
+ proc = subprocess.run(
+ command,
+ check=False,
+ encoding="utf-8",
+ capture_output=True,
+ )
+
+ assert proc.returncode == 0
+ assert args.wrapper or len(proc.stderr) == 0
+ assert proc.stdout == result
+
+
+check_pattern(
+ "ntriples",
+ "?s <urn:example:p> <urn:example:o> .",
+ "<urn:example:s> <urn:example:p> <urn:example:o> .\n",
+)
+
+check_pattern(
+ "ntriples",
+ "<urn:example:s> ?p <urn:example:o> .",
+ "<urn:example:s> <urn:example:p> <urn:example:o> .\n",
+)
+
+check_pattern(
+ "ntriples",
+ "<urn:example:s> <urn:example:p> ?o .",
+ "<urn:example:s> <urn:example:p> <urn:example:o> .\n",
+)
+
+check_pattern(
+ "nquads",
+ "?s <urn:example:p> <urn:example:o> <urn:example:g> .",
+ "<urn:example:s> <urn:example:p> <urn:example:o> <urn:example:g> .\n",
+)
+
+check_pattern(
+ "nquads",
+ "<urn:example:s> ?p <urn:example:o> <urn:example:g> .",
+ "<urn:example:s> <urn:example:p> <urn:example:o> <urn:example:g> .\n",
+)
+
+check_pattern(
+ "nquads",
+ "<urn:example:s> <urn:example:p> ?o <urn:example:g> .",
+ "<urn:example:s> <urn:example:p> <urn:example:o> <urn:example:g> .\n",
+)
+
+check_pattern(
+ "nquads",
+ "<urn:example:s> <urn:example:p> <urn:example:o> ?g .",
+ "<urn:example:s> <urn:example:p> <urn:example:o> <urn:example:g> .\n",
+)