aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid Robillard <d@drobilla.net>2021-08-08 14:24:59 -0400
committerDavid Robillard <d@drobilla.net>2022-01-28 21:57:07 -0500
commit5aa146e1ce58d295b5f45446bbbbdbb30c8e557d (patch)
treec8b49564b251bf2c3ff63bbfdc932986bea8d0b6
parentbaa2e4e768f542953144cfa6ebe5713ecad389fc (diff)
downloadserd-5aa146e1ce58d295b5f45446bbbbdbb30c8e557d.tar.gz
serd-5aa146e1ce58d295b5f45446bbbbdbb30c8e557d.tar.bz2
serd-5aa146e1ce58d295b5f45446bbbbdbb30c8e557d.zip
Replace serdi -b and -e options with a block size option
This is more powerful, and reduces the number of command line options that almost nobody needs to care about.
-rw-r--r--doc/serdi.116
-rwxr-xr-xscripts/serd_bench.py4
-rw-r--r--src/console.c37
-rw-r--r--src/console.h4
-rw-r--r--src/serdi.c34
-rw-r--r--src/system.h17
-rw-r--r--test/meson.build10
-rwxr-xr-xtest/run_test_suite.py2
8 files changed, 59 insertions, 65 deletions
diff --git a/doc/serdi.1 b/doc/serdi.1
index 2a110785..99652738 100644
--- a/doc/serdi.1
+++ b/doc/serdi.1
@@ -6,9 +6,10 @@
.Nd read, filter, transform, and write RDF data
.Sh SYNOPSIS
.Nm serdi
-.Op Fl Cabefhlmqtvx
+.Op Fl Cafhlmqtvx
.Op Fl F Ar pattern | Fl G Ar pattern
.Op Fl I Ar base
+.Op Fl b Ar bytes
.Op Fl c Ar prefix
.Op Fl i Ar syntax
.Op Fl k Ar bytes
@@ -77,9 +78,11 @@ or to provide a base URI for input from stdin or a string.
Write ASCII output.
If this is enabled, all non-ASCII characters will be escaped, even if the output syntax allows them to be written in UTF-8.
.Pp
-.It Fl b
-Bulk output writing.
-If this is enabled, output will be written a page at a time, rather than a byte at a time.
+.It Fl b Ar bytes
+I/O block size.
+This is the number of bytes in a file that will be read or written at once.
+The default is 4096, which should perform well in most cases.
+Note that this only applies to files, standard input and output are always processed one byte at a time.
.Pp
.It Fl c Ar prefix
Chop
@@ -88,11 +91,6 @@ from matching blank node IDs.
This is the inverse of
.Fl p .
.Pp
-.It Fl e
-Eat input one character at a time, rather than a page at a time which is the default.
-This is useful when reading from a pipe since output will be generated immediately as input arrives, rather than waiting until an entire page of input has arrived.
-With this option serdi uses one page less memory, but will likely be significantly slower.
-.Pp
.It Fl f
Fast and loose mode.
This disables shortening URIs into prefixed names or relative URI references.
diff --git a/scripts/serd_bench.py b/scripts/serd_bench.py
index 018734c4..8a10dab0 100755
--- a/scripts/serd_bench.py
+++ b/scripts/serd_bench.py
@@ -226,8 +226,8 @@ example:
args = ap.parse_args(sys.argv[1:])
progs = [
- "serdi -b -i turtle -o turtle",
- "serdi -m -b -i turtle -o turtle",
+ "serdi -i turtle -o turtle",
+ "serdi -m -i turtle -o turtle",
] + args.run
min_n = int(args.max / 10)
diff --git a/src/console.c b/src/console.c
index 2cc908ef..df1bc2ff 100644
--- a/src/console.c
+++ b/src/console.c
@@ -15,7 +15,6 @@
*/
#include "console.h"
-#include "system.h"
#include "serd/serd.h"
@@ -27,6 +26,7 @@
# include <io.h>
#endif
+#include <stdint.h>
#include <string.h>
void
@@ -56,8 +56,24 @@ serd_print_version(const char* const program)
return 0;
}
+/// Wrapper for getc that is compatible with SerdReadFunc but faster than fread
+static size_t
+serd_file_read_byte(void* buf, size_t size, size_t nmemb, void* stream)
+{
+ (void)size;
+ (void)nmemb;
+
+ const int c = getc((FILE*)stream);
+ if (c == EOF) {
+ *((uint8_t*)buf) = 0;
+ return 0;
+ }
+ *((uint8_t*)buf) = (uint8_t)c;
+ return 1;
+}
+
SerdByteSource*
-serd_open_input(const char* const filename, const size_t page_size)
+serd_open_input(const char* const filename, const size_t block_size)
{
SerdByteSource* byte_source = NULL;
if (!strcmp(filename, "-")) {
@@ -65,31 +81,26 @@ serd_open_input(const char* const filename, const size_t page_size)
SerdNode* name = serd_new_string(SERD_STRING("stdin"));
- byte_source = serd_byte_source_new_function(serd_file_read_byte,
- (SerdStreamErrorFunc)ferror,
- NULL,
- stdin,
- name,
- page_size);
+ byte_source = serd_byte_source_new_function(
+ serd_file_read_byte, (SerdStreamErrorFunc)ferror, NULL, stdin, name, 1);
serd_node_free(name);
} else {
- byte_source = serd_byte_source_new_filename(filename, page_size);
+ byte_source = serd_byte_source_new_filename(filename, block_size);
}
return byte_source;
}
SerdByteSink*
-serd_open_output(const char* const filename, const size_t page_size)
+serd_open_output(const char* const filename, const size_t block_size)
{
if (!filename || !strcmp(filename, "-")) {
serd_set_stream_utf8_mode(stdout);
- return serd_byte_sink_new_function(
- (SerdWriteFunc)fwrite, stdout, page_size);
+ return serd_byte_sink_new_function((SerdWriteFunc)fwrite, stdout, 1);
}
- return serd_byte_sink_new_filename(filename, page_size);
+ return serd_byte_sink_new_filename(filename, block_size);
}
SerdStatus
diff --git a/src/console.h b/src/console.h
index 57170a94..31076b24 100644
--- a/src/console.h
+++ b/src/console.h
@@ -25,10 +25,10 @@ int
serd_print_version(const char* program);
SerdByteSource*
-serd_open_input(const char* filename, size_t page_size);
+serd_open_input(const char* filename, size_t block_size);
SerdByteSink*
-serd_open_output(const char* filename, size_t page_size);
+serd_open_output(const char* filename, size_t block_size);
SerdStatus
serd_set_base_uri_from_path(SerdEnv* env, const char* path);
diff --git a/src/serdi.c b/src/serdi.c
index 88ccacd9..73a3f05c 100644
--- a/src/serdi.c
+++ b/src/serdi.c
@@ -15,7 +15,6 @@
*/
#include "console.h"
-#include "system.h"
#include "serd/serd.h"
@@ -49,9 +48,8 @@ print_usage(const char* const name, const bool error)
fprintf(os, " -G PATTERN Only include statements matching PATTERN.\n");
fprintf(os, " -I BASE_URI Input base URI.\n");
fprintf(os, " -a Write ASCII output if possible.\n");
- fprintf(os, " -b Fast bulk output for large serialisations.\n");
+ fprintf(os, " -b BYTES I/O block size.\n");
fprintf(os, " -c PREFIX Chop PREFIX from matching blank node IDs.\n");
- fprintf(os, " -e Eat input one character at a time.\n");
fprintf(os, " -f Fast and loose mode (possibly ugly output).\n");
fprintf(os, " -h Display this help and exit.\n");
fprintf(os, " -i SYNTAX Input syntax: turtle/ntriples/trig/nquads.\n");
@@ -142,13 +140,12 @@ read_file(SerdWorld* const world,
const size_t stack_size,
const char* const filename,
const char* const add_prefix,
- const bool bulk_read)
+ const size_t block_size)
{
syntax = syntax ? syntax : serd_guess_syntax(filename);
syntax = syntax ? syntax : SERD_TRIG;
- SerdByteSource* byte_source =
- serd_open_input(filename, bulk_read ? SERD_PAGE_SIZE : 1u);
+ SerdByteSource* byte_source = serd_open_input(filename, block_size);
if (!byte_source) {
SERDI_ERRORF(
@@ -185,13 +182,12 @@ main(int argc, char** argv)
SerdSyntax output_syntax = SERD_SYNTAX_EMPTY;
SerdReaderFlags reader_flags = 0;
SerdWriterFlags writer_flags = 0;
- bool bulk_read = true;
- bool bulk_write = false;
bool no_inline = false;
bool osyntax_set = false;
bool use_model = false;
bool canonical = false;
bool quiet = false;
+ size_t block_size = 4096u;
size_t stack_size = 4194304;
const char* input_string = NULL;
const char* in_pattern = NULL;
@@ -213,10 +209,6 @@ main(int argc, char** argv)
canonical = true;
} else if (opt == 'a') {
writer_flags |= SERD_WRITE_ASCII;
- } else if (opt == 'b') {
- bulk_write = true;
- } else if (opt == 'e') {
- bulk_read = false;
} else if (opt == 'f') {
no_inline = true;
writer_flags |= (SERD_WRITE_EXPANDED | SERD_WRITE_VERBATIM);
@@ -256,6 +248,19 @@ main(int argc, char** argv)
base = serd_new_uri(SERD_STRING(argv[a]));
break;
+ } else if (opt == 'b') {
+ if (argv[a][o + 1] || ++a == argc) {
+ return missing_arg(prog, 'b');
+ }
+
+ char* endptr = NULL;
+ const long size = strtol(argv[a], &endptr, 10);
+ if (size < 1 || size == LONG_MAX || *endptr != '\0') {
+ SERDI_ERRORF("invalid block size `%s'\n", argv[a]);
+ return 1;
+ }
+ block_size = (size_t)size;
+ break;
} else if (opt == 'c') {
if (argv[a][o + 1] || ++a == argc) {
return missing_arg(prog, 'c');
@@ -380,8 +385,7 @@ main(int argc, char** argv)
const SerdDescribeFlags describe_flags =
no_inline ? SERD_NO_INLINE_OBJECTS : 0u;
- SerdByteSink* const byte_sink =
- serd_open_output(out_filename, bulk_write ? 4096u : 1u);
+ SerdByteSink* const byte_sink = serd_open_output(out_filename, block_size);
if (!byte_sink) {
perror("serdi: error opening output file");
return 1;
@@ -498,7 +502,7 @@ main(int argc, char** argv)
stack_size,
inputs[i],
n_inputs > 1 ? prefix : add_prefix,
- bulk_read))) {
+ block_size))) {
break;
}
}
diff --git a/src/system.h b/src/system.h
index 27087bde..184e1aae 100644
--- a/src/system.h
+++ b/src/system.h
@@ -19,7 +19,6 @@
#include "attributes.h"
-#include <stdint.h>
#include <stdio.h>
#define SERD_PAGE_SIZE 4096
@@ -47,20 +46,4 @@ serd_allocate_buffer(size_t size);
void
serd_free_aligned(void* ptr);
-/// Wrapper for getc that is compatible with SerdReadFunc
-static inline size_t
-serd_file_read_byte(void* buf, size_t size, size_t nmemb, void* stream)
-{
- (void)size;
- (void)nmemb;
-
- const int c = getc((FILE*)stream);
- if (c == EOF) {
- *((uint8_t*)buf) = 0;
- return 0;
- }
- *((uint8_t*)buf) = (uint8_t)c;
- return 1;
-}
-
#endif // SERD_SYSTEM_H
diff --git a/test/meson.build b/test/meson.build
index 67907c25..898062f0 100644
--- a/test/meson.build
+++ b/test/meson.build
@@ -86,6 +86,10 @@ if get_option('utils')
['-G', '?s ?p ?o . ?q ?r ?s .', '-s', ''],
['-G', 'bad_pattern', '-s', ''],
['-I'],
+ ['-b'],
+ ['-b', '-1'],
+ ['-b', '9223372036854775807'],
+ ['-b', '1024junk'],
['-c'],
['-i', 'unknown'],
['-i', 'turtle'],
@@ -201,12 +205,6 @@ if get_option('utils')
# IO errors
test('read_dir', serdi,
- args: ['-e', meson.source_root()],
- env: test_env,
- should_fail: true,
- suite: 'io_errors')
-
- test('bulk_read_dir', serdi,
args: [meson.source_root()],
env: test_env,
should_fail: true,
diff --git a/test/run_test_suite.py b/test/run_test_suite.py
index d6772014..0d865c1f 100755
--- a/test/run_test_suite.py
+++ b/test/run_test_suite.py
@@ -224,7 +224,7 @@ def test_suite(
self.n_failures = 0
def run_tests(test_class, tests, expected_return, results):
- thru_flags = [["-e"], ["-f"], ["-b"], ["-r", "http://example.org/"]]
+ thru_flags = [["-f"], ["-b", "1"], ["-r", "http://example.org/"]]
thru_options_iter = _option_combinations(thru_flags)
if output_syntax is not None:
osyntax = output_syntax