From 5aa146e1ce58d295b5f45446bbbbdbb30c8e557d Mon Sep 17 00:00:00 2001 From: David Robillard Date: Sun, 8 Aug 2021 14:24:59 -0400 Subject: Replace serdi -b and -e options with a block size option This is more powerful, and reduces the number of command line options that almost nobody needs to care about. --- doc/serdi.1 | 16 +++++++--------- scripts/serd_bench.py | 4 ++-- src/console.c | 37 ++++++++++++++++++++++++------------- src/console.h | 4 ++-- src/serdi.c | 34 +++++++++++++++++++--------------- src/system.h | 17 ----------------- test/meson.build | 10 ++++------ test/run_test_suite.py | 2 +- 8 files changed, 59 insertions(+), 65 deletions(-) diff --git a/doc/serdi.1 b/doc/serdi.1 index 2a110785..99652738 100644 --- a/doc/serdi.1 +++ b/doc/serdi.1 @@ -6,9 +6,10 @@ .Nd read, filter, transform, and write RDF data .Sh SYNOPSIS .Nm serdi -.Op Fl Cabefhlmqtvx +.Op Fl Cafhlmqtvx .Op Fl F Ar pattern | Fl G Ar pattern .Op Fl I Ar base +.Op Fl b Ar bytes .Op Fl c Ar prefix .Op Fl i Ar syntax .Op Fl k Ar bytes @@ -77,9 +78,11 @@ or to provide a base URI for input from stdin or a string. Write ASCII output. If this is enabled, all non-ASCII characters will be escaped, even if the output syntax allows them to be written in UTF-8. .Pp -.It Fl b -Bulk output writing. -If this is enabled, output will be written a page at a time, rather than a byte at a time. +.It Fl b Ar bytes +I/O block size. +This is the number of bytes in a file that will be read or written at once. +The default is 4096, which should perform well in most cases. +Note that this only applies to files, standard input and output are always processed one byte at a time. .Pp .It Fl c Ar prefix Chop @@ -88,11 +91,6 @@ from matching blank node IDs. This is the inverse of .Fl p . .Pp -.It Fl e -Eat input one character at a time, rather than a page at a time which is the default. -This is useful when reading from a pipe since output will be generated immediately as input arrives, rather than waiting until an entire page of input has arrived. -With this option serdi uses one page less memory, but will likely be significantly slower. -.Pp .It Fl f Fast and loose mode. This disables shortening URIs into prefixed names or relative URI references. diff --git a/scripts/serd_bench.py b/scripts/serd_bench.py index 018734c4..8a10dab0 100755 --- a/scripts/serd_bench.py +++ b/scripts/serd_bench.py @@ -226,8 +226,8 @@ example: args = ap.parse_args(sys.argv[1:]) progs = [ - "serdi -b -i turtle -o turtle", - "serdi -m -b -i turtle -o turtle", + "serdi -i turtle -o turtle", + "serdi -m -i turtle -o turtle", ] + args.run min_n = int(args.max / 10) diff --git a/src/console.c b/src/console.c index 2cc908ef..df1bc2ff 100644 --- a/src/console.c +++ b/src/console.c @@ -15,7 +15,6 @@ */ #include "console.h" -#include "system.h" #include "serd/serd.h" @@ -27,6 +26,7 @@ # include #endif +#include #include void @@ -56,8 +56,24 @@ serd_print_version(const char* const program) return 0; } +/// Wrapper for getc that is compatible with SerdReadFunc but faster than fread +static size_t +serd_file_read_byte(void* buf, size_t size, size_t nmemb, void* stream) +{ + (void)size; + (void)nmemb; + + const int c = getc((FILE*)stream); + if (c == EOF) { + *((uint8_t*)buf) = 0; + return 0; + } + *((uint8_t*)buf) = (uint8_t)c; + return 1; +} + SerdByteSource* -serd_open_input(const char* const filename, const size_t page_size) +serd_open_input(const char* const filename, const size_t block_size) { SerdByteSource* byte_source = NULL; if (!strcmp(filename, "-")) { @@ -65,31 +81,26 @@ serd_open_input(const char* const filename, const size_t page_size) SerdNode* name = serd_new_string(SERD_STRING("stdin")); - byte_source = serd_byte_source_new_function(serd_file_read_byte, - (SerdStreamErrorFunc)ferror, - NULL, - stdin, - name, - page_size); + byte_source = serd_byte_source_new_function( + serd_file_read_byte, (SerdStreamErrorFunc)ferror, NULL, stdin, name, 1); serd_node_free(name); } else { - byte_source = serd_byte_source_new_filename(filename, page_size); + byte_source = serd_byte_source_new_filename(filename, block_size); } return byte_source; } SerdByteSink* -serd_open_output(const char* const filename, const size_t page_size) +serd_open_output(const char* const filename, const size_t block_size) { if (!filename || !strcmp(filename, "-")) { serd_set_stream_utf8_mode(stdout); - return serd_byte_sink_new_function( - (SerdWriteFunc)fwrite, stdout, page_size); + return serd_byte_sink_new_function((SerdWriteFunc)fwrite, stdout, 1); } - return serd_byte_sink_new_filename(filename, page_size); + return serd_byte_sink_new_filename(filename, block_size); } SerdStatus diff --git a/src/console.h b/src/console.h index 57170a94..31076b24 100644 --- a/src/console.h +++ b/src/console.h @@ -25,10 +25,10 @@ int serd_print_version(const char* program); SerdByteSource* -serd_open_input(const char* filename, size_t page_size); +serd_open_input(const char* filename, size_t block_size); SerdByteSink* -serd_open_output(const char* filename, size_t page_size); +serd_open_output(const char* filename, size_t block_size); SerdStatus serd_set_base_uri_from_path(SerdEnv* env, const char* path); diff --git a/src/serdi.c b/src/serdi.c index 88ccacd9..73a3f05c 100644 --- a/src/serdi.c +++ b/src/serdi.c @@ -15,7 +15,6 @@ */ #include "console.h" -#include "system.h" #include "serd/serd.h" @@ -49,9 +48,8 @@ print_usage(const char* const name, const bool error) fprintf(os, " -G PATTERN Only include statements matching PATTERN.\n"); fprintf(os, " -I BASE_URI Input base URI.\n"); fprintf(os, " -a Write ASCII output if possible.\n"); - fprintf(os, " -b Fast bulk output for large serialisations.\n"); + fprintf(os, " -b BYTES I/O block size.\n"); fprintf(os, " -c PREFIX Chop PREFIX from matching blank node IDs.\n"); - fprintf(os, " -e Eat input one character at a time.\n"); fprintf(os, " -f Fast and loose mode (possibly ugly output).\n"); fprintf(os, " -h Display this help and exit.\n"); fprintf(os, " -i SYNTAX Input syntax: turtle/ntriples/trig/nquads.\n"); @@ -142,13 +140,12 @@ read_file(SerdWorld* const world, const size_t stack_size, const char* const filename, const char* const add_prefix, - const bool bulk_read) + const size_t block_size) { syntax = syntax ? syntax : serd_guess_syntax(filename); syntax = syntax ? syntax : SERD_TRIG; - SerdByteSource* byte_source = - serd_open_input(filename, bulk_read ? SERD_PAGE_SIZE : 1u); + SerdByteSource* byte_source = serd_open_input(filename, block_size); if (!byte_source) { SERDI_ERRORF( @@ -185,13 +182,12 @@ main(int argc, char** argv) SerdSyntax output_syntax = SERD_SYNTAX_EMPTY; SerdReaderFlags reader_flags = 0; SerdWriterFlags writer_flags = 0; - bool bulk_read = true; - bool bulk_write = false; bool no_inline = false; bool osyntax_set = false; bool use_model = false; bool canonical = false; bool quiet = false; + size_t block_size = 4096u; size_t stack_size = 4194304; const char* input_string = NULL; const char* in_pattern = NULL; @@ -213,10 +209,6 @@ main(int argc, char** argv) canonical = true; } else if (opt == 'a') { writer_flags |= SERD_WRITE_ASCII; - } else if (opt == 'b') { - bulk_write = true; - } else if (opt == 'e') { - bulk_read = false; } else if (opt == 'f') { no_inline = true; writer_flags |= (SERD_WRITE_EXPANDED | SERD_WRITE_VERBATIM); @@ -256,6 +248,19 @@ main(int argc, char** argv) base = serd_new_uri(SERD_STRING(argv[a])); break; + } else if (opt == 'b') { + if (argv[a][o + 1] || ++a == argc) { + return missing_arg(prog, 'b'); + } + + char* endptr = NULL; + const long size = strtol(argv[a], &endptr, 10); + if (size < 1 || size == LONG_MAX || *endptr != '\0') { + SERDI_ERRORF("invalid block size `%s'\n", argv[a]); + return 1; + } + block_size = (size_t)size; + break; } else if (opt == 'c') { if (argv[a][o + 1] || ++a == argc) { return missing_arg(prog, 'c'); @@ -380,8 +385,7 @@ main(int argc, char** argv) const SerdDescribeFlags describe_flags = no_inline ? SERD_NO_INLINE_OBJECTS : 0u; - SerdByteSink* const byte_sink = - serd_open_output(out_filename, bulk_write ? 4096u : 1u); + SerdByteSink* const byte_sink = serd_open_output(out_filename, block_size); if (!byte_sink) { perror("serdi: error opening output file"); return 1; @@ -498,7 +502,7 @@ main(int argc, char** argv) stack_size, inputs[i], n_inputs > 1 ? prefix : add_prefix, - bulk_read))) { + block_size))) { break; } } diff --git a/src/system.h b/src/system.h index 27087bde..184e1aae 100644 --- a/src/system.h +++ b/src/system.h @@ -19,7 +19,6 @@ #include "attributes.h" -#include #include #define SERD_PAGE_SIZE 4096 @@ -47,20 +46,4 @@ serd_allocate_buffer(size_t size); void serd_free_aligned(void* ptr); -/// Wrapper for getc that is compatible with SerdReadFunc -static inline size_t -serd_file_read_byte(void* buf, size_t size, size_t nmemb, void* stream) -{ - (void)size; - (void)nmemb; - - const int c = getc((FILE*)stream); - if (c == EOF) { - *((uint8_t*)buf) = 0; - return 0; - } - *((uint8_t*)buf) = (uint8_t)c; - return 1; -} - #endif // SERD_SYSTEM_H diff --git a/test/meson.build b/test/meson.build index 67907c25..898062f0 100644 --- a/test/meson.build +++ b/test/meson.build @@ -86,6 +86,10 @@ if get_option('utils') ['-G', '?s ?p ?o . ?q ?r ?s .', '-s', ''], ['-G', 'bad_pattern', '-s', ''], ['-I'], + ['-b'], + ['-b', '-1'], + ['-b', '9223372036854775807'], + ['-b', '1024junk'], ['-c'], ['-i', 'unknown'], ['-i', 'turtle'], @@ -201,12 +205,6 @@ if get_option('utils') # IO errors test('read_dir', serdi, - args: ['-e', meson.source_root()], - env: test_env, - should_fail: true, - suite: 'io_errors') - - test('bulk_read_dir', serdi, args: [meson.source_root()], env: test_env, should_fail: true, diff --git a/test/run_test_suite.py b/test/run_test_suite.py index d6772014..0d865c1f 100755 --- a/test/run_test_suite.py +++ b/test/run_test_suite.py @@ -224,7 +224,7 @@ def test_suite( self.n_failures = 0 def run_tests(test_class, tests, expected_return, results): - thru_flags = [["-e"], ["-f"], ["-b"], ["-r", "http://example.org/"]] + thru_flags = [["-f"], ["-b", "1"], ["-r", "http://example.org/"]] thru_options_iter = _option_combinations(thru_flags) if output_syntax is not None: osyntax = output_syntax -- cgit v1.2.1