Replace serdi -b and -e options with a block size option

This is more powerful, and reduces the number of command line options that almost nobody needs to care about.
author: David Robillard <d@drobilla.net> 2021-08-08 14:24:59 -0400
committer: David Robillard <d@drobilla.net> 2022-01-28 21:57:07 -0500
commit: 5aa146e1ce58d295b5f45446bbbbdbb30c8e557d (patch)
tree: c8b49564b251bf2c3ff63bbfdc932986bea8d0b6
parent: baa2e4e768f542953144cfa6ebe5713ecad389fc (diff)
download: serd-5aa146e1ce58d295b5f45446bbbbdbb30c8e557d.tar.gz
serd-5aa146e1ce58d295b5f45446bbbbdbb30c8e557d.tar.bz2
serd-5aa146e1ce58d295b5f45446bbbbdbb30c8e557d.zip
8 files changed, 59 insertions, 65 deletions
diff --git a/doc/serdi.1 b/doc/serdi.1
index 2a110785..99652738 100644
--- a/doc/serdi.1
+++ b/doc/serdi.1
@@ -6,9 +6,10 @@
 .Nd read, filter, transform, and write RDF data
 .Sh SYNOPSIS
 .Nm serdi
-.Op Fl Cabefhlmqtvx
+.Op Fl Cafhlmqtvx
 .Op Fl F Ar pattern | Fl G Ar pattern
 .Op Fl I Ar base
+.Op Fl b Ar bytes
 .Op Fl c Ar prefix
 .Op Fl i Ar syntax
 .Op Fl k Ar bytes
@@ -77,9 +78,11 @@ or to provide a base URI for input from stdin or a string.
 Write ASCII output.
 If this is enabled, all non-ASCII characters will be escaped, even if the output syntax allows them to be written in UTF-8.
 .Pp
-.It Fl b
-Bulk output writing.
-If this is enabled, output will be written a page at a time, rather than a byte at a time.
+.It Fl b Ar bytes
+I/O block size.
+This is the number of bytes in a file that will be read or written at once.
+The default is 4096, which should perform well in most cases.
+Note that this only applies to files, standard input and output are always processed one byte at a time.
 .Pp
 .It Fl c Ar prefix
 Chop
@@ -88,11 +91,6 @@ from matching blank node IDs.
 This is the inverse of
 .Fl p .
 .Pp
-.It Fl e
-Eat input one character at a time, rather than a page at a time which is the default.
-This is useful when reading from a pipe since output will be generated immediately as input arrives, rather than waiting until an entire page of input has arrived.
-With this option serdi uses one page less memory, but will likely be significantly slower.
-.Pp
 .It Fl f
 Fast and loose mode.
 This disables shortening URIs into prefixed names or relative URI references.
diff --git a/scripts/serd_bench.py b/scripts/serd_bench.py
index 018734c4..8a10dab0 100755
--- a/scripts/serd_bench.py
+++ b/scripts/serd_bench.py
@@ -226,8 +226,8 @@ example:
     args = ap.parse_args(sys.argv[1:])
 
     progs = [
-        "serdi -b -i turtle -o turtle",
-        "serdi -m -b -i turtle -o turtle",
+        "serdi -i turtle -o turtle",
+        "serdi -m -i turtle -o turtle",
     ] + args.run
 
     min_n = int(args.max / 10)
diff --git a/src/console.c b/src/console.c
index 2cc908ef..df1bc2ff 100644
--- a/src/console.c
+++ b/src/console.c
@@ -15,7 +15,6 @@
 */
 
 #include "console.h"
-#include "system.h"
 
 #include "serd/serd.h"
 
@@ -27,6 +26,7 @@
 #  include <io.h>
 #endif
 
+#include <stdint.h>
 #include <string.h>
 
 void
@@ -56,8 +56,24 @@ serd_print_version(const char* const program)
   return 0;
 }
 
+/// Wrapper for getc that is compatible with SerdReadFunc but faster than fread
+static size_t
+serd_file_read_byte(void* buf, size_t size, size_t nmemb, void* stream)
+{
+  (void)size;
+  (void)nmemb;
+
+  const int c = getc((FILE*)stream);
+  if (c == EOF) {
+    *((uint8_t*)buf) = 0;
+    return 0;
+  }
+  *((uint8_t*)buf) = (uint8_t)c;
+  return 1;
+}
+
 SerdByteSource*
-serd_open_input(const char* const filename, const size_t page_size)
+serd_open_input(const char* const filename, const size_t block_size)
 {
   SerdByteSource* byte_source = NULL;
   if (!strcmp(filename, "-")) {
@@ -65,31 +81,26 @@ serd_open_input(const char* const filename, const size_t page_size)
 
     SerdNode* name = serd_new_string(SERD_STRING("stdin"));
 
-    byte_source = serd_byte_source_new_function(serd_file_read_byte,
-                                                (SerdStreamErrorFunc)ferror,
-                                                NULL,
-                                                stdin,
-                                                name,
-                                                page_size);
+    byte_source = serd_byte_source_new_function(
+      serd_file_read_byte, (SerdStreamErrorFunc)ferror, NULL, stdin, name, 1);
 
     serd_node_free(name);
   } else {
-    byte_source = serd_byte_source_new_filename(filename, page_size);
+    byte_source = serd_byte_source_new_filename(filename, block_size);
   }
 
   return byte_source;
 }
 
 SerdByteSink*
-serd_open_output(const char* const filename, const size_t page_size)
+serd_open_output(const char* const filename, const size_t block_size)
 {
   if (!filename || !strcmp(filename, "-")) {
     serd_set_stream_utf8_mode(stdout);
-    return serd_byte_sink_new_function(
-      (SerdWriteFunc)fwrite, stdout, page_size);
+    return serd_byte_sink_new_function((SerdWriteFunc)fwrite, stdout, 1);
   }
 
-  return serd_byte_sink_new_filename(filename, page_size);
+  return serd_byte_sink_new_filename(filename, block_size);
 }
 
 SerdStatus
diff --git a/src/console.h b/src/console.h
index 57170a94..31076b24 100644
--- a/src/console.h
+++ b/src/console.h
@@ -25,10 +25,10 @@ int
 serd_print_version(const char* program);
 
 SerdByteSource*
-serd_open_input(const char* filename, size_t page_size);
+serd_open_input(const char* filename, size_t block_size);
 
 SerdByteSink*
-serd_open_output(const char* filename, size_t page_size);
+serd_open_output(const char* filename, size_t block_size);
 
 SerdStatus
 serd_set_base_uri_from_path(SerdEnv* env, const char* path);
diff --git a/src/serdi.c b/src/serdi.c
index 88ccacd9..73a3f05c 100644
--- a/src/serdi.c
+++ b/src/serdi.c
@@ -15,7 +15,6 @@
 */
 
 #include "console.h"
-#include "system.h"
 
 #include "serd/serd.h"
 
@@ -49,9 +48,8 @@ print_usage(const char* const name, const bool error)
   fprintf(os, "  -G PATTERN   Only include statements matching PATTERN.\n");
   fprintf(os, "  -I BASE_URI  Input base URI.\n");
   fprintf(os, "  -a           Write ASCII output if possible.\n");
-  fprintf(os, "  -b           Fast bulk output for large serialisations.\n");
+  fprintf(os, "  -b BYTES     I/O block size.\n");
   fprintf(os, "  -c PREFIX    Chop PREFIX from matching blank node IDs.\n");
-  fprintf(os, "  -e           Eat input one character at a time.\n");
   fprintf(os, "  -f           Fast and loose mode (possibly ugly output).\n");
   fprintf(os, "  -h           Display this help and exit.\n");
   fprintf(os, "  -i SYNTAX    Input syntax: turtle/ntriples/trig/nquads.\n");
@@ -142,13 +140,12 @@ read_file(SerdWorld* const      world,
           const size_t          stack_size,
           const char* const     filename,
           const char* const     add_prefix,
-          const bool            bulk_read)
+          const size_t          block_size)
 {
   syntax = syntax ? syntax : serd_guess_syntax(filename);
   syntax = syntax ? syntax : SERD_TRIG;
 
-  SerdByteSource* byte_source =
-    serd_open_input(filename, bulk_read ? SERD_PAGE_SIZE : 1u);
+  SerdByteSource* byte_source = serd_open_input(filename, block_size);
 
   if (!byte_source) {
     SERDI_ERRORF(
@@ -185,13 +182,12 @@ main(int argc, char** argv)
   SerdSyntax      output_syntax = SERD_SYNTAX_EMPTY;
   SerdReaderFlags reader_flags  = 0;
   SerdWriterFlags writer_flags  = 0;
-  bool            bulk_read     = true;
-  bool            bulk_write    = false;
   bool            no_inline     = false;
   bool            osyntax_set   = false;
   bool            use_model     = false;
   bool            canonical     = false;
   bool            quiet         = false;
+  size_t          block_size    = 4096u;
   size_t          stack_size    = 4194304;
   const char*     input_string  = NULL;
   const char*     in_pattern    = NULL;
@@ -213,10 +209,6 @@ main(int argc, char** argv)
         canonical = true;
       } else if (opt == 'a') {
         writer_flags |= SERD_WRITE_ASCII;
-      } else if (opt == 'b') {
-        bulk_write = true;
-      } else if (opt == 'e') {
-        bulk_read = false;
       } else if (opt == 'f') {
         no_inline = true;
         writer_flags |= (SERD_WRITE_EXPANDED | SERD_WRITE_VERBATIM);
@@ -256,6 +248,19 @@ main(int argc, char** argv)
 
         base = serd_new_uri(SERD_STRING(argv[a]));
         break;
+      } else if (opt == 'b') {
+        if (argv[a][o + 1] || ++a == argc) {
+          return missing_arg(prog, 'b');
+        }
+
+        char*      endptr = NULL;
+        const long size   = strtol(argv[a], &endptr, 10);
+        if (size < 1 || size == LONG_MAX || *endptr != '\0') {
+          SERDI_ERRORF("invalid block size `%s'\n", argv[a]);
+          return 1;
+        }
+        block_size = (size_t)size;
+        break;
       } else if (opt == 'c') {
         if (argv[a][o + 1] || ++a == argc) {
           return missing_arg(prog, 'c');
@@ -380,8 +385,7 @@ main(int argc, char** argv)
   const SerdDescribeFlags describe_flags =
     no_inline ? SERD_NO_INLINE_OBJECTS : 0u;
 
-  SerdByteSink* const byte_sink =
-    serd_open_output(out_filename, bulk_write ? 4096u : 1u);
+  SerdByteSink* const byte_sink = serd_open_output(out_filename, block_size);
   if (!byte_sink) {
     perror("serdi: error opening output file");
     return 1;
@@ -498,7 +502,7 @@ main(int argc, char** argv)
                         stack_size,
                         inputs[i],
                         n_inputs > 1 ? prefix : add_prefix,
-                        bulk_read))) {
+                        block_size))) {
       break;
     }
   }
diff --git a/src/system.h b/src/system.h
index 27087bde..184e1aae 100644
--- a/src/system.h
+++ b/src/system.h
@@ -19,7 +19,6 @@
 
 #include "attributes.h"
 
-#include <stdint.h>
 #include <stdio.h>
 
 #define SERD_PAGE_SIZE 4096
@@ -47,20 +46,4 @@ serd_allocate_buffer(size_t size);
 void
 serd_free_aligned(void* ptr);
 
-/// Wrapper for getc that is compatible with SerdReadFunc
-static inline size_t
-serd_file_read_byte(void* buf, size_t size, size_t nmemb, void* stream)
-{
-  (void)size;
-  (void)nmemb;
-
-  const int c = getc((FILE*)stream);
-  if (c == EOF) {
-    *((uint8_t*)buf) = 0;
-    return 0;
-  }
-  *((uint8_t*)buf) = (uint8_t)c;
-  return 1;
-}
-
 #endif // SERD_SYSTEM_H
diff --git a/test/meson.build b/test/meson.build
index 67907c25..898062f0 100644
--- a/test/meson.build
+++ b/test/meson.build
@@ -86,6 +86,10 @@ if get_option('utils')
     ['-G', '?s ?p ?o . ?q ?r ?s .', '-s', ''],
     ['-G', 'bad_pattern', '-s', ''],
     ['-I'],
+    ['-b'],
+    ['-b', '-1'],
+    ['-b', '9223372036854775807'],
+    ['-b', '1024junk'],
     ['-c'],
     ['-i', 'unknown'],
     ['-i', 'turtle'],
@@ -201,12 +205,6 @@ if get_option('utils')
   # IO errors
 
   test('read_dir', serdi,
-       args: ['-e', meson.source_root()],
-       env: test_env,
-       should_fail: true,
-       suite: 'io_errors')
-
-  test('bulk_read_dir', serdi,
        args: [meson.source_root()],
        env: test_env,
        should_fail: true,
diff --git a/test/run_test_suite.py b/test/run_test_suite.py
index d6772014..0d865c1f 100755
--- a/test/run_test_suite.py
+++ b/test/run_test_suite.py
@@ -224,7 +224,7 @@ def test_suite(
             self.n_failures = 0
 
     def run_tests(test_class, tests, expected_return, results):
-        thru_flags = [["-e"], ["-f"], ["-b"], ["-r", "http://example.org/"]]
+        thru_flags = [["-f"], ["-b", "1"], ["-r", "http://example.org/"]]
         thru_options_iter = _option_combinations(thru_flags)
         if output_syntax is not None:
             osyntax = output_syntax
author	David Robillard <d@drobilla.net>	2021-08-08 14:24:59 -0400
committer	David Robillard <d@drobilla.net>	2022-01-28 21:57:07 -0500
commit	5aa146e1ce58d295b5f45446bbbbdbb30c8e557d (patch)
tree	c8b49564b251bf2c3ff63bbfdc932986bea8d0b6
parent	baa2e4e768f542953144cfa6ebe5713ecad389fc (diff)
download	serd-5aa146e1ce58d295b5f45446bbbbdbb30c8e557d.tar.gz serd-5aa146e1ce58d295b5f45446bbbbdbb30c8e557d.tar.bz2 serd-5aa146e1ce58d295b5f45446bbbbdbb30c8e557d.zip