Add rerex from git@gitlab.com:drobilla/rerex 2420851

author: David Robillard <d@drobilla.net> 2020-12-19 22:07:41 +0100
committer: David Robillard <d@drobilla.net> 2021-03-08 23:34:56 -0500
commit: 00456f395e50b43a9f26ff1479b9ef8b06a4d219 (patch)
tree: 31c22b3bd12d8e1bb6597c53fade859f6ca37c79 /subprojects/rerex
parent: ec0edd399d3ef493c4ddb0e821a56b53a19dbc96 (diff)
download: serd-00456f395e50b43a9f26ff1479b9ef8b06a4d219.tar.gz
serd-00456f395e50b43a9f26ff1479b9ef8b06a4d219.tar.bz2
serd-00456f395e50b43a9f26ff1479b9ef8b06a4d219.zip
17 files changed, 2475 insertions, 0 deletions
diff --git a/subprojects/rerex/.clang-format b/subprojects/rerex/.clang-format
new file mode 100644
index 00000000..fcdbab81
--- /dev/null
+++ b/subprojects/rerex/.clang-format
@@ -0,0 +1,21 @@
+---
+AlignConsecutiveAssignments: true
+AlignConsecutiveDeclarations: true
+AlignEscapedNewlinesLeft: true
+BasedOnStyle: Mozilla
+BraceWrapping:
+  AfterEnum: false
+  AfterExternBlock: false
+  AfterFunction: true
+  AfterStruct: false
+  AfterUnion: false
+BreakBeforeBraces: Custom
+Cpp11BracedListStyle: true
+IndentCaseLabels: false
+IndentPPDirectives: AfterHash
+KeepEmptyLinesAtTheStartOfBlocks: false
+SpacesInContainerLiterals: false
+StatementMacros:
+  - REREX_API
+  - REREX_CONST_API
+...
diff --git a/subprojects/rerex/.clang-tidy b/subprojects/rerex/.clang-tidy
new file mode 100644
index 00000000..3a0d4d70
--- /dev/null
+++ b/subprojects/rerex/.clang-tidy
@@ -0,0 +1,6 @@
+Checks: >
+  *,
+  -llvmlibc*,
+WarningsAsErrors: '*'
+HeaderFilterRegex: '.*'
+FormatStyle: file
diff --git a/subprojects/rerex/.gitignore b/subprojects/rerex/.gitignore
new file mode 100644
index 00000000..84c048a7
--- /dev/null
+++ b/subprojects/rerex/.gitignore
@@ -0,0 +1 @@
+/build/
diff --git a/subprojects/rerex/.gitlab-ci.yml b/subprojects/rerex/.gitlab-ci.yml
new file mode 100644
index 00000000..5e7cb7bb
--- /dev/null
+++ b/subprojects/rerex/.gitlab-ci.yml
@@ -0,0 +1,165 @@
+stages:
+  - build
+  - deploy
+
+.build_template: &build_definition
+  stage: build
+
+
+arm32_dbg:
+  <<: *build_definition
+  image: lv2plugin/debian-arm32
+  script:
+    - meson . build --cross-file=/usr/share/meson/cross/arm-linux-gnueabihf.ini -Dbuildtype=debug -Dstrict=true -Dwerror=true -Dtests=true
+    - ninja -C build test
+
+arm32_rel:
+  <<: *build_definition
+  image: lv2plugin/debian-arm32
+  script:
+    - meson . build --cross-file=/usr/share/meson/cross/arm-linux-gnueabihf.ini -Dbuildtype=release -Dstrict=true -Dwerror=true -Dtests=true
+    - ninja -C build test
+
+
+arm64_dbg:
+  <<: *build_definition
+  image: lv2plugin/debian-arm64
+  script:
+    - meson . build --cross-file=/usr/share/meson/cross/aarch64-linux-gnu.ini -Dbuildtype=debug -Dstrict=true -Dwerror=true -Dtests=true
+    - ninja -C build test
+
+arm64_rel:
+  <<: *build_definition
+  image: lv2plugin/debian-arm64
+  script:
+    - meson . build --cross-file=/usr/share/meson/cross/aarch64-linux-gnu.ini -Dbuildtype=release -Dstrict=true -Dwerror=true -Dtests=true
+    - ninja -C build test
+
+
+x32_dbg:
+  <<: *build_definition
+  image: lv2plugin/debian-x32
+  script:
+    - meson . build --cross-file=/usr/share/meson/cross/i686-linux-gnu.ini -Dbuildtype=debug -Dstrict=true -Dwerror=true
+    - ninja -C build test
+
+x32_rel:
+  <<: *build_definition
+  image: lv2plugin/debian-x32
+  script:
+    - meson . build --cross-file=/usr/share/meson/cross/i686-linux-gnu.ini -Dbuildtype=release -Dstrict=true -Dwerror=true
+    - ninja -C build test
+
+
+x64_dbg:
+  <<: *build_definition
+  image: lv2plugin/debian-x64
+  script:
+    - meson . build -Dbuildtype=debug -Dstrict=true -Dwerror=true -Dtests=true -Db_coverage=true
+    - ninja -C build test
+    - ninja -C build coverage-html
+  artifacts:
+    paths:
+      - build/meson-logs/coveragereport
+
+x64_rel:
+  <<: *build_definition
+  image: lv2plugin/debian-x64
+  script:
+    - meson . build -Dbuildtype=release -Dstrict=true -Dwerror=true -Dtests=true
+    - ninja -C build test
+
+
+x64_static:
+  <<: *build_definition
+  image: lv2plugin/debian-x64
+  script:
+    - meson . build -Ddefault_library=static -Dstrict=true -Dwerror=true -Dtests=true
+    - ninja -C build test
+
+
+x64_sanitize:
+  <<: *build_definition
+  image: lv2plugin/debian-x64-clang
+  script:
+    - meson . build -Db_lundef=false -Dbuildtype=plain -Dstrict=true -Dwerror=true -Dtests=true
+    - ninja -C build test
+  variables:
+    CC: "clang"
+    CFLAGS: "-fno-sanitize-recover=all -fsanitize=address -fsanitize=undefined -fsanitize=float-divide-by-zero -fsanitize=unsigned-integer-overflow -fsanitize=implicit-conversion -fsanitize=local-bounds -fsanitize=nullability"
+    LDFLAGS: "-fno-sanitize-recover=all -fsanitize=address -fsanitize=undefined -fsanitize=float-divide-by-zero -fsanitize=unsigned-integer-overflow -fsanitize=implicit-conversion -fsanitize=local-bounds -fsanitize=nullability"
+
+
+mingw32_dbg:
+  <<: *build_definition
+  image: lv2plugin/debian-mingw32
+  script:
+    - meson . build --cross-file=/usr/share/meson/cross/i686-w64-mingw32.ini -Dbuildtype=debug -Dstrict=true -Dwerror=true -Dtests=true
+    - ninja -C build test
+
+mingw32_rel:
+  <<: *build_definition
+  image: lv2plugin/debian-mingw32
+  script:
+    - meson . build --cross-file=/usr/share/meson/cross/i686-w64-mingw32.ini -Dbuildtype=release -Dstrict=true -Dwerror=true -Dtests=true
+    - ninja -C build test
+
+
+mingw64_dbg:
+  <<: *build_definition
+  image: lv2plugin/debian-mingw64
+  script:
+    - meson . build --cross-file=/usr/share/meson/cross/x86_64-w64-mingw32.ini -Dbuildtype=debug -Dstrict=true -Dwerror=true -Dtests=true
+    - ninja -C build test
+
+mingw64_rel:
+  <<: *build_definition
+  image: lv2plugin/debian-mingw64
+  script:
+    - meson . build --cross-file=/usr/share/meson/cross/x86_64-w64-mingw32.ini -Dbuildtype=release -Dstrict=true -Dwerror=true -Dtests=true
+    - ninja -C build test
+
+
+mac_dbg:
+  <<: *build_definition
+  tags: [macos]
+  script:
+    - meson . build -Dbuildtype=debug -Dstrict=true -Dwerror=true -Dtests=true
+    - ninja -C build test
+
+mac_rel:
+  <<: *build_definition
+  tags: [macos]
+  script:
+    - meson . build -Dbuildtype=release -Dstrict=true -Dwerror=true -Dtests=true
+    - ninja -C build test
+
+
+win_dbg:
+  <<: *build_definition
+  tags: [windows,meson]
+  script:
+    - meson . build -Dbuildtype=debug -Dstrict=true -Dwerror=true -Dtests=true
+    - ninja -C build test
+
+win_rel:
+  <<: *build_definition
+  tags: [windows,meson]
+  script:
+    - meson . build -Dbuildtype=release -Dstrict=true -Dwerror=true -Dtests=true
+    - ninja -C build test
+
+
+pages:
+  stage: deploy
+  script:
+    - mkdir -p .public
+    - mv build/meson-logs/coveragereport/ .public/coverage
+    - mv .public public
+  dependencies:
+    - x64_dbg
+  artifacts:
+    paths:
+      - public
+  only:
+    - master
diff --git a/subprojects/rerex/CONTRIBUTING.md b/subprojects/rerex/CONTRIBUTING.md
new file mode 100644
index 00000000..c942308a
--- /dev/null
+++ b/subprojects/rerex/CONTRIBUTING.md
@@ -0,0 +1,32 @@
+Contributing to Rerex
+=====================
+
+Feel free to tinker with Rerex, I tried to make the code as easy to follow as I
+could while preserving the minimalist spirit of it.  Implementing character
+sets and escapes and so on have made things a little bit hairier than a
+textbook example, but hopefully it is still easy to digest.
+
+If you want to make a contribution, feel free to send patches, but note that I
+would like to preserve the cleanliness avoid bloating things too much, so I may
+not accept patches for features that are too complicated.  Bug fixes or other
+improvements to the code are, of course, most welcome.
+
+That said, this is, of course, free software, so you're free to take it and do
+as you please.  I'd be happy to chat about it, these are just guidelines for
+what I feel would be appropriate for inclusion in the project.
+
+Happy hacking.
+
+Tooling
+-------
+
+All of the code is machine formatted, the style is defined by the included
+`clang-format` configuration.  You can use `ninja clang-format` to format
+everything.
+
+Configuring with `-Dstrict=true` will enable the ultra-strict warnings that are
+run on CI (nearly all of them).  Configuring with `-Dwerror=true` will make
+them errors.
+
+A `clang-tidy` configuration is included for more intensive static checking,
+you can run all the checks with `ninja clang-tidy`.
diff --git a/subprojects/rerex/COPYING b/subprojects/rerex/COPYING
new file mode 100644
index 00000000..0ba77822
--- /dev/null
+++ b/subprojects/rerex/COPYING
@@ -0,0 +1,13 @@
+Copyright 2020 David Robillard <d@drobilla.net>
+
+Permission to use, copy, modify, and/or distribute this software for any
+purpose with or without fee is hereby granted, provided that the above
+copyright notice and this permission notice appear in all copies.
+
+THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+\ No newline at end of file
diff --git a/subprojects/rerex/NEWS b/subprojects/rerex/NEWS
new file mode 100644
index 00000000..41f780a0
--- /dev/null
+++ b/subprojects/rerex/NEWS
@@ -0,0 +1,5 @@
+rerex (0.0.1) unstable;
+
+  * Initial release
+
+ -- David Robillard <d@drobilla.net>  Wed, 09 Dec 2020 18:01:50 +0000
diff --git a/subprojects/rerex/README.md b/subprojects/rerex/README.md
new file mode 100644
index 00000000..a2571cee
--- /dev/null
+++ b/subprojects/rerex/README.md
@@ -0,0 +1,158 @@
+Rerex
+=====
+
+Rerex (REally REgular eXpressions) is a simple and efficient NFA-based regular
+expression implementation in C99.
+
+Rerex parses a regular expression into an NFA, then simulates execution using
+Thompson's algorithm, so matching is linear in the size of the input.  For
+details on the theory behind this approach, see [Programming Techniques:
+Regular expression search algorithm], the classic paper by Ken Thompson.
+[Regular Expression Matching Can Be Simple And Fast] is also a good read, which
+compares this approach with those used in most modern regular expression
+implementations.  The matching algorithm in Rerex is effectively the same as
+the one described there, though with a slightly different implementation.
+Parsing and construction are completely different, using a recursive-descent
+parser which constructs the automata on the fly.
+
+This is a minimalist implementation with a limited feature set which is mainly
+suitable for basic applications like checking markup or programming language
+syntax.  It does, however, strive to be clear and high-quality code which is a
+bit more fully-featured than a simple example, and suitable for real-world use.
+For example, character set expressions are supported, and meaningful errors are
+reported with the offset of the problematic character.
+
+The generated automata are not always optimal, though there are a few simple
+optimizations that avoid generating redundant states.  Though there is room for
+improvement there (at the cost of additional complexity), matching performs
+relatively well.
+
+Implementation
+--------------
+
+The implementation is clean and warning-free C99, tested and known to work with
+at least clang, GCC, and MSVC on POSIX, MacOS, and Windows.  It weighs in at
+under 600 [SLOC][], which can be compiled to about 5k on x86.
+
+Additionally, a simple test suite is included which covers 100% of the code.
+It is run continuously on all of the above mentioned platforms on x64, as well
+as in Linux on 32 and 64 bit ARM via emulation.
+
+Supported Grammar
+-----------------
+
+Rerex supports a basic subset of classic regular expression syntax that is
+supported by almost all implementations.  The grammar is written here in [XML
+EBNF notation][] with terminals in uppercase:
+
+    DOT       ::= '.'
+    OPERATOR  ::= '*' | '+' | '?'
+    SPECIAL   ::= DOT | OPERATOR | '(' | ')' | '[' | ']' | '^' | '{' | '|' | '}'
+    ESCAPABLE ::= SPECIAL | '-'
+    ESCAPE    ::= '\' SPECIAL
+    CHAR      ::= ESCAPE | [#x20-#x7E] - SPECIAL
+    ELEMENT   ::= ([#x20-#x7E] - ']') | ('\' ']')
+    Range     ::= ELEMENT | ELEMENT '-' ELEMENT
+    Set       ::= Range | Range Set
+    Atom      ::= CHAR | DOT | '(' Expr ')' | '[' Set ']'
+    Factor    ::= Atom | Atom OPERATOR
+    Term      ::= Factor | Factor Term
+    Expr      ::= Term | Term '|' Expr
+
+Notable Limitations
+-------------------
+
+Rerex is not meant to replace fully-featured regular expression libraries, so
+the feature set is somewhat limited.  The most glaring omissions include:
+
+  - Only supports printable ASCII input
+  - Only supports anchored matching (no back references or group extraction)
+  - Only reads from a null-terminated string (not, for example, files)
+  - No support for counted replication with `{}`
+
+Should I Use This?
+------------------
+
+If you just want to understand how regular expressions work, and have some
+clean code that's easy to understand and tinker with, sure.
+
+If you need a minimal regular expression matching implementation in portable C
+for basic ASCII tasks, maybe.
+
+If you need a fully-featured regular expression implementation for
+international text with group extraction and other more advanced features,
+probably not.
+
+In any case, if you do, letting me know or linking to this page would be
+appreciated.
+
+Dependencies
+------------
+
+None, except the C standard library.
+
+Building
+--------
+
+Rerex consists only of a single header and configuration-free source file, so
+building it with any compiler manually should be straightforward.  It should,
+in theory, work just about anywhere.
+
+A [Meson][] build definition is included which can be used to do a proper
+system installation with a `pkg-config` file, generate IDE projects, run the
+tests, and so on.  For example, the library and tests can be built and run like
+so:
+
+    meson setup -Dtests=true build
+    cd build
+    ninja test
+
+See the [Meson documentation][] for more details on using Meson.
+
+Usage
+-----
+
+Rerex has a small API based around two objects: a _pattern_, which is an
+immutable compiled representation of a regular expression, and a _matcher_
+which can be used to match strings against a pattern.  See [the
+header](include/rerex/rerex.h) for details, there is no external documentation
+aside from this README.
+
+This simple example function uses everything in the API to provide a high-level
+string-based match function:
+
+```c
+bool matches(const char* regexp, const char* string)
+{
+    RerexPattern* pattern = NULL;
+    size_t        end     = 0;
+    RerexStatus   st      = rerex_compile(regexp, &end, &pattern);
+
+    if (st) {
+        fprintf(stderr, "(string):%zu: error: %s\n", end, rerex_strerror(st));
+        return false;
+    }
+
+    RerexMatcher* matcher = rerex_new_matcher(pattern);
+    bool          matches = rerex_match(matcher, string);
+
+    rerex_free_matcher(matcher);
+    rerex_free_pattern(pattern);
+
+    return matches;
+}
+```
+
+ -- David Robillard <d@drobilla.net>
+
+[XML EBNF notation]: http://www.w3.org/TR/REC-xml/#sec-notation
+
+[SLOC]: https://en.wikipedia.org/wiki/Source_lines_of_code
+
+[Meson]: https://mesonbuild.com/
+
+[Meson documentation]: https://mesonbuild.com/Quick-guide.html
+
+[Programming Techniques: Regular expression search algorithm]: https://dl.acm.org/doi/10.1145/363347.363387
+
+[Regular Expression Matching Can Be Simple And Fast]: https://swtch.com/~rsc/regexp/regexp1.html
diff --git a/subprojects/rerex/include/rerex/rerex.h b/subprojects/rerex/include/rerex/rerex.h
new file mode 100644
index 00000000..507a3a96
--- /dev/null
+++ b/subprojects/rerex/include/rerex/rerex.h
@@ -0,0 +1,112 @@
+/*
+  Copyright 2020 David Robillard <d@drobilla.net>
+
+  Permission to use, copy, modify, and/or distribute this software for any
+  purpose with or without fee is hereby granted, provided that the above
+  copyright notice and this permission notice appear in all copies.
+
+  THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+*/
+
+#ifndef REREX_REREX_H
+#define REREX_REREX_H
+
+#include <stdbool.h>
+#include <stddef.h>
+
+#if defined(_WIN32) && !defined(REREX_STATIC) && defined(REREX_INTERNAL)
+#  define REREX_API __declspec(dllexport)
+#elif defined(_WIN32) && !defined(REREX_STATIC)
+#  define REREX_API __declspec(dllimport)
+#elif defined(__GNUC__)
+#  define REREX_API __attribute__((visibility("default")))
+#else
+#  define REREX_API
+#endif
+
+#if defined(__GNUC__)
+#  define REREX_CONST_API __attribute__((const)) REREX_API
+#else
+#  define REREX_CONST_API REREX_API
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// Return status code
+typedef enum {
+  REREX_SUCCESS,
+  REREX_EXPECTED_CHAR,
+  REREX_EXPECTED_ELEMENT,
+  REREX_EXPECTED_RBRACKET,
+  REREX_EXPECTED_RPAREN,
+  REREX_EXPECTED_SPECIAL,
+  REREX_UNEXPECTED_SPECIAL,
+  REREX_UNEXPECTED_END,
+  REREX_UNORDERED_RANGE,
+} RerexStatus;
+
+/// Pattern that represents a compiled valid regular expression
+typedef struct RerexPatternImpl RerexPattern;
+
+/// Matcher that can be used to match strings against a pattern
+typedef struct RerexMatcherImpl RerexMatcher;
+
+/// Return a human-readable description of `status`
+REREX_CONST_API
+const char*
+rerex_strerror(RerexStatus status);
+
+/**
+   Build a regular expression from a pattern string.
+
+   The `pattern` must be a null-terminated string in supported regular
+   expression syntax.  Upon return, `end` is set to the offset of the last
+   character processed.  On success, `REREX_SUCCESS` is returned, and `out` is
+   pointed to a newly allocated pattern which must be freed with
+   rerex_free_pattern().  On error, `out` is unchanged, an error code is
+   returned, and `end` can be used to determine the position of the error in
+   the pattern.
+*/
+REREX_API
+RerexStatus
+rerex_compile(const char* pattern, size_t* end, RerexPattern** out);
+
+/**
+   Allocate a new matcher for matching against a pattern.
+
+   The returned matcher can be used to match several strings against a single
+   pattern using rerex_match().  It is newly allocated and must be freed with
+   rerex_free_matcher().
+*/
+REREX_API
+RerexMatcher*
+rerex_new_matcher(const RerexPattern* regexp);
+
+/// Return true if `string` matches the pattern of `matcher`
+REREX_API
+bool
+rerex_match(RerexMatcher* matcher, const char* string);
+
+/// Free a matcher allocated with rerex_new_matcher()
+REREX_API
+void
+rerex_free_matcher(RerexMatcher* matcher);
+
+/// Free a pattern allocated with rerex_compile()
+REREX_API
+void
+rerex_free_pattern(RerexPattern* expression);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // REREX_REREX_H
diff --git a/subprojects/rerex/meson.build b/subprojects/rerex/meson.build
new file mode 100644
index 00000000..af64fac9
--- /dev/null
+++ b/subprojects/rerex/meson.build
@@ -0,0 +1,121 @@
+project('rerex', ['c'],
+        version: '0.0.1',
+        license: 'ISC',
+        meson_version: '>= 0.49.0',
+        default_options: [
+          'b_ndebug=if-release',
+          'buildtype=release',
+          'c_std=c99',
+          'default_library=shared',
+          'warning_level=3',
+        ])
+
+major_version = meson.project_version().split('.')[0]
+version_suffix = '-@0@'.format(major_version)
+versioned_name = 'rerex' + version_suffix
+
+# Load build tools
+pkg = import('pkgconfig')
+cc = meson.get_compiler('c')
+
+# Set ultra strict warnings for developers, if requested
+c_warnings = []
+c_suppressions = []
+if get_option('strict')
+  if not meson.is_subproject()
+    subdir('meson')
+    add_project_arguments(all_c_warnings, language: ['c'])
+  endif
+  
+  if cc.get_id() == 'gcc'
+    c_suppressions = [
+      '-Wno-aggregate-return',
+      '-Wno-switch-default',
+    ]
+  elif cc.get_id() == 'msvc'
+    c_suppressions = [
+      '/wd4706',  # assignment within conditional expression
+      '/wd4711',  # function selected for automatic inline expansion
+      '/wd5045',  # will insert Spectre mitigation
+    ]
+  else
+    c_suppressions = []
+  endif
+endif
+
+rerex_c_args = cc.get_supported_arguments(c_suppressions)
+
+if cc.get_id() == 'msvc'
+  # Build as C++
+  add_project_arguments(['/TP'], language: ['c'])
+
+  # Suppress warnings in system headers
+  add_project_arguments(['/experimental:external',
+                         '/external:W0',
+                         '/external:anglebrackets'],
+                        language: ['c'])
+endif
+
+# Determine library type and the flags needed to build it
+if get_option('default_library') == 'both'
+  if host_machine.system() == 'windows'
+    error('default_library=both is not supported on Windows')
+  endif
+
+  library_type = 'both_libraries'
+  library_args = ['-DREREX_INTERNAL']
+elif get_option('default_library') == 'shared'
+  library_type = 'shared_library'
+  library_args = ['-DREREX_INTERNAL']
+else
+  library_type = 'static_library'
+  library_args = ['-DREREX_INTERNAL', '-DREREX_STATIC']
+endif
+
+# Build shared and/or static library/libraries
+librerex = build_target(
+  versioned_name,
+  ['src/rerex.c'],
+  version: meson.project_version(),
+  include_directories: include_directories(['include']),
+  c_args: rerex_c_args + library_args,
+  gnu_symbol_visibility: 'hidden',
+  install: true,
+  target_type: library_type)
+
+# Generage pkg-config file
+pkg.generate(
+  librerex,
+  name: 'Rerex',
+  filebase: versioned_name,
+  subdirs: [versioned_name],
+  version: meson.project_version(),
+  description: 'A simple and efficient regular expression implementation')
+
+# Install header to a versioned include directory
+install_headers(['include/rerex/rerex.h'], subdir: versioned_name / 'rerex')
+
+rerex_dep = declare_dependency(
+  link_with: librerex,
+  include_directories: include_directories(['include']))
+
+# Build and run tests
+if get_option('tests')
+  foreach name : ['syntax', 'match', 'xsd']
+    full_name = 'test_@0@'.format(name)
+    test(full_name,
+         executable(full_name,
+                    'test/@0@.c'.format(full_name),
+                    c_args: rerex_c_args,
+                    include_directories: include_directories(['include']),
+                    dependencies: rerex_dep))
+  endforeach
+endif
+
+if meson.version().version_compare('>=0.53.0')
+  summary('Tests', get_option('tests'), bool_yn: true)
+
+  summary('Install prefix', get_option('prefix'))
+  summary('Headers', get_option('prefix') / get_option('includedir'))
+  summary('Libraries', get_option('prefix') / get_option('libdir'))
+endif
diff --git a/subprojects/rerex/meson/meson.build b/subprojects/rerex/meson/meson.build
new file mode 100644
index 00000000..5dd0de38
--- /dev/null
+++ b/subprojects/rerex/meson/meson.build
@@ -0,0 +1,204 @@
+# General code to enable approximately all warnings.
+#
+# This is trivial for clang and MSVC, but GCC does not have such an option, and
+# has several esoteric warnings, so we need to enable everything we want
+# explicitly.  We enable everything that does not require a value argument,
+# except for warnings that are only relevant for very old languages (earlier
+# than C99 or C++11) or non-standard extensions.
+#
+# Omitted common warnings:
+#
+# Wabi=
+# Waggregate-return
+# Walloc-size-larger-than=BYTES
+# Walloca-larger-than=BYTES
+# Wframe-larger-than=BYTES
+# Wlarger-than=BYTES
+# Wstack-usage=BYTES
+# Wsystem-headers
+# Wtraditional
+# Wtraditional-conversion
+# Wtrampolines
+# Wvla-larger-than=BYTES
+#
+# Omitted C warnings:
+#
+# Wc90-c99-compat
+# Wdeclaration-after-statement
+# Wtraditional
+# Wtraditional-conversion
+#
+# Omitted C++ warnings:
+#
+# Wnamespaces
+# Wtemplates
+
+gcc_common_warnings = [
+  '-Walloc-zero',
+  '-Walloca',
+  '-Wanalyzer-too-complex',
+  '-Warith-conversion',
+  '-Warray-bounds=2',
+  '-Wattribute-alias=2',
+  '-Wcast-align=strict',
+  '-Wcast-qual',
+  '-Wconversion',
+  '-Wdate-time',
+  '-Wdisabled-optimization',
+  '-Wdouble-promotion',
+  '-Wduplicated-branches',
+  '-Wduplicated-cond',
+  '-Wfloat-equal',
+  '-Wformat-overflow=2',
+  '-Wformat-signedness',
+  '-Wformat-truncation=2',
+  '-Wformat=2',
+  '-Wimplicit-fallthrough=2',
+  '-Winit-self',
+  '-Winline',
+  '-Winvalid-pch',
+  '-Wlogical-op',
+  '-Wmissing-declarations',
+  '-Wmissing-include-dirs',
+  '-Wmultichar',
+  '-Wnormalized=nfc',
+  '-Wnull-dereference',
+  '-Wpacked',
+  '-Wpadded',
+  '-Wredundant-decls',
+  '-Wscalar-storage-order',
+  '-Wshadow',
+  '-Wshift-overflow=2',
+  '-Wsizeof-array-argument',
+  '-Wstack-protector',
+  '-Wstrict-aliasing=3',
+  '-Wstrict-overflow=5',
+  '-Wstringop-overflow=3',
+  '-Wsuggest-attribute=cold',
+  '-Wsuggest-attribute=const',
+  '-Wsuggest-attribute=format',
+  '-Wsuggest-attribute=malloc',
+  '-Wsuggest-attribute=noreturn',
+  '-Wsuggest-attribute=pure',
+  '-Wswitch-default',
+  '-Wswitch-enum',
+  '-Wsync-nand',
+  '-Wundef',
+  '-Wunused-const-variable=2',
+  '-Wunused-macros',
+  '-Wvarargs',
+  '-Wvector-operation-performance',
+  '-Wvla',
+  '-Wwrite-strings',
+]
+
+gcc_c_warnings = [
+  '-Wbad-function-cast',
+  '-Wc++-compat',
+  '-Wc99-c11-compat',
+  '-Wdesignated-init',
+  '-Wdiscarded-array-qualifiers',
+  '-Wdiscarded-qualifiers',
+  '-Wincompatible-pointer-types',
+  '-Wjump-misses-init',
+  '-Wmissing-prototypes',
+  '-Wnested-externs',
+  '-Wold-style-definition',
+  '-Wstrict-prototypes',
+  '-Wunsuffixed-float-constants',
+]
+
+# Set all_c_warnings for the current C compiler
+if is_variable('cc') and not is_variable('all_c_warnings')
+  if cc.get_id() == 'clang'
+    all_c_warnings = ['-Weverything']
+  elif cc.get_id() == 'gcc'
+    all_c_warnings = gcc_common_warnings + [
+      '-Wbad-function-cast',
+      '-Wc++-compat',
+      '-Wc99-c11-compat',
+      '-Wdesignated-init',
+      '-Wdiscarded-array-qualifiers',
+      '-Wdiscarded-qualifiers',
+      '-Wincompatible-pointer-types',
+      '-Wjump-misses-init',
+      '-Wmissing-prototypes',
+      '-Wnested-externs',
+      '-Wold-style-definition',
+      '-Wstrict-prototypes',
+      '-Wunsuffixed-float-constants',
+    ]
+  elif cc.get_id() == 'msvc'
+    all_c_warnings = ['/Wall']
+  else
+    all_c_warnings = []
+  endif
+
+  all_c_warnings = cc.get_supported_arguments(all_c_warnings)
+  
+endif
+
+# Set all_cpp_warnings for the current C++ compiler
+if is_variable('cpp') and not is_variable('all_cpp_warnings')
+  if cpp.get_id() == 'clang'
+    all_cpp_warnings = [
+      '-Weverything',
+      '-Wno-c++98-compat',
+      '-Wno-c++98-compat-pedantic'
+    ]
+  elif cpp.get_id() == 'gcc'
+    all_cpp_warnings = gcc_common_warnings + [
+      '-Wabi-tag',
+      '-Waligned-new=all',
+      '-Wcatch-value=3',
+      '-Wcomma-subscript',
+      '-Wconditionally-supported',
+      '-Wctor-dtor-privacy',
+      '-Wdeprecated-copy-dtor',
+      '-Weffc++',
+      '-Wextra-semi',
+      '-Wmismatched-tags',
+      '-Wmultiple-inheritance',
+      '-Wnoexcept',
+      '-Wnoexcept-type',
+      '-Wnon-virtual-dtor',
+      '-Wold-style-cast',
+      '-Woverloaded-virtual',
+      '-Wplacement-new=2',
+      '-Wredundant-tags',
+      '-Wregister',
+      '-Wsign-promo',
+      '-Wstrict-null-sentinel',
+      '-Wsuggest-final-methods',
+      '-Wsuggest-final-types',
+      '-Wsuggest-override',
+      '-Wuseless-cast',
+      '-Wvirtual-inheritance',
+      '-Wvolatile',
+      '-Wzero-as-null-pointer-constant',
+    ]
+  elif cpp.get_id() == 'msvc'
+    all_cpp_warnings = ['/Wall']
+  else
+    all_cpp_warnings = []
+  endif
+
+  all_cpp_warnings = cpp.get_supported_arguments(all_cpp_warnings)
+  
+endif
+
+# Set all_objc_warnings for the current Objective C compiler
+if is_variable('objcc') and not is_variable('all_objc_warnings')
+  all_objc_warnings = []
+  if objcc.get_id() == 'clang'
+    all_objc_warnings = ['-Weverything']
+  elif objc.get_id() == 'gcc'
+    all_objc_warnings = gcc_common_warnings + [
+      '-Wno-direct-ivar-access',
+    ]
+  else
+    all_objc_warnings = []
+  endif
+
+  all_objc_warnings = objcc.get_supported_arguments(all_objc_warnings)
+endif
diff --git a/subprojects/rerex/meson_options.txt b/subprojects/rerex/meson_options.txt
new file mode 100644
index 00000000..f2b59781
--- /dev/null
+++ b/subprojects/rerex/meson_options.txt
@@ -0,0 +1,5 @@
+option('strict', type: 'boolean', value: false,
+       description: 'Enable ultra-strict warnings')
+
+option('tests', type: 'boolean', value: true,
+       description: 'Build tests')
diff --git a/subprojects/rerex/src/.clang-tidy b/subprojects/rerex/src/.clang-tidy
new file mode 100644
index 00000000..420ce833
--- /dev/null
+++ b/subprojects/rerex/src/.clang-tidy
@@ -0,0 +1,8 @@
+Checks: >
+  *,
+  -hicpp-multiway-paths-covered,
+  -llvmlibc-*,
+  -misc-no-recursion,
+WarningsAsErrors: '*'
+HeaderFilterRegex: '.*'
+FormatStyle: file
diff --git a/subprojects/rerex/src/rerex.c b/subprojects/rerex/src/rerex.c
new file mode 100644
index 00000000..49970a45
--- /dev/null
+++ b/subprojects/rerex/src/rerex.c
@@ -0,0 +1,765 @@
+/*
+  Copyright 2020 David Robillard <d@drobilla.net>
+
+  Permission to use, copy, modify, and/or distribute this software for any
+  purpose with or without fee is hereby granted, provided that the above
+  copyright notice and this permission notice appear in all copies.
+
+  THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+*/
+
+#include "rerex/rerex.h"
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+static const char cmin = 0x20; // Inclusive minimum normal character
+static const char cmax = 0x7E; // Inclusive maximum normal character
+
+const char*
+rerex_strerror(const RerexStatus status)
+{
+  switch (status) {
+  case REREX_SUCCESS:
+    return "Success";
+  case REREX_EXPECTED_CHAR:
+    return "Expected a regular character";
+  case REREX_EXPECTED_ELEMENT:
+    return "Expected a character in a set";
+  case REREX_EXPECTED_RBRACKET:
+    return "Expected ']'";
+  case REREX_EXPECTED_RPAREN:
+    return "Expected ')'";
+  case REREX_EXPECTED_SPECIAL:
+    return "Expected a special character (one of \"()*+-?[]^|\")";
+  case REREX_UNEXPECTED_SPECIAL:
+    return "Unexpected special character";
+  case REREX_UNEXPECTED_END:
+    return "Unexpected end of input";
+  case REREX_UNORDERED_RANGE:
+    return "Range is out of order";
+  }
+
+  return "Unknown error";
+}
+
+/* State */
+
+#define NO_STATE 0
+
+// The ID for a state, which is an index into the state array
+typedef size_t StateIndex;
+
+// A code point (currently only 8-bit ASCII but we use the space anyway)
+typedef int Codepoint;
+
+// Special type for states with only epsilon out arcs
+typedef enum {
+  REREX_MATCH = 0xE000, ///< Matching state, no out arcs
+  REREX_SPLIT = 0xE001, ///< Splitting state, one or two out arcs
+} StateType;
+
+/* A state in an NFA.
+
+   A state in Thompson's NFA can have either a single character-labeled
+   transition to another state, or up to two unlabeled epsilon transitions to
+   other states.  There is both a minimum and maximum label for supporting
+   character ranges.  So, either `min` and `max` are ASCII characters that are
+   the label of an arc to next1 (and next2 is null), or `min` is a special
+   StateType and next1 and/or next2 may be set to successor states.
+*/
+typedef struct {
+  StateIndex next1; ///< Head of first out arc (or NULL)
+  StateIndex next2; ///< Head of second out arc (or NULL)
+  Codepoint  min;   ///< Special type, or inclusive min label for next1
+  Codepoint  max;   ///< Inclusive max label for next2
+} State;
+
+// Create a match (end) state with no successors
+static State
+match_state(void)
+{
+  const State s = {NO_STATE, NO_STATE, REREX_MATCH, 0};
+  return s;
+}
+
+// Create a split state with at most two successors
+static State
+split_state(const StateIndex next1, const StateIndex next2)
+{
+  const State s = {next1, next2, REREX_SPLIT, 0};
+  return s;
+}
+
+// Create a labeled state with one successor reached by a character arc
+static State
+range_state(const char min, const char max, const StateIndex next)
+{
+  const State s = {next, NO_STATE, min, max};
+  return s;
+}
+
+/* Array of states.
+
+   States are stored in a flat array to reduce memory fragmentation, and for
+   easy memory management since the automata graph may be cyclic.  This simple
+   implementation calls realloc() for every state, which isn't terribly
+   efficient, but works well enough.  Note that state addresses therefore change
+   during compilation, so states are generally referred to by their index, and
+   not by pointer.  Conveniently, using indices is also useful during matching
+   for storing auxiliary information about states.
+*/
+typedef struct {
+  State* states;
+  size_t n_states;
+} StateArray;
+
+// Append a new state to the end of the state array
+static StateIndex
+add_state(StateArray* const array, const State state)
+{
+  const size_t new_n_states = array->n_states + 1;
+  const size_t new_size     = new_n_states * sizeof(State);
+
+  array->states                  = (State*)realloc(array->states, new_size);
+  array->states[array->n_states] = state;
+
+  return array->n_states++;
+}
+
+/* Automata.
+ *
+ * This is a lightweiht description of an NFA fragment.  The states are stored
+ * elsewhere, this is just used to provide a facade to conceptually work with
+ * automata for high-level operations.
+ */
+typedef struct {
+  StateIndex start;
+  StateIndex end;
+} Automata;
+
+// Simple utility function for making an automata in an expression
+static Automata
+make_automata(const StateIndex start, const StateIndex end)
+{
+  Automata result = {start, end};
+  return result;
+}
+
+// Return whether `nfa` has only two simple states (used for optimizations)
+static inline bool
+is_trivial(const StateArray* const states, const Automata nfa)
+{
+  return (states->states[nfa.start].min < REREX_MATCH &&
+          states->states[nfa.start].next1 == nfa.end);
+}
+
+// Kleene's Star of an NFA
+static Automata
+star(StateArray* const states, const Automata nfa)
+{
+  const StateIndex end   = add_state(states, match_state());
+  const StateIndex start = add_state(states, split_state(nfa.start, end));
+
+  states->states[nfa.end] = split_state(nfa.start, end);
+
+  return make_automata(start, end);
+}
+
+// Zero-or-one of an NFA
+static Automata
+question(StateArray* const states, const Automata nfa)
+{
+  const StateIndex start = add_state(states, split_state(nfa.start, nfa.end));
+
+  return make_automata(start, nfa.end);
+}
+
+// One-or-more of an NFA
+static Automata
+plus(StateArray* const states, const Automata nfa)
+{
+  const StateIndex end = add_state(states, match_state());
+
+  states->states[nfa.end] = split_state(nfa.start, end);
+
+  return make_automata(nfa.start, end);
+}
+
+// Concatenation of two NFAs
+static Automata
+concatenate(StateArray* const states, const Automata a, const Automata b)
+{
+  if (is_trivial(states, a)) {
+    // Optimization: link a's start directly to b's start (drop a's end)
+    states->states[a.start].next1 = b.start;
+  } else {
+    states->states[a.end] = split_state(b.start, NO_STATE);
+  }
+
+  return make_automata(a.start, b.end);
+}
+
+// Alternation (OR) of two NFAs
+static Automata
+alternate(StateArray* const states, const Automata a, const Automata b)
+{
+  const StateIndex split = add_state(states, split_state(a.start, b.start));
+
+  if (is_trivial(states, a)) {
+    // Optimization: link a's start directly to b's end (drop a's end)
+    states->states[a.start].next1 = b.end;
+    return make_automata(split, b.end);
+  }
+
+  if (is_trivial(states, b)) {
+    // Optimization: link b's start directly to a's end (drop b's end)
+    states->states[b.start].next1 = a.end;
+    return make_automata(split, a.end);
+  }
+
+  const StateIndex end = add_state(states, match_state());
+
+  states->states[a.end] = split_state(end, NO_STATE);
+  states->states[b.end] = split_state(end, NO_STATE);
+
+  return make_automata(split, end);
+}
+
+/* Parser input.
+ *
+ * The parser reads from a string one character at a time, though it would be
+ * simple to change this to read from any stream.  All reading is done by three
+ * operations: peek, peekahead, and eat.
+ */
+typedef struct {
+  const char* const str;
+  size_t            offset;
+} Input;
+
+// Return the next character in the input without consuming it
+static inline char
+peek(Input* const input)
+{
+  return input->str[input->offset];
+}
+
+// Return the next-next character in the input without consuming any
+static inline char
+peekahead(Input* const input)
+{
+  // Unfortunately we need 2-char lookahead for the ambiguity of '-' in sets
+  return input->str[input->offset + 1];
+}
+
+// Consume and return the next character in the input
+static inline char
+eat(Input* const input)
+{
+  return input->str[input->offset++];
+}
+
+// Forward declaration for read_expr because it is called recursively
+static RerexStatus
+read_expr(Input* input, StateArray* states, Automata* out);
+
+// DOT      ::= '.'
+// OPERATOR ::= '*' | '+' | '?'
+// SPECIAL  ::= DOT | OPERATOR | '(' | ')' | '[' | ']' | '^' | '{' | '|' | '}'
+static bool
+is_special(const char c)
+{
+  switch (c) {
+  case '(':
+  case ')':
+  case '*':
+  case '+':
+  case '.':
+  case '?':
+  case '[':
+  case ']':
+  case '^':
+  case '{':
+  case '|':
+  case '}':
+    return true;
+  default:
+    break;
+  }
+
+  return false;
+}
+
+// DOT ::= '.'
+static RerexStatus
+read_dot(Input* const input, StateArray* const states, Automata* const out)
+{
+  assert(peek(input) == '.');
+  eat(input);
+
+  const StateIndex end   = add_state(states, match_state());
+  const StateIndex start = add_state(states, range_state(cmin, cmax, end));
+
+  *out = make_automata(start, end);
+
+  return REREX_SUCCESS;
+}
+
+// ESCAPE ::= '\' SPECIAL
+static RerexStatus
+read_escape(Input* const input, char* const out)
+{
+  assert(peek(input) == '\\');
+  eat(input);
+
+  const char c = peek(input);
+  if (is_special(c) || c == '-') {
+    *out = eat(input);
+    return REREX_SUCCESS;
+  }
+
+  return REREX_EXPECTED_SPECIAL;
+}
+
+// CHAR ::= ESCAPE | [#x20-#x7E] - SPECIAL
+static RerexStatus
+read_char(Input* const input, char* const out)
+{
+  const char c = peek(input);
+
+  switch (c) {
+  case '\0':
+    return REREX_UNEXPECTED_END;
+  case '\\':
+    return read_escape(input, out);
+  default:
+    break;
+  }
+
+  if (is_special(c)) {
+    return REREX_UNEXPECTED_SPECIAL;
+  }
+
+  if (c >= cmin && c <= cmax) {
+    *out = eat(input);
+    return REREX_SUCCESS;
+  }
+
+  return REREX_EXPECTED_CHAR;
+}
+
+// ELEMENT ::= ([#x20-#x7E] - ']') | ('\' ']')
+static RerexStatus
+read_element(Input* const input, char* const out)
+{
+  const char c = peek(input);
+
+  switch (c) {
+  case '\0':
+    return REREX_UNEXPECTED_END;
+  case ']':
+    return REREX_UNEXPECTED_SPECIAL;
+  case '\\':
+    eat(input);
+    if (peek(input) != ']') {
+      return REREX_EXPECTED_RBRACKET;
+    }
+
+    *out = eat(input);
+    return REREX_SUCCESS;
+  default:
+    break;
+  }
+
+  if (c >= cmin && c <= cmax) {
+    *out = eat(input);
+    return REREX_SUCCESS;
+  }
+
+  return REREX_EXPECTED_ELEMENT;
+}
+
+// Range ::= ELEMENT | ELEMENT '-' ELEMENT
+static RerexStatus
+read_range(Input* const      input,
+           StateArray* const states,
+           const bool        negated,
+           Automata* const   out)
+{
+  RerexStatus st  = REREX_SUCCESS;
+  char        min = 0;
+  char        max = 0;
+
+  // Read the first (or only) character
+  if ((st = read_element(input, &min))) {
+    return st;
+  }
+
+  // Handle '-' which is a bit hairy because it may or may not be special
+  if (peek(input) == '-') {
+    switch (peekahead(input)) {
+    case ']':
+      // Weird case like [a-] where '-' is a character
+      max = min;
+      break;
+    default:
+      // Normal range like [a-z] (note that '[' isn't special here)
+      eat(input);
+      if ((st = read_element(input, &max))) {
+        return st;
+      }
+      break;
+    }
+  } else {
+    // Single character element
+    max = min;
+  }
+
+  if (max < min) {
+    return REREX_UNORDERED_RANGE;
+  }
+
+  const StateIndex end = add_state(states, match_state());
+  if (negated) {
+    const char       emin = (char)(min - 1);
+    const char       emax = (char)(max + 1);
+    const StateIndex low  = add_state(states, range_state(cmin, emin, end));
+    const StateIndex high = add_state(states, range_state(emax, cmax, end));
+    const StateIndex fork = add_state(states, split_state(low, high));
+
+    *out = make_automata(fork, end);
+  } else {
+    const StateIndex start = add_state(states, range_state(min, max, end));
+
+    *out = make_automata(start, end);
+  }
+
+  return st;
+}
+
+// Set ::= Range | Range Set
+static RerexStatus
+read_set(Input* const input, StateArray* const states, Automata* const out)
+{
+  RerexStatus st      = REREX_SUCCESS;
+  bool        negated = false;
+
+  if (peek(input) == '^') {
+    eat(input);
+    negated = true;
+  }
+
+  Automata nfa = {NO_STATE, NO_STATE};
+  if ((st = read_range(input, states, negated, &nfa))) {
+    return st;
+  }
+
+  while (peek(input) != ']') {
+    Automata range_nfa = {NO_STATE, NO_STATE};
+    if ((st = read_range(input, states, negated, &range_nfa))) {
+      return st;
+    }
+
+    nfa = alternate(states, nfa, range_nfa);
+  }
+
+  *out = nfa;
+  return st;
+}
+
+// Atom ::= CHAR | DOT | '(' Expr ')' | '[' Set ']'
+static RerexStatus
+read_atom(Input* const input, StateArray* const states, Automata* const out)
+{
+  RerexStatus st = REREX_SUCCESS;
+
+  switch (peek(input)) {
+  case '(':
+    eat(input);
+    if ((st = read_expr(input, states, out))) {
+      return st;
+    }
+
+    if (peek(input) != ')') {
+      return REREX_EXPECTED_RPAREN;
+    }
+
+    eat(input);
+    return st;
+
+  case '.':
+    return read_dot(input, states, out);
+
+  case '[':
+    eat(input);
+    if ((st = read_set(input, states, out))) {
+      return st;
+    }
+
+    eat(input);
+    return st;
+
+  default:
+    break;
+  }
+
+  char c = 0;
+  if ((st = read_char(input, &c))) {
+    return st;
+  }
+
+  const StateIndex end   = add_state(states, match_state());
+  const StateIndex start = add_state(states, range_state(c, c, end));
+
+  *out = make_automata(start, end);
+
+  return st;
+}
+
+// OPERATOR ::= '*' | '+' | '?'
+// Factor   ::= Atom | Atom OPERATOR
+static RerexStatus
+read_factor(Input* const input, StateArray* const states, Automata* const out)
+{
+  RerexStatus st       = REREX_SUCCESS;
+  Automata    atom_nfa = {NO_STATE, NO_STATE};
+
+  if (!(st = read_atom(input, states, &atom_nfa))) {
+    const char c = peek(input);
+    switch (c) {
+    case '*':
+      eat(input);
+      *out = star(states, atom_nfa);
+      break;
+    case '+':
+      eat(input);
+      *out = plus(states, atom_nfa);
+      break;
+    case '?':
+      eat(input);
+      *out = question(states, atom_nfa);
+      break;
+    default:
+      *out = atom_nfa;
+      break;
+    }
+  }
+
+  return st;
+}
+
+// Term ::= Factor | Factor Term
+static RerexStatus
+read_term(Input* const input, StateArray* const states, Automata* const out)
+{
+  RerexStatus st         = REREX_SUCCESS;
+  Automata    factor_nfa = {NO_STATE, NO_STATE};
+  Automata    term_nfa   = {NO_STATE, NO_STATE};
+
+  if (!(st = read_factor(input, states, &factor_nfa))) {
+    switch (peek(input)) {
+    case '\0':
+    case ')':
+    case '|':
+      *out = factor_nfa;
+      break;
+    default:
+      if (!(st = read_term(input, states, &term_nfa))) {
+        *out = concatenate(states, factor_nfa, term_nfa);
+      }
+      break;
+    }
+  }
+
+  return st;
+}
+
+// Expr ::= Term | Term '|' Expr
+static RerexStatus
+read_expr(Input* const input, StateArray* const states, Automata* const out)
+{
+  RerexStatus st       = REREX_SUCCESS;
+  Automata    term_nfa = {NO_STATE, NO_STATE};
+  Automata    expr_nfa = {NO_STATE, NO_STATE};
+
+  if ((st = read_term(input, states, &term_nfa))) {
+    return st;
+  }
+
+  if (peek(input) == '|') {
+    eat(input);
+    if (!(st = read_expr(input, states, &expr_nfa))) {
+      *out = alternate(states, term_nfa, expr_nfa);
+    }
+  } else {
+    *out = term_nfa;
+  }
+
+  return st;
+}
+
+/* Pattern.
+ *
+ * A pattern is simply an array of states and an index to the start state.  The
+ * end state(s) are known because they have type REREX_MATCH.  A pattern is
+ * immutable after construction, the matcher does not modify it.
+ */
+struct RerexPatternImpl {
+  StateArray states;
+  StateIndex start;
+};
+
+void
+rerex_free_pattern(RerexPattern* const regexp)
+{
+  free(regexp->states.states);
+  free(regexp);
+}
+
+RerexStatus
+rerex_compile(const char* const    pattern,
+              size_t* const        end,
+              RerexPattern** const out)
+{
+  Input      input  = {pattern, 0};
+  Automata   nfa    = {NO_STATE, NO_STATE};
+  StateArray states = {NULL, 0};
+
+  // Add null state so that no actual state has NO_STATE as an ID
+  add_state(&states, split_state(NO_STATE, NO_STATE));
+
+  const RerexStatus st = read_expr(&input, &states, &nfa);
+  if (st) {
+    free(states.states);
+  } else {
+    *out           = (RerexPattern*)malloc(sizeof(RerexPattern));
+    (*out)->states = states;
+    (*out)->start  = nfa.start;
+  }
+
+  *end = input.offset;
+  return st;
+}
+
+/* Matcher */
+
+typedef struct {
+  StateIndex* indices;   // Array of state indices
+  size_t      n_indices; // Number of elements in indices
+} IndexList;
+
+/* Matcher.
+ *
+ * The matcher tracks active states by keeping two lists of indices: one for
+ * the current iteration, and one for the next.  A separate array, keyed by
+ * state index, stores the number of the last iteration the state was entered
+ * in.  This makes it simple and fast to check if a state has already been
+ * entered in the current iteration, avoiding the need to search the active
+ * list for every entered state.
+ */
+struct RerexMatcherImpl {
+  const RerexPattern* regexp;      // Pattern to match against
+  IndexList           active[2];   // Two lists of active states
+  size_t*             last_active; // Last iteration a state was active
+};
+
+RerexMatcher*
+rerex_new_matcher(const RerexPattern* const regexp)
+{
+  const size_t        n_states = regexp->states.n_states;
+  RerexMatcher* const m        = (RerexMatcher*)calloc(1, sizeof(RerexMatcher));
+
+  m->regexp            = regexp;
+  m->active[0].indices = (StateIndex*)calloc(n_states, sizeof(StateIndex));
+  m->active[1].indices = (StateIndex*)calloc(n_states, sizeof(StateIndex));
+  m->last_active       = (size_t*)calloc(n_states, sizeof(size_t));
+
+  return m;
+}
+
+void
+rerex_free_matcher(RerexMatcher* const matcher)
+{
+  free(matcher->last_active);
+  free(matcher->active[1].indices);
+  free(matcher->active[0].indices);
+  free(matcher);
+}
+
+// Add `s` and any epsilon successors to the active list
+static void
+enter_state(RerexMatcher* const matcher,
+            const size_t        step,
+            IndexList* const    list,
+            const StateIndex    s)
+{
+  const StateArray* const states = &matcher->regexp->states;
+
+  if (s && matcher->last_active[s] != step) {
+    matcher->last_active[s] = step;
+
+    const State* const state = &states->states[s];
+    if (state->min == REREX_SPLIT) {
+      enter_state(matcher, step, list, state->next1);
+      enter_state(matcher, step, list, state->next2);
+    } else {
+      list->indices[list->n_indices++] = s;
+    }
+  }
+}
+
+// Run `matcher` and return true if `string` matches the expression
+bool
+rerex_match(RerexMatcher* const matcher, const char* const string)
+{
+  const StateArray* const states = &matcher->regexp->states;
+
+  // Reset matcher to a consistent initial state
+  matcher->active[0].n_indices = 0;
+  matcher->active[1].n_indices = 0;
+  for (size_t i = 0; i < states->n_states; ++i) {
+    matcher->last_active[i] = SIZE_MAX;
+  }
+
+  // Enter start state
+  enter_state(matcher, 0, &matcher->active[0], matcher->regexp->start);
+
+  // Tick the matcher for every input character
+  bool phase = 0;
+  for (size_t i = 0; string[i]; ++i) {
+    const char       c         = string[i];
+    IndexList* const list      = &matcher->active[phase];
+    IndexList* const next_list = &matcher->active[!phase];
+
+    // Add successor states to the next iteration's list
+    next_list->n_indices = 0;
+    for (size_t j = 0; j < list->n_indices; ++j) {
+      const State* const state = &states->states[list->indices[j]];
+      if (state->min <= c && c <= state->max) {
+        enter_state(matcher, i + 1, next_list, state->next1);
+      }
+    }
+
+    // Flip phase to swap active lists
+    phase = !phase;
+  }
+
+  // Check if match state is entered in the end
+  IndexList* const active = &matcher->active[phase];
+  for (size_t i = 0; i < active->n_indices; ++i) {
+    if (states->states[active->indices[i]].min == REREX_MATCH) {
+      return true;
+    }
+  }
+
+  return false;
+}
diff --git a/subprojects/rerex/test/test_match.c b/subprojects/rerex/test/test_match.c
new file mode 100644
index 00000000..fa270e83
--- /dev/null
+++ b/subprojects/rerex/test/test_match.c
@@ -0,0 +1,238 @@
+/*
+  Copyright 2020 David Robillard <d@drobilla.net>
+
+  Permission to use, copy, modify, and/or distribute this software for any
+  purpose with or without fee is hereby granted, provided that the above
+  copyright notice and this permission notice appear in all copies.
+
+  THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+*/
+
+#undef NDEBUG
+
+#include "rerex/rerex.h"
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+typedef struct {
+  uintptr_t   match;   ///< Boolean, true if text should match
+  const char* pattern; ///< Regular expression
+  const char* text;    ///< Text to match against pattern
+} MatchTestCase;
+
+static const MatchTestCase match_tests[] = {
+  {1, "\\(", "("},
+  {1, "\\)", ")"},
+  {1, "\\*", "*"},
+  {1, "\\+", "+"},
+  {1, "\\-", "-"},
+  {1, "\\.", "."},
+  {1, "\\?", "?"},
+  {1, "\\[", "["},
+  {1, "\\]", "]"},
+  {1, "\\^", "^"},
+  {1, "\\|", "|"},
+  {0, ".", ""},
+  {1, ".", "a"},
+  {0, ".", "aa"},
+  {0, "..", ""},
+  {0, "..", "a"},
+  {1, "..", "aa"},
+  {1, ".*", ""},
+  {1, ".*", "a"},
+  {1, ".*", "aa"},
+  {0, ".+", ""},
+  {1, ".+", "a"},
+  {1, ".+", "aa"},
+  {1, ".?", ""},
+  {1, ".?", "a"},
+  {0, ".?", "aa"},
+  {1, "a*", ""},
+  {1, "a*", "a"},
+  {1, "a*", "aa"},
+  {0, "a*", "b"},
+  {0, "a+", ""},
+  {1, "a+", "a"},
+  {1, "a+", "aa"},
+  {0, "a+", "b"},
+  {1, "a?", ""},
+  {1, "a?", "a"},
+  {0, "a?", "aa"},
+  {0, "a?", "b"},
+  {0, "[.]", "a"},
+  {1, "[.]", "."},
+  {0, "[\\]]", "a"},
+  {1, "[\\]]", "]"},
+  {0, "[b]", "a"},
+  {1, "[b]", "b"},
+  {0, "[b]", "c"},
+  {0, "[bc]", "a"},
+  {1, "[bc]", "b"},
+  {1, "[bc]", "c"},
+  {0, "[bc]", "d"},
+  {0, "[bcd]", "a"},
+  {1, "[bcd]", "b"},
+  {1, "[bcd]", "c"},
+  {1, "[bcd]", "d"},
+  {0, "[bcd]", "e"},
+  {0, "[b-d]", "a"},
+  {1, "[b-d]", "b"},
+  {1, "[b-d]", "d"},
+  {0, "[b-d]", "e"},
+  {1, "[^b-d]", "a"},
+  {0, "[^b-d]", "b"},
+  {0, "[^b-d]", "d"},
+  {1, "[^b-d]", "e"},
+  {0, "[^ -/]", "\t"},
+  {1, "[^ -/]", "0"},
+  {1, "[^{-~]", "z"},
+  {0, "[^{-~]", "~"},
+  {0, "[A-Za-z]", "5"},
+  {1, "[A-Za-z]", "m"},
+  {1, "[A-Za-z]", "M"},
+  {0, "[A-Za-z]", "~"},
+  {0, "[+-]", "*"},
+  {1, "[+-]", "+"},
+  {0, "[+-]", ","},
+  {1, "[+-]", "-"},
+  {0, "[+-]", "."},
+  {1, "[b-d]*", ""},
+  {0, "[b-d]*", "a"},
+  {1, "[b-d]*", "b"},
+  {1, "[b-d]*", "c"},
+  {1, "[b-d]*", "cc"},
+  {1, "[b-d]*", "d"},
+  {0, "[b-d]*", "e"},
+  {0, "[b-d]+", ""},
+  {0, "[b-d]+", "a"},
+  {1, "[b-d]+", "b"},
+  {1, "[b-d]+", "c"},
+  {1, "[b-d]+", "cc"},
+  {1, "[b-d]+", "d"},
+  {0, "[b-d]+", "e"},
+  {1, "[b-d]?", ""},
+  {0, "[b-d]?", "a"},
+  {1, "[b-d]?", "b"},
+  {1, "[b-d]?", "c"},
+  {0, "[b-d]?", "cc"},
+  {1, "[b-d]?", "d"},
+  {0, "[b-d]?", "e"},
+  {1, "h(e|a)llo", "hello"},
+  {1, "h(e|a)llo", "hallo"},
+  {1, "h(e|a)+llo", "haello"},
+  {1, "h(e|a)*llo", "hllo"},
+  {1, "h(e|a)?llo", "hllo"},
+  {1, "h(e|a)?llo", "hello"},
+  {1, "h(e|a)*llo*", "haeeeallooo"},
+  {1, "(ab|a)(bc|c)", "abc"},
+  {0, "(ab|a)(bc|c)", "acb"},
+  {1, "(ab)c|abc", "abc"},
+  {0, "(ab)c|abc", "ab"},
+  {1, "(a*)(b?)(b+)", "aaabbbb"},
+  {0, "(a*)(b?)(b+)", "aaaa"},
+  {1, "((a|a)|a)", "a"},
+  {0, "((a|a)|a)", "aa"},
+  {1, "(a*)(a|aa)", "aaaa"},
+  {0, "(a*)(a|aa)", "b"},
+  {1, "a(b)|c(d)|a(e)f", "aef"},
+  {0, "a(b)|c(d)|a(e)f", "adf"},
+  {1, "(a|b)c|a(b|c)", "ac"},
+  {0, "(a|b)c|a(b|c)", "acc"},
+  {1, "(a|b)c|a(b|c)", "ab"},
+  {0, "(a|b)c|a(b|c)", "acb"},
+  {1, "(a|b)*c|(a|ab)*c", "abc"},
+  {0, "(a|b)*c|(a|ab)*c", "bbbcabbbc"},
+  {1, "a?(ab|ba)ab", "abab"},
+  {0, "a?(ab|ba)ab", "aaabab"},
+  {1, "(aa|aaa)*|(a|aaaaa)", "aa"},
+  {1, "(a)(b)(c)", "abc"},
+  {1, "((((((((((x))))))))))", "x"},
+  {1, "((((((((((x))))))))))*", "xx"},
+  {1, "a?(ab|ba)*", "ababababababababababababababababa"},
+  {1, "a*a*a*a*a*b", "aaaaaaaab"},
+  {1, "abc", "abc"},
+  {1, "ab*c", "abc"},
+  {1, "ab*bc", "abbc"},
+  {1, "ab*bc", "abbbbc"},
+  {1, "ab+bc", "abbc"},
+  {1, "ab+bc", "abbbbc"},
+  {1, "ab?bc", "abbc"},
+  {1, "ab?bc", "abc"},
+  {1, "ab|cd", "ab"},
+  {1, "(a)b(c)", "abc"},
+  {1, "a*", "aaa"},
+  {1, "(a+|b)*", "ab"},
+  {1, "(a+|b)+", "ab"},
+  {1, "a|b|c|d|e", "e"},
+  {1, "(a|b|c|d|e)f", "ef"},
+  {1, "abcd*efg", "abcdefg"},
+  {1, "(ab|ab*)bc", "abc"},
+  {1, "(ab|a)b*c", "abc"},
+  {1, "((a)(b)c)(d)", "abcd"},
+  {1, "(a|ab)(c|bcd)", "abcd"},
+  {1, "(a|ab)(bcd|c)", "abcd"},
+  {1, "(ab|a)(c|bcd)", "abcd"},
+  {1, "(ab|a)(bcd|c)", "abcd"},
+  {1, "((a|ab)(c|bcd))(d*)", "abcd"},
+  {1, "((a|ab)(bcd|c))(d*)", "abcd"},
+  {1, "((ab|a)(c|bcd))(d*)", "abcd"},
+  {1, "((ab|a)(bcd|c))(d*)", "abcd"},
+  {1, "(a|ab)((c|bcd)(d*))", "abcd"},
+  {1, "(a|ab)((bcd|c)(d*))", "abcd"},
+  {1, "(ab|a)((c|bcd)(d*))", "abcd"},
+  {1, "(ab|a)((bcd|c)(d*))", "abcd"},
+  {1, "(a*)(b|abc)", "abc"},
+  {1, "(a*)(abc|b)", "abc"},
+  {1, "((a*)(b|abc))(c*)", "abc"},
+  {1, "((a*)(abc|b))(c*)", "abc"},
+  {1, "(a*)((b|abc))(c*)", "abc"},
+  {1, "(a*)((abc|b)(c*))", "abc"},
+  {1, "(a*)(b|abc)", "abc"},
+  {1, "(a*)(abc|b)", "abc"},
+  {1, "((a*)(b|abc))(c*)", "abc"},
+  {1, "((a*)(abc|b))(c*)", "abc"},
+  {1, "(a*)((b|abc)(c*))", "abc"},
+  {1, "(a*)((abc|b)(c*))", "abc"},
+  {1, "(a|ab)", "ab"},
+  {1, "(ab|a)", "ab"},
+  {1, "(a|ab)(b*)", "ab"},
+  {1, "(ab|a)(b*)", "ab"},
+  {1, "(a|b)*c|(a|ab)*c", "abc"},
+};
+
+int
+main(void)
+{
+  const size_t n_tests = sizeof(match_tests) / sizeof(*match_tests);
+
+  for (size_t i = 0; i < n_tests; ++i) {
+    const char* const regexp       = match_tests[i].pattern;
+    const char* const text         = match_tests[i].text;
+    const bool        should_match = match_tests[i].match;
+
+    RerexPattern*     pattern = NULL;
+    size_t            end     = 0;
+    const RerexStatus st      = rerex_compile(regexp, &end, &pattern);
+
+    assert(!st);
+
+    RerexMatcher* const matcher = rerex_new_matcher(pattern);
+    const bool          matches = rerex_match(matcher, text);
+
+    assert(matches == should_match);
+
+    rerex_free_matcher(matcher);
+    rerex_free_pattern(pattern);
+  }
+
+  return 0;
+}
diff --git a/subprojects/rerex/test/test_syntax.c b/subprojects/rerex/test/test_syntax.c
new file mode 100644
index 00000000..9cff8855
--- /dev/null
+++ b/subprojects/rerex/test/test_syntax.c
@@ -0,0 +1,99 @@
+/*
+  Copyright 2020 David Robillard <d@drobilla.net>
+
+  Permission to use, copy, modify, and/or distribute this software for any
+  purpose with or without fee is hereby granted, provided that the above
+  copyright notice and this permission notice appear in all copies.
+
+  THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+*/
+
+// Tests that regex syntax errors are reported correctly
+
+#undef NDEBUG
+
+#include "rerex/rerex.h"
+
+#include <assert.h>
+#include <stdint.h>
+#include <string.h>
+
+typedef struct {
+  RerexStatus status;
+  unsigned    offset;
+  const char* pattern;
+} BadSyntaxTestCase;
+
+static const BadSyntaxTestCase syntax_tests[] = {
+  {REREX_EXPECTED_CHAR, 1, "a\b"},
+  {REREX_EXPECTED_CHAR, 1, "a\x7F"},
+  {REREX_EXPECTED_ELEMENT, 1, "[\b]"},
+  {REREX_EXPECTED_ELEMENT, 1, "[\x7F]"},
+  {REREX_EXPECTED_ELEMENT, 2, "[a\b]"},
+  {REREX_EXPECTED_ELEMENT, 2, "[a\x7F]"},
+  {REREX_EXPECTED_ELEMENT, 3, "[a-\b]"},
+  {REREX_EXPECTED_ELEMENT, 3, "[a-\x7F]"},
+  {REREX_EXPECTED_RBRACKET, 2, "[\\n]"},
+  {REREX_EXPECTED_RPAREN, 2, "(a"},
+  {REREX_EXPECTED_SPECIAL, 1, "\\n"},
+  {REREX_UNEXPECTED_END, 1, "("},
+  {REREX_UNEXPECTED_END, 1, "["},
+  {REREX_UNEXPECTED_END, 2, "[a"},
+  {REREX_UNEXPECTED_END, 3, "(a|"},
+  {REREX_UNEXPECTED_END, 3, "[a-"},
+  {REREX_UNEXPECTED_END, 4, "[a-z"},
+  {REREX_UNEXPECTED_END, 4, "[a-z"},
+  {REREX_UNEXPECTED_SPECIAL, 0, "{"},
+  {REREX_UNEXPECTED_SPECIAL, 0, "}"},
+  {REREX_UNEXPECTED_SPECIAL, 0, "?"},
+  {REREX_UNEXPECTED_SPECIAL, 1, "[]]"},
+  {REREX_UNEXPECTED_SPECIAL, 2, "a|?"},
+  {REREX_UNEXPECTED_SPECIAL, 3, "(a|?)"},
+  {REREX_UNEXPECTED_SPECIAL, 3, "[[]]"},
+  {REREX_UNEXPECTED_SPECIAL, 3, "[a]]"},
+  {REREX_UNEXPECTED_SPECIAL, 4, "[A-]]"},
+  {REREX_UNEXPECTED_SPECIAL, 4, "[a[]]"},
+  {REREX_UNEXPECTED_SPECIAL, 5, "[A-[]]"},
+  {REREX_UNORDERED_RANGE, 4, "[z-a]"},
+};
+
+static void
+test_status(void)
+{
+  assert(!strcmp(rerex_strerror((RerexStatus)INT32_MAX), "Unknown error"));
+}
+
+static void
+test_syntax(void)
+{
+  const size_t n_tests = sizeof(syntax_tests) / sizeof(*syntax_tests);
+
+  for (size_t i = 0; i < n_tests; ++i) {
+    const char* const regexp = syntax_tests[i].pattern;
+    const size_t      offset = syntax_tests[i].offset;
+    const RerexStatus status = syntax_tests[i].status;
+
+    RerexPattern*     pattern = NULL;
+    size_t            end     = 0;
+    const RerexStatus st      = rerex_compile(regexp, &end, &pattern);
+
+    assert(st == status);
+    assert(!pattern);
+    assert(strcmp(rerex_strerror(st), rerex_strerror(REREX_SUCCESS)));
+    assert(end == offset);
+  }
+}
+
+int
+main(void)
+{
+  test_status();
+  test_syntax();
+  return 0;
+}
diff --git a/subprojects/rerex/test/test_xsd.c b/subprojects/rerex/test/test_xsd.c
new file mode 100644
index 00000000..81104a99
--- /dev/null
+++ b/subprojects/rerex/test/test_xsd.c
@@ -0,0 +1,522 @@
+/*
+  Copyright 2020 David Robillard <d@drobilla.net>
+
+  Permission to use, copy, modify, and/or distribute this software for any
+  purpose with or without fee is hereby granted, provided that the above
+  copyright notice and this permission notice appear in all copies.
+
+  THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+*/
+
+/*
+  Tests more realistic patterns and using a matcher multiple times.
+
+  The patterns here were written by myself and may have bugs.  XSD datatypes
+  unfortunately don't have canonical patterns, though sometimes they
+  infuriatingly say that they are constrained by a pattern... which is not
+  given.  XML is terrible, don't use it, but the XSD datatypes are more
+  generally useful.  Some patterns here slightly fuzzy (for example, with
+  dates), so matching does not necessary mean the value itself is valid.
+
+  These don't have much to do with Rerex itself, but it's something I needed it
+  for, and having some more realistic tests is nice.  These also test reusing a
+  matcher, unlike the basic match tests.
+*/
+
+#undef NDEBUG
+
+#include "rerex/rerex.h"
+
+#include <assert.h>
+#include <stddef.h>
+
+static void
+test_pattern(const char* const regexp,
+             const char* const matching[],
+             const char* const nonmatching[])
+{
+  RerexPattern*     pattern = NULL;
+  size_t            end     = 0;
+  const RerexStatus st      = rerex_compile(regexp, &end, &pattern);
+
+  assert(!st);
+
+  RerexMatcher* const matcher = rerex_new_matcher(pattern);
+
+  for (const char* const* m = matching; *m; ++m) {
+    assert(rerex_match(matcher, *m));
+  }
+
+  for (const char* const* n = nonmatching; *n; ++n) {
+    assert(!rerex_match(matcher, *n));
+  }
+
+  rerex_free_matcher(matcher);
+  rerex_free_pattern(pattern);
+}
+
+static void
+test_base64Binary(void)
+{
+  static const char* const regexp =
+    "(([A-Za-z0-9+/] *[A-Za-z0-9+/] *[A-Za-z0-9+/] *[A-Za-z0-9+/] *)*" //
+    "(([A-Za-z0-9+/] *[A-Za-z0-9+/] *[A-Za-z0-9+/] *[A-Za-z0-9+/])|"   //
+    "([A-Za-z0-9+/] *[A-Za-z0-9+/] *[AEIMQUYcgkosw048] *=)|"           //
+    "([A-Za-z0-9+/] *[AQgw] *= *=)))?";
+
+  static const char* const good[] = {
+    "0FB8",
+    "0fb8",
+    "0 FB8 0F+9",
+    "0F+40A8=",
+    "0F+40A==",
+    "",
+    NULL,
+  };
+
+  static const char* const bad[] = {
+    " 0FB8",
+    "0FB8 ",
+    " 0FB8 ",
+    "FB8",
+    "==0F",
+    "0F+40A9=",
+    "0F+40B==",
+    NULL,
+  };
+
+  test_pattern(regexp, good, bad);
+}
+
+static void
+test_boolean(void)
+{
+  static const char* const regexp = "(true|false|0|1)";
+  static const char* const good[] = {"true", "false", "0", "1", NULL};
+  static const char* const bad[]  = {"TRUE", "T", "", NULL};
+
+  test_pattern(regexp, good, bad);
+}
+
+static void
+test_date(void)
+{
+  static const char* const regexp = //
+    "-?[0-9][0-9][0-9][0-9][0-9]*"  //
+    "-(0[1-9]|1[0-2])"              //
+    "-(0[1-9]|[12][0-9]|3[01])"     //
+    "(Z|[-+][0-2][0-9]:[0-5][0-9])?";
+
+  static const char* const good[] = {
+    "2004-04-12",
+    "-0045-01-01",
+    "12004-04-12",
+    "2004-04-12-05:00",
+    "2004-04-12Z",
+    "2001-10-26",
+    "2001-10-26+02:00",
+    "2001-10-26Z",
+    "2001-10-26+00:00",
+    "-2001-10-26",
+    "-20000-04-01",
+    NULL,
+  };
+
+  static const char* const bad[] = {
+    "99-04-12",
+    "2004-4-2",
+    "2004/04/02",
+    "04-12-2004",
+    // "2004-04-31", // Not quite that clever...
+    "2001-10",
+    "2001-10-32",
+    "2001-13-26+02:00",
+    "01-10-26",
+    "",
+    NULL,
+  };
+
+  test_pattern(regexp, good, bad);
+}
+
+static void
+test_decimal(void)
+{
+  static const char* const regexp =
+    "[+-]?(([0-9]+[.]?[0-9]*)|([0-9]*[.]?[0-9]+))";
+
+  static const char* const good[] = {
+    "3.0",
+    "-3.0",
+    "+3.5",
+    "3",
+    ".3",
+    "3.",
+    "0",
+    "-.3",
+    "0003.0",
+    "3.0000",
+    "-456",
+    NULL,
+  };
+
+  static const char* const bad[] = {"3,5", ".", "", NULL};
+
+  test_pattern(regexp, good, bad);
+}
+
+// Tests both xsd:float and xsd:double, which are lexically identical
+static void
+test_float(void)
+{
+  static const char* const regexp = "-?INF|NaN|[+-]?(([0-9]+[.]?[0-9]*)|([0-"
+                                    "9]*[.]?[0-9]+))([eE][-+]?[0-9]+)?";
+
+  static const char* const good[] = {
+    "-3E2",
+    "4268.22752E11",
+    "+24.3e-3",
+    "12",
+    "+3.5",
+    "INF",
+    "-INF",
+    "-0",
+    "NaN",
+    NULL,
+  };
+
+  static const char* const bad[] = {
+    "-3E2.4",
+    "12E",
+    "+INF",
+    "NAN",
+    "",
+    NULL,
+  };
+
+  test_pattern(regexp, good, bad);
+}
+
+static void
+test_gDay(void)
+{
+  static const char* const regexp =
+    "---(0[1-9]|[12][0-9]|3[01])(Z|[-+][0-2][0-9]:[0-5][0-9])?";
+
+  static const char* const good[] = {
+    "---02",
+    "---01",
+    "---01Z",
+    "---01+02:00",
+    "---01-04:00",
+    "---15",
+    "---31",
+    NULL,
+  };
+
+  static const char* const bad[] = {
+    "02",
+    "---2",
+    "---32",
+    "--30-",
+    "---35",
+    "---5",
+    "15",
+    "",
+    NULL,
+  };
+
+  test_pattern(regexp, good, bad);
+}
+
+static void
+test_gMonth(void)
+{
+  static const char* const regexp =
+    "--(0[1-9]|1[0-2])(Z|[-+][0-2][0-9]:[0-5][0-9])?";
+
+  static const char* const good[] = {
+    "--04",
+    "--04-05:00",
+    "--05",
+    "--11Z",
+    "--11+02:00",
+    "--11-04:00",
+    "--02",
+    NULL,
+  };
+
+  static const char* const bad[] = {
+    "2004-04",
+    "04",
+    "--4",
+    "--13",
+    "-01-",
+    "--13",
+    "--1",
+    "01",
+    "",
+    NULL,
+  };
+
+  test_pattern(regexp, good, bad);
+}
+
+static void
+test_gMonthDay(void)
+{
+  static const char* const regexp = //
+    "--(0[1-9]|1[0-2])"             //
+    "-(0[1-9]|[12][0-9]|3[01])"     //
+    "(Z|[-+][0-2][0-9]:[0-5][0-9])?";
+
+  static const char* const good[] = {
+    "--04-12",
+    "--04-12Z",
+    "--05-01",
+    "--11-01Z",
+    "--11-01+02:00",
+    "--11-01-04:00",
+    "--11-15",
+    "--02-29",
+    NULL,
+  };
+
+  static const char* const bad[] = {
+    "04-12",
+    // "--04-31", Not quite that clever...
+    "--4-6",
+    "-01-30-",
+    "--01-35",
+    "--1-5",
+    "01-15",
+    "",
+    NULL,
+  };
+
+  test_pattern(regexp, good, bad);
+}
+
+static void
+test_gYear(void)
+{
+  static const char* const regexp =
+    "-?[0-9][0-9][0-9][0-9][0-9]*(Z|[-+][0-2][0-9]:[0-5][0-9])?";
+
+  static const char* const good[] = {
+    "2004",
+    "2004-05:00",
+    "12004",
+    "0922",
+    "-0045",
+    "2001+02:00",
+    "2001Z",
+    "2001+00:00",
+    "-2001",
+    "-20000",
+    NULL,
+  };
+
+  static const char* const bad[] = {
+    "99",
+    "922",
+    "01",
+    "2001-12",
+    "",
+    NULL,
+  };
+
+  test_pattern(regexp, good, bad);
+}
+
+static void
+test_gYearMonth(void)
+{
+  static const char* const regexp = //
+    "-?[0-9][0-9][0-9][0-9][0-9]*"  //
+    "-(0[1-9]|1[0-2])"              //
+    "(Z|[-+][0-2][0-9]:[0-5][0-9])?";
+
+  static const char* const good[] = {
+    "2001-10",
+    "2001-10+02:00",
+    "2001-10Z",
+    "2001-10+00:00",
+    "-2001-10",
+    "-20000-04",
+    "2004-04-05:00",
+    NULL,
+  };
+
+  static const char* const bad[] = {
+    "2001",
+    "2001-13",
+    "2001-13-26+02:00",
+    "01-10",
+    "99-04",
+    "2004",
+    "2004-4",
+    "2004-13",
+    "",
+    NULL,
+  };
+
+  test_pattern(regexp, good, bad);
+}
+
+static void
+test_hexBinary(void)
+{
+  static const char* const regexp = "([0-9A-Fa-f][0-9A-Fa-f])*";
+  static const char* const good[] = {"0FB8", "0fb8", "", NULL};
+  static const char* const bad[]  = {"F", "FB8", NULL};
+
+  test_pattern(regexp, good, bad);
+}
+
+static void
+test_integer(void)
+{
+  static const char* const regexp = "[-+]?[0-9]+";
+  static const char* const good[] = {"122", "00122", "0", "-3", "+3", NULL};
+  static const char* const bad[]  = {"3.", "3.0", "A", "", NULL};
+
+  test_pattern(regexp, good, bad);
+}
+
+static void
+test_language(void)
+{
+  static const char* const regexp =                     //
+    "[a-zA-Z][a-zA-Z]?[a-zA-Z]?[a-zA-Z]?"               //
+    "[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?"              //
+    "(-[a-zA-Z0-9][a-zA-Z0-9]?[a-zA-Z0-9]?[a-zA-Z0-9]?" //
+    "[a-zA-Z0-9]?[a-zA-Z0-9]?[a-zA-Z0-9]?[a-zA-Z0-9]?)*";
+
+  static const char* const good[] = {
+    "en",
+    "en-GB",
+    "en-US",
+    "fr",
+    "fr-FR",
+    "fr-CA",
+    "de",
+    "zh",
+    "ja",
+    "ko",
+    "i-navajo",
+    "x-Newspeak",
+    "any-value-with-short-parts",
+    NULL,
+  };
+
+  static const char* const bad[] = {
+    "longerThan8",
+    "even-longerThan8",
+    "longererThan8-first",
+    "last-longererThan8",
+    "middle-longererThan8-CA",
+    "",
+    NULL,
+  };
+
+  test_pattern(regexp, good, bad);
+}
+
+static void
+test_nonNegativeInteger(void)
+{
+  static const char* const regexp = "[+]?[0-9]+";
+  static const char* const good[] = {"+3", "122", "0", "0012", "+123", NULL};
+  static const char* const bad[]  = {"-3", "3.0", "", NULL};
+
+  test_pattern(regexp, good, bad);
+}
+
+static void
+test_nonPositiveInteger(void)
+{
+  static const char* const regexp = "(0|-[0-9]+)";
+  static const char* const good[] = {"-3", "-0", "-00122", NULL};
+  static const char* const bad[]  = {"122", "+3", "3.", "3.0", "", NULL};
+
+  test_pattern(regexp, good, bad);
+}
+
+static void
+test_positiveInteger(void)
+{
+  static const char* const regexp = "[+]?[0-9]*[1-9]+[0-9]*";
+  static const char* const good[] = {"122", "+3", "00122", NULL};
+  static const char* const bad[]  = {"0", "-3", "3.0", "", NULL};
+
+  test_pattern(regexp, good, bad);
+}
+
+static void
+test_time(void)
+{
+  static const char* const regexp =
+    "(([0-1][0-9])|(2[0-4])):[0-5][0-9]:[0-5][0-9](.[0-9]+)?(Z|[-+][0-2][0-"
+    "9]:[0-5][0-9])?";
+
+  static const char* const good[] = {
+    "13:20:00",
+    "13:20:30.5555",
+    "13:20:00-05:00",
+    "13:20:00Z",
+    "00:00:00",
+    "24:00:00",
+    "21:32:52",
+    "21:32:52+02:00",
+    "19:32:52Z",
+    "19:32:52+00:00",
+    "21:32:52.12679",
+    NULL,
+  };
+
+  static const char* const bad[] = {
+    "5:20:00",
+    "13:20",
+    "13:20.5:00",
+    "13:65:00",
+    "21:32",
+    "25:25:10",
+    "-10:00:00",
+    "1:20:10",
+    "",
+    NULL,
+  };
+
+  test_pattern(regexp, good, bad);
+}
+
+int
+main(void)
+{
+  test_base64Binary();
+  test_boolean();
+  test_date();
+  test_decimal();
+  test_float();
+  test_gDay();
+  test_gMonth();
+  test_gMonthDay();
+  test_gYear();
+  test_gYearMonth();
+  test_hexBinary();
+  test_integer();
+  test_language();
+  test_nonNegativeInteger();
+  test_nonPositiveInteger();
+  test_positiveInteger();
+  test_time();
+
+  return 0;
+}
author	David Robillard <d@drobilla.net>	2020-12-19 22:07:41 +0100
committer	David Robillard <d@drobilla.net>	2021-03-08 23:34:56 -0500
commit	00456f395e50b43a9f26ff1479b9ef8b06a4d219 (patch)
tree	31c22b3bd12d8e1bb6597c53fade859f6ca37c79 /subprojects/rerex
parent	ec0edd399d3ef493c4ddb0e821a56b53a19dbc96 (diff)
download	serd-00456f395e50b43a9f26ff1479b9ef8b06a4d219.tar.gz serd-00456f395e50b43a9f26ff1479b9ef8b06a4d219.tar.bz2 serd-00456f395e50b43a9f26ff1479b9ef8b06a4d219.zip