Add rerex from git@gitlab.com:drobilla/rerex 2420851

author: David Robillard <d@drobilla.net> 2020-12-19 22:07:41 +0100
committer: David Robillard <d@drobilla.net> 2021-03-08 23:34:56 -0500
commit: 00456f395e50b43a9f26ff1479b9ef8b06a4d219 (patch)
tree: 31c22b3bd12d8e1bb6597c53fade859f6ca37c79 /subprojects/rerex/src
parent: ec0edd399d3ef493c4ddb0e821a56b53a19dbc96 (diff)
download: serd-00456f395e50b43a9f26ff1479b9ef8b06a4d219.tar.gz
serd-00456f395e50b43a9f26ff1479b9ef8b06a4d219.tar.bz2
serd-00456f395e50b43a9f26ff1479b9ef8b06a4d219.zip
2 files changed, 773 insertions, 0 deletions
diff --git a/subprojects/rerex/src/.clang-tidy b/subprojects/rerex/src/.clang-tidy
new file mode 100644
index 00000000..420ce833
--- /dev/null
+++ b/subprojects/rerex/src/.clang-tidy
@@ -0,0 +1,8 @@
+Checks: >
+  *,
+  -hicpp-multiway-paths-covered,
+  -llvmlibc-*,
+  -misc-no-recursion,
+WarningsAsErrors: '*'
+HeaderFilterRegex: '.*'
+FormatStyle: file
diff --git a/subprojects/rerex/src/rerex.c b/subprojects/rerex/src/rerex.c
new file mode 100644
index 00000000..49970a45
--- /dev/null
+++ b/subprojects/rerex/src/rerex.c
@@ -0,0 +1,765 @@
+/*
+  Copyright 2020 David Robillard <d@drobilla.net>
+
+  Permission to use, copy, modify, and/or distribute this software for any
+  purpose with or without fee is hereby granted, provided that the above
+  copyright notice and this permission notice appear in all copies.
+
+  THIS SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+*/
+
+#include "rerex/rerex.h"
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+static const char cmin = 0x20; // Inclusive minimum normal character
+static const char cmax = 0x7E; // Inclusive maximum normal character
+
+const char*
+rerex_strerror(const RerexStatus status)
+{
+  switch (status) {
+  case REREX_SUCCESS:
+    return "Success";
+  case REREX_EXPECTED_CHAR:
+    return "Expected a regular character";
+  case REREX_EXPECTED_ELEMENT:
+    return "Expected a character in a set";
+  case REREX_EXPECTED_RBRACKET:
+    return "Expected ']'";
+  case REREX_EXPECTED_RPAREN:
+    return "Expected ')'";
+  case REREX_EXPECTED_SPECIAL:
+    return "Expected a special character (one of \"()*+-?[]^|\")";
+  case REREX_UNEXPECTED_SPECIAL:
+    return "Unexpected special character";
+  case REREX_UNEXPECTED_END:
+    return "Unexpected end of input";
+  case REREX_UNORDERED_RANGE:
+    return "Range is out of order";
+  }
+
+  return "Unknown error";
+}
+
+/* State */
+
+#define NO_STATE 0
+
+// The ID for a state, which is an index into the state array
+typedef size_t StateIndex;
+
+// A code point (currently only 8-bit ASCII but we use the space anyway)
+typedef int Codepoint;
+
+// Special type for states with only epsilon out arcs
+typedef enum {
+  REREX_MATCH = 0xE000, ///< Matching state, no out arcs
+  REREX_SPLIT = 0xE001, ///< Splitting state, one or two out arcs
+} StateType;
+
+/* A state in an NFA.
+
+   A state in Thompson's NFA can have either a single character-labeled
+   transition to another state, or up to two unlabeled epsilon transitions to
+   other states.  There is both a minimum and maximum label for supporting
+   character ranges.  So, either `min` and `max` are ASCII characters that are
+   the label of an arc to next1 (and next2 is null), or `min` is a special
+   StateType and next1 and/or next2 may be set to successor states.
+*/
+typedef struct {
+  StateIndex next1; ///< Head of first out arc (or NULL)
+  StateIndex next2; ///< Head of second out arc (or NULL)
+  Codepoint  min;   ///< Special type, or inclusive min label for next1
+  Codepoint  max;   ///< Inclusive max label for next2
+} State;
+
+// Create a match (end) state with no successors
+static State
+match_state(void)
+{
+  const State s = {NO_STATE, NO_STATE, REREX_MATCH, 0};
+  return s;
+}
+
+// Create a split state with at most two successors
+static State
+split_state(const StateIndex next1, const StateIndex next2)
+{
+  const State s = {next1, next2, REREX_SPLIT, 0};
+  return s;
+}
+
+// Create a labeled state with one successor reached by a character arc
+static State
+range_state(const char min, const char max, const StateIndex next)
+{
+  const State s = {next, NO_STATE, min, max};
+  return s;
+}
+
+/* Array of states.
+
+   States are stored in a flat array to reduce memory fragmentation, and for
+   easy memory management since the automata graph may be cyclic.  This simple
+   implementation calls realloc() for every state, which isn't terribly
+   efficient, but works well enough.  Note that state addresses therefore change
+   during compilation, so states are generally referred to by their index, and
+   not by pointer.  Conveniently, using indices is also useful during matching
+   for storing auxiliary information about states.
+*/
+typedef struct {
+  State* states;
+  size_t n_states;
+} StateArray;
+
+// Append a new state to the end of the state array
+static StateIndex
+add_state(StateArray* const array, const State state)
+{
+  const size_t new_n_states = array->n_states + 1;
+  const size_t new_size     = new_n_states * sizeof(State);
+
+  array->states                  = (State*)realloc(array->states, new_size);
+  array->states[array->n_states] = state;
+
+  return array->n_states++;
+}
+
+/* Automata.
+ *
+ * This is a lightweiht description of an NFA fragment.  The states are stored
+ * elsewhere, this is just used to provide a facade to conceptually work with
+ * automata for high-level operations.
+ */
+typedef struct {
+  StateIndex start;
+  StateIndex end;
+} Automata;
+
+// Simple utility function for making an automata in an expression
+static Automata
+make_automata(const StateIndex start, const StateIndex end)
+{
+  Automata result = {start, end};
+  return result;
+}
+
+// Return whether `nfa` has only two simple states (used for optimizations)
+static inline bool
+is_trivial(const StateArray* const states, const Automata nfa)
+{
+  return (states->states[nfa.start].min < REREX_MATCH &&
+          states->states[nfa.start].next1 == nfa.end);
+}
+
+// Kleene's Star of an NFA
+static Automata
+star(StateArray* const states, const Automata nfa)
+{
+  const StateIndex end   = add_state(states, match_state());
+  const StateIndex start = add_state(states, split_state(nfa.start, end));
+
+  states->states[nfa.end] = split_state(nfa.start, end);
+
+  return make_automata(start, end);
+}
+
+// Zero-or-one of an NFA
+static Automata
+question(StateArray* const states, const Automata nfa)
+{
+  const StateIndex start = add_state(states, split_state(nfa.start, nfa.end));
+
+  return make_automata(start, nfa.end);
+}
+
+// One-or-more of an NFA
+static Automata
+plus(StateArray* const states, const Automata nfa)
+{
+  const StateIndex end = add_state(states, match_state());
+
+  states->states[nfa.end] = split_state(nfa.start, end);
+
+  return make_automata(nfa.start, end);
+}
+
+// Concatenation of two NFAs
+static Automata
+concatenate(StateArray* const states, const Automata a, const Automata b)
+{
+  if (is_trivial(states, a)) {
+    // Optimization: link a's start directly to b's start (drop a's end)
+    states->states[a.start].next1 = b.start;
+  } else {
+    states->states[a.end] = split_state(b.start, NO_STATE);
+  }
+
+  return make_automata(a.start, b.end);
+}
+
+// Alternation (OR) of two NFAs
+static Automata
+alternate(StateArray* const states, const Automata a, const Automata b)
+{
+  const StateIndex split = add_state(states, split_state(a.start, b.start));
+
+  if (is_trivial(states, a)) {
+    // Optimization: link a's start directly to b's end (drop a's end)
+    states->states[a.start].next1 = b.end;
+    return make_automata(split, b.end);
+  }
+
+  if (is_trivial(states, b)) {
+    // Optimization: link b's start directly to a's end (drop b's end)
+    states->states[b.start].next1 = a.end;
+    return make_automata(split, a.end);
+  }
+
+  const StateIndex end = add_state(states, match_state());
+
+  states->states[a.end] = split_state(end, NO_STATE);
+  states->states[b.end] = split_state(end, NO_STATE);
+
+  return make_automata(split, end);
+}
+
+/* Parser input.
+ *
+ * The parser reads from a string one character at a time, though it would be
+ * simple to change this to read from any stream.  All reading is done by three
+ * operations: peek, peekahead, and eat.
+ */
+typedef struct {
+  const char* const str;
+  size_t            offset;
+} Input;
+
+// Return the next character in the input without consuming it
+static inline char
+peek(Input* const input)
+{
+  return input->str[input->offset];
+}
+
+// Return the next-next character in the input without consuming any
+static inline char
+peekahead(Input* const input)
+{
+  // Unfortunately we need 2-char lookahead for the ambiguity of '-' in sets
+  return input->str[input->offset + 1];
+}
+
+// Consume and return the next character in the input
+static inline char
+eat(Input* const input)
+{
+  return input->str[input->offset++];
+}
+
+// Forward declaration for read_expr because it is called recursively
+static RerexStatus
+read_expr(Input* input, StateArray* states, Automata* out);
+
+// DOT      ::= '.'
+// OPERATOR ::= '*' | '+' | '?'
+// SPECIAL  ::= DOT | OPERATOR | '(' | ')' | '[' | ']' | '^' | '{' | '|' | '}'
+static bool
+is_special(const char c)
+{
+  switch (c) {
+  case '(':
+  case ')':
+  case '*':
+  case '+':
+  case '.':
+  case '?':
+  case '[':
+  case ']':
+  case '^':
+  case '{':
+  case '|':
+  case '}':
+    return true;
+  default:
+    break;
+  }
+
+  return false;
+}
+
+// DOT ::= '.'
+static RerexStatus
+read_dot(Input* const input, StateArray* const states, Automata* const out)
+{
+  assert(peek(input) == '.');
+  eat(input);
+
+  const StateIndex end   = add_state(states, match_state());
+  const StateIndex start = add_state(states, range_state(cmin, cmax, end));
+
+  *out = make_automata(start, end);
+
+  return REREX_SUCCESS;
+}
+
+// ESCAPE ::= '\' SPECIAL
+static RerexStatus
+read_escape(Input* const input, char* const out)
+{
+  assert(peek(input) == '\\');
+  eat(input);
+
+  const char c = peek(input);
+  if (is_special(c) || c == '-') {
+    *out = eat(input);
+    return REREX_SUCCESS;
+  }
+
+  return REREX_EXPECTED_SPECIAL;
+}
+
+// CHAR ::= ESCAPE | [#x20-#x7E] - SPECIAL
+static RerexStatus
+read_char(Input* const input, char* const out)
+{
+  const char c = peek(input);
+
+  switch (c) {
+  case '\0':
+    return REREX_UNEXPECTED_END;
+  case '\\':
+    return read_escape(input, out);
+  default:
+    break;
+  }
+
+  if (is_special(c)) {
+    return REREX_UNEXPECTED_SPECIAL;
+  }
+
+  if (c >= cmin && c <= cmax) {
+    *out = eat(input);
+    return REREX_SUCCESS;
+  }
+
+  return REREX_EXPECTED_CHAR;
+}
+
+// ELEMENT ::= ([#x20-#x7E] - ']') | ('\' ']')
+static RerexStatus
+read_element(Input* const input, char* const out)
+{
+  const char c = peek(input);
+
+  switch (c) {
+  case '\0':
+    return REREX_UNEXPECTED_END;
+  case ']':
+    return REREX_UNEXPECTED_SPECIAL;
+  case '\\':
+    eat(input);
+    if (peek(input) != ']') {
+      return REREX_EXPECTED_RBRACKET;
+    }
+
+    *out = eat(input);
+    return REREX_SUCCESS;
+  default:
+    break;
+  }
+
+  if (c >= cmin && c <= cmax) {
+    *out = eat(input);
+    return REREX_SUCCESS;
+  }
+
+  return REREX_EXPECTED_ELEMENT;
+}
+
+// Range ::= ELEMENT | ELEMENT '-' ELEMENT
+static RerexStatus
+read_range(Input* const      input,
+           StateArray* const states,
+           const bool        negated,
+           Automata* const   out)
+{
+  RerexStatus st  = REREX_SUCCESS;
+  char        min = 0;
+  char        max = 0;
+
+  // Read the first (or only) character
+  if ((st = read_element(input, &min))) {
+    return st;
+  }
+
+  // Handle '-' which is a bit hairy because it may or may not be special
+  if (peek(input) == '-') {
+    switch (peekahead(input)) {
+    case ']':
+      // Weird case like [a-] where '-' is a character
+      max = min;
+      break;
+    default:
+      // Normal range like [a-z] (note that '[' isn't special here)
+      eat(input);
+      if ((st = read_element(input, &max))) {
+        return st;
+      }
+      break;
+    }
+  } else {
+    // Single character element
+    max = min;
+  }
+
+  if (max < min) {
+    return REREX_UNORDERED_RANGE;
+  }
+
+  const StateIndex end = add_state(states, match_state());
+  if (negated) {
+    const char       emin = (char)(min - 1);
+    const char       emax = (char)(max + 1);
+    const StateIndex low  = add_state(states, range_state(cmin, emin, end));
+    const StateIndex high = add_state(states, range_state(emax, cmax, end));
+    const StateIndex fork = add_state(states, split_state(low, high));
+
+    *out = make_automata(fork, end);
+  } else {
+    const StateIndex start = add_state(states, range_state(min, max, end));
+
+    *out = make_automata(start, end);
+  }
+
+  return st;
+}
+
+// Set ::= Range | Range Set
+static RerexStatus
+read_set(Input* const input, StateArray* const states, Automata* const out)
+{
+  RerexStatus st      = REREX_SUCCESS;
+  bool        negated = false;
+
+  if (peek(input) == '^') {
+    eat(input);
+    negated = true;
+  }
+
+  Automata nfa = {NO_STATE, NO_STATE};
+  if ((st = read_range(input, states, negated, &nfa))) {
+    return st;
+  }
+
+  while (peek(input) != ']') {
+    Automata range_nfa = {NO_STATE, NO_STATE};
+    if ((st = read_range(input, states, negated, &range_nfa))) {
+      return st;
+    }
+
+    nfa = alternate(states, nfa, range_nfa);
+  }
+
+  *out = nfa;
+  return st;
+}
+
+// Atom ::= CHAR | DOT | '(' Expr ')' | '[' Set ']'
+static RerexStatus
+read_atom(Input* const input, StateArray* const states, Automata* const out)
+{
+  RerexStatus st = REREX_SUCCESS;
+
+  switch (peek(input)) {
+  case '(':
+    eat(input);
+    if ((st = read_expr(input, states, out))) {
+      return st;
+    }
+
+    if (peek(input) != ')') {
+      return REREX_EXPECTED_RPAREN;
+    }
+
+    eat(input);
+    return st;
+
+  case '.':
+    return read_dot(input, states, out);
+
+  case '[':
+    eat(input);
+    if ((st = read_set(input, states, out))) {
+      return st;
+    }
+
+    eat(input);
+    return st;
+
+  default:
+    break;
+  }
+
+  char c = 0;
+  if ((st = read_char(input, &c))) {
+    return st;
+  }
+
+  const StateIndex end   = add_state(states, match_state());
+  const StateIndex start = add_state(states, range_state(c, c, end));
+
+  *out = make_automata(start, end);
+
+  return st;
+}
+
+// OPERATOR ::= '*' | '+' | '?'
+// Factor   ::= Atom | Atom OPERATOR
+static RerexStatus
+read_factor(Input* const input, StateArray* const states, Automata* const out)
+{
+  RerexStatus st       = REREX_SUCCESS;
+  Automata    atom_nfa = {NO_STATE, NO_STATE};
+
+  if (!(st = read_atom(input, states, &atom_nfa))) {
+    const char c = peek(input);
+    switch (c) {
+    case '*':
+      eat(input);
+      *out = star(states, atom_nfa);
+      break;
+    case '+':
+      eat(input);
+      *out = plus(states, atom_nfa);
+      break;
+    case '?':
+      eat(input);
+      *out = question(states, atom_nfa);
+      break;
+    default:
+      *out = atom_nfa;
+      break;
+    }
+  }
+
+  return st;
+}
+
+// Term ::= Factor | Factor Term
+static RerexStatus
+read_term(Input* const input, StateArray* const states, Automata* const out)
+{
+  RerexStatus st         = REREX_SUCCESS;
+  Automata    factor_nfa = {NO_STATE, NO_STATE};
+  Automata    term_nfa   = {NO_STATE, NO_STATE};
+
+  if (!(st = read_factor(input, states, &factor_nfa))) {
+    switch (peek(input)) {
+    case '\0':
+    case ')':
+    case '|':
+      *out = factor_nfa;
+      break;
+    default:
+      if (!(st = read_term(input, states, &term_nfa))) {
+        *out = concatenate(states, factor_nfa, term_nfa);
+      }
+      break;
+    }
+  }
+
+  return st;
+}
+
+// Expr ::= Term | Term '|' Expr
+static RerexStatus
+read_expr(Input* const input, StateArray* const states, Automata* const out)
+{
+  RerexStatus st       = REREX_SUCCESS;
+  Automata    term_nfa = {NO_STATE, NO_STATE};
+  Automata    expr_nfa = {NO_STATE, NO_STATE};
+
+  if ((st = read_term(input, states, &term_nfa))) {
+    return st;
+  }
+
+  if (peek(input) == '|') {
+    eat(input);
+    if (!(st = read_expr(input, states, &expr_nfa))) {
+      *out = alternate(states, term_nfa, expr_nfa);
+    }
+  } else {
+    *out = term_nfa;
+  }
+
+  return st;
+}
+
+/* Pattern.
+ *
+ * A pattern is simply an array of states and an index to the start state.  The
+ * end state(s) are known because they have type REREX_MATCH.  A pattern is
+ * immutable after construction, the matcher does not modify it.
+ */
+struct RerexPatternImpl {
+  StateArray states;
+  StateIndex start;
+};
+
+void
+rerex_free_pattern(RerexPattern* const regexp)
+{
+  free(regexp->states.states);
+  free(regexp);
+}
+
+RerexStatus
+rerex_compile(const char* const    pattern,
+              size_t* const        end,
+              RerexPattern** const out)
+{
+  Input      input  = {pattern, 0};
+  Automata   nfa    = {NO_STATE, NO_STATE};
+  StateArray states = {NULL, 0};
+
+  // Add null state so that no actual state has NO_STATE as an ID
+  add_state(&states, split_state(NO_STATE, NO_STATE));
+
+  const RerexStatus st = read_expr(&input, &states, &nfa);
+  if (st) {
+    free(states.states);
+  } else {
+    *out           = (RerexPattern*)malloc(sizeof(RerexPattern));
+    (*out)->states = states;
+    (*out)->start  = nfa.start;
+  }
+
+  *end = input.offset;
+  return st;
+}
+
+/* Matcher */
+
+typedef struct {
+  StateIndex* indices;   // Array of state indices
+  size_t      n_indices; // Number of elements in indices
+} IndexList;
+
+/* Matcher.
+ *
+ * The matcher tracks active states by keeping two lists of indices: one for
+ * the current iteration, and one for the next.  A separate array, keyed by
+ * state index, stores the number of the last iteration the state was entered
+ * in.  This makes it simple and fast to check if a state has already been
+ * entered in the current iteration, avoiding the need to search the active
+ * list for every entered state.
+ */
+struct RerexMatcherImpl {
+  const RerexPattern* regexp;      // Pattern to match against
+  IndexList           active[2];   // Two lists of active states
+  size_t*             last_active; // Last iteration a state was active
+};
+
+RerexMatcher*
+rerex_new_matcher(const RerexPattern* const regexp)
+{
+  const size_t        n_states = regexp->states.n_states;
+  RerexMatcher* const m        = (RerexMatcher*)calloc(1, sizeof(RerexMatcher));
+
+  m->regexp            = regexp;
+  m->active[0].indices = (StateIndex*)calloc(n_states, sizeof(StateIndex));
+  m->active[1].indices = (StateIndex*)calloc(n_states, sizeof(StateIndex));
+  m->last_active       = (size_t*)calloc(n_states, sizeof(size_t));
+
+  return m;
+}
+
+void
+rerex_free_matcher(RerexMatcher* const matcher)
+{
+  free(matcher->last_active);
+  free(matcher->active[1].indices);
+  free(matcher->active[0].indices);
+  free(matcher);
+}
+
+// Add `s` and any epsilon successors to the active list
+static void
+enter_state(RerexMatcher* const matcher,
+            const size_t        step,
+            IndexList* const    list,
+            const StateIndex    s)
+{
+  const StateArray* const states = &matcher->regexp->states;
+
+  if (s && matcher->last_active[s] != step) {
+    matcher->last_active[s] = step;
+
+    const State* const state = &states->states[s];
+    if (state->min == REREX_SPLIT) {
+      enter_state(matcher, step, list, state->next1);
+      enter_state(matcher, step, list, state->next2);
+    } else {
+      list->indices[list->n_indices++] = s;
+    }
+  }
+}
+
+// Run `matcher` and return true if `string` matches the expression
+bool
+rerex_match(RerexMatcher* const matcher, const char* const string)
+{
+  const StateArray* const states = &matcher->regexp->states;
+
+  // Reset matcher to a consistent initial state
+  matcher->active[0].n_indices = 0;
+  matcher->active[1].n_indices = 0;
+  for (size_t i = 0; i < states->n_states; ++i) {
+    matcher->last_active[i] = SIZE_MAX;
+  }
+
+  // Enter start state
+  enter_state(matcher, 0, &matcher->active[0], matcher->regexp->start);
+
+  // Tick the matcher for every input character
+  bool phase = 0;
+  for (size_t i = 0; string[i]; ++i) {
+    const char       c         = string[i];
+    IndexList* const list      = &matcher->active[phase];
+    IndexList* const next_list = &matcher->active[!phase];
+
+    // Add successor states to the next iteration's list
+    next_list->n_indices = 0;
+    for (size_t j = 0; j < list->n_indices; ++j) {
+      const State* const state = &states->states[list->indices[j]];
+      if (state->min <= c && c <= state->max) {
+        enter_state(matcher, i + 1, next_list, state->next1);
+      }
+    }
+
+    // Flip phase to swap active lists
+    phase = !phase;
+  }
+
+  // Check if match state is entered in the end
+  IndexList* const active = &matcher->active[phase];
+  for (size_t i = 0; i < active->n_indices; ++i) {
+    if (states->states[active->indices[i]].min == REREX_MATCH) {
+      return true;
+    }
+  }
+
+  return false;
+}
author	David Robillard <d@drobilla.net>	2020-12-19 22:07:41 +0100
committer	David Robillard <d@drobilla.net>	2021-03-08 23:34:56 -0500
commit	00456f395e50b43a9f26ff1479b9ef8b06a4d219 (patch)
tree	31c22b3bd12d8e1bb6597c53fade859f6ca37c79 /subprojects/rerex/src
parent	ec0edd399d3ef493c4ddb0e821a56b53a19dbc96 (diff)
download	serd-00456f395e50b43a9f26ff1479b9ef8b06a4d219.tar.gz serd-00456f395e50b43a9f26ff1479b9ef8b06a4d219.tar.bz2 serd-00456f395e50b43a9f26ff1479b9ef8b06a4d219.zip