Reduce complexity of URI parsing code

author: David Robillard <d@drobilla.net> 2023-02-06 07:43:36 -0500
committer: David Robillard <d@drobilla.net> 2023-12-02 18:49:08 -0500
commit: b992fe3ef83e102a999084070214b8295f824f6a (patch)
tree: 20f702125b4a257200bbcffbe178118b8d577352
parent: 4cf33db925fbd8bea0defeb34e1ed6575349e644 (diff)
download: serd-b992fe3ef83e102a999084070214b8295f824f6a.tar.gz
serd-b992fe3ef83e102a999084070214b8295f824f6a.tar.bz2
serd-b992fe3ef83e102a999084070214b8295f824f6a.zip
3 files changed, 98 insertions, 101 deletions
diff --git a/.clang-tidy b/.clang-tidy
index 0c240213..a283cc08 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -9,6 +9,7 @@ Checks: >
   -clang-diagnostic-unused-macros,
   -llvmlibc-*,
   -modernize-macro-to-enum,
+  -readability-function-cognitive-complexity,
   -readability-identifier-length,
 CheckOptions:
   - key:   hicpp-uppercase-literal-suffix.NewSuffixes
diff --git a/src/.clang-tidy b/src/.clang-tidy
index c2df3e44..d02d9891 100644
--- a/src/.clang-tidy
+++ b/src/.clang-tidy
@@ -14,5 +14,4 @@ Checks: >
   -llvm-header-guard,
   -misc-no-recursion,
   -modernize-macro-to-enum,
-  -readability-function-cognitive-complexity,
 InheritParentConfig: true
diff --git a/src/uri.c b/src/uri.c
index e7445377..6fc1f17c 100644
--- a/src/uri.c
+++ b/src/uri.c
@@ -5,6 +5,7 @@
 #include "uri_utils.h"
 
 #include "serd/buffer.h"
+#include "serd/status.h"
 #include "serd/stream.h"
 #include "serd/string_view.h"
 #include "serd/uri.h"
@@ -16,11 +17,40 @@
 #include <stdlib.h>
 #include <string.h>
 
+static SerdStatus
+write_file_uri_char(const char c, void* const stream)
+{
+  return (serd_buffer_write(&c, 1, 1, stream) == 1) ? SERD_SUCCESS
+                                                    : SERD_BAD_ALLOC;
+}
+
+static char*
+parse_hostname(const char* const authority, char** const hostname)
+{
+  char* const path = strchr(authority, '/');
+  if (!path) {
+    return NULL;
+  }
+
+  if (hostname) {
+    const size_t len = (size_t)(path - authority);
+    if (!(*hostname = (char*)calloc(len + 1, 1))) {
+      return NULL;
+    }
+
+    memcpy(*hostname, authority, len);
+  }
+
+  return path;
+}
+
 char*
 serd_parse_file_uri(const char* const uri, char** const hostname)
 {
   assert(uri);
 
+  SerdStatus st = SERD_SUCCESS;
+
   const char* path = uri;
   if (hostname) {
     *hostname = NULL;
@@ -30,16 +60,8 @@ serd_parse_file_uri(const char* const uri, char** const hostname)
     const char* auth = uri + 7;
     if (*auth == '/') { // No hostname
       path = auth;
-    } else { // Has hostname
-      if (!(path = strchr(auth, '/'))) {
-        return NULL;
-      }
-
-      if (hostname) {
-        const size_t len = (size_t)(path - auth);
-        *hostname        = (char*)calloc(len + 1, 1);
-        memcpy(*hostname, auth, len);
-      }
+    } else if (!(path = parse_hostname(auth, hostname))) {
+      return NULL;
     }
   }
 
@@ -48,26 +70,30 @@ serd_parse_file_uri(const char* const uri, char** const hostname)
   }
 
   SerdBuffer buffer = {NULL, 0};
-  for (const char* s = path; *s; ++s) {
-    if (*s == '%') {
-      if (*(s + 1) == '%') {
-        serd_buffer_write("%", 1, 1, &buffer);
+  for (const char* s = path; !st && *s; ++s) {
+    if (*s != '%') {
+      st = write_file_uri_char(*s, &buffer);
+    } else if (*(s + 1) == '%') {
+      if (!(st = write_file_uri_char('%', &buffer))) {
         ++s;
-      } else if (is_hexdig(*(s + 1)) && is_hexdig(*(s + 2))) {
-        const uint8_t hi = hex_digit_value((const uint8_t)s[1]);
-        const uint8_t lo = hex_digit_value((const uint8_t)s[2]);
-        const char    c  = (char)((hi << 4U) | lo);
-        serd_buffer_write(&c, 1, 1, &buffer);
+      }
+    } else if (is_hexdig(*(s + 1)) && is_hexdig(*(s + 2))) {
+      const uint8_t hi = hex_digit_value((const uint8_t)s[1]);
+      const uint8_t lo = hex_digit_value((const uint8_t)s[2]);
+      const char    c  = (char)((hi << 4U) | lo);
+      if (!(st = write_file_uri_char(c, &buffer))) {
         s += 2;
-      } else {
-        s += 2; // Junk escape, ignore
       }
     } else {
-      serd_buffer_write(s, 1, 1, &buffer);
+      s += 2; // Junk escape, ignore
     }
   }
 
-  serd_buffer_close(&buffer);
+  if (st || serd_buffer_close(&buffer)) {
+    free(buffer.buf);
+    return NULL;
+  }
+
   return (char*)buffer.buf;
 }
 
@@ -92,6 +118,24 @@ serd_uri_string_has_scheme(const char* const string)
   return false;
 }
 
+static inline bool
+is_uri_authority_char(const char c)
+{
+  return c && c != '/' && c != '?' && c != '#';
+}
+
+static inline bool
+is_uri_path_char(const char c)
+{
+  return c && c != '?' && c != '#';
+}
+
+static inline bool
+is_uri_query_char(const char c)
+{
+  return c && c != '#';
+}
+
 SerdURIView
 serd_parse_uri(const char* const string)
 {
@@ -101,112 +145,65 @@ serd_parse_uri(const char* const string)
   const char* ptr    = string;
 
   /* See http://tools.ietf.org/html/rfc3986#section-3
-     URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
-  */
+     URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] */
 
   /* S3.1: scheme ::= ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) */
   if (is_alpha(*ptr)) {
     for (char c = *++ptr; true; c = *++ptr) {
-      switch (c) {
-      case '\0':
-      case '/':
-      case '?':
-      case '#':
-        ptr = string;
-        goto path; // Relative URI (starts with path by definition)
-      case ':':
+      if (c == ':') {
         result.scheme.data   = string;
-        result.scheme.length = (size_t)((ptr++) - string);
-        goto maybe_authority; // URI with scheme
-      case '+':
-      case '-':
-      case '.':
-        continue;
-      default:
-        if (is_alpha(c) || is_digit(c)) {
-          continue;
-        }
+        result.scheme.length = (size_t)(ptr++ - string);
+        break;
+      }
+
+      if (!is_uri_scheme_char(c)) {
+        ptr = string;
+        break;
       }
     }
   }
 
-  /* S3.2: The authority component is preceded by a double slash ("//")
-     and is terminated by the next slash ("/"), question mark ("?"),
-     or number sign ("#") character, or by the end of the URI.
-  */
-maybe_authority:
+  /* S3.2: The authority component is preceded by "//" and is terminated by the
+     next '/', '?', or '#', or by the end of the URI. */
   if (*ptr == '/' && *(ptr + 1) == '/') {
     ptr += 2;
     result.authority.data = ptr;
-    for (char c = 0; (c = *ptr) != '\0'; ++ptr) {
-      switch (c) {
-      case '/':
-        goto path;
-      case '?':
-        goto query;
-      case '#':
-        goto fragment;
-      default:
-        ++result.authority.length;
-      }
+    while (is_uri_authority_char(*ptr)) {
+      ++result.authority.length;
+      ++ptr;
     }
   }
 
-  /* RFC3986 S3.3: The path is terminated by the first question mark ("?")
-     or number sign ("#") character, or by the end of the URI.
-  */
-path:
-  switch (*ptr) {
-  case '?':
-    goto query;
-  case '#':
-    goto fragment;
-  case '\0':
-    goto end;
-  default:
-    break;
-  }
-  result.path.data   = ptr;
-  result.path.length = 0;
-  for (char c = 0; (c = *ptr) != '\0'; ++ptr) {
-    switch (c) {
-    case '?':
-      goto query;
-    case '#':
-      goto fragment;
-    default:
+  /* S3.3: The path is terminated by the first '?' or '#', or by the end of the
+     URI. */
+  if (is_uri_path_char(*ptr)) {
+    result.path.data   = ptr++;
+    result.path.length = 1U;
+    while (is_uri_path_char(*ptr)) {
       ++result.path.length;
+      ++ptr;
     }
   }
 
-  /* RFC3986 S3.4: The query component is indicated by the first question
-     mark ("?") character and terminated by a number sign ("#") character
-     or by the end of the URI.
-  */
-query:
+  /* S3.4: The query component is indicated by the first '?' and terminated by
+     a '#' or by the end of the URI. */
   if (*ptr == '?') {
     result.query.data = ++ptr;
-    for (char c = 0; (c = *ptr) != '\0'; ++ptr) {
-      if (c == '#') {
-        goto fragment;
-      }
+    while (is_uri_query_char(*ptr)) {
       ++result.query.length;
+      ++ptr;
     }
   }
 
-  /* RFC3986 S3.5: A fragment identifier component is indicated by the
-     presence of a number sign ("#") character and terminated by the end
-     of the URI.
-  */
-fragment:
+  /* S3.5: A fragment identifier component is indicated by the presence of a
+     '#' and terminated by the end of the URI. */
   if (*ptr == '#') {
     result.fragment.data = ptr;
-    while (*ptr++ != '\0') {
+    while (*ptr++) {
       ++result.fragment.length;
     }
   }
 
-end:
   return result;
 }
 
@@ -495,7 +492,7 @@ serd_write_uri(const SerdURIView   uri,
 }
 
 static bool
-is_uri_path_char(const char c)
+is_unescaped_uri_path_char(const char c)
 {
   if (is_alpha(c) || is_digit(c)) {
     return true;
@@ -563,7 +560,7 @@ serd_write_file_uri(const SerdStringView path,
   for (size_t i = 0; i < path.length; ++i) {
     if (path.data[i] == '%') {
       len += sink("%%", 1, 2, stream);
-    } else if (is_uri_path_char(path.data[i])) {
+    } else if (is_unescaped_uri_path_char(path.data[i])) {
       len += sink(path.data + i, 1, 1, stream);
 #ifdef _WIN32
     } else if (path.data[i] == '\\') {
author	David Robillard <d@drobilla.net>	2023-02-06 07:43:36 -0500
committer	David Robillard <d@drobilla.net>	2023-12-02 18:49:08 -0500
commit	b992fe3ef83e102a999084070214b8295f824f6a (patch)
tree	20f702125b4a257200bbcffbe178118b8d577352
parent	4cf33db925fbd8bea0defeb34e1ed6575349e644 (diff)
download	serd-b992fe3ef83e102a999084070214b8295f824f6a.tar.gz serd-b992fe3ef83e102a999084070214b8295f824f6a.tar.bz2 serd-b992fe3ef83e102a999084070214b8295f824f6a.zip