1 files changed, 172 insertions, 100 deletions
diff --git a/src/digest.c b/src/digest.c
index c14a3f5..32cae63 100644
--- a/src/digest.c
+++ b/src/digest.c
@@ -1,5 +1,5 @@
 /*
-  Copyright 2012-2020 David Robillard <d@drobilla.net>
+  Copyright 2012-2021 David Robillard <d@drobilla.net>
 
   Permission to use, copy, modify, and/or distribute this software for any
   purpose with or without fee is hereby granted, provided that the above
@@ -16,150 +16,222 @@
 
 #include "zix/digest.h"
 
-#ifdef __SSE4_2__
-#  include <smmintrin.h>
-#endif
-
 #include <assert.h>
 #include <stdint.h>
+#include <string.h>
 
-#ifdef __SSE4_2__
-
-// SSE 4.2 CRC32
+#if defined(__clang__) && __clang_major__ >= 12
+#  define FALLTHROUGH() __attribute__((fallthrough))
+#elif defined(__GNUC__) && !defined(__clang__)
+#  define FALLTHROUGH() __attribute__((fallthrough))
+#else
+#  define FALLTHROUGH()
+#endif
 
-uint32_t
-zix_digest_start(void)
-{
-  return 1;
-}
+/*
+  64-bit hash: Essentially fasthash64, implemented here in an aligned/padded
+  and a general UB-free variant.
+*/
 
 static inline uint64_t
-load64(const void* const mem)
+mix64(uint64_t h)
 {
-  uint64_t r = 0u;
-  memcpy(&r, mem, sizeof(r));
-  return r;
+  h ^= h >> 23u;
+  h *= 0x2127599BF4325C37ull;
+  h ^= h >> 47u;
+  return h;
 }
 
-static inline uint32_t
-load32(const void* const mem)
+uint64_t
+zix_digest64(const uint64_t seed, const void* const key, const size_t len)
 {
-  uint32_t r = 0u;
-  memcpy(&r, mem, sizeof(r));
-  return r;
-}
+  static const uint64_t m = 0x880355F21E6D1965ull;
 
-static inline uint16_t
-load16(const void* const mem)
-{
-  uint16_t r = 0u;
-  memcpy(&r, mem, sizeof(r));
-  return r;
-}
+  const size_t         n_blocks   = len / sizeof(uint64_t);
+  const uint8_t* const data       = (const uint8_t*)key;
+  const uint8_t* const blocks_end = data + (n_blocks * sizeof(uint64_t));
 
-uint32_t
-zix_digest_add(uint32_t hash, const void* const buf, const size_t len)
-{
-  const uint8_t* str = (const uint8_t*)buf;
+  uint64_t h = seed ^ (len * m);
+  for (size_t i = 0u; i < n_blocks; ++i) {
+    uint64_t k = 0u;
+    memcpy(&k, data, sizeof(uint64_t));
 
-#  ifdef __x86_64__
-  for (size_t i = 0; i < (len / sizeof(uint64_t)); ++i) {
-    hash = (uint32_t)_mm_crc32_u64(hash, load64(str));
-    str += sizeof(uint64_t);
-  }
-  if (len & sizeof(uint32_t)) {
-    hash = _mm_crc32_u32(hash, load32(str));
-    str += sizeof(uint32_t);
-  }
-#  else
-  for (size_t i = 0; i < (len / sizeof(uint32_t)); ++i) {
-    hash = _mm_crc32_u32(hash, load32(str));
-    str += sizeof(uint32_t);
+    h ^= mix64(k);
+    h *= m;
   }
-#  endif
-  if (len & sizeof(uint16_t)) {
-    hash = _mm_crc32_u16(hash, load16(str));
-    str += sizeof(uint16_t);
-  }
-  if (len & sizeof(uint8_t)) {
-    hash = _mm_crc32_u8(hash, *str);
+
+  const uint8_t* const tail = blocks_end;
+  uint64_t             v    = 0u;
+
+  switch (len & 7u) {
+  case 7:
+    v |= (uint64_t)tail[6] << 48u;
+    FALLTHROUGH();
+  case 6:
+    v |= (uint64_t)tail[5] << 40u;
+    FALLTHROUGH();
+  case 5:
+    v |= (uint64_t)tail[4] << 32u;
+    FALLTHROUGH();
+  case 4:
+    v |= (uint64_t)tail[3] << 24u;
+    FALLTHROUGH();
+  case 3:
+    v |= (uint64_t)tail[2] << 16u;
+    FALLTHROUGH();
+  case 2:
+    v |= (uint64_t)tail[1] << 8u;
+    FALLTHROUGH();
+  case 1:
+    v |= (uint64_t)tail[0];
+
+    h ^= mix64(v);
+    h *= m;
   }
 
-  return hash;
+  return mix64(h);
 }
 
-uint32_t
-zix_digest_add_64(uint32_t hash, const void* const buf, const size_t len)
+uint64_t
+zix_digest64_aligned(const uint64_t seed, const void* const key, size_t len)
 {
-  assert((uintptr_t)buf % sizeof(uint64_t) == 0);
-  assert(len % sizeof(uint64_t) == 0);
+  static const uint64_t m = 0x880355F21E6D1965ull;
 
-#  ifdef __x86_64__
-  const uint64_t* ptr = (const uint64_t*)buf;
+  assert((uintptr_t)key % sizeof(uint64_t) == 0u);
+  assert(len % sizeof(uint64_t) == 0u);
 
-  for (size_t i = 0; i < (len / sizeof(uint64_t)); ++i) {
-    hash = (uint32_t)_mm_crc32_u64(hash, *ptr);
-    ++ptr;
-  }
-
-  return hash;
-#  else
-  const uint32_t* ptr = (const uint32_t*)buf;
+  const uint64_t* const blocks   = (const uint64_t*)key;
+  const size_t          n_blocks = len / sizeof(uint64_t);
+  uint64_t              h        = seed ^ (len * m);
 
-  for (size_t i = 0; i < (len / sizeof(uint32_t)); ++i) {
-    hash = _mm_crc32_u32(hash, *ptr);
-    ++ptr;
+  for (size_t i = 0u; i < n_blocks; ++i) {
+    h ^= mix64(blocks[i]);
+    h *= m;
   }
 
-  return hash;
-#  endif
+  return mix64(h);
 }
 
-uint32_t
-zix_digest_add_ptr(const uint32_t hash, const void* const ptr)
-{
-#  ifdef __x86_64__
-  return (uint32_t)_mm_crc32_u64(hash, (uintptr_t)ptr);
-#  else
-  return _mm_crc32_u32(hash, (uintptr_t)ptr);
-#  endif
-}
+/*
+  32-bit hash: Essentially murmur3, reimplemented here in an aligned/padded and
+  a general UB-free variant.
+*/
 
-#else
+/**
+   Rotate left by some count of bits.
 
-// Classic DJB hash
+   This is recognized by any halfway decent compiler and compiled to a single
+   instruction on architectures that have one.
+*/
+static inline uint32_t
+rotl32(const uint32_t val, const uint32_t bits)
+{
+  return ((val << bits) | (val >> (32 - bits)));
+}
 
-uint32_t
-zix_digest_start(void)
+static inline uint32_t
+mix32(uint32_t h)
 {
-  return 5381;
+  h ^= h >> 16u;
+  h *= 0x85EBCA6Bu;
+  h ^= h >> 13u;
+  h *= 0xC2B2AE35u;
+  h ^= h >> 16u;
+  return h;
 }
 
 uint32_t
-zix_digest_add(uint32_t hash, const void* const buf, const size_t len)
+zix_digest32(const uint32_t seed, const void* const key, const size_t len)
 {
-  const uint8_t* str = (const uint8_t*)buf;
+  static const uint32_t c1 = 0xCC9E2D51u;
+  static const uint32_t c2 = 0x1B873593u;
+
+  // Process as many 32-bit blocks as possible
+  const size_t         n_blocks   = len / sizeof(uint32_t);
+  const uint8_t*       data       = (const uint8_t*)key;
+  const uint8_t* const blocks_end = data + (n_blocks * sizeof(uint32_t));
+  uint32_t             h          = seed;
+  for (; data != blocks_end; data += sizeof(uint32_t)) {
+    uint32_t k = 0u;
+    memcpy(&k, data, sizeof(uint32_t));
+
+    k *= c1;
+    k = rotl32(k, 15);
+    k *= c2;
+
+    h ^= k;
+    h = rotl32(h, 13);
+    h = h * 5u + 0xE6546B64u;
+  }
 
-  for (size_t i = 0; i < len; ++i) {
-    hash = (hash << 5u) + hash + str[i];
+  // Process any trailing bytes
+  uint32_t k = 0u;
+  switch (len & 3u) {
+  case 3u:
+    k ^= (uint32_t)data[2u] << 16u;
+    FALLTHROUGH();
+  case 2u:
+    k ^= (uint32_t)data[1u] << 8u;
+    FALLTHROUGH();
+  case 1u:
+    k ^= (uint32_t)data[0u];
+
+    k *= c1;
+    k = rotl32(k, 15u);
+    k *= c2;
+    h ^= k;
   }
 
-  return hash;
+  return mix32(h ^ (uint32_t)len);
 }
 
 uint32_t
-zix_digest_add_64(uint32_t hash, const void* const buf, const size_t len)
+zix_digest32_aligned(const uint32_t    seed,
+                     const void* const key,
+                     const size_t      len)
 {
-  assert((uintptr_t)buf % sizeof(uint64_t) == 0);
-  assert(len % sizeof(uint64_t) == 0);
+  static const uint32_t c1 = 0xCC9E2D51u;
+  static const uint32_t c2 = 0x1B873593u;
 
-  return zix_digest_add(hash, buf, len);
+  assert((uintptr_t)key % sizeof(uint32_t) == 0u);
+  assert(len % sizeof(uint32_t) == 0u);
+
+  const uint32_t* const blocks   = (const uint32_t*)key;
+  const size_t          n_blocks = len / sizeof(uint32_t);
+  uint32_t              h        = seed;
+  for (size_t i = 0u; i < n_blocks; ++i) {
+    uint32_t k = blocks[i];
+
+    k *= c1;
+    k = rotl32(k, 15);
+    k *= c2;
+
+    h ^= k;
+    h = rotl32(h, 13);
+    h = h * 5u + 0xE6546B64u;
+  }
+
+  return mix32(h ^ (uint32_t)len);
 }
 
-uint32_t
-zix_digest_add_ptr(const uint32_t hash, const void* const ptr)
+// Native word size wrapper
+
+size_t
+zix_digest(const size_t seed, const void* const buf, const size_t len)
 {
-  return zix_digest_add(hash, &ptr, sizeof(ptr));
+#if UINTPTR_MAX >= UINT64_MAX
+  return zix_digest64(seed, buf, len);
+#else
+  return zix_digest32(seed, buf, len);
+#endif
 }
 
+size_t
+zix_digest_aligned(const size_t seed, const void* const buf, const size_t len)
+{
+#if UINTPTR_MAX >= UINT64_MAX
+  return zix_digest64_aligned(seed, buf, len);
+#else
+  return zix_digest32_aligned(seed, buf, len);
 #endif
+}