From 3a5742ffd2e9a34b10af9a87748a7f1a7e94e62d Mon Sep 17 00:00:00 2001 From: Robert O'Callahan Date: Tue, 19 Aug 2025 03:21:54 +0000 Subject: [PATCH] Improve commutative hashing. The simple XOR `commutative_eat()` implementation produces a lot of collisions. https://www.preprints.org/manuscript/201710.0192/v1/download is a useful reference on this topic. Running the included `hashTest.cc` without the hashlib changes, I get 49,580,349 collisions. The 49,995,000 (i,j) pairs (0 <= i < 10000, i < j < 10000) hash into only 414,651 unique hash values. We get simple collisions like (0,1) colliding with (2,3). With the hashlib changes, we get only 707,099 collisions and 49,287,901 unique hash values. Much better! The `commutative_hash` implementation corresponds to `Sum(4)` in the paper mentioned above. --- kernel/hashlib.h | 39 ++++++++++++++++++++++++------ kernel/yosys_common.h | 1 + tests/unit/kernel/hashTest.cc | 45 +++++++++++++++++++++++++++++++++++ 3 files changed, 78 insertions(+), 7 deletions(-) create mode 100644 tests/unit/kernel/hashTest.cc diff --git a/kernel/hashlib.h b/kernel/hashlib.h index 7a5650fa3..b43b7302e 100644 --- a/kernel/hashlib.h +++ b/kernel/hashlib.h @@ -12,6 +12,7 @@ #ifndef HASHLIB_H #define HASHLIB_H +#include #include #include #include @@ -100,7 +101,7 @@ private: uint32_t hash = ((a << 5) + a) ^ b; return hash; } - public: +public: void hash32(uint32_t i) { state = djb2_xor(i, state); state = mkhash_xorshift(fudge ^ state); @@ -127,6 +128,7 @@ private: *this = hash_ops::hash_into(t, *this); } + [[deprecated]] void commutative_eat(hash_t t) { state ^= t; } @@ -356,6 +358,29 @@ template> class idict; template> class pool; template> class mfp; +// Computes the hash value of an unordered set of elements. +// See https://www.preprints.org/manuscript/201710.0192/v1/download. +// This is the Sum(4) algorithm from that paper, which has good collision resistance, +// much better than Sum(1) or Xor(1) (and somewhat better than Xor(4)). +class commutative_hash { +public: + commutative_hash() { + buckets.fill(0); + } + void eat(Hasher h) { + Hasher::hash_t v = h.yield(); + size_t index = v & (buckets.size() - 1); + buckets[index] += v; + } + [[nodiscard]] Hasher hash_into(Hasher h) const { + for (auto b : buckets) + h.eat(b); + return h; + } +private: + std::array buckets; +}; + template class dict { struct entry_t @@ -801,14 +826,14 @@ public: } [[nodiscard]] Hasher hash_into(Hasher h) const { + commutative_hash comm; for (auto &it : entries) { Hasher entry_hash; entry_hash.eat(it.udata.first); entry_hash.eat(it.udata.second); - h.commutative_eat(entry_hash.yield()); + comm.eat(entry_hash); } - h.eat(entries.size()); - return h; + return comm.hash_into(h); } void reserve(size_t n) { entries.reserve(n); } @@ -1184,11 +1209,11 @@ public: } [[nodiscard]] Hasher hash_into(Hasher h) const { + commutative_hash comm; for (auto &it : entries) { - h.commutative_eat(ops.hash(it.udata).yield()); + comm.eat(ops.hash(it.udata)); } - h.eat(entries.size()); - return h; + return comm.hash_into(h); } void reserve(size_t n) { entries.reserve(n); } diff --git a/kernel/yosys_common.h b/kernel/yosys_common.h index ecc8ce623..bc92e7869 100644 --- a/kernel/yosys_common.h +++ b/kernel/yosys_common.h @@ -20,6 +20,7 @@ #ifndef YOSYS_COMMON_H #define YOSYS_COMMON_H +#include #include #include #include diff --git a/tests/unit/kernel/hashTest.cc b/tests/unit/kernel/hashTest.cc new file mode 100644 index 000000000..6e4610ec8 --- /dev/null +++ b/tests/unit/kernel/hashTest.cc @@ -0,0 +1,45 @@ +#include +#include "kernel/yosys_common.h" + +#include + +YOSYS_NAMESPACE_BEGIN + +static Hasher hash(int x) +{ + Hasher h; + h.eat(x); + return h; +} + +TEST(CommutativeTest, basic) +{ + hashlib::commutative_hash comm1; + comm1.eat(hash(1)); + comm1.eat(hash(2)); + hashlib::commutative_hash comm2; + comm2.eat(hash(2)); + comm2.eat(hash(1)); + EXPECT_EQ(comm1.hash_into(Hasher()).yield(), comm2.hash_into(Hasher()).yield()); +} + +TEST(PoolHashTest, collisions) +{ + uint64_t collisions = 0; + std::unordered_set hashes; + for (int i = 0; i < 10000; ++i) { + for (int j = i + 1; j < 10000; ++j) { + pool p1; + p1.insert(i); + p1.insert(j); + auto h = p1.hash_into(Hasher()).yield(); + if (!hashes.insert(h).second) { + ++collisions; + } + } + } + std::cout << "pool collisions: " << collisions << std::endl; + EXPECT_LT(collisions, 1000000); +} + +YOSYS_NAMESPACE_END