From d0276169d32f93a201c24dc774d1f8babfaf39d2 Mon Sep 17 00:00:00 2001 From: Robert O'Callahan Date: Wed, 28 Jan 2026 18:59:35 +0000 Subject: [PATCH] Add `ShardedHashSet` We'll use this later in the PR. --- kernel/threading.h | 171 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 171 insertions(+) diff --git a/kernel/threading.h b/kernel/threading.h index 32143f59e..c1897de10 100644 --- a/kernel/threading.h +++ b/kernel/threading.h @@ -391,6 +391,177 @@ private: Buckets buckets; }; +template +struct DefaultCollisionHandler { + void operator()(typename V::Accumulated &, typename V::Accumulated &) const {} +}; + +// A hashtable that can be efficiently built in parallel and then looked up concurrently. +// `V` is the type of elements that will be added to the hashtable. It must have a +// member type `Accumulated` representing the combination of multiple `V` elements. This +// can be the same as `V`, but for example `V` could contain a Wire* and `V::Accumulated` +// could contain a `pool`. `KeyEquality` is a class containing an `operator()` that +// returns true of two `V` elements have equal keys. +// `CollisionHandler` is used to reduce two `V::Accumulated` values into a single value. +// +// To use this, first construct a `Builder` and fill it in (in parallel), then construct +// a `ShardedHashSet` from the `Builder`. +template > +class ShardedHashSet { +public: + // A combination of a `V` and its hash value. + struct Value { + Value(V value, unsigned int hash) : value(std::move(value)), hash(hash) {} + Value(Value &&) = default; + Value(const Value &) = delete; + Value &operator=(const Value &) = delete; + V value; + unsigned int hash; + }; + // A combination of a `V::Accumulated` and its hash value. + struct AccumulatedValue { + AccumulatedValue(typename V::Accumulated value, unsigned int hash) : value(std::move(value)), hash(hash) {} + AccumulatedValue(AccumulatedValue &&) = default; +#if defined(_MSC_VER) + AccumulatedValue(const AccumulatedValue &) { + log_error("Copy constructor called on AccumulatedValue"); + } + AccumulatedValue &operator=(const AccumulatedValue &) { + log_error("Copy assignment called on AccumulatedValue"); + return *this; + } +#else + AccumulatedValue(const AccumulatedValue &) = delete; + AccumulatedValue &operator=(const AccumulatedValue &) = delete; +#endif + typename V::Accumulated value; + unsigned int hash; + }; + // A class containing an `operator()` that returns true of two `AccumulatedValue` + // elements have equal keys. + // Required to insert `AccumulatedValue`s into an `std::unordered_set`. + struct AccumulatedValueEquality { + KeyEquality inner; + AccumulatedValueEquality(const KeyEquality &inner) : inner(inner) {} + bool operator()(const AccumulatedValue &v1, const AccumulatedValue &v2) const { + return inner(v1.value, v2.value); + } + }; + // A class containing an `operator()` that returns the hash value of an `AccumulatedValue`. + // Required to insert `AccumulatedValue`s into an `std::unordered_set`. + struct AccumulatedValueHashOp { + size_t operator()(const AccumulatedValue &v) const { + return static_cast(v.hash); + } + }; + using Shard = std::unordered_set; + + // First construct one of these. Then populate it in parallel by calling `insert()` from many threads. + // Then do another parallel phase calling `process()` from many threads. + class Builder { + public: + Builder(const ParallelDispatchThreadPool &thread_pool, KeyEquality equality = KeyEquality(), CollisionHandler collision_handler = CollisionHandler()) + : collision_handler(std::move(collision_handler)) { + init(thread_pool.num_threads(), std::move(equality)); + } + Builder(const ParallelDispatchThreadPool::Subpool &thread_pool, KeyEquality equality = KeyEquality(), CollisionHandler collision_handler = CollisionHandler()) + : collision_handler(std::move(collision_handler)) { + init(thread_pool.num_threads(), std::move(equality)); + } + // First call `insert` to insert all elements. All inserts must finish + // before calling any `process()`. + void insert(const ThreadIndex &thread, Value v) { + // You might think that for the single-threaded case, we can optimize by + // inserting directly into the `std::unordered_set` here. But that slows things down + // a lot and I never got around to figuring out why. + std::vector> &buckets = all_buckets[thread.thread_num]; + size_t bucket = static_cast(v.hash) % buckets.size(); + buckets[bucket].emplace_back(std::move(v)); + } + // Then call `process` for each thread. All `process()`s must finish before using + // the `Builder` to construct a `ShardedHashSet`. + void process(const ThreadIndex &thread) { + int size = 0; + for (std::vector> &buckets : all_buckets) + size += GetSize(buckets[thread.thread_num]); + Shard &shard = shards[thread.thread_num]; + shard.reserve(size); + for (std::vector> &buckets : all_buckets) { + for (Value &value : buckets[thread.thread_num]) + accumulate(value, shard); + // Free as much memory as we can during the parallel phase. + std::vector().swap(buckets[thread.thread_num]); + } + } + private: + friend class ShardedHashSet; + void accumulate(Value &value, Shard &shard) { + // With C++20 we could make this more efficient using heterogenous lookup + AccumulatedValue accumulated_value{std::move(value.value), value.hash}; + auto [it, inserted] = shard.insert(std::move(accumulated_value)); + if (!inserted) + collision_handler(const_cast(it->value), accumulated_value.value); + } + void init(int num_threads, KeyEquality equality) { + all_buckets.resize(num_threads); + for (std::vector> &buckets : all_buckets) + buckets.resize(num_threads); + for (int i = 0; i < num_threads; ++i) + shards.emplace_back(0, AccumulatedValueHashOp(), AccumulatedValueEquality(equality)); + } + const CollisionHandler collision_handler; + std::vector>> all_buckets; + std::vector shards; + }; + + // Then finally construct the hashtable: + ShardedHashSet(Builder &builder) : shards(std::move(builder.shards)) { + // Check that all necessary 'process()' calls were made. + for (std::vector> &buckets : builder.all_buckets) + for (std::vector &bucket : buckets) + log_assert(bucket.empty()); + // Free memory. + std::vector>>().swap(builder.all_buckets); + } + ShardedHashSet(ShardedHashSet &&other) = default; + ShardedHashSet() {} + + ShardedHashSet &operator=(ShardedHashSet &&other) = default; + + // Look up by `AccumulatedValue`. If we switch to C++20 then we could use + // heterogenous lookup to support looking up by `Value` here. Returns nullptr + // if the key is not found. + const typename V::Accumulated *find(const AccumulatedValue &v) const { + size_t num_shards = shards.size(); + if (num_shards == 0) + return nullptr; + size_t shard = static_cast(v.hash) % num_shards; + auto it = shards[shard].find(v); + if (it == shards[shard].end()) + return nullptr; + return &it->value; + } + + // Insert an element into the table. The caller is responsible for ensuring this does not + // happen concurrently with any other method calls. + void insert(AccumulatedValue v) { + size_t num_shards = shards.size(); + if (num_shards == 0) + return; + size_t shard = static_cast(v.hash) % num_shards; + shards[shard].insert(v); + } + + // Call this for each shard to implement parallel destruction. For very large `ShardedHashSet`s, + // deleting all elements of all shards on a single thread can be a performance bottleneck. + void clear(const ThreadIndex &shard) { + AccumulatedValueEquality equality = shards[0].key_eq(); + shards[shard.thread_num] = Shard(0, AccumulatedValueHashOp(), equality); + } +private: + std::vector shards; +}; + YOSYS_NAMESPACE_END #endif // YOSYS_THREADING_H