From 81fd03c6e4fca1f334a0716a700bfeac6dfbc36e Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Wed, 28 Jan 2026 18:13:12 +0000
Subject: [PATCH 01/26] Add `IdString::unescape()` method

We've already talked about adding this as an alternative to `log_id()`, and we'll
need it later in this PR.
---
 kernel/rtlil.h | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)
diff --git a/kernel/rtlil.h b/kernel/rtlil.h
index fea53081e..6a026352e 100644
--- a/kernel/rtlil.h
+++ b/kernel/rtlil.h
@@ -275,6 +275,17 @@ struct RTLIL::IdString
 		*out += std::to_string(-index_);
 	}
 
+	std::string unescape() const {
+		if (index_ < 0) {
+			// Must start with "$auto$" so no unescaping required.
+			return str();
+		}
+		std::string_view str = global_id_storage_.at(index_).str_view();
+		if (str.size() < 2 || str[0] != '\\' || str[1] == '$' || str[1] == '\\' || (str[1] >= '0' && str[1] <= '9'))
+			return std::string(str);
+		return std::string(str.substr(1));
+	}
+
 	class Substrings {
 		std::string_view first_;
 		int suffix_number;
@@ -758,7 +769,7 @@ namespace RTLIL {
 	}
 
 	static inline std::string unescape_id(RTLIL::IdString str) {
-		return unescape_id(str.str());
+		return str.unescape();
 	}
 
 	static inline const char *id2cstr(RTLIL::IdString str) {

From 3b563b877e0e867567b1bd54507bd4b07b81d197 Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Wed, 28 Jan 2026 18:14:46 +0000
Subject: [PATCH 02/26] Make `log_error()` work in a `Multithreaded` context.

`log_error()` causes an exit so we don't have to try too hard here. The main
thing is to ensure that we normally are able to exit without causing a stack
overflow due to recursive asserts about not being in a `Multithreaded` context.
---
 kernel/log.cc | 14 ++++++++++++--
 kernel/log.h  |  7 +------
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/kernel/log.cc b/kernel/log.cc
index 018a19081..b114f1eaf 100644
--- a/kernel/log.cc
+++ b/kernel/log.cc
@@ -324,6 +324,14 @@ void log_formatted_file_info(std::string_view filename, int lineno, std::string
 	log("%s:%d: Info: %s", filename, lineno, str);
 }
 
+void log_suppressed() {
+	if (log_debug_suppressed && !log_make_debug) {
+		constexpr const char* format = "<suppressed ~%d debug messages>\n";
+		logv_string(format, stringf(format, log_debug_suppressed));
+		log_debug_suppressed = 0;
+	}
+}
+
 [[noreturn]]
 static void log_error_with_prefix(std::string_view prefix, std::string str)
 {
@@ -345,7 +353,9 @@ static void log_error_with_prefix(std::string_view prefix, std::string str)
 	}
 
 	log_last_error = std::move(str);
-	log("%s%s", prefix, log_last_error);
+	std::string message(prefix);
+	message += log_last_error;
+	logv_string("%s%s", message);
 	log_flush();
 
 	log_make_debug = bak_log_make_debug;
@@ -355,7 +365,7 @@ static void log_error_with_prefix(std::string_view prefix, std::string str)
 			item.current_count++;
 
 	for (auto &[_, item] : log_expect_prefix_error)
-		if (std::regex_search(string(prefix) + string(log_last_error), item.pattern))
+		if (std::regex_search(message, item.pattern))
 			item.current_count++;
 
 	log_check_expected();
diff --git a/kernel/log.h b/kernel/log.h
index 63faf7091..d132ba1a0 100644
--- a/kernel/log.h
+++ b/kernel/log.h
@@ -206,12 +206,7 @@ template <typename... Args>
 	log_formatted_cmd_error(fmt.format(args...));
 }
 
-static inline void log_suppressed() {
-	if (log_debug_suppressed && !log_make_debug) {
-		log("<suppressed ~%d debug messages>\n", log_debug_suppressed);
-		log_debug_suppressed = 0;
-	}
-}
+void log_suppressed();
 
 struct LogMakeDebugHdl {
 	bool status = false;

From 9e523e2fd7817b21fb9582fc82f311d6d8889667 Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Wed, 28 Jan 2026 18:16:24 +0000
Subject: [PATCH 03/26] Work around `std::reverse` miscompilation with empty
 range

This causes problems when compiling with fuzzing instrumenation enabled.
---
 frontends/rtlil/rtlil_frontend.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/frontends/rtlil/rtlil_frontend.cc b/frontends/rtlil/rtlil_frontend.cc
index a1412d983..7e2ec5460 100644
--- a/frontends/rtlil/rtlil_frontend.cc
+++ b/frontends/rtlil/rtlil_frontend.cc
@@ -286,6 +286,7 @@ struct RTLILFrontendWorker {
 		if (width > MAX_CONST_WIDTH)
 			error("Constant width %lld out of range before `%s`.", width, error_token());
 		bits.reserve(width);
+		int start_idx = idx;
 		while (true) {
 			RTLIL::State bit;
 			switch (line[idx]) {
@@ -300,8 +301,9 @@ struct RTLILFrontendWorker {
 			bits.push_back(bit);
 			++idx;
 		}
-		done:
-		std::reverse(bits.begin(), bits.end());
+	done:
+		if (start_idx < idx)
+			std::reverse(bits.begin(), bits.end());
 
 		if (GetSize(bits) > width)
 			bits.resize(width);

From fb24763a15e6be34782e147f04d03e6030f36b4f Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Wed, 28 Jan 2026 18:50:23 +0000
Subject: [PATCH 04/26] Add `work_pool_size`, `IntRange`,
 `item_range_for_worker`, and `ThreadIndex`

We'll use these later in this PR.
---
 kernel/threading.cc | 35 +++++++++++++++++++++++++++++++++++
 kernel/threading.h  | 29 +++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+)

diff --git a/kernel/threading.cc b/kernel/threading.cc
index dcc044c89..8c9bfb390 100644
--- a/kernel/threading.cc
+++ b/kernel/threading.cc
@@ -17,6 +17,20 @@ static int get_max_threads()
 	return max_threads;
 }
 
+static int init_work_units_per_thread_override()
+{
+	const char *v = getenv("YOSYS_WORK_UNITS_PER_THREAD");
+	if (v == nullptr)
+		return 0;
+	return atoi(v);
+}
+
+static int get_work_units_per_thread_override()
+{
+	static int work_units_per_thread = init_work_units_per_thread_override();
+	return work_units_per_thread;
+}
+
 void DeferredLogs::flush()
 {
 	for (auto &m : logs)
@@ -37,6 +51,14 @@ int ThreadPool::pool_size(int reserved_cores, int max_worker_threads)
 #endif
 }
 
+int ThreadPool::work_pool_size(int reserved_cores, int work_units, int work_units_per_thread)
+{
+	int work_units_per_thread_override = get_work_units_per_thread_override();
+	if (work_units_per_thread_override > 0)
+		work_units_per_thread = work_units_per_thread_override;
+	return pool_size(reserved_cores, work_units / work_units_per_thread);
+}
+
 ThreadPool::ThreadPool(int pool_size, std::function<void(int)> b)
 	: body(std::move(b))
 {
@@ -57,4 +79,17 @@ ThreadPool::~ThreadPool()
 #endif
 }
 
+IntRange item_range_for_worker(int num_items, int thread_num, int num_threads)
+{
+	if (num_threads <= 1) {
+		return {0, num_items};
+	}
+	int items_per_thread = num_items / num_threads;
+	int extra_items = num_items % num_threads;
+	// The first `extra_items` threads get one extra item.
+	int start = thread_num * items_per_thread + std::min(thread_num, extra_items);
+	int end = (thread_num + 1) * items_per_thread + std::min(thread_num + 1, extra_items);
+	return {start, end};
+}
+
 YOSYS_NAMESPACE_END
diff --git a/kernel/threading.h b/kernel/threading.h
index b8cd62f87..eb068bb20 100644
--- a/kernel/threading.h
+++ b/kernel/threading.h
@@ -131,6 +131,11 @@ public:
 	// The result may be 0.
 	static int pool_size(int reserved_cores, int max_worker_threads);
 
+	// Computes the number of worker threads to use, by dividing work_units among threads.
+	// For testing purposes you can set YOSYS_WORK_UNITS_PER_THREAD to override `work_units_per_thread`.
+	// The result may be 0.
+	static int work_pool_size(int reserved_cores, int work_units, int work_units_per_thread);
+
 	// Create a pool of threads running the given closure (parameterized by thread number).
 	// `pool_size` must be the result of a `pool_size()` call.
 	ThreadPool(int pool_size, std::function<void(int)> b);
@@ -154,6 +159,30 @@ private:
 #endif
 };
 
+// A range of integers [start_, end_) that can be iterated over with a
+// C++ range-based for loop.
+struct IntRange {
+	int start_;
+	int end_;
+	struct Int {
+		int v;
+		int operator*() const { return v; }
+		Int &operator++() { ++v; return *this; }
+		bool operator!=(const Int &other) const { return v != other.v; }
+	};
+	Int begin() const { return {start_}; }
+	Int end() const { return {end_}; }
+};
+// Divides some number of items into `num_threads` subranges and returns the
+// `thread_num`'th subrange. If `num_threads` is zero, returns the whole range.
+IntRange item_range_for_worker(int num_items, int thread_num, int num_threads);
+
+// A type that encapsulates the index of a thread in some list of threads. Useful for
+// stronger typechecking and code readability.
+struct ThreadIndex {
+	int thread_num;
+};
+
 template <class T>
 class ConcurrentStack
 {

From 000470817732a3a8a3999ccb1e3804ca78503327 Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Wed, 28 Jan 2026 18:52:17 +0000
Subject: [PATCH 05/26] Add `ParallelDispatchThreadPool`

We'll use this later in the PR.
---
 kernel/threading.cc |  55 +++++++++++++++++++++++
 kernel/threading.h  | 107 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 162 insertions(+)

diff --git a/kernel/threading.cc b/kernel/threading.cc
index 8c9bfb390..3766c4ddf 100644
--- a/kernel/threading.cc
+++ b/kernel/threading.cc
@@ -92,4 +92,59 @@ IntRange item_range_for_worker(int num_items, int thread_num, int num_threads)
 	return {start, end};
 }
 
+ParallelDispatchThreadPool::ParallelDispatchThreadPool(int pool_size)
+		: num_worker_threads_(std::max(1, pool_size) - 1)
+{
+#ifdef YOSYS_ENABLE_THREADS
+	main_to_workers_signal.resize(num_worker_threads_, 0);
+#endif
+	// Don't start the threads until we've constructed all our data members.
+	thread_pool = std::make_unique<ThreadPool>(num_worker_threads_, [this](int thread_num){
+		run_worker(thread_num);
+	});
+}
+
+ParallelDispatchThreadPool::~ParallelDispatchThreadPool()
+{
+#ifdef YOSYS_ENABLE_THREADS
+	if (num_worker_threads_ == 0)
+		return;
+	current_work = nullptr;
+	num_active_worker_threads_ = num_worker_threads_;
+	signal_workers_start();
+	wait_for_workers_done();
+#endif
+}
+
+void ParallelDispatchThreadPool::run(std::function<void(const RunCtx &)> work, int max_threads)
+{
+	Multithreading multithreading;
+	num_active_worker_threads_ = num_threads(max_threads) - 1;
+	if (num_active_worker_threads_ == 0) {
+		work({{0}, 1});
+		return;
+	}
+#ifdef YOSYS_ENABLE_THREADS
+	current_work = &work;
+	signal_workers_start();
+	work({{0}, num_active_worker_threads_ + 1});
+	wait_for_workers_done();
+#endif
+}
+
+void ParallelDispatchThreadPool::run_worker(int thread_num)
+{
+#ifdef YOSYS_ENABLE_THREADS
+	while (true)
+	{
+		worker_wait_for_start(thread_num);
+		if (current_work == nullptr)
+			break;
+		(*current_work)({{thread_num + 1}, num_active_worker_threads_ + 1});
+		signal_worker_done();
+	}
+	signal_worker_done();
+#endif
+}
+
 YOSYS_NAMESPACE_END
diff --git a/kernel/threading.h b/kernel/threading.h
index eb068bb20..da21a274e 100644
--- a/kernel/threading.h
+++ b/kernel/threading.h
@@ -183,6 +183,113 @@ struct ThreadIndex {
 	int thread_num;
 };
 
+// A set of threads with a `run()` API that runs a closure on all of the threads
+// and wait for all those closures to complete. This is a convenient way to implement
+// parallel algorithms that use barrier synchronization.
+class ParallelDispatchThreadPool
+{
+public:
+	// Create a pool of threads running the given closure (parameterized by thread number).
+	// `pool_size` must be the result of a `pool_size()` call.
+	// `pool_size` can be zero, which we treat as 1.
+	ParallelDispatchThreadPool(int pool_size);
+	~ParallelDispatchThreadPool();
+
+	// For each thread running a closure, a `RunCtx` is passed to the closure. Currently
+	// it contains the thread index and the total number of threads. It can be passed
+	// directly to any APIs requiring a `ThreadIndex`.
+	struct RunCtx : public ThreadIndex {
+		int num_threads;
+		IntRange item_range(int num_items) const {
+			return item_range_for_worker(num_items, thread_num, num_threads);
+		}
+	};
+	// Sometimes we only want to activate a subset of the threads in the pool. This
+	// class provides a way to do that. It provides the same `num_threads()`
+	// and `run()` APIs as a `ParallelDispatchThreadPool`.
+	class Subpool {
+	public:
+		Subpool(ParallelDispatchThreadPool &parent, int max_threads)
+				: parent(parent), max_threads(max_threads) {}
+		// Returns the number of threads that will be used when calling `run()`.
+		int num_threads() const {
+			return parent.num_threads(max_threads);
+		}
+		void run(std::function<void(const RunCtx &)> work) {
+			parent.run(std::move(work), max_threads);
+		}
+		ParallelDispatchThreadPool &thread_pool() { return parent; }
+	private:
+		ParallelDispatchThreadPool &parent;
+		int max_threads;
+	};
+
+	// Run the `work` function in parallel on each thread in the pool (parameterized by
+	// thread number). Waits for all work functions to complete. Only one `run()` can be
+	// active at a time.
+	// Uses no more than `max_threads` threads (but at least one).
+	void run(std::function<void(const RunCtx &)> work) {
+		run(std::move(work), INT_MAX);
+	}
+
+	// Returns the number of threads that will be used when calling `run()`.
+	int num_threads() const {
+		return num_threads(INT_MAX);
+	}
+private:
+	friend class Subpool;
+
+	void run(std::function<void(const RunCtx &)> work, int max_threads);
+	int num_threads(int max_threads) const {
+		return std::min(num_worker_threads_ + 1, std::max(1, max_threads));
+	}
+	void run_worker(int thread_num);
+
+	std::unique_ptr<ThreadPool> thread_pool;
+	std::function<void(const RunCtx &)> *current_work = nullptr;
+	// Keeps a correct count even when threads are exiting.
+	int num_worker_threads_;
+	// The count of active workerthreads for the current `run()`.
+	int num_active_worker_threads_ = 0;
+
+#ifdef YOSYS_ENABLE_THREADS
+	// Not especially efficient for large numbers of threads. Worker wakeup could scale
+	// better by conceptually organising workers into a tree and having workers wake
+	// up their children.
+	std::mutex main_to_workers_signal_mutex;
+	std::condition_variable main_to_workers_signal_cv;
+	std::vector<uint8_t> main_to_workers_signal;
+	void signal_workers_start() {
+		std::unique_lock lock(main_to_workers_signal_mutex);
+		std::fill(main_to_workers_signal.begin(), main_to_workers_signal.begin() + num_active_worker_threads_, 1);
+		// When `num_active_worker_threads_` is small compared to `num_worker_threads_`, we have a "thundering herd"
+		// problem here. Fixing that would add complexity so don't worry about it for now.
+		main_to_workers_signal_cv.notify_all();
+	}
+	void worker_wait_for_start(int thread_num) {
+		std::unique_lock lock(main_to_workers_signal_mutex);
+		main_to_workers_signal_cv.wait(lock, [this, thread_num] { return main_to_workers_signal[thread_num] > 0; });
+		main_to_workers_signal[thread_num] = 0;
+	}
+
+	std::atomic<int> done_workers = 0;
+	std::mutex workers_to_main_signal_mutex;
+	std::condition_variable workers_to_main_signal_cv;
+	void signal_worker_done() {
+		int d = done_workers.fetch_add(1, std::memory_order_release);
+		if (d + 1 == num_active_worker_threads_) {
+			std::unique_lock lock(workers_to_main_signal_mutex);
+			workers_to_main_signal_cv.notify_all();
+		}
+	}
+	void wait_for_workers_done() {
+		std::unique_lock lock(workers_to_main_signal_mutex);
+		workers_to_main_signal_cv.wait(lock, [this] { return done_workers.load(std::memory_order_acquire) == num_active_worker_threads_; });
+		done_workers.store(0, std::memory_order_relaxed);
+	}
+#endif
+};
+
 template <class T>
 class ConcurrentStack
 {

From b06e903906683761fe019b860ff0abb7ace158bd Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Wed, 28 Jan 2026 18:58:09 +0000
Subject: [PATCH 06/26] Add `ShardedVector`

We'll use this later in the PR.
---
 kernel/threading.h | 74 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)

diff --git a/kernel/threading.h b/kernel/threading.h
index da21a274e..32143f59e 100644
--- a/kernel/threading.h
+++ b/kernel/threading.h
@@ -317,6 +317,80 @@ private:
 	std::vector<T> contents;
 };
 
+// A vector that is sharded into buckets, one per thread. This lets multiple threads write
+// efficiently to the vector without synchronization overhead. After all writers have
+// finished writing, the vector can be iterated over. The iteration order is deterministic:
+// all the elements written by thread 0 in the order it inserted them, followed by all elements
+// written by thread 1, etc.
+template <typename T>
+class ShardedVector {
+public:
+	ShardedVector(const ParallelDispatchThreadPool &thread_pool) {
+		init(thread_pool.num_threads());
+	}
+	ShardedVector(const ParallelDispatchThreadPool::Subpool &thread_pool) {
+		init(thread_pool.num_threads());
+	}
+
+	// Insert a value, passing the `ThreadIndex` of the writer thread.
+	// Parallel inserts with different `ThreadIndex` values are fine.
+	// Inserts must not run concurrently with any other methods (e.g.
+	// iteration or `empty()`.)
+	void insert(const ThreadIndex &thread, T value) {
+		buckets[thread.thread_num].emplace_back(std::move(value));
+	}
+
+	bool empty() const {
+		for (const std::vector<T> &bucket : buckets)
+			if (!bucket.empty())
+				return false;
+		return true;
+	}
+
+	using Buckets = std::vector<std::vector<T>>;
+	class iterator {
+	public:
+		iterator(typename Buckets::iterator bucket_it, typename Buckets::iterator bucket_end)
+			: bucket_it(std::move(bucket_it)), bucket_end(std::move(bucket_end)) {
+			if (bucket_it != bucket_end)
+				inner_it = bucket_it->begin();
+			normalize();
+		}
+		T& operator*() const { return *inner_it.value(); }
+		iterator &operator++() {
+			++*inner_it;
+			normalize();
+			return *this;
+		}
+		bool operator!=(const iterator &other) const {
+			return bucket_it != other.bucket_it || inner_it != other.inner_it;
+		}
+	private:
+		void normalize() {
+			if (bucket_it == bucket_end)
+				return;
+			while (inner_it == bucket_it->end()) {
+				++bucket_it;
+				if (bucket_it == bucket_end) {
+					inner_it.reset();
+					return;
+				}
+				inner_it = bucket_it->begin();
+			}
+		}
+		std::optional<typename std::vector<T>::iterator> inner_it;
+		typename Buckets::iterator bucket_it;
+		typename Buckets::iterator bucket_end;
+	};
+	iterator begin() { return iterator(buckets.begin(), buckets.end()); }
+	iterator end() { return iterator(buckets.end(), buckets.end()); }
+private:
+	void init(int num_threads) {
+		buckets.resize(num_threads);
+	}
+	Buckets buckets;
+};
+
 YOSYS_NAMESPACE_END
 
 #endif // YOSYS_THREADING_H

From d0276169d32f93a201c24dc774d1f8babfaf39d2 Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Wed, 28 Jan 2026 18:59:35 +0000
Subject: [PATCH 07/26] Add `ShardedHashSet`

We'll use this later in the PR.
---
 kernel/threading.h | 171 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 171 insertions(+)

diff --git a/kernel/threading.h b/kernel/threading.h
index 32143f59e..c1897de10 100644
--- a/kernel/threading.h
+++ b/kernel/threading.h
@@ -391,6 +391,177 @@ private:
 	Buckets buckets;
 };
 
+template <typename V>
+struct DefaultCollisionHandler {
+	void operator()(typename V::Accumulated &, typename V::Accumulated &) const {}
+};
+
+// A hashtable that can be efficiently built in parallel and then looked up concurrently.
+// `V` is the type of elements that will be added to the hashtable. It must have a
+// member type `Accumulated` representing the combination of multiple `V` elements. This
+// can be the same as `V`, but for example `V` could contain a Wire* and `V::Accumulated`
+// could contain a `pool<Wire*>`. `KeyEquality` is a class containing an `operator()` that
+// returns true of two `V` elements have equal keys.
+// `CollisionHandler` is used to reduce two `V::Accumulated` values into a single value.
+//
+// To use this, first construct a `Builder` and fill it in (in parallel), then construct
+// a `ShardedHashSet` from the `Builder`.
+template <typename V, typename KeyEquality, typename CollisionHandler = DefaultCollisionHandler<V>>
+class ShardedHashSet {
+public:
+	// A combination of a `V` and its hash value.
+	struct Value {
+		Value(V value, unsigned int hash) : value(std::move(value)), hash(hash) {}
+		Value(Value &&) = default;
+		Value(const Value &) = delete;
+		Value &operator=(const Value &) = delete;
+		V value;
+		unsigned int hash;
+	};
+	// A combination of a `V::Accumulated` and its hash value.
+	struct AccumulatedValue {
+		AccumulatedValue(typename V::Accumulated value, unsigned int hash) : value(std::move(value)), hash(hash) {}
+		AccumulatedValue(AccumulatedValue &&) = default;
+#if defined(_MSC_VER)
+		AccumulatedValue(const AccumulatedValue &) {
+			log_error("Copy constructor called on AccumulatedValue");
+		}
+		AccumulatedValue &operator=(const AccumulatedValue &) {
+			log_error("Copy assignment called on AccumulatedValue");
+			return *this;
+		}
+#else
+		AccumulatedValue(const AccumulatedValue &) = delete;
+		AccumulatedValue &operator=(const AccumulatedValue &) = delete;
+#endif
+		typename V::Accumulated value;
+		unsigned int hash;
+	};
+	// A class containing an `operator()` that returns true of two `AccumulatedValue`
+	// elements have equal keys.
+	// Required to insert `AccumulatedValue`s into an `std::unordered_set`.
+	struct AccumulatedValueEquality {
+		KeyEquality inner;
+		AccumulatedValueEquality(const KeyEquality &inner) : inner(inner) {}
+		bool operator()(const AccumulatedValue &v1, const AccumulatedValue &v2) const {
+			return inner(v1.value, v2.value);
+		}
+	};
+	// A class containing an `operator()` that returns the hash value of an `AccumulatedValue`.
+	// Required to insert `AccumulatedValue`s into an `std::unordered_set`.
+	struct AccumulatedValueHashOp {
+		size_t operator()(const AccumulatedValue &v) const {
+			return static_cast<size_t>(v.hash);
+		}
+	};
+	using Shard = std::unordered_set<AccumulatedValue, AccumulatedValueHashOp, AccumulatedValueEquality>;
+
+	// First construct one of these. Then populate it in parallel by calling `insert()` from many threads.
+	// Then do another parallel phase calling `process()` from many threads.
+	class Builder {
+	public:
+		Builder(const ParallelDispatchThreadPool &thread_pool, KeyEquality equality = KeyEquality(), CollisionHandler collision_handler = CollisionHandler())
+				: collision_handler(std::move(collision_handler)) {
+			init(thread_pool.num_threads(), std::move(equality));
+		}
+		Builder(const ParallelDispatchThreadPool::Subpool &thread_pool, KeyEquality equality = KeyEquality(), CollisionHandler collision_handler = CollisionHandler())
+				: collision_handler(std::move(collision_handler)) {
+			init(thread_pool.num_threads(), std::move(equality));
+		}
+		// First call `insert` to insert all elements. All inserts must finish
+		// before calling any `process()`.
+		void insert(const ThreadIndex &thread, Value v) {
+			// You might think that for the single-threaded case, we can optimize by
+			// inserting directly into the `std::unordered_set` here. But that slows things down
+			// a lot and I never got around to figuring out why.
+			std::vector<std::vector<Value>> &buckets = all_buckets[thread.thread_num];
+			size_t bucket = static_cast<size_t>(v.hash) % buckets.size();
+			buckets[bucket].emplace_back(std::move(v));
+		}
+		// Then call `process` for each thread. All `process()`s must finish before using
+		// the `Builder` to construct a `ShardedHashSet`.
+		void process(const ThreadIndex &thread) {
+			int size = 0;
+			for (std::vector<std::vector<Value>> &buckets : all_buckets)
+				size += GetSize(buckets[thread.thread_num]);
+			Shard &shard = shards[thread.thread_num];
+			shard.reserve(size);
+			for (std::vector<std::vector<Value>> &buckets : all_buckets) {
+				for (Value &value : buckets[thread.thread_num])
+					accumulate(value, shard);
+				// Free as much memory as we can during the parallel phase.
+				std::vector<Value>().swap(buckets[thread.thread_num]);
+			}
+		}
+	private:
+		friend class ShardedHashSet<V, KeyEquality, CollisionHandler>;
+		void accumulate(Value &value, Shard &shard) {
+			// With C++20 we could make this more efficient using heterogenous lookup
+			AccumulatedValue accumulated_value{std::move(value.value), value.hash};
+			auto [it, inserted] = shard.insert(std::move(accumulated_value));
+			if (!inserted)
+				collision_handler(const_cast<typename V::Accumulated &>(it->value), accumulated_value.value);
+		}
+		void init(int num_threads, KeyEquality equality) {
+			all_buckets.resize(num_threads);
+			for (std::vector<std::vector<Value>> &buckets : all_buckets)
+				buckets.resize(num_threads);
+			for (int i = 0; i < num_threads; ++i)
+				shards.emplace_back(0, AccumulatedValueHashOp(), AccumulatedValueEquality(equality));
+		}
+		const CollisionHandler collision_handler;
+		std::vector<std::vector<std::vector<Value>>> all_buckets;
+		std::vector<Shard> shards;
+	};
+
+	// Then finally construct the hashtable:
+	ShardedHashSet(Builder &builder) : shards(std::move(builder.shards)) {
+		// Check that all necessary 'process()' calls were made.
+		for (std::vector<std::vector<Value>> &buckets : builder.all_buckets)
+			for (std::vector<Value> &bucket : buckets)
+				log_assert(bucket.empty());
+		// Free memory.
+		std::vector<std::vector<std::vector<Value>>>().swap(builder.all_buckets);
+	}
+	ShardedHashSet(ShardedHashSet &&other) = default;
+	ShardedHashSet() {}
+
+	ShardedHashSet &operator=(ShardedHashSet &&other) = default;
+
+	// Look up by `AccumulatedValue`. If we switch to C++20 then we could use
+	// heterogenous lookup to support looking up by `Value` here. Returns nullptr
+	// if the key is not found.
+	const typename V::Accumulated *find(const AccumulatedValue &v) const {
+		size_t num_shards = shards.size();
+		if (num_shards == 0)
+			return nullptr;
+		size_t shard = static_cast<size_t>(v.hash) % num_shards;
+		auto it = shards[shard].find(v);
+		if (it == shards[shard].end())
+			return nullptr;
+		return &it->value;
+	}
+
+	// Insert an element into the table. The caller is responsible for ensuring this does not
+	// happen concurrently with any other method calls.
+	void insert(AccumulatedValue v) {
+		size_t num_shards = shards.size();
+		if (num_shards == 0)
+			return;
+		size_t shard = static_cast<size_t>(v.hash) % num_shards;
+		shards[shard].insert(v);
+	}
+
+	// Call this for each shard to implement parallel destruction. For very large `ShardedHashSet`s,
+	// deleting all elements of all shards on a single thread can be a performance bottleneck.
+	void clear(const ThreadIndex &shard) {
+		AccumulatedValueEquality equality = shards[0].key_eq();
+		shards[shard.thread_num] = Shard(0, AccumulatedValueHashOp(), equality);
+	}
+private:
+	std::vector<Shard> shards;
+};
+
 YOSYS_NAMESPACE_END
 
 #endif // YOSYS_THREADING_H

From b18ca8710eabcdbca527e19f52899ef361c25b97 Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Wed, 28 Jan 2026 19:00:47 +0000
Subject: [PATCH 08/26] Add `ConcurrentWorkQueue`

We'll use this later in the PR.
---
 kernel/threading.h | 106 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)

diff --git a/kernel/threading.h b/kernel/threading.h
index c1897de10..3a5f5e820 100644
--- a/kernel/threading.h
+++ b/kernel/threading.h
@@ -562,6 +562,112 @@ private:
 	std::vector<Shard> shards;
 };
 
+// A concurrent work-queue that can share batches of work across threads.
+// Uses a naive implementation of work-stealing.
+template <typename T>
+class ConcurrentWorkQueue {
+public:
+	// Create a queue that supports the given number of threads and
+	// groups work into `batch_size` units.
+	ConcurrentWorkQueue(int num_threads, int batch_size = 100)
+		: batch_size(batch_size), thread_states(num_threads) {}
+	int num_threads() const { return GetSize(thread_states); }
+	// Push some work to do. Pushes and pops with the same `thread` must
+	// not happen concurrently.
+	void push(const ThreadIndex &thread, T work) {
+		ThreadState &thread_state = thread_states[thread.thread_num];
+		thread_state.next_batch.emplace_back(std::move(work));
+		if (GetSize(thread_state.next_batch) < batch_size)
+			return;
+		bool was_empty;
+		{
+			std::unique_lock lock(thread_state.batches_lock);
+			was_empty = thread_state.batches.empty();
+			thread_state.batches.push_back(std::move(thread_state.next_batch));
+		}
+		if (was_empty) {
+			std::unique_lock lock(waiters_lock);
+			if (num_waiters > 0) {
+				waiters_cv.notify_one();
+			}
+		}
+	}
+	// Grab some work to do.
+	// If all threads enter `pop_batch()`, then instead of deadlocking the
+	// queue will return no work. That is the only case in which it will
+	// return no work.
+	std::vector<T> pop_batch(const ThreadIndex &thread) {
+		ThreadState &thread_state = thread_states[thread.thread_num];
+		if (!thread_state.next_batch.empty())
+			return std::move(thread_state.next_batch);
+		// Empty our own work queue first.
+		{
+			std::unique_lock lock(thread_state.batches_lock);
+			if (!thread_state.batches.empty()) {
+				std::vector<T> batch = std::move(thread_state.batches.back());
+				thread_state.batches.pop_back();
+				return batch;
+			}
+		}
+		// From here on in this function, our work queue is empty.
+		while (true) {
+			std::vector<T> batch = try_steal(thread);
+			if (!batch.empty()) {
+				return std::move(batch);
+			}
+			// Termination: if all threads run out of work, then all of
+			// them will eventually enter this loop and there will be no further
+			// notifications on waiters_cv, so all will eventually increment
+			// num_waiters and wait, so num_waiters == num_threads()
+			// will become true.
+			std::unique_lock lock(waiters_lock);
+			++num_waiters;
+			if (num_waiters == num_threads()) {
+				waiters_cv.notify_all();
+				return {};
+			}
+			// As above, it's possible that we'll wait here even when there
+			// are work batches posted by other threads. That's OK.
+			waiters_cv.wait(lock);
+			if (num_waiters == num_threads())
+				return {};
+			--num_waiters;
+		}
+	}
+private:
+	std::vector<T> try_steal(const ThreadIndex &thread) {
+		for (int i = 1; i < num_threads(); i++) {
+			int other_thread_num = (thread.thread_num + i) % num_threads();
+			ThreadState &other_thread_state = thread_states[other_thread_num];
+			std::unique_lock lock(other_thread_state.batches_lock);
+			if (!other_thread_state.batches.empty()) {
+				std::vector<T> batch = std::move(other_thread_state.batches.front());
+				other_thread_state.batches.pop_front();
+				return batch;
+			}
+		}
+		return {};
+	}
+
+	int batch_size;
+
+	struct ThreadState {
+		// Entirely thread-local.
+		std::vector<T> next_batch;
+
+		std::mutex batches_lock;
+		// Only the associated thread ever adds to this, and only at the back.
+		// Other threads can remove elements from the front.
+		std::deque<std::vector<T>> batches;
+	};
+	std::vector<ThreadState> thread_states;
+
+	std::mutex waiters_lock;
+	std::condition_variable waiters_cv;
+	// Number of threads waiting for work. Their queues are empty.
+	int num_waiters = 0;
+};
+
 YOSYS_NAMESPACE_END
 
 #endif // YOSYS_THREADING_H

From d5950a6c035546b65daf78cf5177f052fec7fe10 Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Wed, 28 Jan 2026 19:01:43 +0000
Subject: [PATCH 09/26] Add `MonotonicFlag`

We'll use this later in the PR.
---
 kernel/threading.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/kernel/threading.h b/kernel/threading.h
index 3a5f5e820..82a65676d 100644
--- a/kernel/threading.h
+++ b/kernel/threading.h
@@ -668,6 +668,22 @@ private:
 	int num_waiters = 0;
 };
 
+// A monotonic flag. Starts false, and can be set to true in a thread-safe way.
+// Once `load()` returns true, it will always return true.
+// Uses relaxed atomics so there are no memory ordering guarantees. Do not use this
+// to guard access to shared memory.
+class MonotonicFlag {
+public:
+	MonotonicFlag() : value(false) {}
+	bool load() const { return value.load(std::memory_order_relaxed); }
+	void set() { value.store(true, std::memory_order_relaxed); }
+	bool set_and_return_old() {
+		return value.exchange(true, std::memory_order_relaxed);
+	}
+private:
+	std::atomic<bool> value;
+};
+
 YOSYS_NAMESPACE_END
 
 #endif // YOSYS_THREADING_H

From e551e30fa3146c15461a620ceec91ae718461016 Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Wed, 28 Jan 2026 18:20:13 +0000
Subject: [PATCH 10/26] Add `FfInitVals::set_parallel()` method

We'll use this later in the PR.
---
 kernel/ffinit.h | 64 +++++++++++++++++++++++++++++++++----------------
 1 file changed, 43 insertions(+), 21 deletions(-)

diff --git a/kernel/ffinit.h b/kernel/ffinit.h
index 920fba307..8b4758f60 100644
--- a/kernel/ffinit.h
+++ b/kernel/ffinit.h
@@ -22,6 +22,7 @@
 
 #include "kernel/yosys.h"
 #include "kernel/sigtools.h"
+#include "kernel/threading.h"
 
 YOSYS_NAMESPACE_BEGIN
 
@@ -35,34 +36,55 @@ struct FfInitVals
 		sigmap = sigmap_;
 		initbits.clear();
 		for (auto wire : module->wires())
+			if (wire->attributes.count(ID::init))
+				process_wire(wire);
+	}
+
+	void process_wire(RTLIL::Wire *wire)
+	{
+		SigSpec wirebits = (*sigmap)(wire);
+		Const initval = wire->attributes.at(ID::init);
+
+		for (int i = 0; i < GetSize(wirebits) && i < GetSize(initval); i++)
 		{
-			if (wire->attributes.count(ID::init) == 0)
+			SigBit bit = wirebits[i];
+			State val = initval[i];
+
+			if (val != State::S0 && val != State::S1 && bit.wire != nullptr)
 				continue;
 
-			SigSpec wirebits = (*sigmap)(wire);
-			Const initval = wire->attributes.at(ID::init);
-
-			for (int i = 0; i < GetSize(wirebits) && i < GetSize(initval); i++)
-			{
-				SigBit bit = wirebits[i];
-				State val = initval[i];
-
-				if (val != State::S0 && val != State::S1 && bit.wire != nullptr)
-					continue;
-
-				if (initbits.count(bit)) {
-					if (initbits.at(bit).first != val)
-						log_error("Conflicting init values for signal %s (%s = %s != %s).\n",
-								log_signal(bit), log_signal(SigBit(wire, i)),
-								log_signal(val), log_signal(initbits.at(bit).first));
-					continue;
-				}
-
-				initbits[bit] = std::make_pair(val,SigBit(wire,i));
+			if (initbits.count(bit)) {
+				if (initbits.at(bit).first != val)
+					log_error("Conflicting init values for signal %s (%s = %s != %s).\n",
+							log_signal(bit), log_signal(SigBit(wire, i)),
+							log_signal(val), log_signal(initbits.at(bit).first));
+				continue;
 			}
+
+			initbits[bit] = std::make_pair(val,SigBit(wire,i));
 		}
 	}
 
+	void set_parallel(const SigMapView *sigmap_, ParallelDispatchThreadPool &thread_pool, RTLIL::Module *module)
+	{
+		sigmap = sigmap_;
+		initbits.clear();
+
+		const RTLIL::Module *const_module = module;
+		ParallelDispatchThreadPool::Subpool subpool(thread_pool, ThreadPool::work_pool_size(0, module->wires_size(), 1000));
+		ShardedVector<RTLIL::Wire*> init_wires(subpool);
+		subpool.run([const_module, &init_wires](const ParallelDispatchThreadPool::RunCtx &ctx) {
+			for (int i : ctx.item_range(const_module->wires_size())) {
+				RTLIL::Wire *wire = const_module->wire_at(i);
+				if (wire->attributes.count(ID::init))
+					init_wires.insert(ctx, wire);
+			}
+		});
+
+		for (RTLIL::Wire *wire : init_wires)
+			process_wire(wire);
+	}
+
 	RTLIL::State operator()(RTLIL::SigBit bit) const
 	{
 		auto it = initbits.find((*sigmap)(bit));

From 63aa31172cd25cce5c2223353ab0d4d39d64b796 Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Thu, 29 Jan 2026 22:16:46 +0000
Subject: [PATCH 11/26] Parallelize `collect_garbage()`

---
 kernel/rtlil.cc | 95 ++++++++++++++++++++++++++++++++++---------------
 1 file changed, 66 insertions(+), 29 deletions(-)

diff --git a/kernel/rtlil.cc b/kernel/rtlil.cc
index eef1c319d..f4385478e 100644
--- a/kernel/rtlil.cc
+++ b/kernel/rtlil.cc
@@ -22,6 +22,7 @@
 #include "kernel/celltypes.h"
 #include "kernel/binding.h"
 #include "kernel/sigtools.h"
+#include "kernel/threading.h"
 #include "frontends/verilog/verilog_frontend.h"
 #include "frontends/verilog/preproc.h"
 #include "backends/rtlil/rtlil_backend.h"
@@ -142,9 +143,17 @@ static constexpr bool check_well_known_id_order()
 // and in sorted ascii order, as required by the ID macro.
 static_assert(check_well_known_id_order());
 
+constexpr int STATIC_ID_END = static_cast<int>(RTLIL::StaticId::STATIC_ID_END);
+
 struct IdStringCollector {
+	IdStringCollector(std::vector<MonotonicFlag> &live_ids)
+			: live_ids(live_ids) {}
+
 	void trace(IdString id) {
-		live.insert(id.index_);
+		if (id.index_ >= STATIC_ID_END)
+			live_ids[id.index_ - STATIC_ID_END].set();
+		else if (id.index_ < 0)
+			live_autoidx_ids.push_back(id.index_);
 	}
 	template <typename T> void trace(const T* v) {
 		trace(*v);
@@ -178,10 +187,6 @@ struct IdStringCollector {
 			trace(element);
 	}
 
-	void trace(const RTLIL::Design &design) {
-		trace_values(design.modules_);
-		trace(design.selection_vars);
-	}
 	void trace(const RTLIL::Selection &selection_var) {
 		trace(selection_var.selected_modules);
 		trace(selection_var.selected_members);
@@ -190,15 +195,6 @@ struct IdStringCollector {
 		trace_keys(named.attributes);
 		trace(named.name);
 	}
-	void trace(const RTLIL::Module &module) {
-		trace_named(module);
-		trace_values(module.wires_);
-		trace_values(module.cells_);
-		trace(module.avail_parameters);
-		trace_keys(module.parameter_default_values);
-		trace_values(module.memories);
-		trace_values(module.processes);
-	}
 	void trace(const RTLIL::Wire &wire) {
 		trace_named(wire);
 		if (wire.known_driver())
@@ -234,7 +230,8 @@ struct IdStringCollector {
 		trace(action.memid);
 	}
 
-	std::unordered_set<int> live;
+	std::vector<MonotonicFlag> &live_ids;
+	std::vector<int> live_autoidx_ids;
 };
 
 int64_t RTLIL::OwningIdString::gc_ns;
@@ -243,20 +240,55 @@ int RTLIL::OwningIdString::gc_count;
 void RTLIL::OwningIdString::collect_garbage()
 {
 	int64_t start = PerformanceTimer::query();
-	IdStringCollector collector;
-	for (auto &[idx, design] : *RTLIL::Design::get_all_designs()) {
-		collector.trace(*design);
-	}
-	int size = GetSize(global_id_storage_);
-	for (int i = static_cast<int>(StaticId::STATIC_ID_END); i < size; ++i) {
-		RTLIL::IdString::Storage &storage = global_id_storage_.at(i);
-		if (storage.buf == nullptr)
-			continue;
-		if (collector.live.find(i) != collector.live.end())
-			continue;
-		if (global_refcount_storage_.find(i) != global_refcount_storage_.end())
-			continue;
 
+	int pool_size = 0;
+	for (auto &[idx, design] : *RTLIL::Design::get_all_designs())
+		for (RTLIL::Module *module : design->modules())
+			pool_size = std::max(pool_size, ThreadPool::work_pool_size(0, module->cells_size(), 1000));
+	ParallelDispatchThreadPool thread_pool(pool_size);
+
+	int size = GetSize(global_id_storage_);
+	std::vector<MonotonicFlag> live_ids(size - STATIC_ID_END);
+	std::vector<IdStringCollector> collectors;
+	int num_threads = thread_pool.num_threads();
+	collectors.reserve(num_threads);
+	for (int i = 0; i < num_threads; ++i)
+		collectors.emplace_back(live_ids);
+
+	for (auto &[idx, design] : *RTLIL::Design::get_all_designs()) {
+		for (RTLIL::Module *module : design->modules()) {
+			collectors[0].trace_named(*module);
+			ParallelDispatchThreadPool::Subpool subpool(thread_pool, ThreadPool::work_pool_size(0, module->cells_size(), 1000));
+			subpool.run([&collectors, module](const ParallelDispatchThreadPool::RunCtx &ctx) {
+				for (int i : ctx.item_range(module->cells_size()))
+					collectors[ctx.thread_num].trace(module->cell_at(i));
+				for (int i : ctx.item_range(module->wires_size()))
+					collectors[ctx.thread_num].trace(module->wire_at(i));
+			});
+			collectors[0].trace(module->avail_parameters);
+			collectors[0].trace_keys(module->parameter_default_values);
+			collectors[0].trace_values(module->memories);
+			collectors[0].trace_values(module->processes);
+		}
+		collectors[0].trace(design->selection_vars);
+	}
+
+	ShardedVector<int> free_ids(thread_pool);
+	thread_pool.run([&live_ids, size, &free_ids](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		for (int i : ctx.item_range(size - STATIC_ID_END)) {
+			int index = i + STATIC_ID_END;
+			RTLIL::IdString::Storage &storage = global_id_storage_.at(index);
+			if (storage.buf == nullptr)
+				continue;
+			if (live_ids[i].load())
+				continue;
+			if (global_refcount_storage_.find(index) != global_refcount_storage_.end())
+				continue;
+			free_ids.insert(ctx, index);
+		}
+	});
+	for (int i : free_ids) {
+		RTLIL::IdString::Storage &storage = global_id_storage_.at(i);
 		if (yosys_xtrace) {
 			log("#X# Removed IdString '%s' with index %d.\n", storage.buf, i);
 			log_backtrace("-X- ", yosys_xtrace-1);
@@ -268,8 +300,13 @@ void RTLIL::OwningIdString::collect_garbage()
 		global_free_idx_list_.push_back(i);
 	}
 
+	std::unordered_set<int> live_autoidx_ids;
+	for (IdStringCollector &collector : collectors)
+		for (int id : collector.live_autoidx_ids)
+			live_autoidx_ids.insert(id);
+
 	for (auto it = global_autoidx_id_storage_.begin(); it != global_autoidx_id_storage_.end();) {
-		if (collector.live.find(it->first) != collector.live.end()) {
+		if (live_autoidx_ids.find(it->first) != live_autoidx_ids.end()) {
 			++it;
 			continue;
 		}

From c8298aae0285fd46096be2d8c95052465bd9b03d Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Thu, 29 Jan 2026 22:45:10 +0000
Subject: [PATCH 12/26] Parallelize `Design::check()`

---
 kernel/rtlil.cc | 183 +++++++++++++++++++++++++++---------------------
 kernel/rtlil.h  |   4 +-
 2 files changed, 104 insertions(+), 83 deletions(-)

diff --git a/kernel/rtlil.cc b/kernel/rtlil.cc
index f4385478e..54696e000 100644
--- a/kernel/rtlil.cc
+++ b/kernel/rtlil.cc
@@ -1503,15 +1503,21 @@ void RTLIL::Design::sort_modules()
 	modules_.sort(sort_by_id_str());
 }
 
+void check_module(RTLIL::Module *module, ParallelDispatchThreadPool &thread_pool);
+
 void RTLIL::Design::check()
 {
 #ifndef NDEBUG
 	log_assert(!selection_stack.empty());
+	int pool_size = 0;
+	for (auto &it : modules_)
+		pool_size = std::max(pool_size, ThreadPool::work_pool_size(0, it.second->cells_size(), 1000));
+	ParallelDispatchThreadPool thread_pool(pool_size);
 	for (auto &it : modules_) {
 		log_assert(this == it.second->design);
 		log_assert(it.first == it.second->name);
 		log_assert(!it.first.empty());
-		it.second->check();
+		check_module(it.second, thread_pool);
 	}
 #endif
 }
@@ -1747,11 +1753,11 @@ size_t RTLIL::Module::count_id(RTLIL::IdString id)
 namespace {
 	struct InternalCellChecker
 	{
-		RTLIL::Module *module;
+		const RTLIL::Module *module;
 		RTLIL::Cell *cell;
 		pool<RTLIL::IdString> expected_params, expected_ports;
 
-		InternalCellChecker(RTLIL::Module *module, RTLIL::Cell *cell) : module(module), cell(cell) { }
+		InternalCellChecker(const RTLIL::Module *module, RTLIL::Cell *cell) : module(module), cell(cell) { }
 
 		void error(int linenr)
 		{
@@ -2727,88 +2733,96 @@ void RTLIL::Module::sort()
 		it.second->attributes.sort(sort_by_id_str());
 }
 
-void RTLIL::Module::check()
+void check_module(RTLIL::Module *module, ParallelDispatchThreadPool &thread_pool)
 {
 #ifndef NDEBUG
-	std::vector<bool> ports_declared;
-	for (auto &it : wires_) {
-		log_assert(this == it.second->module);
-		log_assert(it.first == it.second->name);
-		log_assert(!it.first.empty());
-		log_assert(it.second->width >= 0);
-		log_assert(it.second->port_id >= 0);
-		for (auto &it2 : it.second->attributes)
-			log_assert(!it2.first.empty());
-		if (it.second->port_id) {
-			log_assert(GetSize(ports) >= it.second->port_id);
-			log_assert(ports.at(it.second->port_id-1) == it.first);
-			log_assert(it.second->port_input || it.second->port_output);
-			if (GetSize(ports_declared) < it.second->port_id)
-				ports_declared.resize(it.second->port_id);
-			log_assert(ports_declared[it.second->port_id-1] == false);
-			ports_declared[it.second->port_id-1] = true;
-		} else
-			log_assert(!it.second->port_input && !it.second->port_output);
-	}
-	for (auto port_declared : ports_declared)
-		log_assert(port_declared == true);
-	log_assert(GetSize(ports) == GetSize(ports_declared));
+	ParallelDispatchThreadPool::Subpool subpool(thread_pool, ThreadPool::work_pool_size(0, module->cells_size(), 1000));
+	const RTLIL::Module *const_module = module;
 
-	for (auto &it : memories) {
+	pool<std::string> memory_strings;
+	for (auto &it : module->memories) {
 		log_assert(it.first == it.second->name);
 		log_assert(!it.first.empty());
 		log_assert(it.second->width >= 0);
 		log_assert(it.second->size >= 0);
 		for (auto &it2 : it.second->attributes)
 			log_assert(!it2.first.empty());
+		memory_strings.insert(it.second->name.str());
 	}
 
-	pool<IdString> packed_memids;
+	std::vector<MonotonicFlag> ports_declared(GetSize(module->ports));
+	ShardedVector<std::string> memids(subpool);
+	subpool.run([const_module, &ports_declared, &memory_strings, &memids](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		for (int i : ctx.item_range(const_module->cells_size())) {
+			auto it = *const_module->cells_.element(i);
+			log_assert(const_module == it.second->module);
+			log_assert(it.first == it.second->name);
+			log_assert(!it.first.empty());
+			log_assert(!it.second->type.empty());
+			for (auto &it2 : it.second->connections()) {
+				log_assert(!it2.first.empty());
+				it2.second.check(const_module);
+			}
+			for (auto &it2 : it.second->attributes)
+				log_assert(!it2.first.empty());
+			for (auto &it2 : it.second->parameters)
+				log_assert(!it2.first.empty());
+			InternalCellChecker checker(const_module, it.second);
+			checker.check();
+			if (it.second->has_memid()) {
+				log_assert(memory_strings.count(it.second->parameters.at(ID::MEMID).decode_string()));
+			} else if (it.second->is_mem_cell()) {
+				std::string memid = it.second->parameters.at(ID::MEMID).decode_string();
+				log_assert(!memory_strings.count(memid));
+				memids.insert(ctx, std::move(memid));
+			}
+			auto cell_mod = const_module->design->module(it.first);
+			if (cell_mod != nullptr) {
+				// assertion check below to make sure that there are no
+				// cases where a cell has a blackbox attribute since
+				// that is deprecated
+				#ifdef __GNUC__
+				#pragma GCC diagnostic push
+				#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+				#endif
+				log_assert(!it.second->get_blackbox_attribute());
+				#ifdef __GNUC__
+				#pragma GCC diagnostic pop
+				#endif
+			}
+		}
 
-	for (auto &it : cells_) {
-		log_assert(this == it.second->module);
-		log_assert(it.first == it.second->name);
-		log_assert(!it.first.empty());
-		log_assert(!it.second->type.empty());
-		for (auto &it2 : it.second->connections()) {
-			log_assert(!it2.first.empty());
-			it2.second.check(this);
+		for (int i : ctx.item_range(const_module->wires_size())) {
+			auto it = *const_module->wires_.element(i);
+			log_assert(const_module == it.second->module);
+			log_assert(it.first == it.second->name);
+			log_assert(!it.first.empty());
+			log_assert(it.second->width >= 0);
+			log_assert(it.second->port_id >= 0);
+			for (auto &it2 : it.second->attributes)
+				log_assert(!it2.first.empty());
+			if (it.second->port_id) {
+				log_assert(GetSize(const_module->ports) >= it.second->port_id);
+				log_assert(const_module->ports.at(it.second->port_id-1) == it.first);
+				log_assert(it.second->port_input || it.second->port_output);
+				log_assert(it.second->port_id <= GetSize(ports_declared));
+				bool previously_declared = ports_declared[it.second->port_id-1].set_and_return_old();
+				log_assert(previously_declared == false);
+			} else
+				log_assert(!it.second->port_input && !it.second->port_output);
 		}
-		for (auto &it2 : it.second->attributes)
-			log_assert(!it2.first.empty());
-		for (auto &it2 : it.second->parameters)
-			log_assert(!it2.first.empty());
-		InternalCellChecker checker(this, it.second);
-		checker.check();
-		if (it.second->has_memid()) {
-			log_assert(memories.count(it.second->parameters.at(ID::MEMID).decode_string()));
-		} else if (it.second->is_mem_cell()) {
-			IdString memid = it.second->parameters.at(ID::MEMID).decode_string();
-			log_assert(!memories.count(memid));
-			log_assert(!packed_memids.count(memid));
-			packed_memids.insert(memid);
-		}
-		auto cell_mod = design->module(it.first);
-		if (cell_mod != nullptr) {
-			// assertion check below to make sure that there are no
-			// cases where a cell has a blackbox attribute since
-			// that is deprecated
-			#ifdef __GNUC__
-			#pragma GCC diagnostic push
-			#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-			#endif
-			log_assert(!it.second->get_blackbox_attribute());
-			#ifdef __GNUC__
-			#pragma GCC diagnostic pop
-			#endif
-		}
-	}
+	});
+	for (const MonotonicFlag &port_declared : ports_declared)
+		log_assert(port_declared.load() == true);
+	pool<std::string> memids_pool;
+	for (std::string &memid : memids)
+		log_assert(memids_pool.insert(memid).second);
 
-	for (auto &it : processes) {
+	for (auto &it : module->processes) {
 		log_assert(it.first == it.second->name);
 		log_assert(!it.first.empty());
 		log_assert(it.second->root_case.compare.empty());
-		std::vector<CaseRule*> all_cases = {&it.second->root_case};
+		std::vector<RTLIL::CaseRule*> all_cases = {&it.second->root_case};
 		for (size_t i = 0; i < all_cases.size(); i++) {
 			for (auto &switch_it : all_cases[i]->switches) {
 				for (auto &case_it : switch_it->cases) {
@@ -2821,34 +2835,41 @@ void RTLIL::Module::check()
 		}
 		for (auto &sync_it : it.second->syncs) {
 			switch (sync_it->type) {
-				case SyncType::ST0:
-				case SyncType::ST1:
-				case SyncType::STp:
-				case SyncType::STn:
-				case SyncType::STe:
+				case RTLIL::SyncType::ST0:
+				case RTLIL::SyncType::ST1:
+				case RTLIL::SyncType::STp:
+				case RTLIL::SyncType::STn:
+				case RTLIL::SyncType::STe:
 					log_assert(!sync_it->signal.empty());
 					break;
-				case SyncType::STa:
-				case SyncType::STg:
-				case SyncType::STi:
+				case RTLIL::SyncType::STa:
+				case RTLIL::SyncType::STg:
+				case RTLIL::SyncType::STi:
 					log_assert(sync_it->signal.empty());
 					break;
 			}
 		}
 	}
 
-	for (auto &it : connections_) {
+	for (auto &it : module->connections_) {
 		log_assert(it.first.size() == it.second.size());
 		log_assert(!it.first.has_const());
-		it.first.check(this);
-		it.second.check(this);
+		it.first.check(module);
+		it.second.check(module);
 	}
 
-	for (auto &it : attributes)
+	for (auto &it : module->attributes)
 		log_assert(!it.first.empty());
 #endif
 }
 
+void RTLIL::Module::check()
+{
+	int pool_size = ThreadPool::work_pool_size(0, cells_size(), 1000);
+	ParallelDispatchThreadPool thread_pool(pool_size);
+	check_module(this, thread_pool);
+}
+
 void RTLIL::Module::optimize()
 {
 }
@@ -5507,7 +5528,7 @@ RTLIL::SigSpec RTLIL::SigSpec::repeat(int num) const
 }
 
 #ifndef NDEBUG
-void RTLIL::SigSpec::check(Module *mod) const
+void RTLIL::SigSpec::check(const Module *mod) const
 {
 	if (rep_ == CHUNK)
 	{
diff --git a/kernel/rtlil.h b/kernel/rtlil.h
index 6a026352e..9ecee8942 100644
--- a/kernel/rtlil.h
+++ b/kernel/rtlil.h
@@ -1759,9 +1759,9 @@ public:
 	}
 
 #ifndef NDEBUG
-	void check(Module *mod = nullptr) const;
+	void check(const Module *mod = nullptr) const;
 #else
-	void check(Module *mod = nullptr) const { (void)mod; }
+	void check(const Module *mod = nullptr) const { (void)mod; }
 #endif
 };
 

From cba49642aa42959cb4448f74f7816c723c6f5378 Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Wed, 28 Jan 2026 19:14:09 +0000
Subject: [PATCH 13/26] Make `keep_cache_t` process all modules up-front
 instead of on-demand

We will want to query `keep_cache` from parallel threads. If we compute
the results on-demand, that means we need synchronization for cache
access in those queries, which adds complexity and overhead. Instead, prefill
the cache with the status of all relevant modules. Note that this doesn't
actually do more work --- we always consult `keep_cache` for all cells of
all selected modules, so scanning all those cells and determining the kept
status of all dependency modules is always required.

Later in this PR we're going to parallelize `scan_module` itself, and that's also
much easier to do when no other parallel threads are running.
---
 passes/opt/opt_clean.cc | 162 +++++++++++++++++++++++++---------------
 1 file changed, 103 insertions(+), 59 deletions(-)

diff --git a/passes/opt/opt_clean.cc b/passes/opt/opt_clean.cc
index f1d21435c..a8e2edd16 100644
--- a/passes/opt/opt_clean.cc
+++ b/passes/opt/opt_clean.cc
@@ -33,47 +33,95 @@ using RTLIL::id2cstr;
 
 struct keep_cache_t
 {
-	Design *design;
-	dict<Module*, bool> cache;
-	bool purge_mode = false;
+	dict<Module*, bool> keep_modules;
+	bool purge_mode;
 
-	void reset(Design *design = nullptr, bool purge_mode = false)
-	{
-		this->design = design;
-		this->purge_mode = purge_mode;
-		cache.clear();
-	}
-
-	bool query(Module *module)
-	{
-		log_assert(design != nullptr);
-
-		if (module == nullptr)
-			return false;
-
-		if (cache.count(module))
-			return cache.at(module);
-
-		cache[module] = true;
-		if (!module->get_bool_attribute(ID::keep)) {
-		    bool found_keep = false;
-		    for (auto cell : module->cells())
-			if (query(cell, true /* ignore_specify */)) {
-			    found_keep = true;
-			    break;
-			}
-		    for (auto wire : module->wires())
-			if (wire->get_bool_attribute(ID::keep)) {
-			    found_keep = true;
-			    break;
-			}
-		    cache[module] = found_keep;
+	keep_cache_t(bool purge_mode, const std::vector<RTLIL::Module *> &selected_modules)
+			: purge_mode(purge_mode) {
+		std::vector<RTLIL::Module *> scan_modules_worklist;
+		dict<RTLIL::Module *, std::vector<RTLIL::Module*>> dependents;
+		std::vector<RTLIL::Module *> propagate_kept_modules_worklist;
+		for (RTLIL::Module *module : selected_modules) {
+			if (keep_modules.count(module))
+				continue;
+			bool keep = scan_module(module, dependents, true, scan_modules_worklist);
+			keep_modules[module] = keep;
+			if (keep)
+				propagate_kept_modules_worklist.push_back(module);
 		}
 
-		return cache[module];
+		while (!scan_modules_worklist.empty()) {
+			RTLIL::Module *module = scan_modules_worklist.back();
+			scan_modules_worklist.pop_back();
+			if (keep_modules.count(module))
+				continue;
+			bool keep = scan_module(module, dependents, false, scan_modules_worklist);
+			keep_modules[module] = keep;
+			if (keep)
+				propagate_kept_modules_worklist.push_back(module);
+		}
+
+		while (!propagate_kept_modules_worklist.empty()) {
+			RTLIL::Module *module = propagate_kept_modules_worklist.back();
+			propagate_kept_modules_worklist.pop_back();
+			for (RTLIL::Module *dependent : dependents[module]) {
+				if (keep_modules[dependent])
+					continue;
+				keep_modules[dependent] = true;
+				propagate_kept_modules_worklist.push_back(dependent);
+			}
+		}
 	}
 
-	bool query(Cell *cell, bool ignore_specify = false)
+	bool query(Cell *cell) const
+	{
+		if (keep_cell(cell, purge_mode))
+			return true;
+		if (cell->type.in(ID($specify2), ID($specify3), ID($specrule)))
+			return true;
+		if (cell->module && cell->module->design) {
+			RTLIL::Module *cell_module = cell->module->design->module(cell->type);
+			return cell_module != nullptr && keep_modules.at(cell_module);
+		}
+		return false;
+	}
+
+private:
+	bool scan_module(Module *module, dict<RTLIL::Module *, std::vector<RTLIL::Module*>> &dependents,
+			bool scan_all_cells, std::vector<Module*> &worklist) const
+	{
+		bool keep = false;
+		if (module->get_bool_attribute(ID::keep)) {
+			if (!scan_all_cells)
+				return true;
+			keep = true;
+		}
+
+		for (Cell *cell : module->cells()) {
+			if (keep_cell(cell, purge_mode)) {
+				if (!scan_all_cells)
+					return true;
+				keep = true;
+			}
+			if (module->design) {
+				RTLIL::Module *cell_module = module->design->module(cell->type);
+				if (cell_module != nullptr) {
+					dependents[cell_module].push_back(module);
+					worklist.push_back(cell_module);
+				}
+			}
+		}
+		if (!scan_all_cells && keep)
+			return true;
+		for (Wire *wire : module->wires()) {
+			if (wire->get_bool_attribute(ID::keep)) {
+				return true;
+			}
+		}
+		return keep;
+	}
+
+	static bool keep_cell(Cell *cell, bool purge_mode)
 	{
 		if (cell->type.in(ID($assert), ID($assume), ID($live), ID($fair), ID($cover)))
 			return true;
@@ -81,9 +129,6 @@ struct keep_cache_t
 		if (cell->type.in(ID($overwrite_tag)))
 			return true;
 
-		if (!ignore_specify && cell->type.in(ID($specify2), ID($specify3), ID($specrule)))
-			return true;
-
 		if (cell->type == ID($print) || cell->type == ID($check))
 			return true;
 
@@ -92,19 +137,14 @@ struct keep_cache_t
 
 		if (!purge_mode && cell->type == ID($scopeinfo))
 			return true;
-
-		if (cell->module && cell->module->design)
-			return query(cell->module->design->module(cell->type));
-
 		return false;
 	}
 };
 
-keep_cache_t keep_cache;
 CellTypes ct_reg, ct_all;
 int count_rm_cells, count_rm_wires;
 
-void rmunused_module_cells(Module *module, bool verbose)
+void rmunused_module_cells(Module *module, bool verbose, keep_cache_t &keep_cache)
 {
 	SigMap sigmap(module);
 	dict<IdString, pool<Cell*>> mem2cells;
@@ -595,7 +635,7 @@ bool rmunused_module_init(RTLIL::Module *module, bool verbose)
 	return did_something;
 }
 
-void rmunused_module(RTLIL::Module *module, bool purge_mode, bool verbose, bool rminit)
+void rmunused_module(RTLIL::Module *module, bool purge_mode, bool verbose, bool rminit, keep_cache_t &keep_cache)
 {
 	if (verbose)
 		log("Finding unused cells or wires in module %s..\n", module->name);
@@ -652,7 +692,7 @@ void rmunused_module(RTLIL::Module *module, bool purge_mode, bool verbose, bool
 	if (!delcells.empty())
 		module->design->scratchpad_set_bool("opt.did_something", true);
 
-	rmunused_module_cells(module, verbose);
+	rmunused_module_cells(module, verbose, keep_cache);
 	while (rmunused_module_signals(module, purge_mode, verbose)) { }
 
 	if (rminit && rmunused_module_init(module, verbose))
@@ -695,7 +735,12 @@ struct OptCleanPass : public Pass {
 		}
 		extra_args(args, argidx, design);
 
-		keep_cache.reset(design, purge_mode);
+		std::vector<RTLIL::Module*> selected_modules;
+		for (auto module : design->selected_whole_modules_warn()) {
+			if (!module->has_processes_warn())
+				selected_modules.push_back(module);
+		}
+		keep_cache_t keep_cache(purge_mode, selected_modules);
 
 		ct_reg.setup_internals_mem();
 		ct_reg.setup_internals_anyinit();
@@ -706,10 +751,8 @@ struct OptCleanPass : public Pass {
 		count_rm_cells = 0;
 		count_rm_wires = 0;
 
-		for (auto module : design->selected_whole_modules_warn()) {
-			if (module->has_processes_warn())
-				continue;
-			rmunused_module(module, purge_mode, true, true);
+		for (auto module : selected_modules) {
+			rmunused_module(module, purge_mode, true, true, keep_cache);
 		}
 
 		if (count_rm_cells > 0 || count_rm_wires > 0)
@@ -718,7 +761,6 @@ struct OptCleanPass : public Pass {
 		design->optimize();
 		design->check();
 
-		keep_cache.reset();
 		ct_reg.clear();
 		ct_all.clear();
 		log_pop();
@@ -758,7 +800,12 @@ struct CleanPass : public Pass {
 		}
 		extra_args(args, argidx, design);
 
-		keep_cache.reset(design);
+		std::vector<RTLIL::Module*> selected_modules;
+		for (auto module : design->selected_unboxed_whole_modules()) {
+			if (!module->has_processes())
+				selected_modules.push_back(module);
+		}
+		keep_cache_t keep_cache(purge_mode, selected_modules);
 
 		ct_reg.setup_internals_mem();
 		ct_reg.setup_internals_anyinit();
@@ -769,10 +816,8 @@ struct CleanPass : public Pass {
 		count_rm_cells = 0;
 		count_rm_wires = 0;
 
-		for (auto module : design->selected_unboxed_whole_modules()) {
-			if (module->has_processes())
-				continue;
-			rmunused_module(module, purge_mode, ys_debug(), true);
+		for (auto module : selected_modules) {
+			rmunused_module(module, purge_mode, ys_debug(), true, keep_cache);
 		}
 
 		log_suppressed();
@@ -782,7 +827,6 @@ struct CleanPass : public Pass {
 		design->optimize();
 		design->check();
 
-		keep_cache.reset();
 		ct_reg.clear();
 		ct_all.clear();
 

From c10374c7716c6f0cc3eb40a6225aabf3f23cd338 Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Wed, 28 Jan 2026 19:27:09 +0000
Subject: [PATCH 14/26] Introduce `RmStats` struct to encapsulate removal
 statistics

Turns out this is not strictly necessary for this PR but it's
still a good thing to do and makes it clearer that the stats
are not modified in a possibly racy way.
---
 passes/opt/opt_clean.cc | 47 ++++++++++++++++++++++-------------------
 1 file changed, 25 insertions(+), 22 deletions(-)

diff --git a/passes/opt/opt_clean.cc b/passes/opt/opt_clean.cc
index a8e2edd16..bf322583b 100644
--- a/passes/opt/opt_clean.cc
+++ b/passes/opt/opt_clean.cc
@@ -142,9 +142,19 @@ private:
 };
 
 CellTypes ct_reg, ct_all;
-int count_rm_cells, count_rm_wires;
 
-void rmunused_module_cells(Module *module, bool verbose, keep_cache_t &keep_cache)
+struct RmStats {
+	int count_rm_cells = 0;
+	int count_rm_wires = 0;
+
+	void log()
+	{
+		if (count_rm_cells > 0 || count_rm_wires > 0)
+			YOSYS_NAMESPACE_PREFIX log("Removed %d unused cells and %d unused wires.\n", count_rm_cells, count_rm_wires);
+	}
+};
+
+void rmunused_module_cells(Module *module, bool verbose, RmStats &stats, keep_cache_t &keep_cache)
 {
 	SigMap sigmap(module);
 	dict<IdString, pool<Cell*>> mem2cells;
@@ -249,7 +259,7 @@ void rmunused_module_cells(Module *module, bool verbose, keep_cache_t &keep_cach
 		if (cell->is_builtin_ff())
 			ffinit.remove_init(cell->getPort(ID::Q));
 		module->remove(cell);
-		count_rm_cells++;
+		stats.count_rm_cells++;
 	}
 
 	for (auto it : mem_unused)
@@ -341,7 +351,7 @@ bool check_public_name(RTLIL::IdString id)
 	return true;
 }
 
-bool rmunused_module_signals(RTLIL::Module *module, bool purge_mode, bool verbose)
+bool rmunused_module_signals(RTLIL::Module *module, bool purge_mode, bool verbose, RmStats &stats)
 {
 	// `register_signals` and `connected_signals` will help us decide later on
 	// on picking representatives out of groups of connected signals
@@ -543,7 +553,7 @@ bool rmunused_module_signals(RTLIL::Module *module, bool purge_mode, bool verbos
 	}
 
 	module->remove(del_wires_queue);
-	count_rm_wires += GetSize(del_wires_queue);
+	stats.count_rm_wires += GetSize(del_wires_queue);
 
 	if (verbose && del_temp_wires_count)
 		log_debug("  removed %d unused temporary wires.\n", del_temp_wires_count);
@@ -635,7 +645,7 @@ bool rmunused_module_init(RTLIL::Module *module, bool verbose)
 	return did_something;
 }
 
-void rmunused_module(RTLIL::Module *module, bool purge_mode, bool verbose, bool rminit, keep_cache_t &keep_cache)
+void rmunused_module(RTLIL::Module *module, bool purge_mode, bool verbose, bool rminit, RmStats &stats, keep_cache_t &keep_cache)
 {
 	if (verbose)
 		log("Finding unused cells or wires in module %s..\n", module->name);
@@ -692,11 +702,11 @@ void rmunused_module(RTLIL::Module *module, bool purge_mode, bool verbose, bool
 	if (!delcells.empty())
 		module->design->scratchpad_set_bool("opt.did_something", true);
 
-	rmunused_module_cells(module, verbose, keep_cache);
-	while (rmunused_module_signals(module, purge_mode, verbose)) { }
+	rmunused_module_cells(module, verbose, stats, keep_cache);
+	while (rmunused_module_signals(module, purge_mode, verbose, stats)) { }
 
 	if (rminit && rmunused_module_init(module, verbose))
-		while (rmunused_module_signals(module, purge_mode, verbose)) { }
+		while (rmunused_module_signals(module, purge_mode, verbose, stats)) { }
 }
 
 struct OptCleanPass : public Pass {
@@ -748,15 +758,11 @@ struct OptCleanPass : public Pass {
 
 		ct_all.setup(design);
 
-		count_rm_cells = 0;
-		count_rm_wires = 0;
-
+		RmStats stats;
 		for (auto module : selected_modules) {
-			rmunused_module(module, purge_mode, true, true, keep_cache);
+			rmunused_module(module, purge_mode, true, true, stats, keep_cache);
 		}
-
-		if (count_rm_cells > 0 || count_rm_wires > 0)
-			log("Removed %d unused cells and %d unused wires.\n", count_rm_cells, count_rm_wires);
+		stats.log();
 
 		design->optimize();
 		design->check();
@@ -813,16 +819,13 @@ struct CleanPass : public Pass {
 
 		ct_all.setup(design);
 
-		count_rm_cells = 0;
-		count_rm_wires = 0;
-
+		RmStats stats;
 		for (auto module : selected_modules) {
-			rmunused_module(module, purge_mode, ys_debug(), true, keep_cache);
+			rmunused_module(module, purge_mode, ys_debug(), true, stats, keep_cache);
 		}
 
 		log_suppressed();
-		if (count_rm_cells > 0 || count_rm_wires > 0)
-			log("Removed %d unused cells and %d unused wires.\n", count_rm_cells, count_rm_wires);
+		stats.log();
 
 		design->optimize();
 		design->check();

From 9990bda187990e211b4a66c2ce0cf7850f0f0c95 Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Wed, 28 Jan 2026 21:58:37 +0000
Subject: [PATCH 15/26] Create a toplevel `ParallelDispatchThreadPool` and
 parallelize `keep_cache_t::scan_module()` with it

---
 passes/opt/opt_clean.cc | 90 ++++++++++++++++++++++++++++-------------
 1 file changed, 62 insertions(+), 28 deletions(-)

diff --git a/passes/opt/opt_clean.cc b/passes/opt/opt_clean.cc
index bf322583b..7a7bf17ad 100644
--- a/passes/opt/opt_clean.cc
+++ b/passes/opt/opt_clean.cc
@@ -22,6 +22,7 @@
 #include "kernel/log.h"
 #include "kernel/celltypes.h"
 #include "kernel/ffinit.h"
+#include "kernel/threading.h"
 #include <stdlib.h>
 #include <stdio.h>
 #include <set>
@@ -36,15 +37,16 @@ struct keep_cache_t
 	dict<Module*, bool> keep_modules;
 	bool purge_mode;
 
-	keep_cache_t(bool purge_mode, const std::vector<RTLIL::Module *> &selected_modules)
+	keep_cache_t(bool purge_mode, ParallelDispatchThreadPool &thread_pool, const std::vector<RTLIL::Module *> &selected_modules)
 			: purge_mode(purge_mode) {
+
 		std::vector<RTLIL::Module *> scan_modules_worklist;
 		dict<RTLIL::Module *, std::vector<RTLIL::Module*>> dependents;
 		std::vector<RTLIL::Module *> propagate_kept_modules_worklist;
 		for (RTLIL::Module *module : selected_modules) {
 			if (keep_modules.count(module))
 				continue;
-			bool keep = scan_module(module, dependents, true, scan_modules_worklist);
+			bool keep = scan_module(module, thread_pool, dependents, ALL_CELLS, scan_modules_worklist);
 			keep_modules[module] = keep;
 			if (keep)
 				propagate_kept_modules_worklist.push_back(module);
@@ -55,7 +57,7 @@ struct keep_cache_t
 			scan_modules_worklist.pop_back();
 			if (keep_modules.count(module))
 				continue;
-			bool keep = scan_module(module, dependents, false, scan_modules_worklist);
+			bool keep = scan_module(module, thread_pool, dependents, MINIMUM_CELLS, scan_modules_worklist);
 			keep_modules[module] = keep;
 			if (keep)
 				propagate_kept_modules_worklist.push_back(module);
@@ -87,38 +89,62 @@ struct keep_cache_t
 	}
 
 private:
-	bool scan_module(Module *module, dict<RTLIL::Module *, std::vector<RTLIL::Module*>> &dependents,
-			bool scan_all_cells, std::vector<Module*> &worklist) const
+	enum ScanCells {
+		// Scan every cell to see if it uses a module that is kept.
+		ALL_CELLS,
+		// Stop scanning cells if we determine early that this module is kept.
+		MINIMUM_CELLS,
+	};
+	bool scan_module(Module *module, ParallelDispatchThreadPool &thread_pool, dict<RTLIL::Module *, std::vector<RTLIL::Module*>> &dependents,
+			ScanCells scan_cells, std::vector<Module*> &worklist) const
 	{
-		bool keep = false;
+		MonotonicFlag keep_module;
 		if (module->get_bool_attribute(ID::keep)) {
-			if (!scan_all_cells)
+			if (scan_cells == MINIMUM_CELLS)
 				return true;
-			keep = true;
+			keep_module.set();
 		}
 
-		for (Cell *cell : module->cells()) {
-			if (keep_cell(cell, purge_mode)) {
-				if (!scan_all_cells)
-					return true;
-				keep = true;
-			}
-			if (module->design) {
-				RTLIL::Module *cell_module = module->design->module(cell->type);
-				if (cell_module != nullptr) {
-					dependents[cell_module].push_back(module);
-					worklist.push_back(cell_module);
+		ParallelDispatchThreadPool::Subpool subpool(thread_pool, ThreadPool::work_pool_size(0, module->cells_size(), 1000));
+		ShardedVector<Module*> deps(subpool);
+		const RTLIL::Module *const_module = module;
+		bool purge_mode = this->purge_mode;
+		subpool.run([purge_mode, const_module, scan_cells, &deps, &keep_module](const ParallelDispatchThreadPool::RunCtx &ctx) {
+			bool keep = false;
+			for (int i : ctx.item_range(const_module->cells_size())) {
+				Cell *cell = const_module->cell_at(i);
+				if (keep_cell(cell, purge_mode)) {
+					if (scan_cells == MINIMUM_CELLS) {
+						keep_module.set();
+						return;
+					}
+					keep = true;
+				}
+				if (const_module->design) {
+					RTLIL::Module *cell_module = const_module->design->module(cell->type);
+					if (cell_module != nullptr)
+						deps.insert(ctx, cell_module);
 				}
 			}
-		}
-		if (!scan_all_cells && keep)
-			return true;
-		for (Wire *wire : module->wires()) {
-			if (wire->get_bool_attribute(ID::keep)) {
-				return true;
+			if (keep) {
+				keep_module.set();
+				return;
 			}
+			for (int i : ctx.item_range(const_module->wires_size())) {
+				Wire *wire = const_module->wire_at(i);
+				if (wire->get_bool_attribute(ID::keep)) {
+					keep_module.set();
+					return;
+				}
+			}
+		});
+		if (scan_cells == MINIMUM_CELLS && keep_module.load())
+			return true;
+		for (Module *dep : deps) {
+			dependents[dep].push_back(module);
+			worklist.push_back(dep);
 		}
-		return keep;
+		return keep_module.load();
 	}
 
 	static bool keep_cell(Cell *cell, bool purge_mode)
@@ -750,7 +776,11 @@ struct OptCleanPass : public Pass {
 			if (!module->has_processes_warn())
 				selected_modules.push_back(module);
 		}
-		keep_cache_t keep_cache(purge_mode, selected_modules);
+		int thread_pool_size = 0;
+		for (RTLIL::Module *m : selected_modules)
+			thread_pool_size = std::max(thread_pool_size, ThreadPool::work_pool_size(0, m->cells_size(), 1000));
+		ParallelDispatchThreadPool thread_pool(thread_pool_size);
+		keep_cache_t keep_cache(purge_mode, thread_pool, selected_modules);
 
 		ct_reg.setup_internals_mem();
 		ct_reg.setup_internals_anyinit();
@@ -811,7 +841,11 @@ struct CleanPass : public Pass {
 			if (!module->has_processes())
 				selected_modules.push_back(module);
 		}
-		keep_cache_t keep_cache(purge_mode, selected_modules);
+		int thread_pool_size = 0;
+		for (RTLIL::Module *m : selected_modules)
+			thread_pool_size = std::max(thread_pool_size, ThreadPool::work_pool_size(0, m->cells_size(), 1000));
+		ParallelDispatchThreadPool thread_pool(thread_pool_size);
+		keep_cache_t keep_cache(purge_mode, thread_pool, selected_modules);
 
 		ct_reg.setup_internals_mem();
 		ct_reg.setup_internals_anyinit();

From 84932e32075e7f2330f66b9402ed7f3186eaf5c7 Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Wed, 28 Jan 2026 22:06:19 +0000
Subject: [PATCH 16/26] Pass the toplevel thread pool to `rmunused_module`,
 create a `Subpool`, and parallelize `remove_temporary_cells`

---
 passes/opt/opt_clean.cc | 94 ++++++++++++++++++++++++-----------------
 1 file changed, 56 insertions(+), 38 deletions(-)

diff --git a/passes/opt/opt_clean.cc b/passes/opt/opt_clean.cc
index 7a7bf17ad..78546914d 100644
--- a/passes/opt/opt_clean.cc
+++ b/passes/opt/opt_clean.cc
@@ -671,47 +671,53 @@ bool rmunused_module_init(RTLIL::Module *module, bool verbose)
 	return did_something;
 }
 
-void rmunused_module(RTLIL::Module *module, bool purge_mode, bool verbose, bool rminit, RmStats &stats, keep_cache_t &keep_cache)
+void remove_temporary_cells(RTLIL::Module *module, ParallelDispatchThreadPool::Subpool &subpool, bool verbose)
 {
-	if (verbose)
-		log("Finding unused cells or wires in module %s..\n", module->name);
+	ShardedVector<RTLIL::Cell*> delcells(subpool);
+	ShardedVector<RTLIL::SigSig> new_connections(subpool);
+	const RTLIL::Module *const_module = module;
+	subpool.run([const_module, &delcells, &new_connections](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		for (int i : ctx.item_range(const_module->cells_size())) {
+			RTLIL::Cell *cell = const_module->cell_at(i);
+			if (cell->type.in(ID($pos), ID($_BUF_), ID($buf)) && !cell->has_keep_attr()) {
+				bool is_signed = cell->type == ID($pos) && cell->getParam(ID::A_SIGNED).as_bool();
+				RTLIL::SigSpec a = cell->getPort(ID::A);
+				RTLIL::SigSpec y = cell->getPort(ID::Y);
+				a.extend_u0(GetSize(y), is_signed);
 
-	std::vector<RTLIL::Cell*> delcells;
-	for (auto cell : module->cells()) {
-		if (cell->type.in(ID($pos), ID($_BUF_), ID($buf)) && !cell->has_keep_attr()) {
-			bool is_signed = cell->type == ID($pos) && cell->getParam(ID::A_SIGNED).as_bool();
-			RTLIL::SigSpec a = cell->getPort(ID::A);
-			RTLIL::SigSpec y = cell->getPort(ID::Y);
-			a.extend_u0(GetSize(y), is_signed);
-
-			if (a.has_const(State::Sz)) {
-				SigSpec new_a;
-				SigSpec new_y;
-				for (int i = 0; i < GetSize(a); ++i) {
-					SigBit b = a[i];
-					if (b == State::Sz)
-						continue;
-					new_a.append(b);
-					new_y.append(y[i]);
+				if (a.has_const(State::Sz)) {
+					RTLIL::SigSpec new_a;
+					RTLIL::SigSpec new_y;
+					for (int i = 0; i < GetSize(a); ++i) {
+						RTLIL::SigBit b = a[i];
+						if (b == State::Sz)
+							continue;
+						new_a.append(b);
+						new_y.append(y[i]);
+					}
+					a = std::move(new_a);
+					y = std::move(new_y);
 				}
-				a = std::move(new_a);
-				y = std::move(new_y);
+				if (!y.empty())
+					new_connections.insert(ctx, {y, a});
+				delcells.insert(ctx, cell);
+			} else if (cell->type.in(ID($connect)) && !cell->has_keep_attr()) {
+				RTLIL::SigSpec a = cell->getPort(ID::A);
+				RTLIL::SigSpec b = cell->getPort(ID::B);
+				if (a.has_const() && !b.has_const())
+					std::swap(a, b);
+				new_connections.insert(ctx, {a, b});
+				delcells.insert(ctx, cell);
+			} else if (cell->type.in(ID($input_port)) && !cell->has_keep_attr()) {
+				delcells.insert(ctx, cell);
 			}
-			if (!y.empty())
-				module->connect(y, a);
-			delcells.push_back(cell);
-		} else if (cell->type.in(ID($connect)) && !cell->has_keep_attr()) {
-			RTLIL::SigSpec a = cell->getPort(ID::A);
-			RTLIL::SigSpec b = cell->getPort(ID::B);
-			if (a.has_const() && !b.has_const())
-				std::swap(a, b);
-			module->connect(a, b);
-			delcells.push_back(cell);
-		} else if (cell->type.in(ID($input_port)) && !cell->has_keep_attr()) {
-			delcells.push_back(cell);
 		}
+	});
+	bool did_something = false;
+	for (RTLIL::SigSig &connection : new_connections) {
+		module->connect(connection);
 	}
-	for (auto cell : delcells) {
+	for (RTLIL::Cell *cell : delcells) {
 		if (verbose) {
 			if (cell->type == ID($connect))
 				log_debug("  removing connect cell `%s': %s <-> %s\n", cell->name,
@@ -724,10 +730,22 @@ void rmunused_module(RTLIL::Module *module, bool purge_mode, bool verbose, bool
 						log_signal(cell->getPort(ID::Y)), log_signal(cell->getPort(ID::A)));
 		}
 		module->remove(cell);
+		did_something = true;
 	}
-	if (!delcells.empty())
+	if (did_something)
 		module->design->scratchpad_set_bool("opt.did_something", true);
+}
 
+void rmunused_module(RTLIL::Module *module, ParallelDispatchThreadPool &thread_pool, bool purge_mode, bool verbose, bool rminit, RmStats &stats, keep_cache_t &keep_cache)
+{
+	if (verbose)
+		log("Finding unused cells or wires in module %s..\n", module->name);
+
+	// Use no more than one worker per thousand cells, rounded down, so
+	// we only start multithreading with at least 2000 cells.
+	int num_worker_threads = ThreadPool::work_pool_size(0, module->cells_size(), 1000);
+	ParallelDispatchThreadPool::Subpool subpool(thread_pool, num_worker_threads);
+	remove_temporary_cells(module, subpool, verbose);
 	rmunused_module_cells(module, verbose, stats, keep_cache);
 	while (rmunused_module_signals(module, purge_mode, verbose, stats)) { }
 
@@ -790,7 +808,7 @@ struct OptCleanPass : public Pass {
 
 		RmStats stats;
 		for (auto module : selected_modules) {
-			rmunused_module(module, purge_mode, true, true, stats, keep_cache);
+			rmunused_module(module, thread_pool, purge_mode, true, true, stats, keep_cache);
 		}
 		stats.log();
 
@@ -855,7 +873,7 @@ struct CleanPass : public Pass {
 
 		RmStats stats;
 		for (auto module : selected_modules) {
-			rmunused_module(module, purge_mode, ys_debug(), true, stats, keep_cache);
+			rmunused_module(module, thread_pool, purge_mode, ys_debug(), true, stats, keep_cache);
 		}
 
 		log_suppressed();

From e4dde705dc18024c769deb6123f1a0d614556e36 Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Wed, 28 Jan 2026 22:46:10 +0000
Subject: [PATCH 17/26] Pass the module `Subpool` to `rmunused_module_init` and
 parallelize that function

---
 passes/opt/opt_clean.cc | 134 +++++++++++++++++++++-------------------
 1 file changed, 72 insertions(+), 62 deletions(-)

diff --git a/passes/opt/opt_clean.cc b/passes/opt/opt_clean.cc
index 78546914d..c7874eeb3 100644
--- a/passes/opt/opt_clean.cc
+++ b/passes/opt/opt_clean.cc
@@ -590,79 +590,93 @@ bool rmunused_module_signals(RTLIL::Module *module, bool purge_mode, bool verbos
 	return !del_wires_queue.empty();
 }
 
-bool rmunused_module_init(RTLIL::Module *module, bool verbose)
+bool rmunused_module_init(RTLIL::Module *module, ParallelDispatchThreadPool::Subpool &subpool, bool verbose)
 {
-	bool did_something = false;
 	CellTypes fftypes;
 	fftypes.setup_internals_mem();
 
 	SigMap sigmap(module);
-	dict<SigBit, State> qbits;
 
-	for (auto cell : module->cells())
-		if (fftypes.cell_known(cell->type) && cell->hasPort(ID::Q))
-		{
-			SigSpec sig = cell->getPort(ID::Q);
-
-			for (int i = 0; i < GetSize(sig); i++)
+	const Module *const_module = module;
+	ShardedVector<std::pair<SigBit, State>> results(subpool);
+	subpool.run([const_module, &fftypes, &results](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		for (int i : ctx.item_range(const_module->cells_size())) {
+			RTLIL::Cell *cell = const_module->cell_at(i);
+			if (fftypes.cell_known(cell->type) && cell->hasPort(ID::Q))
 			{
-				SigBit bit = sig[i];
+				SigSpec sig = cell->getPort(ID::Q);
 
-				if (bit.wire == nullptr || bit.wire->attributes.count(ID::init) == 0)
-					continue;
+				for (int i = 0; i < GetSize(sig); i++)
+				{
+					SigBit bit = sig[i];
 
-				Const init = bit.wire->attributes.at(ID::init);
+					if (bit.wire == nullptr || bit.wire->attributes.count(ID::init) == 0)
+						continue;
 
-				if (i >= GetSize(init) || init[i] == State::Sx || init[i] == State::Sz)
-					continue;
+					Const init = bit.wire->attributes.at(ID::init);
 
-				sigmap.add(bit);
-				qbits[bit] = init[i];
-			}
-		}
+					if (i >= GetSize(init) || init[i] == State::Sx || init[i] == State::Sz)
+						continue;
 
-	for (auto wire : module->wires())
-	{
-		if (wire->attributes.count(ID::init) == 0)
-			continue;
-
-		Const init = wire->attributes.at(ID::init);
-
-		for (int i = 0; i < GetSize(wire) && i < GetSize(init); i++)
-		{
-			if (init[i] == State::Sx || init[i] == State::Sz)
-				continue;
-
-			SigBit wire_bit = SigBit(wire, i);
-			SigBit mapped_wire_bit = sigmap(wire_bit);
-
-			if (wire_bit == mapped_wire_bit)
-				goto next_wire;
-
-			if (mapped_wire_bit.wire) {
-				if (qbits.count(mapped_wire_bit) == 0)
-					goto next_wire;
-
-				if (qbits.at(mapped_wire_bit) != init[i])
-					goto next_wire;
-			}
-			else {
-				if (mapped_wire_bit == State::Sx || mapped_wire_bit == State::Sz)
-					goto next_wire;
-
-				if (mapped_wire_bit != init[i]) {
-					log_warning("Initial value conflict for %s resolving to %s but with init %s.\n", log_signal(wire_bit), log_signal(mapped_wire_bit), log_signal(init[i]));
-					goto next_wire;
+					results.insert(ctx, {bit, init[i]});
 				}
 			}
 		}
+	});
+	dict<SigBit, State> qbits;
+	for (std::pair<SigBit, State> &p : results) {
+		sigmap.add(p.first);
+		qbits[p.first] = p.second;
+	}
 
+	ShardedVector<RTLIL::Wire*> wire_results(subpool);
+	subpool.run([const_module, &sigmap, &qbits, &wire_results](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		for (int j : ctx.item_range(const_module->wires_size())) {
+			RTLIL::Wire *wire = const_module->wire_at(j);
+			if (wire->attributes.count(ID::init) == 0)
+				continue;
+			Const init = wire->attributes.at(ID::init);
+
+			for (int i = 0; i < GetSize(wire) && i < GetSize(init); i++)
+			{
+				if (init[i] == State::Sx || init[i] == State::Sz)
+					continue;
+
+				SigBit wire_bit = SigBit(wire, i);
+				SigBit mapped_wire_bit = sigmap(wire_bit);
+
+				if (wire_bit == mapped_wire_bit)
+					goto next_wire;
+
+				if (mapped_wire_bit.wire) {
+					if (qbits.count(mapped_wire_bit) == 0)
+						goto next_wire;
+
+					if (qbits.at(mapped_wire_bit) != init[i])
+						goto next_wire;
+				}
+				else {
+					if (mapped_wire_bit == State::Sx || mapped_wire_bit == State::Sz)
+						goto next_wire;
+
+					if (mapped_wire_bit != init[i]) {
+						log_warning("Initial value conflict for %s resolving to %s but with init %s.\n", log_signal(wire_bit), log_signal(mapped_wire_bit), log_signal(init[i]));
+						goto next_wire;
+					}
+				}
+			}
+			wire_results.insert(ctx, wire);
+
+			next_wire:;
+		}
+	});
+
+	bool did_something = false;
+	for (RTLIL::Wire *wire : wire_results) {
 		if (verbose)
 			log_debug("  removing redundant init attribute on %s.\n", log_id(wire));
-
 		wire->attributes.erase(ID::init);
 		did_something = true;
-	next_wire:;
 	}
 
 	if (did_something)
@@ -749,7 +763,7 @@ void rmunused_module(RTLIL::Module *module, ParallelDispatchThreadPool &thread_p
 	rmunused_module_cells(module, verbose, stats, keep_cache);
 	while (rmunused_module_signals(module, purge_mode, verbose, stats)) { }
 
-	if (rminit && rmunused_module_init(module, verbose))
+	if (rminit && rmunused_module_init(module, subpool, verbose))
 		while (rmunused_module_signals(module, purge_mode, verbose, stats)) { }
 }
 
@@ -790,10 +804,9 @@ struct OptCleanPass : public Pass {
 		extra_args(args, argidx, design);
 
 		std::vector<RTLIL::Module*> selected_modules;
-		for (auto module : design->selected_whole_modules_warn()) {
+		for (auto module : design->selected_whole_modules_warn())
 			if (!module->has_processes_warn())
 				selected_modules.push_back(module);
-		}
 		int thread_pool_size = 0;
 		for (RTLIL::Module *m : selected_modules)
 			thread_pool_size = std::max(thread_pool_size, ThreadPool::work_pool_size(0, m->cells_size(), 1000));
@@ -807,9 +820,8 @@ struct OptCleanPass : public Pass {
 		ct_all.setup(design);
 
 		RmStats stats;
-		for (auto module : selected_modules) {
+		for (auto module : selected_modules)
 			rmunused_module(module, thread_pool, purge_mode, true, true, stats, keep_cache);
-		}
 		stats.log();
 
 		design->optimize();
@@ -855,10 +867,9 @@ struct CleanPass : public Pass {
 		extra_args(args, argidx, design);
 
 		std::vector<RTLIL::Module*> selected_modules;
-		for (auto module : design->selected_unboxed_whole_modules()) {
+		for (auto module : design->selected_unboxed_whole_modules())
 			if (!module->has_processes())
 				selected_modules.push_back(module);
-		}
 		int thread_pool_size = 0;
 		for (RTLIL::Module *m : selected_modules)
 			thread_pool_size = std::max(thread_pool_size, ThreadPool::work_pool_size(0, m->cells_size(), 1000));
@@ -872,9 +883,8 @@ struct CleanPass : public Pass {
 		ct_all.setup(design);
 
 		RmStats stats;
-		for (auto module : selected_modules) {
+		for (auto module : selected_modules)
 			rmunused_module(module, thread_pool, purge_mode, ys_debug(), true, stats, keep_cache);
-		}
 
 		log_suppressed();
 		stats.log();

From c81d7b00da86a15d43af5e6922ac677aaa48ad63 Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Wed, 28 Jan 2026 22:59:04 +0000
Subject: [PATCH 18/26] Pass the module `Subpool` to `rmunused_module_cells`
 and parallelize that function

---
 passes/opt/opt_clean.cc | 319 ++++++++++++++++++++++++++++------------
 1 file changed, 222 insertions(+), 97 deletions(-)

diff --git a/passes/opt/opt_clean.cc b/passes/opt/opt_clean.cc
index c7874eeb3..e822d13b1 100644
--- a/passes/opt/opt_clean.cc
+++ b/passes/opt/opt_clean.cc
@@ -180,16 +180,15 @@ struct RmStats {
 	}
 };
 
-void rmunused_module_cells(Module *module, bool verbose, RmStats &stats, keep_cache_t &keep_cache)
+unsigned int hash_bit(const SigBit &bit) {
+	return static_cast<unsigned int>(hash_ops<SigBit>::hash(bit).yield());
+}
+
+void rmunused_module_cells(Module *module, ParallelDispatchThreadPool::Subpool &subpool, bool verbose, RmStats &stats, keep_cache_t &keep_cache)
 {
 	SigMap sigmap(module);
-	dict<IdString, pool<Cell*>> mem2cells;
-	pool<IdString> mem_unused;
-	pool<Cell*> queue, unused;
-	pool<SigBit> used_raw_bits;
-	dict<SigBit, pool<Cell*>> wire2driver;
-	dict<SigBit, vector<string>> driver_driver_logs;
-	FfInitVals ffinit(&sigmap, module);
+	FfInitVals ffinit;
+	ffinit.set_parallel(&sigmap, subpool.thread_pool(), module);
 
 	SigMap raw_sigmap;
 	for (auto &it : module->connections_) {
@@ -199,86 +198,209 @@ void rmunused_module_cells(Module *module, bool verbose, RmStats &stats, keep_ca
 		}
 	}
 
-	for (auto &it : module->memories) {
-		mem_unused.insert(it.first);
-	}
+	struct WireDrivers;
+	struct WireDriver {
+		using Accumulated = WireDrivers;
+		SigBit bit;
+		int driver_cell;
+	};
+	struct WireDrivers {
+		WireDrivers() : driver_cell(0) {}
+		WireDrivers(WireDriver driver) : bit(driver.bit), driver_cell(driver.driver_cell) {}
+		WireDrivers(SigBit bit) : bit(bit), driver_cell(0) {}
+		WireDrivers(WireDrivers &&other) = default;
 
-	for (Cell *cell : module->cells()) {
-		if (cell->type.in(ID($memwr), ID($memwr_v2), ID($meminit), ID($meminit_v2))) {
-			IdString mem_id = cell->getParam(ID::MEMID).decode_string();
-			mem2cells[mem_id].insert(cell);
-		}
-	}
-
-	for (auto &it : module->cells_) {
-		Cell *cell = it.second;
-		for (auto &it2 : cell->connections()) {
-			if (ct_all.cell_known(cell->type) && !ct_all.cell_output(cell->type, it2.first))
-				continue;
-			for (auto raw_bit : it2.second) {
-				if (raw_bit.wire == nullptr)
-					continue;
-				auto bit = sigmap(raw_bit);
-				if (bit.wire == nullptr && ct_all.cell_known(cell->type))
-					driver_driver_logs[raw_sigmap(raw_bit)].push_back(stringf("Driver-driver conflict "
-							"for %s between cell %s.%s and constant %s in %s: Resolved using constant.",
-							log_signal(raw_bit), log_id(cell), log_id(it2.first), log_signal(bit), log_id(module)));
-				if (bit.wire != nullptr)
-					wire2driver[bit].insert(cell);
-			}
-		}
-		if (keep_cache.query(cell))
-			queue.insert(cell);
-		else
-			unused.insert(cell);
-	}
-
-	for (auto &it : module->wires_) {
-		Wire *wire = it.second;
-		if (wire->port_output || wire->get_bool_attribute(ID::keep)) {
-			for (auto bit : sigmap(wire))
-			for (auto c : wire2driver[bit])
-				queue.insert(c), unused.erase(c);
-			for (auto raw_bit : SigSpec(wire))
-				used_raw_bits.insert(raw_sigmap(raw_bit));
-		}
-	}
-
-	while (!queue.empty())
-	{
-		pool<SigBit> bits;
-		pool<IdString> mems;
-		for (auto cell : queue) {
-			for (auto &it : cell->connections())
-				if (!ct_all.cell_known(cell->type) || ct_all.cell_input(cell->type, it.first))
-					for (auto bit : sigmap(it.second))
-						bits.insert(bit);
-
-			if (cell->type.in(ID($memrd), ID($memrd_v2))) {
-				IdString mem_id = cell->getParam(ID::MEMID).decode_string();
-				if (mem_unused.count(mem_id)) {
-					mem_unused.erase(mem_id);
-					mems.insert(mem_id);
+		class const_iterator {
+		public:
+			const_iterator(const WireDrivers &drivers, bool end)
+					: driver_cell(drivers.driver_cell), in_extra_cells(end) {
+				if (drivers.extra_driver_cells) {
+					if (end) {
+						extra_it = drivers.extra_driver_cells->end();
+					} else {
+						extra_it = drivers.extra_driver_cells->begin();
+					}
 				}
 			}
+			int operator*() const {
+				if (in_extra_cells)
+					return **extra_it;
+				return driver_cell;
+			}
+			const_iterator& operator++() {
+				if (in_extra_cells)
+					++*extra_it;
+				else
+					in_extra_cells = true;
+				return *this;
+			}
+			bool operator!=(const const_iterator &other) const {
+				return !(*this == other);
+			}
+			bool operator==(const const_iterator &other) const {
+				return in_extra_cells == other.in_extra_cells &&
+					extra_it == other.extra_it;
+			}
+		private:
+			std::optional<pool<int>::iterator> extra_it;
+			int driver_cell;
+			bool in_extra_cells;
+		};
+
+		const_iterator begin() const { return const_iterator(*this, false); }
+		const_iterator end() const { return const_iterator(*this, true); }
+
+		SigBit bit;
+		int driver_cell;
+		std::unique_ptr<pool<int>> extra_driver_cells;
+	};
+	struct WireDriversKeyEquality {
+		bool operator()(const WireDrivers &a, const WireDrivers &b) const {
+			return a.bit == b.bit;
 		}
+	};
+	struct WireDriversCollisionHandler {
+		void operator()(WireDrivers &incumbent, WireDrivers &new_value) const {
+			log_assert(new_value.extra_driver_cells == nullptr);
+			if (!incumbent.extra_driver_cells)
+				incumbent.extra_driver_cells.reset(new pool<int>());
+			incumbent.extra_driver_cells->insert(new_value.driver_cell);
+		}
+	};
+	using Wire2Drivers = ShardedHashSet<WireDriver, WireDriversKeyEquality, WireDriversCollisionHandler>;
 
-		queue.clear();
+	Wire2Drivers::Builder wire2driver_builder(subpool);
+	ShardedVector<std::pair<std::string, int>> mem2cells_vector(subpool);
+	ShardedVector<std::pair<SigBit, std::string>> driver_driver_logs(subpool);
+	ShardedVector<Wire*> keep_wires(subpool);
+	const RTLIL::Module *const_module = module;
+	int num_threads = subpool.num_threads();
+	ConcurrentWorkQueue<int> cell_queue(num_threads);
+	std::vector<std::atomic<bool>> unused(const_module->cells_size());
+	subpool.run([&sigmap, &raw_sigmap, &keep_cache, const_module, &mem2cells_vector, &driver_driver_logs, &keep_wires, &cell_queue, &wire2driver_builder, &unused](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		for (int i : ctx.item_range(const_module->cells_size())) {
+			Cell *cell = const_module->cell_at(i);
+			if (cell->type.in(ID($memwr), ID($memwr_v2), ID($meminit), ID($meminit_v2)))
+				mem2cells_vector.insert(ctx, {cell->getParam(ID::MEMID).decode_string(), i});
 
-		for (auto bit : bits)
-		for (auto c : wire2driver[bit])
-			if (unused.count(c))
-				queue.insert(c), unused.erase(c);
+			for (auto &it2 : cell->connections()) {
+				if (ct_all.cell_known(cell->type) && !ct_all.cell_output(cell->type, it2.first))
+					continue;
+				for (auto raw_bit : it2.second) {
+					if (raw_bit.wire == nullptr)
+						continue;
+					auto bit = sigmap(raw_bit);
+					if (bit.wire == nullptr && ct_all.cell_known(cell->type)) {
+						std::string msg = stringf("Driver-driver conflict "
+								"for %s between cell %s.%s and constant %s in %s: Resolved using constant.",
+								log_signal(raw_bit), cell->name.unescape(), it2.first.unescape(), log_signal(bit), const_module->name.unescape());
+						driver_driver_logs.insert(ctx, {raw_sigmap(raw_bit), msg});
+					}
+					if (bit.wire != nullptr)
+						wire2driver_builder.insert(ctx, {{bit, i}, hash_bit(bit)});
+				}
+			}
+			bool keep = keep_cache.query(cell);
+			unused[i].store(!keep, std::memory_order_relaxed);
+			if (keep)
+				cell_queue.push(ctx, i);
+		}
+		for (int i : ctx.item_range(const_module->wires_size())) {
+			Wire *wire = const_module->wire_at(i);
+			if (wire->port_output || wire->get_bool_attribute(ID::keep))
+				keep_wires.insert(ctx, wire);
+		}
+	});
+	subpool.run([&wire2driver_builder](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		wire2driver_builder.process(ctx);
+	});
+	Wire2Drivers wire2driver(wire2driver_builder);
 
-		for (auto mem : mems)
-		for (auto c : mem2cells[mem])
-			if (unused.count(c))
-				queue.insert(c), unused.erase(c);
+	dict<std::string, pool<int>> mem2cells;
+	for (std::pair<std::string, int> &mem2cell : mem2cells_vector)
+		mem2cells[mem2cell.first].insert(mem2cell.second);
+
+	pool<SigBit> used_raw_bits;
+	int i = 0;
+	for (Wire *wire : keep_wires) {
+		for (auto bit : sigmap(wire)) {
+			const WireDrivers *drivers = wire2driver.find({{bit}, hash_bit(bit)});
+			if (drivers != nullptr)
+				for (int cell_index : *drivers)
+					if (unused[cell_index].exchange(false, std::memory_order_relaxed)) {
+						ThreadIndex fake_thread_index = {i++ % num_threads};
+						cell_queue.push(fake_thread_index, cell_index);
+					}
+		}
+		for (auto raw_bit : SigSpec(wire))
+			used_raw_bits.insert(raw_sigmap(raw_bit));
 	}
 
-	unused.sort(RTLIL::sort_by_name_id<RTLIL::Cell>());
+	std::vector<std::atomic<bool>> mem_unused(module->memories.size());
+	dict<std::string, int> mem_indices;
+	for (int i = 0; i < GetSize(module->memories); ++i) {
+		mem_indices[module->memories.element(i)->first.str()] = i;
+		mem_unused[i].store(true, std::memory_order_relaxed);
+	}
 
-	for (auto cell : unused) {
+	subpool.run([const_module, &sigmap, &wire2driver, &mem2cells, &unused, &cell_queue, &mem_indices, &mem_unused](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		pool<SigBit> bits;
+		pool<std::string> mems;
+		while (true) {
+			std::vector<int> cell_indices = cell_queue.pop_batch(ctx);
+			if (cell_indices.empty())
+				return;
+			for (auto cell_index : cell_indices) {
+				Cell *cell = const_module->cell_at(cell_index);
+				for (auto &it : cell->connections())
+					if (!ct_all.cell_known(cell->type) || ct_all.cell_input(cell->type, it.first))
+						for (auto bit : sigmap(it.second))
+							bits.insert(bit);
+
+				if (cell->type.in(ID($memrd), ID($memrd_v2))) {
+					std::string mem_id = cell->getParam(ID::MEMID).decode_string();
+					if (mem_indices.count(mem_id)) {
+						int mem_index = mem_indices[mem_id];
+						if (mem_unused[mem_index].exchange(false, std::memory_order_relaxed))
+							mems.insert(mem_id);
+					}
+				}
+			}
+
+			for (auto bit : bits) {
+				const WireDrivers *drivers = wire2driver.find({{bit}, hash_bit(bit)});
+				if (drivers != nullptr)
+					for (int cell_index : *drivers)
+						if (unused[cell_index].exchange(false, std::memory_order_relaxed))
+							cell_queue.push(ctx, cell_index);
+			}
+			bits.clear();
+
+			for (auto mem : mems) {
+				if (mem2cells.count(mem) == 0)
+					continue;
+				for (int cell_index : mem2cells.at(mem))
+					if (unused[cell_index].exchange(false, std::memory_order_relaxed))
+						cell_queue.push(ctx, cell_index);
+			}
+			mems.clear();
+		}
+	});
+
+	ShardedVector<int> sharded_unused_cells(subpool);
+	subpool.run([const_module, &unused, &sharded_unused_cells, &wire2driver](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		// Parallel destruction of `wire2driver`
+		wire2driver.clear(ctx);
+		for (int i : ctx.item_range(const_module->cells_size()))
+			if (unused[i].load(std::memory_order_relaxed))
+				sharded_unused_cells.insert(ctx, i);
+	});
+	pool<Cell*> unused_cells;
+	for (int cell_index : sharded_unused_cells)
+		unused_cells.insert(const_module->cell_at(cell_index));
+	unused_cells.sort(RTLIL::sort_by_name_id<RTLIL::Cell>());
+
+	for (auto cell : unused_cells) {
 		if (verbose)
 			log_debug("  removing unused `%s' cell `%s'.\n", cell->type, cell->name);
 		module->design->scratchpad_set_bool("opt.did_something", true);
@@ -288,28 +410,31 @@ void rmunused_module_cells(Module *module, bool verbose, RmStats &stats, keep_ca
 		stats.count_rm_cells++;
 	}
 
-	for (auto it : mem_unused)
-	{
+	for (const auto &it : mem_indices) {
+		if (!mem_unused[it.second].load(std::memory_order_relaxed))
+			continue;
+		RTLIL::IdString id(it.first);
 		if (verbose)
-			log_debug("  removing unused memory `%s'.\n", it);
-		delete module->memories.at(it);
-		module->memories.erase(it);
+			log_debug("  removing unused memory `%s'.\n", id.unescape());
+		delete module->memories.at(id);
+		module->memories.erase(id);
 	}
 
-	for (auto &it : module->cells_) {
-		Cell *cell = it.second;
-		for (auto &it2 : cell->connections()) {
-			if (ct_all.cell_known(cell->type) && !ct_all.cell_input(cell->type, it2.first))
-				continue;
-			for (auto raw_bit : raw_sigmap(it2.second))
-				used_raw_bits.insert(raw_bit);
+	if (!driver_driver_logs.empty()) {
+		// We could do this in parallel but hopefully this is rare.
+		for (auto &it : module->cells_) {
+			Cell *cell = it.second;
+			for (auto &it2 : cell->connections()) {
+				if (ct_all.cell_known(cell->type) && !ct_all.cell_input(cell->type, it2.first))
+					continue;
+				for (auto raw_bit : raw_sigmap(it2.second))
+					used_raw_bits.insert(raw_bit);
+			}
+		}
+		for (std::pair<SigBit, std::string> &it : driver_driver_logs) {
+			if (used_raw_bits.count(it.first))
+				log_warning("%s\n", it.second);
 		}
-	}
-
-	for (auto it : driver_driver_logs) {
-		if (used_raw_bits.count(it.first))
-			for (auto msg : it.second)
-				log_warning("%s\n", msg);
 	}
 }
 
@@ -760,7 +885,7 @@ void rmunused_module(RTLIL::Module *module, ParallelDispatchThreadPool &thread_p
 	int num_worker_threads = ThreadPool::work_pool_size(0, module->cells_size(), 1000);
 	ParallelDispatchThreadPool::Subpool subpool(thread_pool, num_worker_threads);
 	remove_temporary_cells(module, subpool, verbose);
-	rmunused_module_cells(module, verbose, stats, keep_cache);
+	rmunused_module_cells(module, subpool, verbose, stats, keep_cache);
 	while (rmunused_module_signals(module, purge_mode, verbose, stats)) { }
 
 	if (rminit && rmunused_module_init(module, subpool, verbose))

From e5cf1a90f1101773ca06750e829feb554ec632f1 Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Thu, 5 Feb 2026 19:23:10 +0000
Subject: [PATCH 19/26] Add test that connects a wire with `init` to a constant

---
 tests/opt/opt_clean_init_const.ys | 9 +++++++++
 1 file changed, 9 insertions(+)
 create mode 100644 tests/opt/opt_clean_init_const.ys

diff --git a/tests/opt/opt_clean_init_const.ys b/tests/opt/opt_clean_init_const.ys
new file mode 100644
index 000000000..1b3d5db63
--- /dev/null
+++ b/tests/opt/opt_clean_init_const.ys
@@ -0,0 +1,9 @@
+read_rtlil << EOT
+module \top
+  attribute \init 1'0
+  wire \w
+
+  connect \w 1'0
+end
+EOT
+opt_clean

From fb05c13b338d70a18110361130b31a52ac853bab Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Wed, 28 Jan 2026 22:59:44 +0000
Subject: [PATCH 20/26] Pass the module `Subpool` to `rmunused_module_signals`
 and parallelize that function

---
 passes/opt/opt_clean.cc | 482 +++++++++++++++++++++++++++-------------
 1 file changed, 327 insertions(+), 155 deletions(-)

diff --git a/passes/opt/opt_clean.cc b/passes/opt/opt_clean.cc
index e822d13b1..7c2377b10 100644
--- a/passes/opt/opt_clean.cc
+++ b/passes/opt/opt_clean.cc
@@ -448,9 +448,62 @@ int count_nontrivial_wire_attrs(RTLIL::Wire *w)
 	return count;
 }
 
+struct ShardedSigBit {
+	using Accumulated = ShardedSigBit;
+	RTLIL::SigBit bit;
+	ShardedSigBit() = default;
+	ShardedSigBit(const RTLIL::SigBit &bit) : bit(bit) {}
+};
+struct ShardedSigBitEquality {
+	bool operator()(const ShardedSigBit &b1, const ShardedSigBit &b2) const {
+		return b1.bit == b2.bit;
+	}
+};
+using ShardedSigPool = ShardedHashSet<ShardedSigBit, ShardedSigBitEquality>;
+
+struct ShardedSigSpec {
+	using Accumulated = ShardedSigSpec;
+	RTLIL::SigSpec spec;
+	ShardedSigSpec() = default;
+	ShardedSigSpec(RTLIL::SigSpec spec) : spec(std::move(spec)) {}
+	ShardedSigSpec(ShardedSigSpec &&) = default;
+};
+struct ShardedSigSpecEquality {
+	bool operator()(const ShardedSigSpec &s1, const ShardedSigSpec &s2) const {
+		return s1.spec == s2.spec;
+	}
+};
+using ShardedSigSpecPool = ShardedHashSet<ShardedSigSpec, ShardedSigSpecEquality>;
+
+struct DirectWires {
+	const SigMap &assign_map;
+	const ShardedSigSpecPool &direct_sigs;
+	dict<RTLIL::Wire *, bool> cache;
+
+	DirectWires(const SigMap &assign_map, const ShardedSigSpecPool &direct_sigs) : assign_map(assign_map), direct_sigs(direct_sigs) {}
+	void cache_result_for_bit(const SigBit &bit) {
+		if (bit.wire != nullptr)
+			is_direct(bit.wire);
+	}
+	bool is_direct(RTLIL::Wire *wire) {
+		if (wire->port_input)
+			return true;
+		auto it = cache.find(wire);
+		if (it != cache.end())
+			return it->second;
+		SigSpec direct_sig = assign_map(wire);
+		bool direct = direct_sigs.find({direct_sig, direct_sig.hash_into(Hasher()).yield()}) != nullptr;
+		cache.insert({wire, direct});
+		return direct;
+	}
+};
+
 // Should we pick `s2` over `s1` to represent a signal?
-bool compare_signals(RTLIL::SigBit &s1, RTLIL::SigBit &s2, SigPool &regs, SigPool &conns, pool<RTLIL::Wire*> &direct_wires)
+bool compare_signals(const RTLIL::SigBit &s1, const RTLIL::SigBit &s2, const ShardedSigPool &regs, const ShardedSigPool &conns, DirectWires &direct_wires)
 {
+	if (s1 == s2)
+		return false;
+
 	RTLIL::Wire *w1 = s1.wire;
 	RTLIL::Wire *w2 = s2.wire;
 
@@ -464,12 +517,20 @@ bool compare_signals(RTLIL::SigBit &s1, RTLIL::SigBit &s2, SigPool &regs, SigPoo
 		return !(w2->port_input && w2->port_output);
 
 	if (w1->name.isPublic() && w2->name.isPublic()) {
-		if (regs.check(s1) != regs.check(s2))
-			return regs.check(s2);
-		if (direct_wires.count(w1) != direct_wires.count(w2))
-			return direct_wires.count(w2) != 0;
-		if (conns.check_any(s1) != conns.check_any(s2))
-			return conns.check_any(s2);
+		ShardedSigPool::AccumulatedValue s1_val = {s1, s1.hash_top().yield()};
+		ShardedSigPool::AccumulatedValue s2_val = {s2, s2.hash_top().yield()};
+		bool regs1 = regs.find(s1_val) != nullptr;
+		bool regs2 = regs.find(s2_val) != nullptr;
+		if (regs1 != regs2)
+			return regs2;
+		bool w1_direct = direct_wires.is_direct(w1);
+		bool w2_direct = direct_wires.is_direct(w2);
+		if (w1_direct != w2_direct)
+			return w2_direct;
+		bool conns1 = conns.find(s1_val) != nullptr;
+		bool conns2 = conns.find(s2_val) != nullptr;
+		if (conns1 != conns2)
+			return conns2;
 	}
 
 	if (w1 == w2)
@@ -502,109 +563,185 @@ bool check_public_name(RTLIL::IdString id)
 	return true;
 }
 
-bool rmunused_module_signals(RTLIL::Module *module, bool purge_mode, bool verbose, RmStats &stats)
-{
-	// `register_signals` and `connected_signals` will help us decide later on
-	// on picking representatives out of groups of connected signals
-	SigPool register_signals;
-	SigPool connected_signals;
-	if (!purge_mode)
-		for (auto &it : module->cells_) {
-			RTLIL::Cell *cell = it.second;
-			if (ct_reg.cell_known(cell->type)) {
-				bool clk2fflogic = cell->get_bool_attribute(ID(clk2fflogic));
-				for (auto &it2 : cell->connections())
-					if (clk2fflogic ? it2.first == ID::D : ct_reg.cell_output(cell->type, it2.first))
-						register_signals.add(it2.second);
-			}
-			for (auto &it2 : cell->connections())
-				connected_signals.add(it2.second);
-		}
+void add_spec(ShardedSigPool::Builder &builder, const ThreadIndex &thread, const RTLIL::SigSpec &spec) {
+	for (SigBit bit : spec)
+		if (bit.wire != nullptr)
+			builder.insert(thread, {bit, bit.hash_top().yield()});
+}
 
+bool check_any(const ShardedSigPool &sigs, const RTLIL::SigSpec &spec) {
+	for (SigBit b : spec)
+		if (sigs.find({b, b.hash_top().yield()}) != nullptr)
+			return true;
+	return false;
+}
+
+bool check_all(const ShardedSigPool &sigs, const RTLIL::SigSpec &spec) {
+	for (SigBit b : spec)
+		if (sigs.find({b, b.hash_top().yield()}) == nullptr)
+			return false;
+	return true;
+}
+
+bool rmunused_module_signals(RTLIL::Module *module, ParallelDispatchThreadPool::Subpool &subpool, bool purge_mode, bool verbose, RmStats &stats)
+{
 	SigMap assign_map(module);
 
+	const RTLIL::Module *const_module = module;
+	// `register_signals` and `connected_signals` will help us decide later on
+	// on picking representatives out of groups of connected signals
+	ShardedSigPool::Builder register_signals_builder(subpool);
+	ShardedSigPool::Builder connected_signals_builder(subpool);
 	// construct a pool of wires which are directly driven by a known celltype,
 	// this will influence our choice of representatives
-	pool<RTLIL::Wire*> direct_wires;
-	{
-		pool<RTLIL::SigSpec> direct_sigs;
-		for (auto &it : module->cells_) {
-			RTLIL::Cell *cell = it.second;
+	ShardedSigSpecPool::Builder direct_sigs_builder(subpool);
+	subpool.run([const_module, purge_mode, &assign_map, &direct_sigs_builder, &register_signals_builder, &connected_signals_builder](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		for (int i : ctx.item_range(const_module->cells_size())) {
+			RTLIL::Cell *cell = const_module->cell_at(i);
+			if (!purge_mode) {
+				if (ct_reg.cell_known(cell->type)) {
+					bool clk2fflogic = cell->get_bool_attribute(ID(clk2fflogic));
+					for (auto &it2 : cell->connections())
+						if (clk2fflogic ? it2.first == ID::D : ct_reg.cell_output(cell->type, it2.first))
+							add_spec(register_signals_builder, ctx, it2.second);
+				}
+				for (auto &it2 : cell->connections())
+					add_spec(connected_signals_builder, ctx, it2.second);
+			}
 			if (ct_all.cell_known(cell->type))
 				for (auto &it2 : cell->connections())
-					if (ct_all.cell_output(cell->type, it2.first))
-						direct_sigs.insert(assign_map(it2.second));
+					if (ct_all.cell_output(cell->type, it2.first)) {
+						RTLIL::SigSpec spec = assign_map(it2.second);
+						unsigned int hash = spec.hash_into(Hasher()).yield();
+						direct_sigs_builder.insert(ctx, {std::move(spec), hash});
+					}
 		}
-		for (auto &it : module->wires_) {
-			if (direct_sigs.count(assign_map(it.second)) || it.second->port_input)
-				direct_wires.insert(it.second);
-		}
-	}
+	});
+	subpool.run([&register_signals_builder, &connected_signals_builder, &direct_sigs_builder](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		register_signals_builder.process(ctx);
+		connected_signals_builder.process(ctx);
+		direct_sigs_builder.process(ctx);
+	});
+	ShardedSigPool register_signals(register_signals_builder);
+	ShardedSigPool connected_signals(connected_signals_builder);
+	ShardedSigSpecPool direct_sigs(direct_sigs_builder);
 
-	// weight all options for representatives with `compare_signals`,
-	// the one that wins will be what `assign_map` maps to
-	for (auto &it : module->wires_) {
-		RTLIL::Wire *wire = it.second;
-		for (int i = 0; i < wire->width; i++) {
-			RTLIL::SigBit s1 = RTLIL::SigBit(wire, i), s2 = assign_map(s1);
-			if (compare_signals(s2, s1, register_signals, connected_signals, direct_wires))
-				assign_map.add(s1);
+	ShardedVector<RTLIL::SigBit> sigmap_canonical_candidates(subpool);
+	DirectWires direct_wires(assign_map, direct_sigs);
+	subpool.run([const_module, &assign_map, &register_signals, &connected_signals, &sigmap_canonical_candidates, &direct_sigs, &direct_wires](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		std::optional<DirectWires> local_direct_wires;
+		DirectWires *this_thread_direct_wires = &direct_wires;
+		if (ctx.thread_num > 0) {
+			local_direct_wires.emplace(assign_map, direct_sigs);
+			this_thread_direct_wires = &local_direct_wires.value();
 		}
+		for (int i : ctx.item_range(const_module->wires_size())) {
+			RTLIL::Wire *wire = const_module->wire_at(i);
+			for (int j = 0; j < wire->width; ++j) {
+				RTLIL::SigBit s1(wire, j);
+				RTLIL::SigBit s2 = assign_map(s1);
+				if (compare_signals(s2, s1, register_signals, connected_signals, *this_thread_direct_wires))
+					sigmap_canonical_candidates.insert(ctx, s1);
+			}
+		}
+	});
+	// Cache all the direct_wires results that we might possible need. This avoids the results
+	// changing when we update `assign_map` below.
+	for (RTLIL::SigBit candidate : sigmap_canonical_candidates) {
+		direct_wires.cache_result_for_bit(candidate);
+		direct_wires.cache_result_for_bit(assign_map(candidate));
+	}
+	for (RTLIL::SigBit candidate : sigmap_canonical_candidates) {
+		RTLIL::SigBit current_canonical = assign_map(candidate);
+		if (compare_signals(current_canonical, candidate, register_signals, connected_signals, direct_wires))
+			assign_map.add(candidate);
 	}
 
 	// we are removing all connections
 	module->connections_.clear();
 
 	// used signals sigmapped
-	SigPool used_signals;
+	ShardedSigPool::Builder used_signals_builder(subpool);
 	// used signals pre-sigmapped
-	SigPool raw_used_signals;
+	ShardedSigPool::Builder raw_used_signals_builder(subpool);
 	// used signals sigmapped, ignoring drivers (we keep track of this to set `unused_bits`)
-	SigPool used_signals_nodrivers;
-
-	// gather the usage information for cells
-	for (auto &it : module->cells_) {
-		RTLIL::Cell *cell = it.second;
-		for (auto &it2 : cell->connections_) {
-			assign_map.apply(it2.second); // modify the cell connection in place
-			raw_used_signals.add(it2.second);
-			used_signals.add(it2.second);
-			if (!ct_all.cell_output(cell->type, it2.first))
-				used_signals_nodrivers.add(it2.second);
-		}
-	}
-
-	// gather the usage information for ports, wires with `keep`,
+	ShardedSigPool::Builder used_signals_nodrivers_builder(subpool);
+	struct UpdateConnection {
+		RTLIL::Cell *cell;
+		RTLIL::IdString port;
+		RTLIL::SigSpec spec;
+	};
+	ShardedVector<UpdateConnection> update_connections(subpool);
+	ShardedVector<RTLIL::Wire*> initialized_wires(subpool);
+	// gather the usage information for cells and update cell connections
+	// also gather the usage information for ports, wires with `keep`
 	// also gather init bits
+	subpool.run([const_module, &register_signals, &connected_signals, &direct_sigs, &assign_map, &used_signals_builder, &raw_used_signals_builder, &used_signals_nodrivers_builder, &update_connections, &initialized_wires](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		// Parallel destruction of these sharded structures
+		register_signals.clear(ctx);
+		connected_signals.clear(ctx);
+		direct_sigs.clear(ctx);
+
+		for (int i : ctx.item_range(const_module->cells_size())) {
+			RTLIL::Cell *cell = const_module->cell_at(i);
+			for (const auto &it2 : cell->connections_) {
+				SigSpec spec = assign_map(it2.second);
+				if (spec != it2.second)
+					update_connections.insert(ctx, {cell, it2.first, spec});
+				add_spec(raw_used_signals_builder, ctx, spec);
+				add_spec(used_signals_builder, ctx, spec);
+				if (!ct_all.cell_output(cell->type, it2.first))
+					add_spec(used_signals_nodrivers_builder, ctx, spec);
+			}
+		}
+		for (int i : ctx.item_range(const_module->wires_size())) {
+			RTLIL::Wire *wire = const_module->wire_at(i);
+			if (wire->port_id > 0) {
+				RTLIL::SigSpec sig = RTLIL::SigSpec(wire);
+				add_spec(raw_used_signals_builder, ctx, sig);
+				assign_map.apply(sig);
+				add_spec(used_signals_builder, ctx, sig);
+				if (!wire->port_input)
+					add_spec(used_signals_nodrivers_builder, ctx, sig);
+			}
+			if (wire->get_bool_attribute(ID::keep)) {
+				RTLIL::SigSpec sig = RTLIL::SigSpec(wire);
+				assign_map.apply(sig);
+				add_spec(used_signals_builder, ctx, sig);
+			}
+			auto it2 = wire->attributes.find(ID::init);
+			if (it2 != wire->attributes.end())
+				initialized_wires.insert(ctx, wire);
+		}
+	});
+	subpool.run([&used_signals_builder, &raw_used_signals_builder, &used_signals_nodrivers_builder](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		used_signals_builder.process(ctx);
+		raw_used_signals_builder.process(ctx);
+		used_signals_nodrivers_builder.process(ctx);
+	});
+	ShardedSigPool used_signals(used_signals_builder);
+	ShardedSigPool raw_used_signals(raw_used_signals_builder);
+	ShardedSigPool used_signals_nodrivers(used_signals_nodrivers_builder);
+
 	dict<RTLIL::SigBit, RTLIL::State> init_bits;
-	for (auto &it : module->wires_) {
-		RTLIL::Wire *wire = it.second;
-		if (wire->port_id > 0) {
-			RTLIL::SigSpec sig = RTLIL::SigSpec(wire);
-			raw_used_signals.add(sig);
-			assign_map.apply(sig);
-			used_signals.add(sig);
-			if (!wire->port_input)
-				used_signals_nodrivers.add(sig);
-		}
-		if (wire->get_bool_attribute(ID::keep)) {
-			RTLIL::SigSpec sig = RTLIL::SigSpec(wire);
-			assign_map.apply(sig);
-			used_signals.add(sig);
-		}
-		auto it2 = wire->attributes.find(ID::init);
-		if (it2 != wire->attributes.end()) {
-			RTLIL::Const &val = it2->second;
-			SigSpec sig = assign_map(wire);
-			for (int i = 0; i < GetSize(val) && i < GetSize(sig); i++)
-				if (val[i] != State::Sx)
-					init_bits[sig[i]] = val[i];
-			wire->attributes.erase(it2);
-		}
+	// The wires that appear in the keys of `init_bits`
+	pool<Wire*> init_bits_wires;
+	for (const UpdateConnection &update : update_connections)
+		update.cell->connections_.at(update.port) = std::move(update.spec);
+	for (RTLIL::Wire *intialized_wire : initialized_wires) {
+		auto it = intialized_wire->attributes.find(ID::init);
+		RTLIL::Const &val = it->second;
+		SigSpec sig = assign_map(intialized_wire);
+		for (int i = 0; i < GetSize(val) && i < GetSize(sig); i++)
+			if (val[i] != State::Sx && sig[i].wire != nullptr) {
+				init_bits[sig[i]] = val[i];
+				init_bits_wires.insert(sig[i].wire);
+			}
+		intialized_wire->attributes.erase(it);
 	}
 
 	// set init attributes on all wires of a connected group
-	for (auto wire : module->wires()) {
+	for (RTLIL::Wire *wire : init_bits_wires) {
 		bool found = false;
 		Const val(State::Sx, wire->width);
 		for (int i = 0; i < wire->width; i++) {
@@ -619,81 +756,117 @@ bool rmunused_module_signals(RTLIL::Module *module, bool purge_mode, bool verbos
 	}
 
 	// now decide for each wire if we should be deleting it
-	pool<RTLIL::Wire*> del_wires_queue;
-	for (auto wire : module->wires())
-	{
-		SigSpec s1 = SigSpec(wire), s2 = assign_map(s1);
-		log_assert(GetSize(s1) == GetSize(s2));
+	ShardedVector<RTLIL::Wire*> del_wires(subpool);
+	ShardedVector<RTLIL::Wire*> remove_init(subpool);
+	ShardedVector<std::pair<RTLIL::Wire*, RTLIL::Const>> set_init(subpool);
+	ShardedVector<RTLIL::SigSig> connections(subpool);
+	ShardedVector<RTLIL::Wire*> remove_unused_bits(subpool);
+	ShardedVector<std::pair<RTLIL::Wire*, RTLIL::Const>> set_unused_bits(subpool);
+	subpool.run([const_module, purge_mode, &assign_map, &used_signals, &raw_used_signals, &used_signals_nodrivers, &del_wires, &remove_init, &set_init, &connections, &remove_unused_bits, &set_unused_bits](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		for (int i : ctx.item_range(const_module->wires_size())) {
+			RTLIL::Wire *wire = const_module->wire_at(i);
+			SigSpec s1 = SigSpec(wire), s2 = assign_map(s1);
+			log_assert(GetSize(s1) == GetSize(s2));
 
-		Const initval;
-		if (wire->attributes.count(ID::init))
-			initval = wire->attributes.at(ID::init);
-		if (GetSize(initval) != GetSize(wire))
-			initval.resize(GetSize(wire), State::Sx);
-		if (initval.is_fully_undef())
-			wire->attributes.erase(ID::init);
+			Const initval;
+			bool has_init_attribute = wire->attributes.count(ID::init);
+			bool init_changed = false;
+			if (has_init_attribute)
+				initval = wire->attributes.at(ID::init);
+			if (GetSize(initval) != GetSize(wire)) {
+				initval.resize(GetSize(wire), State::Sx);
+				init_changed = true;
+			}
 
-		if (GetSize(wire) == 0) {
-			// delete zero-width wires, unless they are module ports
-			if (wire->port_id == 0)
+			if (GetSize(wire) == 0) {
+				// delete zero-width wires, unless they are module ports
+				if (wire->port_id == 0)
+					goto delete_this_wire;
+			} else
+			if (wire->port_id != 0 || wire->get_bool_attribute(ID::keep) || !initval.is_fully_undef()) {
+				// do not delete anything with "keep" or module ports or initialized wires
+			} else
+			if (!purge_mode && check_public_name(wire->name) && (check_any(raw_used_signals, s1) || check_any(used_signals, s2) || s1 != s2)) {
+				// do not get rid of public names unless in purge mode or if the wire is entirely unused, not even aliased
+			} else
+			if (!check_any(raw_used_signals, s1)) {
+				// delete wires that aren't used by anything directly
 				goto delete_this_wire;
-		} else
-		if (wire->port_id != 0 || wire->get_bool_attribute(ID::keep) || !initval.is_fully_undef()) {
-			// do not delete anything with "keep" or module ports or initialized wires
-		} else
-		if (!purge_mode && check_public_name(wire->name) && (raw_used_signals.check_any(s1) || used_signals.check_any(s2) || s1 != s2)) {
-			// do not get rid of public names unless in purge mode or if the wire is entirely unused, not even aliased
-		} else
-		if (!raw_used_signals.check_any(s1)) {
-			// delete wires that aren't used by anything directly
-			goto delete_this_wire;
-		}
-
-		if (0)
-		{
-	delete_this_wire:
-			del_wires_queue.insert(wire);
-		}
-		else
-		{
-			RTLIL::SigSig new_conn;
-			for (int i = 0; i < GetSize(s1); i++)
-				if (s1[i] != s2[i]) {
-					if (s2[i] == State::Sx && (initval[i] == State::S0 || initval[i] == State::S1)) {
-						s2[i] = initval[i];
-						initval.set(i, State::Sx);
-					}
-					new_conn.first.append(s1[i]);
-					new_conn.second.append(s2[i]);
-				}
-			if (new_conn.first.size() > 0) {
-				if (initval.is_fully_undef())
-					wire->attributes.erase(ID::init);
-				else
-					wire->attributes.at(ID::init) = initval;
-				module->connect(new_conn);
 			}
 
-			if (!used_signals_nodrivers.check_all(s2)) {
+			if (0)
+			{
+		delete_this_wire:
+				del_wires.insert(ctx, wire);
+			}
+			else
+			{
+				RTLIL::SigSig new_conn;
+				for (int i = 0; i < GetSize(s1); i++)
+					if (s1[i] != s2[i]) {
+						if (s2[i] == State::Sx && (initval[i] == State::S0 || initval[i] == State::S1)) {
+							s2[i] = initval[i];
+							initval.set(i, State::Sx);
+							init_changed = true;
+						}
+						new_conn.first.append(s1[i]);
+						new_conn.second.append(s2[i]);
+					}
+				if (new_conn.first.size() > 0)
+					connections.insert(ctx, std::move(new_conn));
+				if (initval.is_fully_undef()) {
+					if (has_init_attribute)
+						remove_init.insert(ctx, wire);
+				} else
+					if (init_changed)
+						set_init.insert(ctx, {wire, std::move(initval)});
+
 				std::string unused_bits;
-				for (int i = 0; i < GetSize(s2); i++) {
-					if (s2[i].wire == NULL)
-						continue;
-					if (!used_signals_nodrivers.check(s2[i])) {
-						if (!unused_bits.empty())
-							unused_bits += " ";
-						unused_bits += stringf("%d", i);
+				if (!check_all(used_signals_nodrivers, s2)) {
+					for (int i = 0; i < GetSize(s2); i++) {
+						if (s2[i].wire == NULL)
+							continue;
+						SigBit b = s2[i];
+						if (used_signals_nodrivers.find({b, b.hash_top().yield()}) == nullptr) {
+							if (!unused_bits.empty())
+								unused_bits += " ";
+							unused_bits += stringf("%d", i);
+						}
 					}
 				}
-				if (unused_bits.empty() || wire->port_id != 0)
-					wire->attributes.erase(ID::unused_bits);
-				else
-					wire->attributes[ID::unused_bits] = RTLIL::Const(unused_bits);
-			} else {
-				wire->attributes.erase(ID::unused_bits);
+				if (unused_bits.empty() || wire->port_id != 0) {
+					if (wire->attributes.count(ID::unused_bits))
+						remove_unused_bits.insert(ctx, wire);
+				} else {
+					RTLIL::Const unused_bits_const(std::move(unused_bits));
+					if (wire->attributes.count(ID::unused_bits)) {
+						RTLIL::Const &unused_bits_attr = wire->attributes.at(ID::unused_bits);
+						if (unused_bits_attr != unused_bits_const)
+							set_unused_bits.insert(ctx, {wire, std::move(unused_bits_const)});
+					} else
+						set_unused_bits.insert(ctx, {wire, std::move(unused_bits_const)});
+				}
 			}
 		}
-	}
+	});
+	pool<RTLIL::Wire*> del_wires_queue;
+	del_wires_queue.insert(del_wires.begin(), del_wires.end());
+	for (RTLIL::Wire *wire : remove_init)
+		wire->attributes.erase(ID::init);
+	for (auto &p : set_init)
+		p.first->attributes[ID::init] = std::move(p.second);
+	for (auto &conn : connections)
+		module->connect(std::move(conn));
+	for (RTLIL::Wire *wire : remove_unused_bits)
+		wire->attributes.erase(ID::unused_bits);
+	for (auto &p : set_unused_bits)
+		p.first->attributes[ID::unused_bits] = std::move(p.second);
+
+	subpool.run([&used_signals, &raw_used_signals, &used_signals_nodrivers](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		used_signals.clear(ctx);
+		raw_used_signals.clear(ctx);
+		used_signals_nodrivers.clear(ctx);
+	});
 
 	int del_temp_wires_count = 0;
 	for (auto wire : del_wires_queue) {
@@ -886,12 +1059,11 @@ void rmunused_module(RTLIL::Module *module, ParallelDispatchThreadPool &thread_p
 	ParallelDispatchThreadPool::Subpool subpool(thread_pool, num_worker_threads);
 	remove_temporary_cells(module, subpool, verbose);
 	rmunused_module_cells(module, subpool, verbose, stats, keep_cache);
-	while (rmunused_module_signals(module, purge_mode, verbose, stats)) { }
+	while (rmunused_module_signals(module, subpool, purge_mode, verbose, stats)) { }
 
 	if (rminit && rmunused_module_init(module, subpool, verbose))
-		while (rmunused_module_signals(module, purge_mode, verbose, stats)) { }
+		while (rmunused_module_signals(module, subpool, purge_mode, verbose, stats)) { }
 }
-
 struct OptCleanPass : public Pass {
 	OptCleanPass() : Pass("opt_clean", "remove unused cells and wires") { }
 	void help() override

From a1aa9ab4aa0b244faf3828a326110d311cbed69e Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Thu, 5 Feb 2026 21:24:15 +0000
Subject: [PATCH 21/26] Make gmock available in test environment

---
 .github/actions/setup-build-env/action.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/actions/setup-build-env/action.yml b/.github/actions/setup-build-env/action.yml
index 60fe481e7..fd25ae68e 100644
--- a/.github/actions/setup-build-env/action.yml
+++ b/.github/actions/setup-build-env/action.yml
@@ -58,7 +58,7 @@ runs:
       if: runner.os == 'Linux' && inputs.get-test-deps == 'true'
       uses: awalsh128/cache-apt-pkgs-action@v1.6.0
       with:
-        packages: libgtest-dev
+        packages: libgtest-dev libgmock-dev
         version: ${{ inputs.runs-on }}-testys
 
     - name: Install macOS Dependencies

From f34c6fec19fd2e76c5d85061139a8fdfc7d03eab Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Mon, 2 Feb 2026 23:04:34 +0000
Subject: [PATCH 22/26] Add unit-tests for `ParallelDispatchThread` and friends

---
 kernel/threading.h                 |   3 +
 tests/unit/Makefile                |   4 +-
 tests/unit/kernel/threadingTest.cc | 164 +++++++++++++++++++++++++++++
 3 files changed, 169 insertions(+), 2 deletions(-)
 create mode 100644 tests/unit/kernel/threadingTest.cc

diff --git a/kernel/threading.h b/kernel/threading.h
index 82a65676d..3d6495720 100644
--- a/kernel/threading.h
+++ b/kernel/threading.h
@@ -172,6 +172,9 @@ struct IntRange {
 	};
 	Int begin() const { return {start_}; }
 	Int end() const { return {end_}; }
+
+	bool operator==(const IntRange &other) const { return start_ == other.start_ && end_ == other.end_; }
+	bool operator!=(const IntRange &other) const { return !(*this == other); }
 };
 // Divides some number of items into `num_threads` subranges and returns the
 // `thread_num`'th subrange. If `num_threads` is zero, returns the whole range.
diff --git a/tests/unit/Makefile b/tests/unit/Makefile
index b275d7f41..3165ad97b 100644
--- a/tests/unit/Makefile
+++ b/tests/unit/Makefile
@@ -4,10 +4,10 @@ UNAME_S := $(shell uname -s)
 GTEST_PREFIX := $(shell brew --prefix googletest 2>/dev/null)
 ifeq ($(GTEST_PREFIX),)
   GTEST_CXXFLAGS :=
-  GTEST_LDFLAGS := -lgtest -lgtest_main
+  GTEST_LDFLAGS := -lgtest -lgmock -lgtest_main
 else
   GTEST_CXXFLAGS := -I$(GTEST_PREFIX)/include
-  GTEST_LDFLAGS := -L$(GTEST_PREFIX)/lib -lgtest -lgtest_main
+  GTEST_LDFLAGS := -L$(GTEST_PREFIX)/lib -lgtest -lgmock -lgtest_main
 endif
 
 ifeq ($(UNAME_S),Darwin)
diff --git a/tests/unit/kernel/threadingTest.cc b/tests/unit/kernel/threadingTest.cc
new file mode 100644
index 000000000..b26a08fcc
--- /dev/null
+++ b/tests/unit/kernel/threadingTest.cc
@@ -0,0 +1,164 @@
+#include <gtest/gtest.h>
+#include <gmock/gmock.h>
+#include "kernel/threading.h"
+
+YOSYS_NAMESPACE_BEGIN
+
+class ThreadingTest : public testing::Test {
+protected:
+	ThreadingTest() {
+		if (log_files.empty())
+			log_files.emplace_back(stdout);
+	}
+};
+
+TEST_F(ThreadingTest, ParallelDispatchThreadPoolCreate) {
+	// Test creating a pool with 0 threads (treated as 1)
+	ParallelDispatchThreadPool pool0(0);
+	EXPECT_EQ(pool0.num_threads(), 1);
+
+	// Test creating a pool with 1 thread
+	ParallelDispatchThreadPool pool1(1);
+	EXPECT_EQ(pool1.num_threads(), 1);
+
+	// Test creating a pool with 2 threads
+	ParallelDispatchThreadPool pool2(2);
+	// YOSYS_MAX_THREADS or system configuration could mean we
+	// decide to only use one thread.
+	EXPECT_GE(pool2.num_threads(), 1);
+	EXPECT_LE(pool2.num_threads(), 2);
+}
+
+TEST_F(ThreadingTest, ParallelDispatchThreadPoolRunSimple) {
+	ParallelDispatchThreadPool pool(2);
+
+	std::atomic<int> counter{0};
+	pool.run([&counter](const ParallelDispatchThreadPool::RunCtx &) {
+		counter.fetch_add(1, std::memory_order_relaxed);
+	});
+
+	EXPECT_EQ(counter.load(), pool.num_threads());
+}
+
+TEST_F(ThreadingTest, ParallelDispatchThreadPoolRunMultiple) {
+	ParallelDispatchThreadPool pool(2);
+
+	std::atomic<int> counter{0};
+	// Run multiple times to verify the pool can be reused
+	for (int i = 0; i < 5; ++i)
+		pool.run([&counter](const ParallelDispatchThreadPool::RunCtx &) {
+			counter.fetch_add(1, std::memory_order_relaxed);
+		});
+
+	EXPECT_EQ(counter.load(), pool.num_threads() * 5);
+}
+
+TEST_F(ThreadingTest, ParallelDispatchThreadPoolRunCtxThreadNums) {
+	ParallelDispatchThreadPool pool(4);
+
+	std::vector<int> thread_nums(pool.num_threads(), -1);
+	pool.run([&thread_nums](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		thread_nums[ctx.thread_num] = ctx.thread_num;
+	});
+
+	// Every thread should have recorded its own thread number
+	for (int i = 0; i < pool.num_threads(); ++i)
+		EXPECT_EQ(thread_nums[i], i);
+}
+
+TEST_F(ThreadingTest, ParallelDispatchThreadPoolItemRange) {
+	ParallelDispatchThreadPool pool(3);
+
+	const int num_items = 100;
+	std::vector<std::atomic<int>> item_counts(num_items);
+	for (std::atomic<int> &c : item_counts)
+		c.store(0);
+
+	pool.run([&item_counts](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		for (int i : ctx.item_range(num_items))
+			item_counts[i].fetch_add(1);
+	});
+
+	// Each item should have been processed exactly once
+	for (int i = 0; i < num_items; ++i)
+		EXPECT_EQ(item_counts[i].load(), 1);
+}
+
+TEST_F(ThreadingTest, ParallelDispatchThreadPoolSubpool) {
+	ParallelDispatchThreadPool pool(4);
+
+	// Subpool limited to 2 threads
+	ParallelDispatchThreadPool::Subpool subpool(pool, 2);
+	EXPECT_LE(subpool.num_threads(), 2);
+
+	std::atomic<int> counter{0};
+	subpool.run([&counter](const ParallelDispatchThreadPool::RunCtx &) {
+		counter.fetch_add(1, std::memory_order_relaxed);
+	});
+
+	EXPECT_EQ(counter.load(), subpool.num_threads());
+}
+
+TEST_F(ThreadingTest, IntRangeIteration) {
+	IntRange range{3, 7};
+	std::vector<int> values;
+	for (int i : range)
+		values.push_back(i);
+	EXPECT_THAT(values, testing::ElementsAre(3, 4, 5, 6));
+}
+
+TEST_F(ThreadingTest, IntRangeEmpty) {
+	IntRange range{5, 5};
+	for (int _ : range)
+		FAIL();
+}
+
+TEST_F(ThreadingTest, ItemRangeForWorker) {
+	EXPECT_EQ(item_range_for_worker(10, 0, 3), (IntRange{0, 4}));
+	EXPECT_EQ(item_range_for_worker(10, 1, 3), (IntRange{4, 7}));
+	EXPECT_EQ(item_range_for_worker(10, 2, 3), (IntRange{7, 10}));
+}
+
+TEST_F(ThreadingTest, ItemRangeForWorkerZeroThreads) {
+	EXPECT_EQ(item_range_for_worker(10, 0, 0), (IntRange{0, 10}));
+}
+
+TEST_F(ThreadingTest, ShardedVectorBasic) {
+	ParallelDispatchThreadPool pool(2);
+	ShardedVector<int> vec(pool);
+	pool.run([&vec](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		vec.insert(ctx, ctx.thread_num * 10);
+		vec.insert(ctx, ctx.thread_num * 10 + 1);
+	});
+
+	EXPECT_FALSE(vec.empty());
+
+	// Count elements
+	std::vector<int> elements;
+	for (int v : vec) {
+		elements.push_back(v);
+	}
+
+	if (pool.num_threads() == 2)
+		EXPECT_THAT(elements, testing::ElementsAre(0, 1, 10, 11));
+	else
+		EXPECT_THAT(elements, testing::ElementsAre(0, 1));
+}
+
+TEST_F(ThreadingTest, MonotonicFlagBasic) {
+	MonotonicFlag flag;
+	EXPECT_FALSE(flag.load());
+	flag.set();
+	EXPECT_TRUE(flag.load());
+	flag.set();
+	EXPECT_TRUE(flag.load());
+}
+
+TEST_F(ThreadingTest, MonotonicFlagSetAndReturnOld) {
+	MonotonicFlag flag;
+	EXPECT_FALSE(flag.set_and_return_old());
+	EXPECT_TRUE(flag.load());
+	EXPECT_TRUE(flag.set_and_return_old());
+}
+
+YOSYS_NAMESPACE_END

From 56c0da2f4310019e07926395281a086526d6df9f Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Mon, 2 Feb 2026 23:16:20 +0000
Subject: [PATCH 23/26] Add unit tests for `ConcurrentQueue` and `ThreadPool`

---
 tests/unit/kernel/threadingTest.cc | 75 ++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)

diff --git a/tests/unit/kernel/threadingTest.cc b/tests/unit/kernel/threadingTest.cc
index b26a08fcc..a3c19edf7 100644
--- a/tests/unit/kernel/threadingTest.cc
+++ b/tests/unit/kernel/threadingTest.cc
@@ -161,4 +161,79 @@ TEST_F(ThreadingTest, MonotonicFlagSetAndReturnOld) {
 	EXPECT_TRUE(flag.set_and_return_old());
 }
 
+TEST_F(ThreadingTest, ConcurrentQueueBasic) {
+	ConcurrentQueue<int> queue;
+	queue.push_back(1);
+	queue.push_back(2);
+	queue.push_back(3);
+
+	auto v1 = queue.pop_front();
+	auto v2 = queue.pop_front();
+	auto v3 = queue.pop_front();
+
+	ASSERT_TRUE(v1.has_value());
+	ASSERT_TRUE(v2.has_value());
+	ASSERT_TRUE(v3.has_value());
+	EXPECT_EQ(*v1, 1);
+	EXPECT_EQ(*v2, 2);
+	EXPECT_EQ(*v3, 3);
+}
+
+TEST_F(ThreadingTest, ConcurrentQueueTryPopEmpty) {
+	ConcurrentQueue<int> queue;
+	auto v = queue.try_pop_front();
+	EXPECT_FALSE(v.has_value());
+}
+
+TEST_F(ThreadingTest, ConcurrentQueueClose) {
+	ConcurrentQueue<int> queue;
+	queue.push_back(42);
+	queue.close();
+
+	// Can still pop existing elements
+	auto v1 = queue.pop_front();
+	ASSERT_TRUE(v1.has_value());
+	EXPECT_EQ(*v1, 42);
+
+	// After close and empty, pop_front returns nullopt
+	auto v2 = queue.pop_front();
+	EXPECT_FALSE(v2.has_value());
+}
+
+TEST_F(ThreadingTest, ThreadPoolCreate) {
+	// pool_size of 0 means no worker threads
+	ThreadPool pool0(0, [](int) {});
+	EXPECT_EQ(pool0.num_threads(), 0);
+
+	// pool_size of 1 means 1 worker thread
+	std::atomic<int> counter{0};
+	{
+		ThreadPool pool1(1, [&counter](int thread_num) {
+			EXPECT_EQ(thread_num, 0);
+			counter.fetch_add(1);
+		});
+	}
+#ifdef YOSYS_ENABLE_THREADS
+	EXPECT_EQ(counter.load(), 1);
+#else
+	EXPECT_EQ(counter.load(), 0);
+#endif
+}
+
+TEST_F(ThreadingTest, ThreadPoolMultipleThreads) {
+	std::atomic<int> counter{0};
+	{
+		ThreadPool pool(2, [&counter](int) {
+			counter.fetch_add(1);
+		});
+		EXPECT_LE(pool.num_threads(), 2);
+	}
+#ifdef YOSYS_ENABLE_THREADS
+	EXPECT_GE(counter.load(), 1);
+	EXPECT_LE(counter.load(), 2);
+#else
+	EXPECT_EQ(counter.load(), 0);
+#endif
+}
+
 YOSYS_NAMESPACE_END

From bb36842e7b5886c7e11bca19922251429ca53767 Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Mon, 2 Feb 2026 23:36:41 +0000
Subject: [PATCH 24/26] Add some tests for `ShardedHashSet`

---
 tests/unit/kernel/threadingTest.cc | 133 +++++++++++++++++++++++++++++
 1 file changed, 133 insertions(+)

diff --git a/tests/unit/kernel/threadingTest.cc b/tests/unit/kernel/threadingTest.cc
index a3c19edf7..3a3a78978 100644
--- a/tests/unit/kernel/threadingTest.cc
+++ b/tests/unit/kernel/threadingTest.cc
@@ -236,4 +236,137 @@ TEST_F(ThreadingTest, ThreadPoolMultipleThreads) {
 #endif
 }
 
+// Helper types for ShardedHashSet tests
+struct IntValue {
+	using Accumulated = IntValue;
+	int value;
+	operator int() const { return value; }
+};
+
+struct IntValueEquality {
+	bool operator()(int a, int b) const { return a == b; }
+};
+
+TEST_F(ThreadingTest, ShardedHashSetBasic) {
+	ParallelDispatchThreadPool pool(1);
+
+	using HashSet = ShardedHashSet<IntValue, IntValueEquality>;
+	HashSet::Builder builder(pool);
+
+	// Insert some values
+	pool.run([&builder](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		builder.insert(ctx, {{10}, 10});
+		builder.insert(ctx, {{20}, 20});
+		builder.insert(ctx, {{30}, 30});
+	});
+
+	// Process
+	pool.run([&builder](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		builder.process(ctx);
+	});
+
+	// Build and lookup
+	HashSet set(builder);
+	const IntValue *found10 = set.find({{10}, 10});
+	const IntValue *found20 = set.find({{20}, 20});
+	const IntValue *found99 = set.find({{99}, 99});
+
+	ASSERT_NE(found10, nullptr);
+	ASSERT_NE(found20, nullptr);
+	EXPECT_EQ(found99, nullptr);
+	EXPECT_EQ(*found10, 10);
+	EXPECT_EQ(*found20, 20);
+}
+
+TEST_F(ThreadingTest, ShardedHashSetParallelInsert) {
+	ParallelDispatchThreadPool pool(3);
+
+	using HashSet = ShardedHashSet<IntValue, IntValueEquality>;
+	HashSet::Builder builder(pool);
+
+	// Insert values from multiple threads
+	pool.run([&builder](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		for (int i = 0; i < 10; ++i) {
+			int val = ctx.thread_num * 100 + i;
+			builder.insert(ctx, {{val}, static_cast<unsigned>(val)});
+		}
+	});
+
+	pool.run([&builder](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		builder.process(ctx);
+	});
+
+	HashSet set(builder);
+
+	// Verify all values can be found
+	for (int t = 0; t < pool.num_threads(); ++t) {
+		for (int i = 0; i < 10; ++i) {
+			int val = t * 100 + i;
+			const IntValue *found = set.find({{val}, static_cast<unsigned>(val)});
+			ASSERT_NE(found, nullptr) << "Value " << val << " not found";
+			EXPECT_EQ(*found, val);
+		}
+	}
+}
+
+// Helper types for ShardedHashSet tests
+struct IntDictValue {
+	using Accumulated = IntDictValue;
+	int key;
+	int value;
+	bool operator==(const IntDictValue &other) const { return key == other.key && value == other.value; }
+	bool operator!=(const IntDictValue &other) const { return !(*this == other); }
+};
+
+struct IntDictKeyEquality {
+	bool operator()(const IntDictValue &a, const IntDictValue &b) const { return a.key == b.key; }
+};
+
+// Collision handler that sums values
+struct SumCollisionHandler {
+	void operator()(IntDictValue &existing, IntDictValue &incoming) const {
+		existing.value += incoming.value;
+	}
+};
+
+TEST_F(ThreadingTest, ShardedHashSetCollision) {
+	ParallelDispatchThreadPool pool(1);
+
+	using HashSet = ShardedHashSet<IntDictValue, IntDictKeyEquality, SumCollisionHandler>;
+	HashSet::Builder builder(pool);
+
+	// Insert duplicate keys with same hash - duplicates should collapse
+	pool.run([&builder](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		builder.insert(ctx, {{5, 10}, 5});
+		builder.insert(ctx, {{5, 12}, 5});  // Duplicate key/hash
+		builder.insert(ctx, {{5, 14}, 5});  // Another duplicate
+	});
+
+	pool.run([&builder](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		builder.process(ctx);
+	});
+
+	HashSet set(builder);
+	const IntDictValue *found = set.find({{5, 0}, 5});
+	ASSERT_NE(found, nullptr);
+	// With default collision handler, first value is kept
+	EXPECT_EQ(*found, (IntDictValue{5, 36}));
+}
+
+TEST_F(ThreadingTest, ShardedHashSetEmpty) {
+	ParallelDispatchThreadPool pool(1);
+
+	using HashSet = ShardedHashSet<IntValue, IntValueEquality>;
+	HashSet::Builder builder(pool);
+
+	// Don't insert anything, just process
+	pool.run([&builder](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		builder.process(ctx);
+	});
+
+	HashSet set(builder);
+	const IntValue *found = set.find({{42}, 42});
+	EXPECT_EQ(found, nullptr);
+}
+
 YOSYS_NAMESPACE_END

From 34d9b28050c473ea221d5cb4543433ceb3fec890 Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Tue, 3 Feb 2026 00:02:00 +0000
Subject: [PATCH 25/26] Add unit tests for `ConcurrentWorkQueue`

---
 tests/unit/kernel/threadingTest.cc | 70 ++++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)

diff --git a/tests/unit/kernel/threadingTest.cc b/tests/unit/kernel/threadingTest.cc
index 3a3a78978..c0bd5927f 100644
--- a/tests/unit/kernel/threadingTest.cc
+++ b/tests/unit/kernel/threadingTest.cc
@@ -369,4 +369,74 @@ TEST_F(ThreadingTest, ShardedHashSetEmpty) {
 	EXPECT_EQ(found, nullptr);
 }
 
+TEST_F(ThreadingTest, ConcurrentWorkQueueSingleThread) {
+	ConcurrentWorkQueue<int> queue(1, 10);  // 1 thread, batch size 10
+	EXPECT_EQ(queue.num_threads(), 1);
+
+	ThreadIndex thread{0};
+
+	// Push some items (less than batch size)
+	for (int i = 0; i < 5; ++i)
+		queue.push(thread, i);
+
+	// Pop should return those items
+	std::vector<int> batch = queue.pop_batch(thread);
+	EXPECT_THAT(batch, testing::UnorderedElementsAre(0, 1, 2, 3, 4));
+
+	// Next pop should return empty (all threads "waiting")
+	std::vector<int> empty_batch = queue.pop_batch(thread);
+	EXPECT_TRUE(empty_batch.empty());
+}
+
+TEST_F(ThreadingTest, ConcurrentWorkQueueBatching) {
+	ConcurrentWorkQueue<int> queue(1, 3);  // batch size 3
+	ThreadIndex thread{0};
+
+	queue.push(thread, 10);
+	queue.push(thread, 20);
+	queue.push(thread, 30);
+	queue.push(thread, 40);
+	queue.push(thread, 50);
+
+	std::vector<int> popped;
+	while (true) {
+		std::vector<int> batch = queue.pop_batch(thread);
+		if (batch.empty())
+			break;
+		popped.insert(popped.end(), batch.begin(), batch.end());
+	}
+	EXPECT_THAT(popped, testing::UnorderedElementsAre(10, 20, 30, 40, 50));
+}
+
+TEST_F(ThreadingTest, ConcurrentWorkQueueParallel) {
+	ParallelDispatchThreadPool pool(2);
+	if (pool.num_threads() < 2) {
+		// Skip test if we don't have multiple threads
+		return;
+	}
+
+	ConcurrentWorkQueue<int> queue(2, 3);
+	std::atomic<int> sum{0};
+
+	pool.run([&queue, &sum](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		// Each thread pushes some work
+		for (int i = 0; i < 10; ++i)
+			queue.push(ctx, ctx.thread_num * 100 + i);
+
+		// Each thread processes work until done
+		while (true) {
+			std::vector<int> batch = queue.pop_batch(ctx);
+			if (batch.empty())
+				break;
+			for (int v : batch)
+				sum.fetch_add(v);
+		}
+	});
+
+	// Thread 0 pushes: 0+1+2+...+9 = 45
+	// Thread 1 pushes: 100+101+...+109 = 1045
+	// Total = 45 + 1045 = 1090
+	EXPECT_EQ(sum.load(), 1090);
+}
+
 YOSYS_NAMESPACE_END

From b438afc2d8439122b0e736ef5e7d59e37e0e0c2e Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <rocallahan@google.com>
Date: Thu, 5 Feb 2026 18:20:31 +0000
Subject: [PATCH 26/26] Add 'init' attributes to RTLIL fuzzing

---
 tests/tools/rtlil-fuzz-grammar.json | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/tools/rtlil-fuzz-grammar.json b/tests/tools/rtlil-fuzz-grammar.json
index c27b160f4..96af9bde3 100644
--- a/tests/tools/rtlil-fuzz-grammar.json
+++ b/tests/tools/rtlil-fuzz-grammar.json
@@ -8,7 +8,7 @@
 			"end\n"
 		]
 	],
-	"<WIRE>": [ [ "  wire width ", "<WIDTH>", " ", "<WIRE_MODE>", " ", "<WIRE_ID>", "\n" ] ],
+	"<WIRE>": [ [ "<WIRE_ATTRIBUTES>", "  wire width ", "<WIDTH>", " ", "<WIRE_MODE>", " ", "<WIRE_ID>", "\n" ] ],
 	"<WIDTH>": [ [ "1" ], [ "2" ], [ "3" ], [ "4" ], [ "32" ], [ "128" ] ],
 	"<WIRE_MODE>": [ [ "input ", "<PORT_ID>" ], [ "output ", "<PORT_ID>" ], [ "inout ", "<PORT_ID>" ], [] ],
 	"<CELL>": [
@@ -71,6 +71,7 @@
 			"  end\n"
 		]
 	],
+	"<WIRE_ATTRIBUTE>": [ [ "  attribute \\init ", "<CONST>", "\n" ] ],
 	"<WIRE_ID>": [ [ "\\wire_a" ], [ "\\wire_b" ], [ "\\wire_c" ], [ "\\wire_d" ], [ "\\wire_e" ], [ "\\wire_f" ], [ "\\wire_g" ], [ "\\wire_h" ], [ "\\wire_i" ], [ "\\wire_j" ] ],
 	"<CELL_ID>": [ [ "\\cell_a" ], [ "\\cell_b" ], [ "\\cell_c" ], [ "\\cell_d" ], [ "\\cell_e" ], [ "\\cell_f" ], [ "\\cell_g" ], [ "\\cell_h" ], [ "\\cell_i" ], [ "\\cell_j" ] ],
 	"<BLACKBOX_CELL>": [ [ "\\bb1" ], [ "\\bb2" ] ],
@@ -97,6 +98,7 @@
 	"<CONNECT>": [ [ "  connect ", "<SIGSPEC>", " ", "<SIGSPEC>", "\n" ] ],
 
 	"<WIRES>": [ [ ], [ "<WIRE>", "<WIRES>" ] ],
+	"<WIRE_ATTRIBUTES>": [ [ ], [ "<WIRE_ATTRIBUTE>", "<WIRE_ATTRIBUTES>" ] ],
 	"<CELLS>": [ [ ], [ "<CELL>", "<CELLS>" ] ],
 	"<BITS>": [ [ ], [ "<BIT>", "<BITS>" ] ],
 	"<CONNECTS>": [ [ ], [ "<CONNECT>", "<CONNECTS>" ] ],