diff --git a/.github/actions/setup-build-env/action.yml b/.github/actions/setup-build-env/action.yml
index 60fe481e7..fd25ae68e 100644
--- a/.github/actions/setup-build-env/action.yml
+++ b/.github/actions/setup-build-env/action.yml
@@ -58,7 +58,7 @@ runs:
       if: runner.os == 'Linux' && inputs.get-test-deps == 'true'
       uses: awalsh128/cache-apt-pkgs-action@v1.6.0
       with:
-        packages: libgtest-dev
+        packages: libgtest-dev libgmock-dev
         version: ${{ inputs.runs-on }}-testys
 
     - name: Install macOS Dependencies
diff --git a/frontends/rtlil/rtlil_frontend.cc b/frontends/rtlil/rtlil_frontend.cc
index a1412d983..7e2ec5460 100644
--- a/frontends/rtlil/rtlil_frontend.cc
+++ b/frontends/rtlil/rtlil_frontend.cc
@@ -286,6 +286,7 @@ struct RTLILFrontendWorker {
 		if (width > MAX_CONST_WIDTH)
 			error("Constant width %lld out of range before `%s`.", width, error_token());
 		bits.reserve(width);
+		int start_idx = idx;
 		while (true) {
 			RTLIL::State bit;
 			switch (line[idx]) {
@@ -300,8 +301,9 @@ struct RTLILFrontendWorker {
 			bits.push_back(bit);
 			++idx;
 		}
-		done:
-		std::reverse(bits.begin(), bits.end());
+	done:
+		if (start_idx < idx)
+			std::reverse(bits.begin(), bits.end());
 
 		if (GetSize(bits) > width)
 			bits.resize(width);
diff --git a/kernel/ffinit.h b/kernel/ffinit.h
index 920fba307..8b4758f60 100644
--- a/kernel/ffinit.h
+++ b/kernel/ffinit.h
@@ -22,6 +22,7 @@
 
 #include "kernel/yosys.h"
 #include "kernel/sigtools.h"
+#include "kernel/threading.h"
 
 YOSYS_NAMESPACE_BEGIN
 
@@ -35,34 +36,55 @@ struct FfInitVals
 		sigmap = sigmap_;
 		initbits.clear();
 		for (auto wire : module->wires())
+			if (wire->attributes.count(ID::init))
+				process_wire(wire);
+	}
+
+	void process_wire(RTLIL::Wire *wire)
+	{
+		SigSpec wirebits = (*sigmap)(wire);
+		Const initval = wire->attributes.at(ID::init);
+
+		for (int i = 0; i < GetSize(wirebits) && i < GetSize(initval); i++)
 		{
-			if (wire->attributes.count(ID::init) == 0)
+			SigBit bit = wirebits[i];
+			State val = initval[i];
+
+			if (val != State::S0 && val != State::S1 && bit.wire != nullptr)
 				continue;
 
-			SigSpec wirebits = (*sigmap)(wire);
-			Const initval = wire->attributes.at(ID::init);
-
-			for (int i = 0; i < GetSize(wirebits) && i < GetSize(initval); i++)
-			{
-				SigBit bit = wirebits[i];
-				State val = initval[i];
-
-				if (val != State::S0 && val != State::S1 && bit.wire != nullptr)
-					continue;
-
-				if (initbits.count(bit)) {
-					if (initbits.at(bit).first != val)
-						log_error("Conflicting init values for signal %s (%s = %s != %s).\n",
-								log_signal(bit), log_signal(SigBit(wire, i)),
-								log_signal(val), log_signal(initbits.at(bit).first));
-					continue;
-				}
-
-				initbits[bit] = std::make_pair(val,SigBit(wire,i));
+			if (initbits.count(bit)) {
+				if (initbits.at(bit).first != val)
+					log_error("Conflicting init values for signal %s (%s = %s != %s).\n",
+							log_signal(bit), log_signal(SigBit(wire, i)),
+							log_signal(val), log_signal(initbits.at(bit).first));
+				continue;
 			}
+
+			initbits[bit] = std::make_pair(val,SigBit(wire,i));
 		}
 	}
 
+	void set_parallel(const SigMapView *sigmap_, ParallelDispatchThreadPool &thread_pool, RTLIL::Module *module)
+	{
+		sigmap = sigmap_;
+		initbits.clear();
+
+		const RTLIL::Module *const_module = module;
+		ParallelDispatchThreadPool::Subpool subpool(thread_pool, ThreadPool::work_pool_size(0, module->wires_size(), 1000));
+		ShardedVector<RTLIL::Wire*> init_wires(subpool);
+		subpool.run([const_module, &init_wires](const ParallelDispatchThreadPool::RunCtx &ctx) {
+			for (int i : ctx.item_range(const_module->wires_size())) {
+				RTLIL::Wire *wire = const_module->wire_at(i);
+				if (wire->attributes.count(ID::init))
+					init_wires.insert(ctx, wire);
+			}
+		});
+
+		for (RTLIL::Wire *wire : init_wires)
+			process_wire(wire);
+	}
+
 	RTLIL::State operator()(RTLIL::SigBit bit) const
 	{
 		auto it = initbits.find((*sigmap)(bit));
diff --git a/kernel/log.cc b/kernel/log.cc
index 018a19081..b114f1eaf 100644
--- a/kernel/log.cc
+++ b/kernel/log.cc
@@ -324,6 +324,14 @@ void log_formatted_file_info(std::string_view filename, int lineno, std::string
 	log("%s:%d: Info: %s", filename, lineno, str);
 }
 
+void log_suppressed() {
+	if (log_debug_suppressed && !log_make_debug) {
+		constexpr const char* format = "<suppressed ~%d debug messages>\n";
+		logv_string(format, stringf(format, log_debug_suppressed));
+		log_debug_suppressed = 0;
+	}
+}
+
 [[noreturn]]
 static void log_error_with_prefix(std::string_view prefix, std::string str)
 {
@@ -345,7 +353,9 @@ static void log_error_with_prefix(std::string_view prefix, std::string str)
 	}
 
 	log_last_error = std::move(str);
-	log("%s%s", prefix, log_last_error);
+	std::string message(prefix);
+	message += log_last_error;
+	logv_string("%s%s", message);
 	log_flush();
 
 	log_make_debug = bak_log_make_debug;
@@ -355,7 +365,7 @@ static void log_error_with_prefix(std::string_view prefix, std::string str)
 			item.current_count++;
 
 	for (auto &[_, item] : log_expect_prefix_error)
-		if (std::regex_search(string(prefix) + string(log_last_error), item.pattern))
+		if (std::regex_search(message, item.pattern))
 			item.current_count++;
 
 	log_check_expected();
diff --git a/kernel/log.h b/kernel/log.h
index 63faf7091..d132ba1a0 100644
--- a/kernel/log.h
+++ b/kernel/log.h
@@ -206,12 +206,7 @@ template <typename... Args>
 	log_formatted_cmd_error(fmt.format(args...));
 }
 
-static inline void log_suppressed() {
-	if (log_debug_suppressed && !log_make_debug) {
-		log("<suppressed ~%d debug messages>\n", log_debug_suppressed);
-		log_debug_suppressed = 0;
-	}
-}
+void log_suppressed();
 
 struct LogMakeDebugHdl {
 	bool status = false;
diff --git a/kernel/rtlil.cc b/kernel/rtlil.cc
index eef1c319d..54696e000 100644
--- a/kernel/rtlil.cc
+++ b/kernel/rtlil.cc
@@ -22,6 +22,7 @@
 #include "kernel/celltypes.h"
 #include "kernel/binding.h"
 #include "kernel/sigtools.h"
+#include "kernel/threading.h"
 #include "frontends/verilog/verilog_frontend.h"
 #include "frontends/verilog/preproc.h"
 #include "backends/rtlil/rtlil_backend.h"
@@ -142,9 +143,17 @@ static constexpr bool check_well_known_id_order()
 // and in sorted ascii order, as required by the ID macro.
 static_assert(check_well_known_id_order());
 
+constexpr int STATIC_ID_END = static_cast<int>(RTLIL::StaticId::STATIC_ID_END);
+
 struct IdStringCollector {
+	IdStringCollector(std::vector<MonotonicFlag> &live_ids)
+			: live_ids(live_ids) {}
+
 	void trace(IdString id) {
-		live.insert(id.index_);
+		if (id.index_ >= STATIC_ID_END)
+			live_ids[id.index_ - STATIC_ID_END].set();
+		else if (id.index_ < 0)
+			live_autoidx_ids.push_back(id.index_);
 	}
 	template <typename T> void trace(const T* v) {
 		trace(*v);
@@ -178,10 +187,6 @@ struct IdStringCollector {
 			trace(element);
 	}
 
-	void trace(const RTLIL::Design &design) {
-		trace_values(design.modules_);
-		trace(design.selection_vars);
-	}
 	void trace(const RTLIL::Selection &selection_var) {
 		trace(selection_var.selected_modules);
 		trace(selection_var.selected_members);
@@ -190,15 +195,6 @@ struct IdStringCollector {
 		trace_keys(named.attributes);
 		trace(named.name);
 	}
-	void trace(const RTLIL::Module &module) {
-		trace_named(module);
-		trace_values(module.wires_);
-		trace_values(module.cells_);
-		trace(module.avail_parameters);
-		trace_keys(module.parameter_default_values);
-		trace_values(module.memories);
-		trace_values(module.processes);
-	}
 	void trace(const RTLIL::Wire &wire) {
 		trace_named(wire);
 		if (wire.known_driver())
@@ -234,7 +230,8 @@ struct IdStringCollector {
 		trace(action.memid);
 	}
 
-	std::unordered_set<int> live;
+	std::vector<MonotonicFlag> &live_ids;
+	std::vector<int> live_autoidx_ids;
 };
 
 int64_t RTLIL::OwningIdString::gc_ns;
@@ -243,20 +240,55 @@ int RTLIL::OwningIdString::gc_count;
 void RTLIL::OwningIdString::collect_garbage()
 {
 	int64_t start = PerformanceTimer::query();
-	IdStringCollector collector;
-	for (auto &[idx, design] : *RTLIL::Design::get_all_designs()) {
-		collector.trace(*design);
-	}
-	int size = GetSize(global_id_storage_);
-	for (int i = static_cast<int>(StaticId::STATIC_ID_END); i < size; ++i) {
-		RTLIL::IdString::Storage &storage = global_id_storage_.at(i);
-		if (storage.buf == nullptr)
-			continue;
-		if (collector.live.find(i) != collector.live.end())
-			continue;
-		if (global_refcount_storage_.find(i) != global_refcount_storage_.end())
-			continue;
 
+	int pool_size = 0;
+	for (auto &[idx, design] : *RTLIL::Design::get_all_designs())
+		for (RTLIL::Module *module : design->modules())
+			pool_size = std::max(pool_size, ThreadPool::work_pool_size(0, module->cells_size(), 1000));
+	ParallelDispatchThreadPool thread_pool(pool_size);
+
+	int size = GetSize(global_id_storage_);
+	std::vector<MonotonicFlag> live_ids(size - STATIC_ID_END);
+	std::vector<IdStringCollector> collectors;
+	int num_threads = thread_pool.num_threads();
+	collectors.reserve(num_threads);
+	for (int i = 0; i < num_threads; ++i)
+		collectors.emplace_back(live_ids);
+
+	for (auto &[idx, design] : *RTLIL::Design::get_all_designs()) {
+		for (RTLIL::Module *module : design->modules()) {
+			collectors[0].trace_named(*module);
+			ParallelDispatchThreadPool::Subpool subpool(thread_pool, ThreadPool::work_pool_size(0, module->cells_size(), 1000));
+			subpool.run([&collectors, module](const ParallelDispatchThreadPool::RunCtx &ctx) {
+				for (int i : ctx.item_range(module->cells_size()))
+					collectors[ctx.thread_num].trace(module->cell_at(i));
+				for (int i : ctx.item_range(module->wires_size()))
+					collectors[ctx.thread_num].trace(module->wire_at(i));
+			});
+			collectors[0].trace(module->avail_parameters);
+			collectors[0].trace_keys(module->parameter_default_values);
+			collectors[0].trace_values(module->memories);
+			collectors[0].trace_values(module->processes);
+		}
+		collectors[0].trace(design->selection_vars);
+	}
+
+	ShardedVector<int> free_ids(thread_pool);
+	thread_pool.run([&live_ids, size, &free_ids](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		for (int i : ctx.item_range(size - STATIC_ID_END)) {
+			int index = i + STATIC_ID_END;
+			RTLIL::IdString::Storage &storage = global_id_storage_.at(index);
+			if (storage.buf == nullptr)
+				continue;
+			if (live_ids[i].load())
+				continue;
+			if (global_refcount_storage_.find(index) != global_refcount_storage_.end())
+				continue;
+			free_ids.insert(ctx, index);
+		}
+	});
+	for (int i : free_ids) {
+		RTLIL::IdString::Storage &storage = global_id_storage_.at(i);
 		if (yosys_xtrace) {
 			log("#X# Removed IdString '%s' with index %d.\n", storage.buf, i);
 			log_backtrace("-X- ", yosys_xtrace-1);
@@ -268,8 +300,13 @@ void RTLIL::OwningIdString::collect_garbage()
 		global_free_idx_list_.push_back(i);
 	}
 
+	std::unordered_set<int> live_autoidx_ids;
+	for (IdStringCollector &collector : collectors)
+		for (int id : collector.live_autoidx_ids)
+			live_autoidx_ids.insert(id);
+
 	for (auto it = global_autoidx_id_storage_.begin(); it != global_autoidx_id_storage_.end();) {
-		if (collector.live.find(it->first) != collector.live.end()) {
+		if (live_autoidx_ids.find(it->first) != live_autoidx_ids.end()) {
 			++it;
 			continue;
 		}
@@ -1466,15 +1503,21 @@ void RTLIL::Design::sort_modules()
 	modules_.sort(sort_by_id_str());
 }
 
+void check_module(RTLIL::Module *module, ParallelDispatchThreadPool &thread_pool);
+
 void RTLIL::Design::check()
 {
 #ifndef NDEBUG
 	log_assert(!selection_stack.empty());
+	int pool_size = 0;
+	for (auto &it : modules_)
+		pool_size = std::max(pool_size, ThreadPool::work_pool_size(0, it.second->cells_size(), 1000));
+	ParallelDispatchThreadPool thread_pool(pool_size);
 	for (auto &it : modules_) {
 		log_assert(this == it.second->design);
 		log_assert(it.first == it.second->name);
 		log_assert(!it.first.empty());
-		it.second->check();
+		check_module(it.second, thread_pool);
 	}
 #endif
 }
@@ -1710,11 +1753,11 @@ size_t RTLIL::Module::count_id(RTLIL::IdString id)
 namespace {
 	struct InternalCellChecker
 	{
-		RTLIL::Module *module;
+		const RTLIL::Module *module;
 		RTLIL::Cell *cell;
 		pool<RTLIL::IdString> expected_params, expected_ports;
 
-		InternalCellChecker(RTLIL::Module *module, RTLIL::Cell *cell) : module(module), cell(cell) { }
+		InternalCellChecker(const RTLIL::Module *module, RTLIL::Cell *cell) : module(module), cell(cell) { }
 
 		void error(int linenr)
 		{
@@ -2690,88 +2733,96 @@ void RTLIL::Module::sort()
 		it.second->attributes.sort(sort_by_id_str());
 }
 
-void RTLIL::Module::check()
+void check_module(RTLIL::Module *module, ParallelDispatchThreadPool &thread_pool)
 {
 #ifndef NDEBUG
-	std::vector<bool> ports_declared;
-	for (auto &it : wires_) {
-		log_assert(this == it.second->module);
-		log_assert(it.first == it.second->name);
-		log_assert(!it.first.empty());
-		log_assert(it.second->width >= 0);
-		log_assert(it.second->port_id >= 0);
-		for (auto &it2 : it.second->attributes)
-			log_assert(!it2.first.empty());
-		if (it.second->port_id) {
-			log_assert(GetSize(ports) >= it.second->port_id);
-			log_assert(ports.at(it.second->port_id-1) == it.first);
-			log_assert(it.second->port_input || it.second->port_output);
-			if (GetSize(ports_declared) < it.second->port_id)
-				ports_declared.resize(it.second->port_id);
-			log_assert(ports_declared[it.second->port_id-1] == false);
-			ports_declared[it.second->port_id-1] = true;
-		} else
-			log_assert(!it.second->port_input && !it.second->port_output);
-	}
-	for (auto port_declared : ports_declared)
-		log_assert(port_declared == true);
-	log_assert(GetSize(ports) == GetSize(ports_declared));
+	ParallelDispatchThreadPool::Subpool subpool(thread_pool, ThreadPool::work_pool_size(0, module->cells_size(), 1000));
+	const RTLIL::Module *const_module = module;
 
-	for (auto &it : memories) {
+	pool<std::string> memory_strings;
+	for (auto &it : module->memories) {
 		log_assert(it.first == it.second->name);
 		log_assert(!it.first.empty());
 		log_assert(it.second->width >= 0);
 		log_assert(it.second->size >= 0);
 		for (auto &it2 : it.second->attributes)
 			log_assert(!it2.first.empty());
+		memory_strings.insert(it.second->name.str());
 	}
 
-	pool<IdString> packed_memids;
+	std::vector<MonotonicFlag> ports_declared(GetSize(module->ports));
+	ShardedVector<std::string> memids(subpool);
+	subpool.run([const_module, &ports_declared, &memory_strings, &memids](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		for (int i : ctx.item_range(const_module->cells_size())) {
+			auto it = *const_module->cells_.element(i);
+			log_assert(const_module == it.second->module);
+			log_assert(it.first == it.second->name);
+			log_assert(!it.first.empty());
+			log_assert(!it.second->type.empty());
+			for (auto &it2 : it.second->connections()) {
+				log_assert(!it2.first.empty());
+				it2.second.check(const_module);
+			}
+			for (auto &it2 : it.second->attributes)
+				log_assert(!it2.first.empty());
+			for (auto &it2 : it.second->parameters)
+				log_assert(!it2.first.empty());
+			InternalCellChecker checker(const_module, it.second);
+			checker.check();
+			if (it.second->has_memid()) {
+				log_assert(memory_strings.count(it.second->parameters.at(ID::MEMID).decode_string()));
+			} else if (it.second->is_mem_cell()) {
+				std::string memid = it.second->parameters.at(ID::MEMID).decode_string();
+				log_assert(!memory_strings.count(memid));
+				memids.insert(ctx, std::move(memid));
+			}
+			auto cell_mod = const_module->design->module(it.first);
+			if (cell_mod != nullptr) {
+				// assertion check below to make sure that there are no
+				// cases where a cell has a blackbox attribute since
+				// that is deprecated
+				#ifdef __GNUC__
+				#pragma GCC diagnostic push
+				#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+				#endif
+				log_assert(!it.second->get_blackbox_attribute());
+				#ifdef __GNUC__
+				#pragma GCC diagnostic pop
+				#endif
+			}
+		}
 
-	for (auto &it : cells_) {
-		log_assert(this == it.second->module);
-		log_assert(it.first == it.second->name);
-		log_assert(!it.first.empty());
-		log_assert(!it.second->type.empty());
-		for (auto &it2 : it.second->connections()) {
-			log_assert(!it2.first.empty());
-			it2.second.check(this);
+		for (int i : ctx.item_range(const_module->wires_size())) {
+			auto it = *const_module->wires_.element(i);
+			log_assert(const_module == it.second->module);
+			log_assert(it.first == it.second->name);
+			log_assert(!it.first.empty());
+			log_assert(it.second->width >= 0);
+			log_assert(it.second->port_id >= 0);
+			for (auto &it2 : it.second->attributes)
+				log_assert(!it2.first.empty());
+			if (it.second->port_id) {
+				log_assert(GetSize(const_module->ports) >= it.second->port_id);
+				log_assert(const_module->ports.at(it.second->port_id-1) == it.first);
+				log_assert(it.second->port_input || it.second->port_output);
+				log_assert(it.second->port_id <= GetSize(ports_declared));
+				bool previously_declared = ports_declared[it.second->port_id-1].set_and_return_old();
+				log_assert(previously_declared == false);
+			} else
+				log_assert(!it.second->port_input && !it.second->port_output);
 		}
-		for (auto &it2 : it.second->attributes)
-			log_assert(!it2.first.empty());
-		for (auto &it2 : it.second->parameters)
-			log_assert(!it2.first.empty());
-		InternalCellChecker checker(this, it.second);
-		checker.check();
-		if (it.second->has_memid()) {
-			log_assert(memories.count(it.second->parameters.at(ID::MEMID).decode_string()));
-		} else if (it.second->is_mem_cell()) {
-			IdString memid = it.second->parameters.at(ID::MEMID).decode_string();
-			log_assert(!memories.count(memid));
-			log_assert(!packed_memids.count(memid));
-			packed_memids.insert(memid);
-		}
-		auto cell_mod = design->module(it.first);
-		if (cell_mod != nullptr) {
-			// assertion check below to make sure that there are no
-			// cases where a cell has a blackbox attribute since
-			// that is deprecated
-			#ifdef __GNUC__
-			#pragma GCC diagnostic push
-			#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-			#endif
-			log_assert(!it.second->get_blackbox_attribute());
-			#ifdef __GNUC__
-			#pragma GCC diagnostic pop
-			#endif
-		}
-	}
+	});
+	for (const MonotonicFlag &port_declared : ports_declared)
+		log_assert(port_declared.load() == true);
+	pool<std::string> memids_pool;
+	for (std::string &memid : memids)
+		log_assert(memids_pool.insert(memid).second);
 
-	for (auto &it : processes) {
+	for (auto &it : module->processes) {
 		log_assert(it.first == it.second->name);
 		log_assert(!it.first.empty());
 		log_assert(it.second->root_case.compare.empty());
-		std::vector<CaseRule*> all_cases = {&it.second->root_case};
+		std::vector<RTLIL::CaseRule*> all_cases = {&it.second->root_case};
 		for (size_t i = 0; i < all_cases.size(); i++) {
 			for (auto &switch_it : all_cases[i]->switches) {
 				for (auto &case_it : switch_it->cases) {
@@ -2784,34 +2835,41 @@ void RTLIL::Module::check()
 		}
 		for (auto &sync_it : it.second->syncs) {
 			switch (sync_it->type) {
-				case SyncType::ST0:
-				case SyncType::ST1:
-				case SyncType::STp:
-				case SyncType::STn:
-				case SyncType::STe:
+				case RTLIL::SyncType::ST0:
+				case RTLIL::SyncType::ST1:
+				case RTLIL::SyncType::STp:
+				case RTLIL::SyncType::STn:
+				case RTLIL::SyncType::STe:
 					log_assert(!sync_it->signal.empty());
 					break;
-				case SyncType::STa:
-				case SyncType::STg:
-				case SyncType::STi:
+				case RTLIL::SyncType::STa:
+				case RTLIL::SyncType::STg:
+				case RTLIL::SyncType::STi:
 					log_assert(sync_it->signal.empty());
 					break;
 			}
 		}
 	}
 
-	for (auto &it : connections_) {
+	for (auto &it : module->connections_) {
 		log_assert(it.first.size() == it.second.size());
 		log_assert(!it.first.has_const());
-		it.first.check(this);
-		it.second.check(this);
+		it.first.check(module);
+		it.second.check(module);
 	}
 
-	for (auto &it : attributes)
+	for (auto &it : module->attributes)
 		log_assert(!it.first.empty());
 #endif
 }
 
+void RTLIL::Module::check()
+{
+	int pool_size = ThreadPool::work_pool_size(0, cells_size(), 1000);
+	ParallelDispatchThreadPool thread_pool(pool_size);
+	check_module(this, thread_pool);
+}
+
 void RTLIL::Module::optimize()
 {
 }
@@ -5470,7 +5528,7 @@ RTLIL::SigSpec RTLIL::SigSpec::repeat(int num) const
 }
 
 #ifndef NDEBUG
-void RTLIL::SigSpec::check(Module *mod) const
+void RTLIL::SigSpec::check(const Module *mod) const
 {
 	if (rep_ == CHUNK)
 	{
diff --git a/kernel/rtlil.h b/kernel/rtlil.h
index fea53081e..9ecee8942 100644
--- a/kernel/rtlil.h
+++ b/kernel/rtlil.h
@@ -275,6 +275,17 @@ struct RTLIL::IdString
 		*out += std::to_string(-index_);
 	}
 
+	std::string unescape() const {
+		if (index_ < 0) {
+			// Must start with "$auto$" so no unescaping required.
+			return str();
+		}
+		std::string_view str = global_id_storage_.at(index_).str_view();
+		if (str.size() < 2 || str[0] != '\\' || str[1] == '$' || str[1] == '\\' || (str[1] >= '0' && str[1] <= '9'))
+			return std::string(str);
+		return std::string(str.substr(1));
+	}
+
 	class Substrings {
 		std::string_view first_;
 		int suffix_number;
@@ -758,7 +769,7 @@ namespace RTLIL {
 	}
 
 	static inline std::string unescape_id(RTLIL::IdString str) {
-		return unescape_id(str.str());
+		return str.unescape();
 	}
 
 	static inline const char *id2cstr(RTLIL::IdString str) {
@@ -1748,9 +1759,9 @@ public:
 	}
 
 #ifndef NDEBUG
-	void check(Module *mod = nullptr) const;
+	void check(const Module *mod = nullptr) const;
 #else
-	void check(Module *mod = nullptr) const { (void)mod; }
+	void check(const Module *mod = nullptr) const { (void)mod; }
 #endif
 };
 
diff --git a/kernel/threading.cc b/kernel/threading.cc
index dcc044c89..3766c4ddf 100644
--- a/kernel/threading.cc
+++ b/kernel/threading.cc
@@ -17,6 +17,20 @@ static int get_max_threads()
 	return max_threads;
 }
 
+static int init_work_units_per_thread_override()
+{
+	const char *v = getenv("YOSYS_WORK_UNITS_PER_THREAD");
+	if (v == nullptr)
+		return 0;
+	return atoi(v);
+}
+
+static int get_work_units_per_thread_override()
+{
+	static int work_units_per_thread = init_work_units_per_thread_override();
+	return work_units_per_thread;
+}
+
 void DeferredLogs::flush()
 {
 	for (auto &m : logs)
@@ -37,6 +51,14 @@ int ThreadPool::pool_size(int reserved_cores, int max_worker_threads)
 #endif
 }
 
+int ThreadPool::work_pool_size(int reserved_cores, int work_units, int work_units_per_thread)
+{
+	int work_units_per_thread_override = get_work_units_per_thread_override();
+	if (work_units_per_thread_override > 0)
+		work_units_per_thread = work_units_per_thread_override;
+	return pool_size(reserved_cores, work_units / work_units_per_thread);
+}
+
 ThreadPool::ThreadPool(int pool_size, std::function<void(int)> b)
 	: body(std::move(b))
 {
@@ -57,4 +79,72 @@ ThreadPool::~ThreadPool()
 #endif
 }
 
+IntRange item_range_for_worker(int num_items, int thread_num, int num_threads)
+{
+	if (num_threads <= 1) {
+		return {0, num_items};
+	}
+	int items_per_thread = num_items / num_threads;
+	int extra_items = num_items % num_threads;
+	// The first `extra_items` threads get one extra item.
+	int start = thread_num * items_per_thread + std::min(thread_num, extra_items);
+	int end = (thread_num + 1) * items_per_thread + std::min(thread_num + 1, extra_items);
+	return {start, end};
+}
+
+ParallelDispatchThreadPool::ParallelDispatchThreadPool(int pool_size)
+		: num_worker_threads_(std::max(1, pool_size) - 1)
+{
+#ifdef YOSYS_ENABLE_THREADS
+	main_to_workers_signal.resize(num_worker_threads_, 0);
+#endif
+	// Don't start the threads until we've constructed all our data members.
+	thread_pool = std::make_unique<ThreadPool>(num_worker_threads_, [this](int thread_num){
+		run_worker(thread_num);
+	});
+}
+
+ParallelDispatchThreadPool::~ParallelDispatchThreadPool()
+{
+#ifdef YOSYS_ENABLE_THREADS
+	if (num_worker_threads_ == 0)
+		return;
+	current_work = nullptr;
+	num_active_worker_threads_ = num_worker_threads_;
+	signal_workers_start();
+	wait_for_workers_done();
+#endif
+}
+
+void ParallelDispatchThreadPool::run(std::function<void(const RunCtx &)> work, int max_threads)
+{
+	Multithreading multithreading;
+	num_active_worker_threads_ = num_threads(max_threads) - 1;
+	if (num_active_worker_threads_ == 0) {
+		work({{0}, 1});
+		return;
+	}
+#ifdef YOSYS_ENABLE_THREADS
+	current_work = &work;
+	signal_workers_start();
+	work({{0}, num_active_worker_threads_ + 1});
+	wait_for_workers_done();
+#endif
+}
+
+void ParallelDispatchThreadPool::run_worker(int thread_num)
+{
+#ifdef YOSYS_ENABLE_THREADS
+	while (true)
+	{
+		worker_wait_for_start(thread_num);
+		if (current_work == nullptr)
+			break;
+		(*current_work)({{thread_num + 1}, num_active_worker_threads_ + 1});
+		signal_worker_done();
+	}
+	signal_worker_done();
+#endif
+}
+
 YOSYS_NAMESPACE_END
diff --git a/kernel/threading.h b/kernel/threading.h
index b8cd62f87..3d6495720 100644
--- a/kernel/threading.h
+++ b/kernel/threading.h
@@ -131,6 +131,11 @@ public:
 	// The result may be 0.
 	static int pool_size(int reserved_cores, int max_worker_threads);
 
+	// Computes the number of worker threads to use, by dividing work_units among threads.
+	// For testing purposes you can set YOSYS_WORK_UNITS_PER_THREAD to override `work_units_per_thread`.
+	// The result may be 0.
+	static int work_pool_size(int reserved_cores, int work_units, int work_units_per_thread);
+
 	// Create a pool of threads running the given closure (parameterized by thread number).
 	// `pool_size` must be the result of a `pool_size()` call.
 	ThreadPool(int pool_size, std::function<void(int)> b);
@@ -154,6 +159,140 @@ private:
 #endif
 };
 
+// A range of integers [start_, end_) that can be iterated over with a
+// C++ range-based for loop.
+struct IntRange {
+	int start_;
+	int end_;
+	struct Int {
+		int v;
+		int operator*() const { return v; }
+		Int &operator++() { ++v; return *this; }
+		bool operator!=(const Int &other) const { return v != other.v; }
+	};
+	Int begin() const { return {start_}; }
+	Int end() const { return {end_}; }
+
+	bool operator==(const IntRange &other) const { return start_ == other.start_ && end_ == other.end_; }
+	bool operator!=(const IntRange &other) const { return !(*this == other); }
+};
+// Divides some number of items into `num_threads` subranges and returns the
+// `thread_num`'th subrange. If `num_threads` is zero, returns the whole range.
+IntRange item_range_for_worker(int num_items, int thread_num, int num_threads);
+
+// A type that encapsulates the index of a thread in some list of threads. Useful for
+// stronger typechecking and code readability.
+struct ThreadIndex {
+	int thread_num;
+};
+
+// A set of threads with a `run()` API that runs a closure on all of the threads
+// and wait for all those closures to complete. This is a convenient way to implement
+// parallel algorithms that use barrier synchronization.
+class ParallelDispatchThreadPool
+{
+public:
+	// Create a pool of threads running the given closure (parameterized by thread number).
+	// `pool_size` must be the result of a `pool_size()` call.
+	// `pool_size` can be zero, which we treat as 1.
+	ParallelDispatchThreadPool(int pool_size);
+	~ParallelDispatchThreadPool();
+
+	// For each thread running a closure, a `RunCtx` is passed to the closure. Currently
+	// it contains the thread index and the total number of threads. It can be passed
+	// directly to any APIs requiring a `ThreadIndex`.
+	struct RunCtx : public ThreadIndex {
+		int num_threads;
+		IntRange item_range(int num_items) const {
+			return item_range_for_worker(num_items, thread_num, num_threads);
+		}
+	};
+	// Sometimes we only want to activate a subset of the threads in the pool. This
+	// class provides a way to do that. It provides the same `num_threads()`
+	// and `run()` APIs as a `ParallelDispatchThreadPool`.
+	class Subpool {
+	public:
+		Subpool(ParallelDispatchThreadPool &parent, int max_threads)
+				: parent(parent), max_threads(max_threads) {}
+		// Returns the number of threads that will be used when calling `run()`.
+		int num_threads() const {
+			return parent.num_threads(max_threads);
+		}
+		void run(std::function<void(const RunCtx &)> work) {
+			parent.run(std::move(work), max_threads);
+		}
+		ParallelDispatchThreadPool &thread_pool() { return parent; }
+	private:
+		ParallelDispatchThreadPool &parent;
+		int max_threads;
+	};
+
+	// Run the `work` function in parallel on each thread in the pool (parameterized by
+	// thread number). Waits for all work functions to complete. Only one `run()` can be
+	// active at a time.
+	// Uses no more than `max_threads` threads (but at least one).
+	void run(std::function<void(const RunCtx &)> work) {
+		run(std::move(work), INT_MAX);
+	}
+
+	// Returns the number of threads that will be used when calling `run()`.
+	int num_threads() const {
+		return num_threads(INT_MAX);
+	}
+private:
+	friend class Subpool;
+
+	void run(std::function<void(const RunCtx &)> work, int max_threads);
+	int num_threads(int max_threads) const {
+		return std::min(num_worker_threads_ + 1, std::max(1, max_threads));
+	}
+	void run_worker(int thread_num);
+
+	std::unique_ptr<ThreadPool> thread_pool;
+	std::function<void(const RunCtx &)> *current_work = nullptr;
+	// Keeps a correct count even when threads are exiting.
+	int num_worker_threads_;
+	// The count of active workerthreads for the current `run()`.
+	int num_active_worker_threads_ = 0;
+
+#ifdef YOSYS_ENABLE_THREADS
+	// Not especially efficient for large numbers of threads. Worker wakeup could scale
+	// better by conceptually organising workers into a tree and having workers wake
+	// up their children.
+	std::mutex main_to_workers_signal_mutex;
+	std::condition_variable main_to_workers_signal_cv;
+	std::vector<uint8_t> main_to_workers_signal;
+	void signal_workers_start() {
+		std::unique_lock lock(main_to_workers_signal_mutex);
+		std::fill(main_to_workers_signal.begin(), main_to_workers_signal.begin() + num_active_worker_threads_, 1);
+		// When `num_active_worker_threads_` is small compared to `num_worker_threads_`, we have a "thundering herd"
+		// problem here. Fixing that would add complexity so don't worry about it for now.
+		main_to_workers_signal_cv.notify_all();
+	}
+	void worker_wait_for_start(int thread_num) {
+		std::unique_lock lock(main_to_workers_signal_mutex);
+		main_to_workers_signal_cv.wait(lock, [this, thread_num] { return main_to_workers_signal[thread_num] > 0; });
+		main_to_workers_signal[thread_num] = 0;
+	}
+
+	std::atomic<int> done_workers = 0;
+	std::mutex workers_to_main_signal_mutex;
+	std::condition_variable workers_to_main_signal_cv;
+	void signal_worker_done() {
+		int d = done_workers.fetch_add(1, std::memory_order_release);
+		if (d + 1 == num_active_worker_threads_) {
+			std::unique_lock lock(workers_to_main_signal_mutex);
+			workers_to_main_signal_cv.notify_all();
+		}
+	}
+	void wait_for_workers_done() {
+		std::unique_lock lock(workers_to_main_signal_mutex);
+		workers_to_main_signal_cv.wait(lock, [this] { return done_workers.load(std::memory_order_acquire) == num_active_worker_threads_; });
+		done_workers.store(0, std::memory_order_relaxed);
+	}
+#endif
+};
+
 template <class T>
 class ConcurrentStack
 {
@@ -181,6 +320,373 @@ private:
 	std::vector<T> contents;
 };
 
+// A vector that is sharded into buckets, one per thread. This lets multiple threads write
+// efficiently to the vector without synchronization overhead. After all writers have
+// finished writing, the vector can be iterated over. The iteration order is deterministic:
+// all the elements written by thread 0 in the order it inserted them, followed by all elements
+// written by thread 1, etc.
+template <typename T>
+class ShardedVector {
+public:
+	ShardedVector(const ParallelDispatchThreadPool &thread_pool) {
+		init(thread_pool.num_threads());
+	}
+	ShardedVector(const ParallelDispatchThreadPool::Subpool &thread_pool) {
+		init(thread_pool.num_threads());
+	}
+
+	// Insert a value, passing the `ThreadIndex` of the writer thread.
+	// Parallel inserts with different `ThreadIndex` values are fine.
+	// Inserts must not run concurrently with any other methods (e.g.
+	// iteration or `empty()`.)
+	void insert(const ThreadIndex &thread, T value) {
+		buckets[thread.thread_num].emplace_back(std::move(value));
+	}
+
+	bool empty() const {
+		for (const std::vector<T> &bucket : buckets)
+			if (!bucket.empty())
+				return false;
+		return true;
+	}
+
+	using Buckets = std::vector<std::vector<T>>;
+	class iterator {
+	public:
+		iterator(typename Buckets::iterator bucket_it, typename Buckets::iterator bucket_end)
+			: bucket_it(std::move(bucket_it)), bucket_end(std::move(bucket_end)) {
+			if (bucket_it != bucket_end)
+				inner_it = bucket_it->begin();
+			normalize();
+		}
+		T& operator*() const { return *inner_it.value(); }
+		iterator &operator++() {
+			++*inner_it;
+			normalize();
+			return *this;
+		}
+		bool operator!=(const iterator &other) const {
+			return bucket_it != other.bucket_it || inner_it != other.inner_it;
+		}
+	private:
+		void normalize() {
+			if (bucket_it == bucket_end)
+				return;
+			while (inner_it == bucket_it->end()) {
+				++bucket_it;
+				if (bucket_it == bucket_end) {
+					inner_it.reset();
+					return;
+				}
+				inner_it = bucket_it->begin();
+			}
+		}
+		std::optional<typename std::vector<T>::iterator> inner_it;
+		typename Buckets::iterator bucket_it;
+		typename Buckets::iterator bucket_end;
+	};
+	iterator begin() { return iterator(buckets.begin(), buckets.end()); }
+	iterator end() { return iterator(buckets.end(), buckets.end()); }
+private:
+	void init(int num_threads) {
+		buckets.resize(num_threads);
+	}
+	Buckets buckets;
+};
+
+template <typename V>
+struct DefaultCollisionHandler {
+	void operator()(typename V::Accumulated &, typename V::Accumulated &) const {}
+};
+
+// A hashtable that can be efficiently built in parallel and then looked up concurrently.
+// `V` is the type of elements that will be added to the hashtable. It must have a
+// member type `Accumulated` representing the combination of multiple `V` elements. This
+// can be the same as `V`, but for example `V` could contain a Wire* and `V::Accumulated`
+// could contain a `pool<Wire*>`. `KeyEquality` is a class containing an `operator()` that
+// returns true of two `V` elements have equal keys.
+// `CollisionHandler` is used to reduce two `V::Accumulated` values into a single value.
+//
+// To use this, first construct a `Builder` and fill it in (in parallel), then construct
+// a `ShardedHashSet` from the `Builder`.
+template <typename V, typename KeyEquality, typename CollisionHandler = DefaultCollisionHandler<V>>
+class ShardedHashSet {
+public:
+	// A combination of a `V` and its hash value.
+	struct Value {
+		Value(V value, unsigned int hash) : value(std::move(value)), hash(hash) {}
+		Value(Value &&) = default;
+		Value(const Value &) = delete;
+		Value &operator=(const Value &) = delete;
+		V value;
+		unsigned int hash;
+	};
+	// A combination of a `V::Accumulated` and its hash value.
+	struct AccumulatedValue {
+		AccumulatedValue(typename V::Accumulated value, unsigned int hash) : value(std::move(value)), hash(hash) {}
+		AccumulatedValue(AccumulatedValue &&) = default;
+#if defined(_MSC_VER)
+		AccumulatedValue(const AccumulatedValue &) {
+			log_error("Copy constructor called on AccumulatedValue");
+		}
+		AccumulatedValue &operator=(const AccumulatedValue &) {
+			log_error("Copy assignment called on AccumulatedValue");
+			return *this;
+		}
+#else
+		AccumulatedValue(const AccumulatedValue &) = delete;
+		AccumulatedValue &operator=(const AccumulatedValue &) = delete;
+#endif
+		typename V::Accumulated value;
+		unsigned int hash;
+	};
+	// A class containing an `operator()` that returns true of two `AccumulatedValue`
+	// elements have equal keys.
+	// Required to insert `AccumulatedValue`s into an `std::unordered_set`.
+	struct AccumulatedValueEquality {
+		KeyEquality inner;
+		AccumulatedValueEquality(const KeyEquality &inner) : inner(inner) {}
+		bool operator()(const AccumulatedValue &v1, const AccumulatedValue &v2) const {
+			return inner(v1.value, v2.value);
+		}
+	};
+	// A class containing an `operator()` that returns the hash value of an `AccumulatedValue`.
+	// Required to insert `AccumulatedValue`s into an `std::unordered_set`.
+	struct AccumulatedValueHashOp {
+		size_t operator()(const AccumulatedValue &v) const {
+			return static_cast<size_t>(v.hash);
+		}
+	};
+	using Shard = std::unordered_set<AccumulatedValue, AccumulatedValueHashOp, AccumulatedValueEquality>;
+
+	// First construct one of these. Then populate it in parallel by calling `insert()` from many threads.
+	// Then do another parallel phase calling `process()` from many threads.
+	class Builder {
+	public:
+		Builder(const ParallelDispatchThreadPool &thread_pool, KeyEquality equality = KeyEquality(), CollisionHandler collision_handler = CollisionHandler())
+				: collision_handler(std::move(collision_handler)) {
+			init(thread_pool.num_threads(), std::move(equality));
+		}
+		Builder(const ParallelDispatchThreadPool::Subpool &thread_pool, KeyEquality equality = KeyEquality(), CollisionHandler collision_handler = CollisionHandler())
+				: collision_handler(std::move(collision_handler)) {
+			init(thread_pool.num_threads(), std::move(equality));
+		}
+		// First call `insert` to insert all elements. All inserts must finish
+		// before calling any `process()`.
+		void insert(const ThreadIndex &thread, Value v) {
+			// You might think that for the single-threaded case, we can optimize by
+			// inserting directly into the `std::unordered_set` here. But that slows things down
+			// a lot and I never got around to figuring out why.
+			std::vector<std::vector<Value>> &buckets = all_buckets[thread.thread_num];
+			size_t bucket = static_cast<size_t>(v.hash) % buckets.size();
+			buckets[bucket].emplace_back(std::move(v));
+		}
+		// Then call `process` for each thread. All `process()`s must finish before using
+		// the `Builder` to construct a `ShardedHashSet`.
+		void process(const ThreadIndex &thread) {
+			int size = 0;
+			for (std::vector<std::vector<Value>> &buckets : all_buckets)
+				size += GetSize(buckets[thread.thread_num]);
+			Shard &shard = shards[thread.thread_num];
+			shard.reserve(size);
+			for (std::vector<std::vector<Value>> &buckets : all_buckets) {
+				for (Value &value : buckets[thread.thread_num])
+					accumulate(value, shard);
+				// Free as much memory as we can during the parallel phase.
+				std::vector<Value>().swap(buckets[thread.thread_num]);
+			}
+		}
+	private:
+		friend class ShardedHashSet<V, KeyEquality, CollisionHandler>;
+		void accumulate(Value &value, Shard &shard) {
+			// With C++20 we could make this more efficient using heterogenous lookup
+			AccumulatedValue accumulated_value{std::move(value.value), value.hash};
+			auto [it, inserted] = shard.insert(std::move(accumulated_value));
+			if (!inserted)
+				collision_handler(const_cast<typename V::Accumulated &>(it->value), accumulated_value.value);
+		}
+		void init(int num_threads, KeyEquality equality) {
+			all_buckets.resize(num_threads);
+			for (std::vector<std::vector<Value>> &buckets : all_buckets)
+				buckets.resize(num_threads);
+			for (int i = 0; i < num_threads; ++i)
+				shards.emplace_back(0, AccumulatedValueHashOp(), AccumulatedValueEquality(equality));
+		}
+		const CollisionHandler collision_handler;
+		std::vector<std::vector<std::vector<Value>>> all_buckets;
+		std::vector<Shard> shards;
+	};
+
+	// Then finally construct the hashtable:
+	ShardedHashSet(Builder &builder) : shards(std::move(builder.shards)) {
+		// Check that all necessary 'process()' calls were made.
+		for (std::vector<std::vector<Value>> &buckets : builder.all_buckets)
+			for (std::vector<Value> &bucket : buckets)
+				log_assert(bucket.empty());
+		// Free memory.
+		std::vector<std::vector<std::vector<Value>>>().swap(builder.all_buckets);
+	}
+	ShardedHashSet(ShardedHashSet &&other) = default;
+	ShardedHashSet() {}
+
+	ShardedHashSet &operator=(ShardedHashSet &&other) = default;
+
+	// Look up by `AccumulatedValue`. If we switch to C++20 then we could use
+	// heterogenous lookup to support looking up by `Value` here. Returns nullptr
+	// if the key is not found.
+	const typename V::Accumulated *find(const AccumulatedValue &v) const {
+		size_t num_shards = shards.size();
+		if (num_shards == 0)
+			return nullptr;
+		size_t shard = static_cast<size_t>(v.hash) % num_shards;
+		auto it = shards[shard].find(v);
+		if (it == shards[shard].end())
+			return nullptr;
+		return &it->value;
+	}
+
+	// Insert an element into the table. The caller is responsible for ensuring this does not
+	// happen concurrently with any other method calls.
+	void insert(AccumulatedValue v) {
+		size_t num_shards = shards.size();
+		if (num_shards == 0)
+			return;
+		size_t shard = static_cast<size_t>(v.hash) % num_shards;
+		shards[shard].insert(v);
+	}
+
+	// Call this for each shard to implement parallel destruction. For very large `ShardedHashSet`s,
+	// deleting all elements of all shards on a single thread can be a performance bottleneck.
+	void clear(const ThreadIndex &shard) {
+		AccumulatedValueEquality equality = shards[0].key_eq();
+		shards[shard.thread_num] = Shard(0, AccumulatedValueHashOp(), equality);
+	}
+private:
+	std::vector<Shard> shards;
+};
+
+// A concurrent work-queue that can share batches of work across threads.
+// Uses a naive implementation of work-stealing.
+template <typename T>
+class ConcurrentWorkQueue {
+public:
+	// Create a queue that supports the given number of threads and
+	// groups work into `batch_size` units.
+	ConcurrentWorkQueue(int num_threads, int batch_size = 100)
+		: batch_size(batch_size), thread_states(num_threads) {}
+	int num_threads() const { return GetSize(thread_states); }
+	// Push some work to do. Pushes and pops with the same `thread` must
+	// not happen concurrently.
+	void push(const ThreadIndex &thread, T work) {
+		ThreadState &thread_state = thread_states[thread.thread_num];
+		thread_state.next_batch.emplace_back(std::move(work));
+		if (GetSize(thread_state.next_batch) < batch_size)
+			return;
+		bool was_empty;
+		{
+			std::unique_lock lock(thread_state.batches_lock);
+			was_empty = thread_state.batches.empty();
+			thread_state.batches.push_back(std::move(thread_state.next_batch));
+		}
+		if (was_empty) {
+			std::unique_lock lock(waiters_lock);
+			if (num_waiters > 0) {
+				waiters_cv.notify_one();
+			}
+		}
+	}
+	// Grab some work to do.
+	// If all threads enter `pop_batch()`, then instead of deadlocking the
+	// queue will return no work. That is the only case in which it will
+	// return no work.
+	std::vector<T> pop_batch(const ThreadIndex &thread) {
+		ThreadState &thread_state = thread_states[thread.thread_num];
+		if (!thread_state.next_batch.empty())
+			return std::move(thread_state.next_batch);
+		// Empty our own work queue first.
+		{
+			std::unique_lock lock(thread_state.batches_lock);
+			if (!thread_state.batches.empty()) {
+				std::vector<T> batch = std::move(thread_state.batches.back());
+				thread_state.batches.pop_back();
+				return batch;
+			}
+		}
+		// From here on in this function, our work queue is empty.
+		while (true) {
+			std::vector<T> batch = try_steal(thread);
+			if (!batch.empty()) {
+				return std::move(batch);
+			}
+			// Termination: if all threads run out of work, then all of
+			// them will eventually enter this loop and there will be no further
+			// notifications on waiters_cv, so all will eventually increment
+			// num_waiters and wait, so num_waiters == num_threads()
+			// will become true.
+			std::unique_lock lock(waiters_lock);
+			++num_waiters;
+			if (num_waiters == num_threads()) {
+				waiters_cv.notify_all();
+				return {};
+			}
+			// As above, it's possible that we'll wait here even when there
+			// are work batches posted by other threads. That's OK.
+			waiters_cv.wait(lock);
+			if (num_waiters == num_threads())
+				return {};
+			--num_waiters;
+		}
+	}
+private:
+	std::vector<T> try_steal(const ThreadIndex &thread) {
+		for (int i = 1; i < num_threads(); i++) {
+			int other_thread_num = (thread.thread_num + i) % num_threads();
+			ThreadState &other_thread_state = thread_states[other_thread_num];
+			std::unique_lock lock(other_thread_state.batches_lock);
+			if (!other_thread_state.batches.empty()) {
+				std::vector<T> batch = std::move(other_thread_state.batches.front());
+				other_thread_state.batches.pop_front();
+				return batch;
+			}
+		}
+		return {};
+	}
+
+	int batch_size;
+
+	struct ThreadState {
+		// Entirely thread-local.
+		std::vector<T> next_batch;
+
+		std::mutex batches_lock;
+		// Only the associated thread ever adds to this, and only at the back.
+		// Other threads can remove elements from the front.
+		std::deque<std::vector<T>> batches;
+	};
+	std::vector<ThreadState> thread_states;
+
+	std::mutex waiters_lock;
+	std::condition_variable waiters_cv;
+	// Number of threads waiting for work. Their queues are empty.
+	int num_waiters = 0;
+};
+
+// A monotonic flag. Starts false, and can be set to true in a thread-safe way.
+// Once `load()` returns true, it will always return true.
+// Uses relaxed atomics so there are no memory ordering guarantees. Do not use this
+// to guard access to shared memory.
+class MonotonicFlag {
+public:
+	MonotonicFlag() : value(false) {}
+	bool load() const { return value.load(std::memory_order_relaxed); }
+	void set() { value.store(true, std::memory_order_relaxed); }
+	bool set_and_return_old() {
+		return value.exchange(true, std::memory_order_relaxed);
+	}
+private:
+	std::atomic<bool> value;
+};
+
 YOSYS_NAMESPACE_END
 
 #endif // YOSYS_THREADING_H
diff --git a/passes/opt/opt_clean.cc b/passes/opt/opt_clean.cc
index f1d21435c..7c2377b10 100644
--- a/passes/opt/opt_clean.cc
+++ b/passes/opt/opt_clean.cc
@@ -22,6 +22,7 @@
 #include "kernel/log.h"
 #include "kernel/celltypes.h"
 #include "kernel/ffinit.h"
+#include "kernel/threading.h"
 #include <stdlib.h>
 #include <stdio.h>
 #include <set>
@@ -33,47 +34,120 @@ using RTLIL::id2cstr;
 
 struct keep_cache_t
 {
-	Design *design;
-	dict<Module*, bool> cache;
-	bool purge_mode = false;
+	dict<Module*, bool> keep_modules;
+	bool purge_mode;
 
-	void reset(Design *design = nullptr, bool purge_mode = false)
-	{
-		this->design = design;
-		this->purge_mode = purge_mode;
-		cache.clear();
-	}
+	keep_cache_t(bool purge_mode, ParallelDispatchThreadPool &thread_pool, const std::vector<RTLIL::Module *> &selected_modules)
+			: purge_mode(purge_mode) {
 
-	bool query(Module *module)
-	{
-		log_assert(design != nullptr);
-
-		if (module == nullptr)
-			return false;
-
-		if (cache.count(module))
-			return cache.at(module);
-
-		cache[module] = true;
-		if (!module->get_bool_attribute(ID::keep)) {
-		    bool found_keep = false;
-		    for (auto cell : module->cells())
-			if (query(cell, true /* ignore_specify */)) {
-			    found_keep = true;
-			    break;
-			}
-		    for (auto wire : module->wires())
-			if (wire->get_bool_attribute(ID::keep)) {
-			    found_keep = true;
-			    break;
-			}
-		    cache[module] = found_keep;
+		std::vector<RTLIL::Module *> scan_modules_worklist;
+		dict<RTLIL::Module *, std::vector<RTLIL::Module*>> dependents;
+		std::vector<RTLIL::Module *> propagate_kept_modules_worklist;
+		for (RTLIL::Module *module : selected_modules) {
+			if (keep_modules.count(module))
+				continue;
+			bool keep = scan_module(module, thread_pool, dependents, ALL_CELLS, scan_modules_worklist);
+			keep_modules[module] = keep;
+			if (keep)
+				propagate_kept_modules_worklist.push_back(module);
 		}
 
-		return cache[module];
+		while (!scan_modules_worklist.empty()) {
+			RTLIL::Module *module = scan_modules_worklist.back();
+			scan_modules_worklist.pop_back();
+			if (keep_modules.count(module))
+				continue;
+			bool keep = scan_module(module, thread_pool, dependents, MINIMUM_CELLS, scan_modules_worklist);
+			keep_modules[module] = keep;
+			if (keep)
+				propagate_kept_modules_worklist.push_back(module);
+		}
+
+		while (!propagate_kept_modules_worklist.empty()) {
+			RTLIL::Module *module = propagate_kept_modules_worklist.back();
+			propagate_kept_modules_worklist.pop_back();
+			for (RTLIL::Module *dependent : dependents[module]) {
+				if (keep_modules[dependent])
+					continue;
+				keep_modules[dependent] = true;
+				propagate_kept_modules_worklist.push_back(dependent);
+			}
+		}
 	}
 
-	bool query(Cell *cell, bool ignore_specify = false)
+	bool query(Cell *cell) const
+	{
+		if (keep_cell(cell, purge_mode))
+			return true;
+		if (cell->type.in(ID($specify2), ID($specify3), ID($specrule)))
+			return true;
+		if (cell->module && cell->module->design) {
+			RTLIL::Module *cell_module = cell->module->design->module(cell->type);
+			return cell_module != nullptr && keep_modules.at(cell_module);
+		}
+		return false;
+	}
+
+private:
+	enum ScanCells {
+		// Scan every cell to see if it uses a module that is kept.
+		ALL_CELLS,
+		// Stop scanning cells if we determine early that this module is kept.
+		MINIMUM_CELLS,
+	};
+	bool scan_module(Module *module, ParallelDispatchThreadPool &thread_pool, dict<RTLIL::Module *, std::vector<RTLIL::Module*>> &dependents,
+			ScanCells scan_cells, std::vector<Module*> &worklist) const
+	{
+		MonotonicFlag keep_module;
+		if (module->get_bool_attribute(ID::keep)) {
+			if (scan_cells == MINIMUM_CELLS)
+				return true;
+			keep_module.set();
+		}
+
+		ParallelDispatchThreadPool::Subpool subpool(thread_pool, ThreadPool::work_pool_size(0, module->cells_size(), 1000));
+		ShardedVector<Module*> deps(subpool);
+		const RTLIL::Module *const_module = module;
+		bool purge_mode = this->purge_mode;
+		subpool.run([purge_mode, const_module, scan_cells, &deps, &keep_module](const ParallelDispatchThreadPool::RunCtx &ctx) {
+			bool keep = false;
+			for (int i : ctx.item_range(const_module->cells_size())) {
+				Cell *cell = const_module->cell_at(i);
+				if (keep_cell(cell, purge_mode)) {
+					if (scan_cells == MINIMUM_CELLS) {
+						keep_module.set();
+						return;
+					}
+					keep = true;
+				}
+				if (const_module->design) {
+					RTLIL::Module *cell_module = const_module->design->module(cell->type);
+					if (cell_module != nullptr)
+						deps.insert(ctx, cell_module);
+				}
+			}
+			if (keep) {
+				keep_module.set();
+				return;
+			}
+			for (int i : ctx.item_range(const_module->wires_size())) {
+				Wire *wire = const_module->wire_at(i);
+				if (wire->get_bool_attribute(ID::keep)) {
+					keep_module.set();
+					return;
+				}
+			}
+		});
+		if (scan_cells == MINIMUM_CELLS && keep_module.load())
+			return true;
+		for (Module *dep : deps) {
+			dependents[dep].push_back(module);
+			worklist.push_back(dep);
+		}
+		return keep_module.load();
+	}
+
+	static bool keep_cell(Cell *cell, bool purge_mode)
 	{
 		if (cell->type.in(ID($assert), ID($assume), ID($live), ID($fair), ID($cover)))
 			return true;
@@ -81,9 +155,6 @@ struct keep_cache_t
 		if (cell->type.in(ID($overwrite_tag)))
 			return true;
 
-		if (!ignore_specify && cell->type.in(ID($specify2), ID($specify3), ID($specrule)))
-			return true;
-
 		if (cell->type == ID($print) || cell->type == ID($check))
 			return true;
 
@@ -92,28 +163,32 @@ struct keep_cache_t
 
 		if (!purge_mode && cell->type == ID($scopeinfo))
 			return true;
-
-		if (cell->module && cell->module->design)
-			return query(cell->module->design->module(cell->type));
-
 		return false;
 	}
 };
 
-keep_cache_t keep_cache;
 CellTypes ct_reg, ct_all;
-int count_rm_cells, count_rm_wires;
 
-void rmunused_module_cells(Module *module, bool verbose)
+struct RmStats {
+	int count_rm_cells = 0;
+	int count_rm_wires = 0;
+
+	void log()
+	{
+		if (count_rm_cells > 0 || count_rm_wires > 0)
+			YOSYS_NAMESPACE_PREFIX log("Removed %d unused cells and %d unused wires.\n", count_rm_cells, count_rm_wires);
+	}
+};
+
+unsigned int hash_bit(const SigBit &bit) {
+	return static_cast<unsigned int>(hash_ops<SigBit>::hash(bit).yield());
+}
+
+void rmunused_module_cells(Module *module, ParallelDispatchThreadPool::Subpool &subpool, bool verbose, RmStats &stats, keep_cache_t &keep_cache)
 {
 	SigMap sigmap(module);
-	dict<IdString, pool<Cell*>> mem2cells;
-	pool<IdString> mem_unused;
-	pool<Cell*> queue, unused;
-	pool<SigBit> used_raw_bits;
-	dict<SigBit, pool<Cell*>> wire2driver;
-	dict<SigBit, vector<string>> driver_driver_logs;
-	FfInitVals ffinit(&sigmap, module);
+	FfInitVals ffinit;
+	ffinit.set_parallel(&sigmap, subpool.thread_pool(), module);
 
 	SigMap raw_sigmap;
 	for (auto &it : module->connections_) {
@@ -123,117 +198,243 @@ void rmunused_module_cells(Module *module, bool verbose)
 		}
 	}
 
-	for (auto &it : module->memories) {
-		mem_unused.insert(it.first);
-	}
+	struct WireDrivers;
+	struct WireDriver {
+		using Accumulated = WireDrivers;
+		SigBit bit;
+		int driver_cell;
+	};
+	struct WireDrivers {
+		WireDrivers() : driver_cell(0) {}
+		WireDrivers(WireDriver driver) : bit(driver.bit), driver_cell(driver.driver_cell) {}
+		WireDrivers(SigBit bit) : bit(bit), driver_cell(0) {}
+		WireDrivers(WireDrivers &&other) = default;
 
-	for (Cell *cell : module->cells()) {
-		if (cell->type.in(ID($memwr), ID($memwr_v2), ID($meminit), ID($meminit_v2))) {
-			IdString mem_id = cell->getParam(ID::MEMID).decode_string();
-			mem2cells[mem_id].insert(cell);
-		}
-	}
-
-	for (auto &it : module->cells_) {
-		Cell *cell = it.second;
-		for (auto &it2 : cell->connections()) {
-			if (ct_all.cell_known(cell->type) && !ct_all.cell_output(cell->type, it2.first))
-				continue;
-			for (auto raw_bit : it2.second) {
-				if (raw_bit.wire == nullptr)
-					continue;
-				auto bit = sigmap(raw_bit);
-				if (bit.wire == nullptr && ct_all.cell_known(cell->type))
-					driver_driver_logs[raw_sigmap(raw_bit)].push_back(stringf("Driver-driver conflict "
-							"for %s between cell %s.%s and constant %s in %s: Resolved using constant.",
-							log_signal(raw_bit), log_id(cell), log_id(it2.first), log_signal(bit), log_id(module)));
-				if (bit.wire != nullptr)
-					wire2driver[bit].insert(cell);
-			}
-		}
-		if (keep_cache.query(cell))
-			queue.insert(cell);
-		else
-			unused.insert(cell);
-	}
-
-	for (auto &it : module->wires_) {
-		Wire *wire = it.second;
-		if (wire->port_output || wire->get_bool_attribute(ID::keep)) {
-			for (auto bit : sigmap(wire))
-			for (auto c : wire2driver[bit])
-				queue.insert(c), unused.erase(c);
-			for (auto raw_bit : SigSpec(wire))
-				used_raw_bits.insert(raw_sigmap(raw_bit));
-		}
-	}
-
-	while (!queue.empty())
-	{
-		pool<SigBit> bits;
-		pool<IdString> mems;
-		for (auto cell : queue) {
-			for (auto &it : cell->connections())
-				if (!ct_all.cell_known(cell->type) || ct_all.cell_input(cell->type, it.first))
-					for (auto bit : sigmap(it.second))
-						bits.insert(bit);
-
-			if (cell->type.in(ID($memrd), ID($memrd_v2))) {
-				IdString mem_id = cell->getParam(ID::MEMID).decode_string();
-				if (mem_unused.count(mem_id)) {
-					mem_unused.erase(mem_id);
-					mems.insert(mem_id);
+		class const_iterator {
+		public:
+			const_iterator(const WireDrivers &drivers, bool end)
+					: driver_cell(drivers.driver_cell), in_extra_cells(end) {
+				if (drivers.extra_driver_cells) {
+					if (end) {
+						extra_it = drivers.extra_driver_cells->end();
+					} else {
+						extra_it = drivers.extra_driver_cells->begin();
+					}
 				}
 			}
+			int operator*() const {
+				if (in_extra_cells)
+					return **extra_it;
+				return driver_cell;
+			}
+			const_iterator& operator++() {
+				if (in_extra_cells)
+					++*extra_it;
+				else
+					in_extra_cells = true;
+				return *this;
+			}
+			bool operator!=(const const_iterator &other) const {
+				return !(*this == other);
+			}
+			bool operator==(const const_iterator &other) const {
+				return in_extra_cells == other.in_extra_cells &&
+					extra_it == other.extra_it;
+			}
+		private:
+			std::optional<pool<int>::iterator> extra_it;
+			int driver_cell;
+			bool in_extra_cells;
+		};
+
+		const_iterator begin() const { return const_iterator(*this, false); }
+		const_iterator end() const { return const_iterator(*this, true); }
+
+		SigBit bit;
+		int driver_cell;
+		std::unique_ptr<pool<int>> extra_driver_cells;
+	};
+	struct WireDriversKeyEquality {
+		bool operator()(const WireDrivers &a, const WireDrivers &b) const {
+			return a.bit == b.bit;
 		}
+	};
+	struct WireDriversCollisionHandler {
+		void operator()(WireDrivers &incumbent, WireDrivers &new_value) const {
+			log_assert(new_value.extra_driver_cells == nullptr);
+			if (!incumbent.extra_driver_cells)
+				incumbent.extra_driver_cells.reset(new pool<int>());
+			incumbent.extra_driver_cells->insert(new_value.driver_cell);
+		}
+	};
+	using Wire2Drivers = ShardedHashSet<WireDriver, WireDriversKeyEquality, WireDriversCollisionHandler>;
 
-		queue.clear();
+	Wire2Drivers::Builder wire2driver_builder(subpool);
+	ShardedVector<std::pair<std::string, int>> mem2cells_vector(subpool);
+	ShardedVector<std::pair<SigBit, std::string>> driver_driver_logs(subpool);
+	ShardedVector<Wire*> keep_wires(subpool);
+	const RTLIL::Module *const_module = module;
+	int num_threads = subpool.num_threads();
+	ConcurrentWorkQueue<int> cell_queue(num_threads);
+	std::vector<std::atomic<bool>> unused(const_module->cells_size());
+	subpool.run([&sigmap, &raw_sigmap, &keep_cache, const_module, &mem2cells_vector, &driver_driver_logs, &keep_wires, &cell_queue, &wire2driver_builder, &unused](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		for (int i : ctx.item_range(const_module->cells_size())) {
+			Cell *cell = const_module->cell_at(i);
+			if (cell->type.in(ID($memwr), ID($memwr_v2), ID($meminit), ID($meminit_v2)))
+				mem2cells_vector.insert(ctx, {cell->getParam(ID::MEMID).decode_string(), i});
 
-		for (auto bit : bits)
-		for (auto c : wire2driver[bit])
-			if (unused.count(c))
-				queue.insert(c), unused.erase(c);
+			for (auto &it2 : cell->connections()) {
+				if (ct_all.cell_known(cell->type) && !ct_all.cell_output(cell->type, it2.first))
+					continue;
+				for (auto raw_bit : it2.second) {
+					if (raw_bit.wire == nullptr)
+						continue;
+					auto bit = sigmap(raw_bit);
+					if (bit.wire == nullptr && ct_all.cell_known(cell->type)) {
+						std::string msg = stringf("Driver-driver conflict "
+								"for %s between cell %s.%s and constant %s in %s: Resolved using constant.",
+								log_signal(raw_bit), cell->name.unescape(), it2.first.unescape(), log_signal(bit), const_module->name.unescape());
+						driver_driver_logs.insert(ctx, {raw_sigmap(raw_bit), msg});
+					}
+					if (bit.wire != nullptr)
+						wire2driver_builder.insert(ctx, {{bit, i}, hash_bit(bit)});
+				}
+			}
+			bool keep = keep_cache.query(cell);
+			unused[i].store(!keep, std::memory_order_relaxed);
+			if (keep)
+				cell_queue.push(ctx, i);
+		}
+		for (int i : ctx.item_range(const_module->wires_size())) {
+			Wire *wire = const_module->wire_at(i);
+			if (wire->port_output || wire->get_bool_attribute(ID::keep))
+				keep_wires.insert(ctx, wire);
+		}
+	});
+	subpool.run([&wire2driver_builder](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		wire2driver_builder.process(ctx);
+	});
+	Wire2Drivers wire2driver(wire2driver_builder);
 
-		for (auto mem : mems)
-		for (auto c : mem2cells[mem])
-			if (unused.count(c))
-				queue.insert(c), unused.erase(c);
+	dict<std::string, pool<int>> mem2cells;
+	for (std::pair<std::string, int> &mem2cell : mem2cells_vector)
+		mem2cells[mem2cell.first].insert(mem2cell.second);
+
+	pool<SigBit> used_raw_bits;
+	int i = 0;
+	for (Wire *wire : keep_wires) {
+		for (auto bit : sigmap(wire)) {
+			const WireDrivers *drivers = wire2driver.find({{bit}, hash_bit(bit)});
+			if (drivers != nullptr)
+				for (int cell_index : *drivers)
+					if (unused[cell_index].exchange(false, std::memory_order_relaxed)) {
+						ThreadIndex fake_thread_index = {i++ % num_threads};
+						cell_queue.push(fake_thread_index, cell_index);
+					}
+		}
+		for (auto raw_bit : SigSpec(wire))
+			used_raw_bits.insert(raw_sigmap(raw_bit));
 	}
 
-	unused.sort(RTLIL::sort_by_name_id<RTLIL::Cell>());
+	std::vector<std::atomic<bool>> mem_unused(module->memories.size());
+	dict<std::string, int> mem_indices;
+	for (int i = 0; i < GetSize(module->memories); ++i) {
+		mem_indices[module->memories.element(i)->first.str()] = i;
+		mem_unused[i].store(true, std::memory_order_relaxed);
+	}
 
-	for (auto cell : unused) {
+	subpool.run([const_module, &sigmap, &wire2driver, &mem2cells, &unused, &cell_queue, &mem_indices, &mem_unused](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		pool<SigBit> bits;
+		pool<std::string> mems;
+		while (true) {
+			std::vector<int> cell_indices = cell_queue.pop_batch(ctx);
+			if (cell_indices.empty())
+				return;
+			for (auto cell_index : cell_indices) {
+				Cell *cell = const_module->cell_at(cell_index);
+				for (auto &it : cell->connections())
+					if (!ct_all.cell_known(cell->type) || ct_all.cell_input(cell->type, it.first))
+						for (auto bit : sigmap(it.second))
+							bits.insert(bit);
+
+				if (cell->type.in(ID($memrd), ID($memrd_v2))) {
+					std::string mem_id = cell->getParam(ID::MEMID).decode_string();
+					if (mem_indices.count(mem_id)) {
+						int mem_index = mem_indices[mem_id];
+						if (mem_unused[mem_index].exchange(false, std::memory_order_relaxed))
+							mems.insert(mem_id);
+					}
+				}
+			}
+
+			for (auto bit : bits) {
+				const WireDrivers *drivers = wire2driver.find({{bit}, hash_bit(bit)});
+				if (drivers != nullptr)
+					for (int cell_index : *drivers)
+						if (unused[cell_index].exchange(false, std::memory_order_relaxed))
+							cell_queue.push(ctx, cell_index);
+			}
+			bits.clear();
+
+			for (auto mem : mems) {
+				if (mem2cells.count(mem) == 0)
+					continue;
+				for (int cell_index : mem2cells.at(mem))
+					if (unused[cell_index].exchange(false, std::memory_order_relaxed))
+						cell_queue.push(ctx, cell_index);
+			}
+			mems.clear();
+		}
+	});
+
+	ShardedVector<int> sharded_unused_cells(subpool);
+	subpool.run([const_module, &unused, &sharded_unused_cells, &wire2driver](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		// Parallel destruction of `wire2driver`
+		wire2driver.clear(ctx);
+		for (int i : ctx.item_range(const_module->cells_size()))
+			if (unused[i].load(std::memory_order_relaxed))
+				sharded_unused_cells.insert(ctx, i);
+	});
+	pool<Cell*> unused_cells;
+	for (int cell_index : sharded_unused_cells)
+		unused_cells.insert(const_module->cell_at(cell_index));
+	unused_cells.sort(RTLIL::sort_by_name_id<RTLIL::Cell>());
+
+	for (auto cell : unused_cells) {
 		if (verbose)
 			log_debug("  removing unused `%s' cell `%s'.\n", cell->type, cell->name);
 		module->design->scratchpad_set_bool("opt.did_something", true);
 		if (cell->is_builtin_ff())
 			ffinit.remove_init(cell->getPort(ID::Q));
 		module->remove(cell);
-		count_rm_cells++;
+		stats.count_rm_cells++;
 	}
 
-	for (auto it : mem_unused)
-	{
+	for (const auto &it : mem_indices) {
+		if (!mem_unused[it.second].load(std::memory_order_relaxed))
+			continue;
+		RTLIL::IdString id(it.first);
 		if (verbose)
-			log_debug("  removing unused memory `%s'.\n", it);
-		delete module->memories.at(it);
-		module->memories.erase(it);
+			log_debug("  removing unused memory `%s'.\n", id.unescape());
+		delete module->memories.at(id);
+		module->memories.erase(id);
 	}
 
-	for (auto &it : module->cells_) {
-		Cell *cell = it.second;
-		for (auto &it2 : cell->connections()) {
-			if (ct_all.cell_known(cell->type) && !ct_all.cell_input(cell->type, it2.first))
-				continue;
-			for (auto raw_bit : raw_sigmap(it2.second))
-				used_raw_bits.insert(raw_bit);
+	if (!driver_driver_logs.empty()) {
+		// We could do this in parallel but hopefully this is rare.
+		for (auto &it : module->cells_) {
+			Cell *cell = it.second;
+			for (auto &it2 : cell->connections()) {
+				if (ct_all.cell_known(cell->type) && !ct_all.cell_input(cell->type, it2.first))
+					continue;
+				for (auto raw_bit : raw_sigmap(it2.second))
+					used_raw_bits.insert(raw_bit);
+			}
+		}
+		for (std::pair<SigBit, std::string> &it : driver_driver_logs) {
+			if (used_raw_bits.count(it.first))
+				log_warning("%s\n", it.second);
 		}
-	}
-
-	for (auto it : driver_driver_logs) {
-		if (used_raw_bits.count(it.first))
-			for (auto msg : it.second)
-				log_warning("%s\n", msg);
 	}
 }
 
@@ -247,9 +448,62 @@ int count_nontrivial_wire_attrs(RTLIL::Wire *w)
 	return count;
 }
 
+struct ShardedSigBit {
+	using Accumulated = ShardedSigBit;
+	RTLIL::SigBit bit;
+	ShardedSigBit() = default;
+	ShardedSigBit(const RTLIL::SigBit &bit) : bit(bit) {}
+};
+struct ShardedSigBitEquality {
+	bool operator()(const ShardedSigBit &b1, const ShardedSigBit &b2) const {
+		return b1.bit == b2.bit;
+	}
+};
+using ShardedSigPool = ShardedHashSet<ShardedSigBit, ShardedSigBitEquality>;
+
+struct ShardedSigSpec {
+	using Accumulated = ShardedSigSpec;
+	RTLIL::SigSpec spec;
+	ShardedSigSpec() = default;
+	ShardedSigSpec(RTLIL::SigSpec spec) : spec(std::move(spec)) {}
+	ShardedSigSpec(ShardedSigSpec &&) = default;
+};
+struct ShardedSigSpecEquality {
+	bool operator()(const ShardedSigSpec &s1, const ShardedSigSpec &s2) const {
+		return s1.spec == s2.spec;
+	}
+};
+using ShardedSigSpecPool = ShardedHashSet<ShardedSigSpec, ShardedSigSpecEquality>;
+
+struct DirectWires {
+	const SigMap &assign_map;
+	const ShardedSigSpecPool &direct_sigs;
+	dict<RTLIL::Wire *, bool> cache;
+
+	DirectWires(const SigMap &assign_map, const ShardedSigSpecPool &direct_sigs) : assign_map(assign_map), direct_sigs(direct_sigs) {}
+	void cache_result_for_bit(const SigBit &bit) {
+		if (bit.wire != nullptr)
+			is_direct(bit.wire);
+	}
+	bool is_direct(RTLIL::Wire *wire) {
+		if (wire->port_input)
+			return true;
+		auto it = cache.find(wire);
+		if (it != cache.end())
+			return it->second;
+		SigSpec direct_sig = assign_map(wire);
+		bool direct = direct_sigs.find({direct_sig, direct_sig.hash_into(Hasher()).yield()}) != nullptr;
+		cache.insert({wire, direct});
+		return direct;
+	}
+};
+
 // Should we pick `s2` over `s1` to represent a signal?
-bool compare_signals(RTLIL::SigBit &s1, RTLIL::SigBit &s2, SigPool &regs, SigPool &conns, pool<RTLIL::Wire*> &direct_wires)
+bool compare_signals(const RTLIL::SigBit &s1, const RTLIL::SigBit &s2, const ShardedSigPool &regs, const ShardedSigPool &conns, DirectWires &direct_wires)
 {
+	if (s1 == s2)
+		return false;
+
 	RTLIL::Wire *w1 = s1.wire;
 	RTLIL::Wire *w2 = s2.wire;
 
@@ -263,12 +517,20 @@ bool compare_signals(RTLIL::SigBit &s1, RTLIL::SigBit &s2, SigPool &regs, SigPoo
 		return !(w2->port_input && w2->port_output);
 
 	if (w1->name.isPublic() && w2->name.isPublic()) {
-		if (regs.check(s1) != regs.check(s2))
-			return regs.check(s2);
-		if (direct_wires.count(w1) != direct_wires.count(w2))
-			return direct_wires.count(w2) != 0;
-		if (conns.check_any(s1) != conns.check_any(s2))
-			return conns.check_any(s2);
+		ShardedSigPool::AccumulatedValue s1_val = {s1, s1.hash_top().yield()};
+		ShardedSigPool::AccumulatedValue s2_val = {s2, s2.hash_top().yield()};
+		bool regs1 = regs.find(s1_val) != nullptr;
+		bool regs2 = regs.find(s2_val) != nullptr;
+		if (regs1 != regs2)
+			return regs2;
+		bool w1_direct = direct_wires.is_direct(w1);
+		bool w2_direct = direct_wires.is_direct(w2);
+		if (w1_direct != w2_direct)
+			return w2_direct;
+		bool conns1 = conns.find(s1_val) != nullptr;
+		bool conns2 = conns.find(s2_val) != nullptr;
+		if (conns1 != conns2)
+			return conns2;
 	}
 
 	if (w1 == w2)
@@ -301,109 +563,185 @@ bool check_public_name(RTLIL::IdString id)
 	return true;
 }
 
-bool rmunused_module_signals(RTLIL::Module *module, bool purge_mode, bool verbose)
-{
-	// `register_signals` and `connected_signals` will help us decide later on
-	// on picking representatives out of groups of connected signals
-	SigPool register_signals;
-	SigPool connected_signals;
-	if (!purge_mode)
-		for (auto &it : module->cells_) {
-			RTLIL::Cell *cell = it.second;
-			if (ct_reg.cell_known(cell->type)) {
-				bool clk2fflogic = cell->get_bool_attribute(ID(clk2fflogic));
-				for (auto &it2 : cell->connections())
-					if (clk2fflogic ? it2.first == ID::D : ct_reg.cell_output(cell->type, it2.first))
-						register_signals.add(it2.second);
-			}
-			for (auto &it2 : cell->connections())
-				connected_signals.add(it2.second);
-		}
+void add_spec(ShardedSigPool::Builder &builder, const ThreadIndex &thread, const RTLIL::SigSpec &spec) {
+	for (SigBit bit : spec)
+		if (bit.wire != nullptr)
+			builder.insert(thread, {bit, bit.hash_top().yield()});
+}
 
+bool check_any(const ShardedSigPool &sigs, const RTLIL::SigSpec &spec) {
+	for (SigBit b : spec)
+		if (sigs.find({b, b.hash_top().yield()}) != nullptr)
+			return true;
+	return false;
+}
+
+bool check_all(const ShardedSigPool &sigs, const RTLIL::SigSpec &spec) {
+	for (SigBit b : spec)
+		if (sigs.find({b, b.hash_top().yield()}) == nullptr)
+			return false;
+	return true;
+}
+
+bool rmunused_module_signals(RTLIL::Module *module, ParallelDispatchThreadPool::Subpool &subpool, bool purge_mode, bool verbose, RmStats &stats)
+{
 	SigMap assign_map(module);
 
+	const RTLIL::Module *const_module = module;
+	// `register_signals` and `connected_signals` will help us decide later on
+	// on picking representatives out of groups of connected signals
+	ShardedSigPool::Builder register_signals_builder(subpool);
+	ShardedSigPool::Builder connected_signals_builder(subpool);
 	// construct a pool of wires which are directly driven by a known celltype,
 	// this will influence our choice of representatives
-	pool<RTLIL::Wire*> direct_wires;
-	{
-		pool<RTLIL::SigSpec> direct_sigs;
-		for (auto &it : module->cells_) {
-			RTLIL::Cell *cell = it.second;
+	ShardedSigSpecPool::Builder direct_sigs_builder(subpool);
+	subpool.run([const_module, purge_mode, &assign_map, &direct_sigs_builder, &register_signals_builder, &connected_signals_builder](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		for (int i : ctx.item_range(const_module->cells_size())) {
+			RTLIL::Cell *cell = const_module->cell_at(i);
+			if (!purge_mode) {
+				if (ct_reg.cell_known(cell->type)) {
+					bool clk2fflogic = cell->get_bool_attribute(ID(clk2fflogic));
+					for (auto &it2 : cell->connections())
+						if (clk2fflogic ? it2.first == ID::D : ct_reg.cell_output(cell->type, it2.first))
+							add_spec(register_signals_builder, ctx, it2.second);
+				}
+				for (auto &it2 : cell->connections())
+					add_spec(connected_signals_builder, ctx, it2.second);
+			}
 			if (ct_all.cell_known(cell->type))
 				for (auto &it2 : cell->connections())
-					if (ct_all.cell_output(cell->type, it2.first))
-						direct_sigs.insert(assign_map(it2.second));
+					if (ct_all.cell_output(cell->type, it2.first)) {
+						RTLIL::SigSpec spec = assign_map(it2.second);
+						unsigned int hash = spec.hash_into(Hasher()).yield();
+						direct_sigs_builder.insert(ctx, {std::move(spec), hash});
+					}
 		}
-		for (auto &it : module->wires_) {
-			if (direct_sigs.count(assign_map(it.second)) || it.second->port_input)
-				direct_wires.insert(it.second);
-		}
-	}
+	});
+	subpool.run([&register_signals_builder, &connected_signals_builder, &direct_sigs_builder](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		register_signals_builder.process(ctx);
+		connected_signals_builder.process(ctx);
+		direct_sigs_builder.process(ctx);
+	});
+	ShardedSigPool register_signals(register_signals_builder);
+	ShardedSigPool connected_signals(connected_signals_builder);
+	ShardedSigSpecPool direct_sigs(direct_sigs_builder);
 
-	// weight all options for representatives with `compare_signals`,
-	// the one that wins will be what `assign_map` maps to
-	for (auto &it : module->wires_) {
-		RTLIL::Wire *wire = it.second;
-		for (int i = 0; i < wire->width; i++) {
-			RTLIL::SigBit s1 = RTLIL::SigBit(wire, i), s2 = assign_map(s1);
-			if (compare_signals(s2, s1, register_signals, connected_signals, direct_wires))
-				assign_map.add(s1);
+	ShardedVector<RTLIL::SigBit> sigmap_canonical_candidates(subpool);
+	DirectWires direct_wires(assign_map, direct_sigs);
+	subpool.run([const_module, &assign_map, &register_signals, &connected_signals, &sigmap_canonical_candidates, &direct_sigs, &direct_wires](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		std::optional<DirectWires> local_direct_wires;
+		DirectWires *this_thread_direct_wires = &direct_wires;
+		if (ctx.thread_num > 0) {
+			local_direct_wires.emplace(assign_map, direct_sigs);
+			this_thread_direct_wires = &local_direct_wires.value();
 		}
+		for (int i : ctx.item_range(const_module->wires_size())) {
+			RTLIL::Wire *wire = const_module->wire_at(i);
+			for (int j = 0; j < wire->width; ++j) {
+				RTLIL::SigBit s1(wire, j);
+				RTLIL::SigBit s2 = assign_map(s1);
+				if (compare_signals(s2, s1, register_signals, connected_signals, *this_thread_direct_wires))
+					sigmap_canonical_candidates.insert(ctx, s1);
+			}
+		}
+	});
+	// Cache all the direct_wires results that we might possible need. This avoids the results
+	// changing when we update `assign_map` below.
+	for (RTLIL::SigBit candidate : sigmap_canonical_candidates) {
+		direct_wires.cache_result_for_bit(candidate);
+		direct_wires.cache_result_for_bit(assign_map(candidate));
+	}
+	for (RTLIL::SigBit candidate : sigmap_canonical_candidates) {
+		RTLIL::SigBit current_canonical = assign_map(candidate);
+		if (compare_signals(current_canonical, candidate, register_signals, connected_signals, direct_wires))
+			assign_map.add(candidate);
 	}
 
 	// we are removing all connections
 	module->connections_.clear();
 
 	// used signals sigmapped
-	SigPool used_signals;
+	ShardedSigPool::Builder used_signals_builder(subpool);
 	// used signals pre-sigmapped
-	SigPool raw_used_signals;
+	ShardedSigPool::Builder raw_used_signals_builder(subpool);
 	// used signals sigmapped, ignoring drivers (we keep track of this to set `unused_bits`)
-	SigPool used_signals_nodrivers;
-
-	// gather the usage information for cells
-	for (auto &it : module->cells_) {
-		RTLIL::Cell *cell = it.second;
-		for (auto &it2 : cell->connections_) {
-			assign_map.apply(it2.second); // modify the cell connection in place
-			raw_used_signals.add(it2.second);
-			used_signals.add(it2.second);
-			if (!ct_all.cell_output(cell->type, it2.first))
-				used_signals_nodrivers.add(it2.second);
-		}
-	}
-
-	// gather the usage information for ports, wires with `keep`,
+	ShardedSigPool::Builder used_signals_nodrivers_builder(subpool);
+	struct UpdateConnection {
+		RTLIL::Cell *cell;
+		RTLIL::IdString port;
+		RTLIL::SigSpec spec;
+	};
+	ShardedVector<UpdateConnection> update_connections(subpool);
+	ShardedVector<RTLIL::Wire*> initialized_wires(subpool);
+	// gather the usage information for cells and update cell connections
+	// also gather the usage information for ports, wires with `keep`
 	// also gather init bits
+	subpool.run([const_module, &register_signals, &connected_signals, &direct_sigs, &assign_map, &used_signals_builder, &raw_used_signals_builder, &used_signals_nodrivers_builder, &update_connections, &initialized_wires](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		// Parallel destruction of these sharded structures
+		register_signals.clear(ctx);
+		connected_signals.clear(ctx);
+		direct_sigs.clear(ctx);
+
+		for (int i : ctx.item_range(const_module->cells_size())) {
+			RTLIL::Cell *cell = const_module->cell_at(i);
+			for (const auto &it2 : cell->connections_) {
+				SigSpec spec = assign_map(it2.second);
+				if (spec != it2.second)
+					update_connections.insert(ctx, {cell, it2.first, spec});
+				add_spec(raw_used_signals_builder, ctx, spec);
+				add_spec(used_signals_builder, ctx, spec);
+				if (!ct_all.cell_output(cell->type, it2.first))
+					add_spec(used_signals_nodrivers_builder, ctx, spec);
+			}
+		}
+		for (int i : ctx.item_range(const_module->wires_size())) {
+			RTLIL::Wire *wire = const_module->wire_at(i);
+			if (wire->port_id > 0) {
+				RTLIL::SigSpec sig = RTLIL::SigSpec(wire);
+				add_spec(raw_used_signals_builder, ctx, sig);
+				assign_map.apply(sig);
+				add_spec(used_signals_builder, ctx, sig);
+				if (!wire->port_input)
+					add_spec(used_signals_nodrivers_builder, ctx, sig);
+			}
+			if (wire->get_bool_attribute(ID::keep)) {
+				RTLIL::SigSpec sig = RTLIL::SigSpec(wire);
+				assign_map.apply(sig);
+				add_spec(used_signals_builder, ctx, sig);
+			}
+			auto it2 = wire->attributes.find(ID::init);
+			if (it2 != wire->attributes.end())
+				initialized_wires.insert(ctx, wire);
+		}
+	});
+	subpool.run([&used_signals_builder, &raw_used_signals_builder, &used_signals_nodrivers_builder](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		used_signals_builder.process(ctx);
+		raw_used_signals_builder.process(ctx);
+		used_signals_nodrivers_builder.process(ctx);
+	});
+	ShardedSigPool used_signals(used_signals_builder);
+	ShardedSigPool raw_used_signals(raw_used_signals_builder);
+	ShardedSigPool used_signals_nodrivers(used_signals_nodrivers_builder);
+
 	dict<RTLIL::SigBit, RTLIL::State> init_bits;
-	for (auto &it : module->wires_) {
-		RTLIL::Wire *wire = it.second;
-		if (wire->port_id > 0) {
-			RTLIL::SigSpec sig = RTLIL::SigSpec(wire);
-			raw_used_signals.add(sig);
-			assign_map.apply(sig);
-			used_signals.add(sig);
-			if (!wire->port_input)
-				used_signals_nodrivers.add(sig);
-		}
-		if (wire->get_bool_attribute(ID::keep)) {
-			RTLIL::SigSpec sig = RTLIL::SigSpec(wire);
-			assign_map.apply(sig);
-			used_signals.add(sig);
-		}
-		auto it2 = wire->attributes.find(ID::init);
-		if (it2 != wire->attributes.end()) {
-			RTLIL::Const &val = it2->second;
-			SigSpec sig = assign_map(wire);
-			for (int i = 0; i < GetSize(val) && i < GetSize(sig); i++)
-				if (val[i] != State::Sx)
-					init_bits[sig[i]] = val[i];
-			wire->attributes.erase(it2);
-		}
+	// The wires that appear in the keys of `init_bits`
+	pool<Wire*> init_bits_wires;
+	for (const UpdateConnection &update : update_connections)
+		update.cell->connections_.at(update.port) = std::move(update.spec);
+	for (RTLIL::Wire *intialized_wire : initialized_wires) {
+		auto it = intialized_wire->attributes.find(ID::init);
+		RTLIL::Const &val = it->second;
+		SigSpec sig = assign_map(intialized_wire);
+		for (int i = 0; i < GetSize(val) && i < GetSize(sig); i++)
+			if (val[i] != State::Sx && sig[i].wire != nullptr) {
+				init_bits[sig[i]] = val[i];
+				init_bits_wires.insert(sig[i].wire);
+			}
+		intialized_wire->attributes.erase(it);
 	}
 
 	// set init attributes on all wires of a connected group
-	for (auto wire : module->wires()) {
+	for (RTLIL::Wire *wire : init_bits_wires) {
 		bool found = false;
 		Const val(State::Sx, wire->width);
 		for (int i = 0; i < wire->width; i++) {
@@ -418,81 +756,117 @@ bool rmunused_module_signals(RTLIL::Module *module, bool purge_mode, bool verbos
 	}
 
 	// now decide for each wire if we should be deleting it
-	pool<RTLIL::Wire*> del_wires_queue;
-	for (auto wire : module->wires())
-	{
-		SigSpec s1 = SigSpec(wire), s2 = assign_map(s1);
-		log_assert(GetSize(s1) == GetSize(s2));
+	ShardedVector<RTLIL::Wire*> del_wires(subpool);
+	ShardedVector<RTLIL::Wire*> remove_init(subpool);
+	ShardedVector<std::pair<RTLIL::Wire*, RTLIL::Const>> set_init(subpool);
+	ShardedVector<RTLIL::SigSig> connections(subpool);
+	ShardedVector<RTLIL::Wire*> remove_unused_bits(subpool);
+	ShardedVector<std::pair<RTLIL::Wire*, RTLIL::Const>> set_unused_bits(subpool);
+	subpool.run([const_module, purge_mode, &assign_map, &used_signals, &raw_used_signals, &used_signals_nodrivers, &del_wires, &remove_init, &set_init, &connections, &remove_unused_bits, &set_unused_bits](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		for (int i : ctx.item_range(const_module->wires_size())) {
+			RTLIL::Wire *wire = const_module->wire_at(i);
+			SigSpec s1 = SigSpec(wire), s2 = assign_map(s1);
+			log_assert(GetSize(s1) == GetSize(s2));
 
-		Const initval;
-		if (wire->attributes.count(ID::init))
-			initval = wire->attributes.at(ID::init);
-		if (GetSize(initval) != GetSize(wire))
-			initval.resize(GetSize(wire), State::Sx);
-		if (initval.is_fully_undef())
-			wire->attributes.erase(ID::init);
+			Const initval;
+			bool has_init_attribute = wire->attributes.count(ID::init);
+			bool init_changed = false;
+			if (has_init_attribute)
+				initval = wire->attributes.at(ID::init);
+			if (GetSize(initval) != GetSize(wire)) {
+				initval.resize(GetSize(wire), State::Sx);
+				init_changed = true;
+			}
 
-		if (GetSize(wire) == 0) {
-			// delete zero-width wires, unless they are module ports
-			if (wire->port_id == 0)
+			if (GetSize(wire) == 0) {
+				// delete zero-width wires, unless they are module ports
+				if (wire->port_id == 0)
+					goto delete_this_wire;
+			} else
+			if (wire->port_id != 0 || wire->get_bool_attribute(ID::keep) || !initval.is_fully_undef()) {
+				// do not delete anything with "keep" or module ports or initialized wires
+			} else
+			if (!purge_mode && check_public_name(wire->name) && (check_any(raw_used_signals, s1) || check_any(used_signals, s2) || s1 != s2)) {
+				// do not get rid of public names unless in purge mode or if the wire is entirely unused, not even aliased
+			} else
+			if (!check_any(raw_used_signals, s1)) {
+				// delete wires that aren't used by anything directly
 				goto delete_this_wire;
-		} else
-		if (wire->port_id != 0 || wire->get_bool_attribute(ID::keep) || !initval.is_fully_undef()) {
-			// do not delete anything with "keep" or module ports or initialized wires
-		} else
-		if (!purge_mode && check_public_name(wire->name) && (raw_used_signals.check_any(s1) || used_signals.check_any(s2) || s1 != s2)) {
-			// do not get rid of public names unless in purge mode or if the wire is entirely unused, not even aliased
-		} else
-		if (!raw_used_signals.check_any(s1)) {
-			// delete wires that aren't used by anything directly
-			goto delete_this_wire;
-		}
-
-		if (0)
-		{
-	delete_this_wire:
-			del_wires_queue.insert(wire);
-		}
-		else
-		{
-			RTLIL::SigSig new_conn;
-			for (int i = 0; i < GetSize(s1); i++)
-				if (s1[i] != s2[i]) {
-					if (s2[i] == State::Sx && (initval[i] == State::S0 || initval[i] == State::S1)) {
-						s2[i] = initval[i];
-						initval.set(i, State::Sx);
-					}
-					new_conn.first.append(s1[i]);
-					new_conn.second.append(s2[i]);
-				}
-			if (new_conn.first.size() > 0) {
-				if (initval.is_fully_undef())
-					wire->attributes.erase(ID::init);
-				else
-					wire->attributes.at(ID::init) = initval;
-				module->connect(new_conn);
 			}
 
-			if (!used_signals_nodrivers.check_all(s2)) {
+			if (0)
+			{
+		delete_this_wire:
+				del_wires.insert(ctx, wire);
+			}
+			else
+			{
+				RTLIL::SigSig new_conn;
+				for (int i = 0; i < GetSize(s1); i++)
+					if (s1[i] != s2[i]) {
+						if (s2[i] == State::Sx && (initval[i] == State::S0 || initval[i] == State::S1)) {
+							s2[i] = initval[i];
+							initval.set(i, State::Sx);
+							init_changed = true;
+						}
+						new_conn.first.append(s1[i]);
+						new_conn.second.append(s2[i]);
+					}
+				if (new_conn.first.size() > 0)
+					connections.insert(ctx, std::move(new_conn));
+				if (initval.is_fully_undef()) {
+					if (has_init_attribute)
+						remove_init.insert(ctx, wire);
+				} else
+					if (init_changed)
+						set_init.insert(ctx, {wire, std::move(initval)});
+
 				std::string unused_bits;
-				for (int i = 0; i < GetSize(s2); i++) {
-					if (s2[i].wire == NULL)
-						continue;
-					if (!used_signals_nodrivers.check(s2[i])) {
-						if (!unused_bits.empty())
-							unused_bits += " ";
-						unused_bits += stringf("%d", i);
+				if (!check_all(used_signals_nodrivers, s2)) {
+					for (int i = 0; i < GetSize(s2); i++) {
+						if (s2[i].wire == NULL)
+							continue;
+						SigBit b = s2[i];
+						if (used_signals_nodrivers.find({b, b.hash_top().yield()}) == nullptr) {
+							if (!unused_bits.empty())
+								unused_bits += " ";
+							unused_bits += stringf("%d", i);
+						}
 					}
 				}
-				if (unused_bits.empty() || wire->port_id != 0)
-					wire->attributes.erase(ID::unused_bits);
-				else
-					wire->attributes[ID::unused_bits] = RTLIL::Const(unused_bits);
-			} else {
-				wire->attributes.erase(ID::unused_bits);
+				if (unused_bits.empty() || wire->port_id != 0) {
+					if (wire->attributes.count(ID::unused_bits))
+						remove_unused_bits.insert(ctx, wire);
+				} else {
+					RTLIL::Const unused_bits_const(std::move(unused_bits));
+					if (wire->attributes.count(ID::unused_bits)) {
+						RTLIL::Const &unused_bits_attr = wire->attributes.at(ID::unused_bits);
+						if (unused_bits_attr != unused_bits_const)
+							set_unused_bits.insert(ctx, {wire, std::move(unused_bits_const)});
+					} else
+						set_unused_bits.insert(ctx, {wire, std::move(unused_bits_const)});
+				}
 			}
 		}
-	}
+	});
+	pool<RTLIL::Wire*> del_wires_queue;
+	del_wires_queue.insert(del_wires.begin(), del_wires.end());
+	for (RTLIL::Wire *wire : remove_init)
+		wire->attributes.erase(ID::init);
+	for (auto &p : set_init)
+		p.first->attributes[ID::init] = std::move(p.second);
+	for (auto &conn : connections)
+		module->connect(std::move(conn));
+	for (RTLIL::Wire *wire : remove_unused_bits)
+		wire->attributes.erase(ID::unused_bits);
+	for (auto &p : set_unused_bits)
+		p.first->attributes[ID::unused_bits] = std::move(p.second);
+
+	subpool.run([&used_signals, &raw_used_signals, &used_signals_nodrivers](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		used_signals.clear(ctx);
+		raw_used_signals.clear(ctx);
+		used_signals_nodrivers.clear(ctx);
+	});
 
 	int del_temp_wires_count = 0;
 	for (auto wire : del_wires_queue) {
@@ -503,7 +877,7 @@ bool rmunused_module_signals(RTLIL::Module *module, bool purge_mode, bool verbos
 	}
 
 	module->remove(del_wires_queue);
-	count_rm_wires += GetSize(del_wires_queue);
+	stats.count_rm_wires += GetSize(del_wires_queue);
 
 	if (verbose && del_temp_wires_count)
 		log_debug("  removed %d unused temporary wires.\n", del_temp_wires_count);
@@ -514,79 +888,93 @@ bool rmunused_module_signals(RTLIL::Module *module, bool purge_mode, bool verbos
 	return !del_wires_queue.empty();
 }
 
-bool rmunused_module_init(RTLIL::Module *module, bool verbose)
+bool rmunused_module_init(RTLIL::Module *module, ParallelDispatchThreadPool::Subpool &subpool, bool verbose)
 {
-	bool did_something = false;
 	CellTypes fftypes;
 	fftypes.setup_internals_mem();
 
 	SigMap sigmap(module);
-	dict<SigBit, State> qbits;
 
-	for (auto cell : module->cells())
-		if (fftypes.cell_known(cell->type) && cell->hasPort(ID::Q))
-		{
-			SigSpec sig = cell->getPort(ID::Q);
-
-			for (int i = 0; i < GetSize(sig); i++)
+	const Module *const_module = module;
+	ShardedVector<std::pair<SigBit, State>> results(subpool);
+	subpool.run([const_module, &fftypes, &results](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		for (int i : ctx.item_range(const_module->cells_size())) {
+			RTLIL::Cell *cell = const_module->cell_at(i);
+			if (fftypes.cell_known(cell->type) && cell->hasPort(ID::Q))
 			{
-				SigBit bit = sig[i];
+				SigSpec sig = cell->getPort(ID::Q);
 
-				if (bit.wire == nullptr || bit.wire->attributes.count(ID::init) == 0)
-					continue;
+				for (int i = 0; i < GetSize(sig); i++)
+				{
+					SigBit bit = sig[i];
 
-				Const init = bit.wire->attributes.at(ID::init);
+					if (bit.wire == nullptr || bit.wire->attributes.count(ID::init) == 0)
+						continue;
 
-				if (i >= GetSize(init) || init[i] == State::Sx || init[i] == State::Sz)
-					continue;
+					Const init = bit.wire->attributes.at(ID::init);
 
-				sigmap.add(bit);
-				qbits[bit] = init[i];
-			}
-		}
+					if (i >= GetSize(init) || init[i] == State::Sx || init[i] == State::Sz)
+						continue;
 
-	for (auto wire : module->wires())
-	{
-		if (wire->attributes.count(ID::init) == 0)
-			continue;
-
-		Const init = wire->attributes.at(ID::init);
-
-		for (int i = 0; i < GetSize(wire) && i < GetSize(init); i++)
-		{
-			if (init[i] == State::Sx || init[i] == State::Sz)
-				continue;
-
-			SigBit wire_bit = SigBit(wire, i);
-			SigBit mapped_wire_bit = sigmap(wire_bit);
-
-			if (wire_bit == mapped_wire_bit)
-				goto next_wire;
-
-			if (mapped_wire_bit.wire) {
-				if (qbits.count(mapped_wire_bit) == 0)
-					goto next_wire;
-
-				if (qbits.at(mapped_wire_bit) != init[i])
-					goto next_wire;
-			}
-			else {
-				if (mapped_wire_bit == State::Sx || mapped_wire_bit == State::Sz)
-					goto next_wire;
-
-				if (mapped_wire_bit != init[i]) {
-					log_warning("Initial value conflict for %s resolving to %s but with init %s.\n", log_signal(wire_bit), log_signal(mapped_wire_bit), log_signal(init[i]));
-					goto next_wire;
+					results.insert(ctx, {bit, init[i]});
 				}
 			}
 		}
+	});
+	dict<SigBit, State> qbits;
+	for (std::pair<SigBit, State> &p : results) {
+		sigmap.add(p.first);
+		qbits[p.first] = p.second;
+	}
 
+	ShardedVector<RTLIL::Wire*> wire_results(subpool);
+	subpool.run([const_module, &sigmap, &qbits, &wire_results](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		for (int j : ctx.item_range(const_module->wires_size())) {
+			RTLIL::Wire *wire = const_module->wire_at(j);
+			if (wire->attributes.count(ID::init) == 0)
+				continue;
+			Const init = wire->attributes.at(ID::init);
+
+			for (int i = 0; i < GetSize(wire) && i < GetSize(init); i++)
+			{
+				if (init[i] == State::Sx || init[i] == State::Sz)
+					continue;
+
+				SigBit wire_bit = SigBit(wire, i);
+				SigBit mapped_wire_bit = sigmap(wire_bit);
+
+				if (wire_bit == mapped_wire_bit)
+					goto next_wire;
+
+				if (mapped_wire_bit.wire) {
+					if (qbits.count(mapped_wire_bit) == 0)
+						goto next_wire;
+
+					if (qbits.at(mapped_wire_bit) != init[i])
+						goto next_wire;
+				}
+				else {
+					if (mapped_wire_bit == State::Sx || mapped_wire_bit == State::Sz)
+						goto next_wire;
+
+					if (mapped_wire_bit != init[i]) {
+						log_warning("Initial value conflict for %s resolving to %s but with init %s.\n", log_signal(wire_bit), log_signal(mapped_wire_bit), log_signal(init[i]));
+						goto next_wire;
+					}
+				}
+			}
+			wire_results.insert(ctx, wire);
+
+			next_wire:;
+		}
+	});
+
+	bool did_something = false;
+	for (RTLIL::Wire *wire : wire_results) {
 		if (verbose)
 			log_debug("  removing redundant init attribute on %s.\n", log_id(wire));
-
 		wire->attributes.erase(ID::init);
 		did_something = true;
-	next_wire:;
 	}
 
 	if (did_something)
@@ -595,47 +983,53 @@ bool rmunused_module_init(RTLIL::Module *module, bool verbose)
 	return did_something;
 }
 
-void rmunused_module(RTLIL::Module *module, bool purge_mode, bool verbose, bool rminit)
+void remove_temporary_cells(RTLIL::Module *module, ParallelDispatchThreadPool::Subpool &subpool, bool verbose)
 {
-	if (verbose)
-		log("Finding unused cells or wires in module %s..\n", module->name);
+	ShardedVector<RTLIL::Cell*> delcells(subpool);
+	ShardedVector<RTLIL::SigSig> new_connections(subpool);
+	const RTLIL::Module *const_module = module;
+	subpool.run([const_module, &delcells, &new_connections](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		for (int i : ctx.item_range(const_module->cells_size())) {
+			RTLIL::Cell *cell = const_module->cell_at(i);
+			if (cell->type.in(ID($pos), ID($_BUF_), ID($buf)) && !cell->has_keep_attr()) {
+				bool is_signed = cell->type == ID($pos) && cell->getParam(ID::A_SIGNED).as_bool();
+				RTLIL::SigSpec a = cell->getPort(ID::A);
+				RTLIL::SigSpec y = cell->getPort(ID::Y);
+				a.extend_u0(GetSize(y), is_signed);
 
-	std::vector<RTLIL::Cell*> delcells;
-	for (auto cell : module->cells()) {
-		if (cell->type.in(ID($pos), ID($_BUF_), ID($buf)) && !cell->has_keep_attr()) {
-			bool is_signed = cell->type == ID($pos) && cell->getParam(ID::A_SIGNED).as_bool();
-			RTLIL::SigSpec a = cell->getPort(ID::A);
-			RTLIL::SigSpec y = cell->getPort(ID::Y);
-			a.extend_u0(GetSize(y), is_signed);
-
-			if (a.has_const(State::Sz)) {
-				SigSpec new_a;
-				SigSpec new_y;
-				for (int i = 0; i < GetSize(a); ++i) {
-					SigBit b = a[i];
-					if (b == State::Sz)
-						continue;
-					new_a.append(b);
-					new_y.append(y[i]);
+				if (a.has_const(State::Sz)) {
+					RTLIL::SigSpec new_a;
+					RTLIL::SigSpec new_y;
+					for (int i = 0; i < GetSize(a); ++i) {
+						RTLIL::SigBit b = a[i];
+						if (b == State::Sz)
+							continue;
+						new_a.append(b);
+						new_y.append(y[i]);
+					}
+					a = std::move(new_a);
+					y = std::move(new_y);
 				}
-				a = std::move(new_a);
-				y = std::move(new_y);
+				if (!y.empty())
+					new_connections.insert(ctx, {y, a});
+				delcells.insert(ctx, cell);
+			} else if (cell->type.in(ID($connect)) && !cell->has_keep_attr()) {
+				RTLIL::SigSpec a = cell->getPort(ID::A);
+				RTLIL::SigSpec b = cell->getPort(ID::B);
+				if (a.has_const() && !b.has_const())
+					std::swap(a, b);
+				new_connections.insert(ctx, {a, b});
+				delcells.insert(ctx, cell);
+			} else if (cell->type.in(ID($input_port)) && !cell->has_keep_attr()) {
+				delcells.insert(ctx, cell);
 			}
-			if (!y.empty())
-				module->connect(y, a);
-			delcells.push_back(cell);
-		} else if (cell->type.in(ID($connect)) && !cell->has_keep_attr()) {
-			RTLIL::SigSpec a = cell->getPort(ID::A);
-			RTLIL::SigSpec b = cell->getPort(ID::B);
-			if (a.has_const() && !b.has_const())
-				std::swap(a, b);
-			module->connect(a, b);
-			delcells.push_back(cell);
-		} else if (cell->type.in(ID($input_port)) && !cell->has_keep_attr()) {
-			delcells.push_back(cell);
 		}
+	});
+	bool did_something = false;
+	for (RTLIL::SigSig &connection : new_connections) {
+		module->connect(connection);
 	}
-	for (auto cell : delcells) {
+	for (RTLIL::Cell *cell : delcells) {
 		if (verbose) {
 			if (cell->type == ID($connect))
 				log_debug("  removing connect cell `%s': %s <-> %s\n", cell->name,
@@ -648,17 +1042,28 @@ void rmunused_module(RTLIL::Module *module, bool purge_mode, bool verbose, bool
 						log_signal(cell->getPort(ID::Y)), log_signal(cell->getPort(ID::A)));
 		}
 		module->remove(cell);
+		did_something = true;
 	}
-	if (!delcells.empty())
+	if (did_something)
 		module->design->scratchpad_set_bool("opt.did_something", true);
-
-	rmunused_module_cells(module, verbose);
-	while (rmunused_module_signals(module, purge_mode, verbose)) { }
-
-	if (rminit && rmunused_module_init(module, verbose))
-		while (rmunused_module_signals(module, purge_mode, verbose)) { }
 }
 
+void rmunused_module(RTLIL::Module *module, ParallelDispatchThreadPool &thread_pool, bool purge_mode, bool verbose, bool rminit, RmStats &stats, keep_cache_t &keep_cache)
+{
+	if (verbose)
+		log("Finding unused cells or wires in module %s..\n", module->name);
+
+	// Use no more than one worker per thousand cells, rounded down, so
+	// we only start multithreading with at least 2000 cells.
+	int num_worker_threads = ThreadPool::work_pool_size(0, module->cells_size(), 1000);
+	ParallelDispatchThreadPool::Subpool subpool(thread_pool, num_worker_threads);
+	remove_temporary_cells(module, subpool, verbose);
+	rmunused_module_cells(module, subpool, verbose, stats, keep_cache);
+	while (rmunused_module_signals(module, subpool, purge_mode, verbose, stats)) { }
+
+	if (rminit && rmunused_module_init(module, subpool, verbose))
+		while (rmunused_module_signals(module, subpool, purge_mode, verbose, stats)) { }
+}
 struct OptCleanPass : public Pass {
 	OptCleanPass() : Pass("opt_clean", "remove unused cells and wires") { }
 	void help() override
@@ -695,7 +1100,15 @@ struct OptCleanPass : public Pass {
 		}
 		extra_args(args, argidx, design);
 
-		keep_cache.reset(design, purge_mode);
+		std::vector<RTLIL::Module*> selected_modules;
+		for (auto module : design->selected_whole_modules_warn())
+			if (!module->has_processes_warn())
+				selected_modules.push_back(module);
+		int thread_pool_size = 0;
+		for (RTLIL::Module *m : selected_modules)
+			thread_pool_size = std::max(thread_pool_size, ThreadPool::work_pool_size(0, m->cells_size(), 1000));
+		ParallelDispatchThreadPool thread_pool(thread_pool_size);
+		keep_cache_t keep_cache(purge_mode, thread_pool, selected_modules);
 
 		ct_reg.setup_internals_mem();
 		ct_reg.setup_internals_anyinit();
@@ -703,22 +1116,14 @@ struct OptCleanPass : public Pass {
 
 		ct_all.setup(design);
 
-		count_rm_cells = 0;
-		count_rm_wires = 0;
-
-		for (auto module : design->selected_whole_modules_warn()) {
-			if (module->has_processes_warn())
-				continue;
-			rmunused_module(module, purge_mode, true, true);
-		}
-
-		if (count_rm_cells > 0 || count_rm_wires > 0)
-			log("Removed %d unused cells and %d unused wires.\n", count_rm_cells, count_rm_wires);
+		RmStats stats;
+		for (auto module : selected_modules)
+			rmunused_module(module, thread_pool, purge_mode, true, true, stats, keep_cache);
+		stats.log();
 
 		design->optimize();
 		design->check();
 
-		keep_cache.reset();
 		ct_reg.clear();
 		ct_all.clear();
 		log_pop();
@@ -758,7 +1163,15 @@ struct CleanPass : public Pass {
 		}
 		extra_args(args, argidx, design);
 
-		keep_cache.reset(design);
+		std::vector<RTLIL::Module*> selected_modules;
+		for (auto module : design->selected_unboxed_whole_modules())
+			if (!module->has_processes())
+				selected_modules.push_back(module);
+		int thread_pool_size = 0;
+		for (RTLIL::Module *m : selected_modules)
+			thread_pool_size = std::max(thread_pool_size, ThreadPool::work_pool_size(0, m->cells_size(), 1000));
+		ParallelDispatchThreadPool thread_pool(thread_pool_size);
+		keep_cache_t keep_cache(purge_mode, thread_pool, selected_modules);
 
 		ct_reg.setup_internals_mem();
 		ct_reg.setup_internals_anyinit();
@@ -766,23 +1179,16 @@ struct CleanPass : public Pass {
 
 		ct_all.setup(design);
 
-		count_rm_cells = 0;
-		count_rm_wires = 0;
-
-		for (auto module : design->selected_unboxed_whole_modules()) {
-			if (module->has_processes())
-				continue;
-			rmunused_module(module, purge_mode, ys_debug(), true);
-		}
+		RmStats stats;
+		for (auto module : selected_modules)
+			rmunused_module(module, thread_pool, purge_mode, ys_debug(), true, stats, keep_cache);
 
 		log_suppressed();
-		if (count_rm_cells > 0 || count_rm_wires > 0)
-			log("Removed %d unused cells and %d unused wires.\n", count_rm_cells, count_rm_wires);
+		stats.log();
 
 		design->optimize();
 		design->check();
 
-		keep_cache.reset();
 		ct_reg.clear();
 		ct_all.clear();
 
diff --git a/tests/opt/opt_clean_init_const.ys b/tests/opt/opt_clean_init_const.ys
new file mode 100644
index 000000000..1b3d5db63
--- /dev/null
+++ b/tests/opt/opt_clean_init_const.ys
@@ -0,0 +1,9 @@
+read_rtlil << EOT
+module \top
+  attribute \init 1'0
+  wire \w
+
+  connect \w 1'0
+end
+EOT
+opt_clean
diff --git a/tests/tools/rtlil-fuzz-grammar.json b/tests/tools/rtlil-fuzz-grammar.json
index c27b160f4..96af9bde3 100644
--- a/tests/tools/rtlil-fuzz-grammar.json
+++ b/tests/tools/rtlil-fuzz-grammar.json
@@ -8,7 +8,7 @@
 			"end\n"
 		]
 	],
-	"<WIRE>": [ [ "  wire width ", "<WIDTH>", " ", "<WIRE_MODE>", " ", "<WIRE_ID>", "\n" ] ],
+	"<WIRE>": [ [ "<WIRE_ATTRIBUTES>", "  wire width ", "<WIDTH>", " ", "<WIRE_MODE>", " ", "<WIRE_ID>", "\n" ] ],
 	"<WIDTH>": [ [ "1" ], [ "2" ], [ "3" ], [ "4" ], [ "32" ], [ "128" ] ],
 	"<WIRE_MODE>": [ [ "input ", "<PORT_ID>" ], [ "output ", "<PORT_ID>" ], [ "inout ", "<PORT_ID>" ], [] ],
 	"<CELL>": [
@@ -71,6 +71,7 @@
 			"  end\n"
 		]
 	],
+	"<WIRE_ATTRIBUTE>": [ [ "  attribute \\init ", "<CONST>", "\n" ] ],
 	"<WIRE_ID>": [ [ "\\wire_a" ], [ "\\wire_b" ], [ "\\wire_c" ], [ "\\wire_d" ], [ "\\wire_e" ], [ "\\wire_f" ], [ "\\wire_g" ], [ "\\wire_h" ], [ "\\wire_i" ], [ "\\wire_j" ] ],
 	"<CELL_ID>": [ [ "\\cell_a" ], [ "\\cell_b" ], [ "\\cell_c" ], [ "\\cell_d" ], [ "\\cell_e" ], [ "\\cell_f" ], [ "\\cell_g" ], [ "\\cell_h" ], [ "\\cell_i" ], [ "\\cell_j" ] ],
 	"<BLACKBOX_CELL>": [ [ "\\bb1" ], [ "\\bb2" ] ],
@@ -97,6 +98,7 @@
 	"<CONNECT>": [ [ "  connect ", "<SIGSPEC>", " ", "<SIGSPEC>", "\n" ] ],
 
 	"<WIRES>": [ [ ], [ "<WIRE>", "<WIRES>" ] ],
+	"<WIRE_ATTRIBUTES>": [ [ ], [ "<WIRE_ATTRIBUTE>", "<WIRE_ATTRIBUTES>" ] ],
 	"<CELLS>": [ [ ], [ "<CELL>", "<CELLS>" ] ],
 	"<BITS>": [ [ ], [ "<BIT>", "<BITS>" ] ],
 	"<CONNECTS>": [ [ ], [ "<CONNECT>", "<CONNECTS>" ] ],
diff --git a/tests/unit/Makefile b/tests/unit/Makefile
index b275d7f41..3165ad97b 100644
--- a/tests/unit/Makefile
+++ b/tests/unit/Makefile
@@ -4,10 +4,10 @@ UNAME_S := $(shell uname -s)
 GTEST_PREFIX := $(shell brew --prefix googletest 2>/dev/null)
 ifeq ($(GTEST_PREFIX),)
   GTEST_CXXFLAGS :=
-  GTEST_LDFLAGS := -lgtest -lgtest_main
+  GTEST_LDFLAGS := -lgtest -lgmock -lgtest_main
 else
   GTEST_CXXFLAGS := -I$(GTEST_PREFIX)/include
-  GTEST_LDFLAGS := -L$(GTEST_PREFIX)/lib -lgtest -lgtest_main
+  GTEST_LDFLAGS := -L$(GTEST_PREFIX)/lib -lgtest -lgmock -lgtest_main
 endif
 
 ifeq ($(UNAME_S),Darwin)
diff --git a/tests/unit/kernel/threadingTest.cc b/tests/unit/kernel/threadingTest.cc
new file mode 100644
index 000000000..c0bd5927f
--- /dev/null
+++ b/tests/unit/kernel/threadingTest.cc
@@ -0,0 +1,442 @@
+#include <gtest/gtest.h>
+#include <gmock/gmock.h>
+#include "kernel/threading.h"
+
+YOSYS_NAMESPACE_BEGIN
+
+class ThreadingTest : public testing::Test {
+protected:
+	ThreadingTest() {
+		if (log_files.empty())
+			log_files.emplace_back(stdout);
+	}
+};
+
+TEST_F(ThreadingTest, ParallelDispatchThreadPoolCreate) {
+	// Test creating a pool with 0 threads (treated as 1)
+	ParallelDispatchThreadPool pool0(0);
+	EXPECT_EQ(pool0.num_threads(), 1);
+
+	// Test creating a pool with 1 thread
+	ParallelDispatchThreadPool pool1(1);
+	EXPECT_EQ(pool1.num_threads(), 1);
+
+	// Test creating a pool with 2 threads
+	ParallelDispatchThreadPool pool2(2);
+	// YOSYS_MAX_THREADS or system configuration could mean we
+	// decide to only use one thread.
+	EXPECT_GE(pool2.num_threads(), 1);
+	EXPECT_LE(pool2.num_threads(), 2);
+}
+
+TEST_F(ThreadingTest, ParallelDispatchThreadPoolRunSimple) {
+	ParallelDispatchThreadPool pool(2);
+
+	std::atomic<int> counter{0};
+	pool.run([&counter](const ParallelDispatchThreadPool::RunCtx &) {
+		counter.fetch_add(1, std::memory_order_relaxed);
+	});
+
+	EXPECT_EQ(counter.load(), pool.num_threads());
+}
+
+TEST_F(ThreadingTest, ParallelDispatchThreadPoolRunMultiple) {
+	ParallelDispatchThreadPool pool(2);
+
+	std::atomic<int> counter{0};
+	// Run multiple times to verify the pool can be reused
+	for (int i = 0; i < 5; ++i)
+		pool.run([&counter](const ParallelDispatchThreadPool::RunCtx &) {
+			counter.fetch_add(1, std::memory_order_relaxed);
+		});
+
+	EXPECT_EQ(counter.load(), pool.num_threads() * 5);
+}
+
+TEST_F(ThreadingTest, ParallelDispatchThreadPoolRunCtxThreadNums) {
+	ParallelDispatchThreadPool pool(4);
+
+	std::vector<int> thread_nums(pool.num_threads(), -1);
+	pool.run([&thread_nums](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		thread_nums[ctx.thread_num] = ctx.thread_num;
+	});
+
+	// Every thread should have recorded its own thread number
+	for (int i = 0; i < pool.num_threads(); ++i)
+		EXPECT_EQ(thread_nums[i], i);
+}
+
+TEST_F(ThreadingTest, ParallelDispatchThreadPoolItemRange) {
+	ParallelDispatchThreadPool pool(3);
+
+	const int num_items = 100;
+	std::vector<std::atomic<int>> item_counts(num_items);
+	for (std::atomic<int> &c : item_counts)
+		c.store(0);
+
+	pool.run([&item_counts](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		for (int i : ctx.item_range(num_items))
+			item_counts[i].fetch_add(1);
+	});
+
+	// Each item should have been processed exactly once
+	for (int i = 0; i < num_items; ++i)
+		EXPECT_EQ(item_counts[i].load(), 1);
+}
+
+TEST_F(ThreadingTest, ParallelDispatchThreadPoolSubpool) {
+	ParallelDispatchThreadPool pool(4);
+
+	// Subpool limited to 2 threads
+	ParallelDispatchThreadPool::Subpool subpool(pool, 2);
+	EXPECT_LE(subpool.num_threads(), 2);
+
+	std::atomic<int> counter{0};
+	subpool.run([&counter](const ParallelDispatchThreadPool::RunCtx &) {
+		counter.fetch_add(1, std::memory_order_relaxed);
+	});
+
+	EXPECT_EQ(counter.load(), subpool.num_threads());
+}
+
+TEST_F(ThreadingTest, IntRangeIteration) {
+	IntRange range{3, 7};
+	std::vector<int> values;
+	for (int i : range)
+		values.push_back(i);
+	EXPECT_THAT(values, testing::ElementsAre(3, 4, 5, 6));
+}
+
+TEST_F(ThreadingTest, IntRangeEmpty) {
+	IntRange range{5, 5};
+	for (int _ : range)
+		FAIL();
+}
+
+TEST_F(ThreadingTest, ItemRangeForWorker) {
+	EXPECT_EQ(item_range_for_worker(10, 0, 3), (IntRange{0, 4}));
+	EXPECT_EQ(item_range_for_worker(10, 1, 3), (IntRange{4, 7}));
+	EXPECT_EQ(item_range_for_worker(10, 2, 3), (IntRange{7, 10}));
+}
+
+TEST_F(ThreadingTest, ItemRangeForWorkerZeroThreads) {
+	EXPECT_EQ(item_range_for_worker(10, 0, 0), (IntRange{0, 10}));
+}
+
+TEST_F(ThreadingTest, ShardedVectorBasic) {
+	ParallelDispatchThreadPool pool(2);
+	ShardedVector<int> vec(pool);
+	pool.run([&vec](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		vec.insert(ctx, ctx.thread_num * 10);
+		vec.insert(ctx, ctx.thread_num * 10 + 1);
+	});
+
+	EXPECT_FALSE(vec.empty());
+
+	// Count elements
+	std::vector<int> elements;
+	for (int v : vec) {
+		elements.push_back(v);
+	}
+
+	if (pool.num_threads() == 2)
+		EXPECT_THAT(elements, testing::ElementsAre(0, 1, 10, 11));
+	else
+		EXPECT_THAT(elements, testing::ElementsAre(0, 1));
+}
+
+TEST_F(ThreadingTest, MonotonicFlagBasic) {
+	MonotonicFlag flag;
+	EXPECT_FALSE(flag.load());
+	flag.set();
+	EXPECT_TRUE(flag.load());
+	flag.set();
+	EXPECT_TRUE(flag.load());
+}
+
+TEST_F(ThreadingTest, MonotonicFlagSetAndReturnOld) {
+	MonotonicFlag flag;
+	EXPECT_FALSE(flag.set_and_return_old());
+	EXPECT_TRUE(flag.load());
+	EXPECT_TRUE(flag.set_and_return_old());
+}
+
+TEST_F(ThreadingTest, ConcurrentQueueBasic) {
+	ConcurrentQueue<int> queue;
+	queue.push_back(1);
+	queue.push_back(2);
+	queue.push_back(3);
+
+	auto v1 = queue.pop_front();
+	auto v2 = queue.pop_front();
+	auto v3 = queue.pop_front();
+
+	ASSERT_TRUE(v1.has_value());
+	ASSERT_TRUE(v2.has_value());
+	ASSERT_TRUE(v3.has_value());
+	EXPECT_EQ(*v1, 1);
+	EXPECT_EQ(*v2, 2);
+	EXPECT_EQ(*v3, 3);
+}
+
+TEST_F(ThreadingTest, ConcurrentQueueTryPopEmpty) {
+	ConcurrentQueue<int> queue;
+	auto v = queue.try_pop_front();
+	EXPECT_FALSE(v.has_value());
+}
+
+TEST_F(ThreadingTest, ConcurrentQueueClose) {
+	ConcurrentQueue<int> queue;
+	queue.push_back(42);
+	queue.close();
+
+	// Can still pop existing elements
+	auto v1 = queue.pop_front();
+	ASSERT_TRUE(v1.has_value());
+	EXPECT_EQ(*v1, 42);
+
+	// After close and empty, pop_front returns nullopt
+	auto v2 = queue.pop_front();
+	EXPECT_FALSE(v2.has_value());
+}
+
+TEST_F(ThreadingTest, ThreadPoolCreate) {
+	// pool_size of 0 means no worker threads
+	ThreadPool pool0(0, [](int) {});
+	EXPECT_EQ(pool0.num_threads(), 0);
+
+	// pool_size of 1 means 1 worker thread
+	std::atomic<int> counter{0};
+	{
+		ThreadPool pool1(1, [&counter](int thread_num) {
+			EXPECT_EQ(thread_num, 0);
+			counter.fetch_add(1);
+		});
+	}
+#ifdef YOSYS_ENABLE_THREADS
+	EXPECT_EQ(counter.load(), 1);
+#else
+	EXPECT_EQ(counter.load(), 0);
+#endif
+}
+
+TEST_F(ThreadingTest, ThreadPoolMultipleThreads) {
+	std::atomic<int> counter{0};
+	{
+		ThreadPool pool(2, [&counter](int) {
+			counter.fetch_add(1);
+		});
+		EXPECT_LE(pool.num_threads(), 2);
+	}
+#ifdef YOSYS_ENABLE_THREADS
+	EXPECT_GE(counter.load(), 1);
+	EXPECT_LE(counter.load(), 2);
+#else
+	EXPECT_EQ(counter.load(), 0);
+#endif
+}
+
+// Helper types for ShardedHashSet tests
+struct IntValue {
+	using Accumulated = IntValue;
+	int value;
+	operator int() const { return value; }
+};
+
+struct IntValueEquality {
+	bool operator()(int a, int b) const { return a == b; }
+};
+
+TEST_F(ThreadingTest, ShardedHashSetBasic) {
+	ParallelDispatchThreadPool pool(1);
+
+	using HashSet = ShardedHashSet<IntValue, IntValueEquality>;
+	HashSet::Builder builder(pool);
+
+	// Insert some values
+	pool.run([&builder](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		builder.insert(ctx, {{10}, 10});
+		builder.insert(ctx, {{20}, 20});
+		builder.insert(ctx, {{30}, 30});
+	});
+
+	// Process
+	pool.run([&builder](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		builder.process(ctx);
+	});
+
+	// Build and lookup
+	HashSet set(builder);
+	const IntValue *found10 = set.find({{10}, 10});
+	const IntValue *found20 = set.find({{20}, 20});
+	const IntValue *found99 = set.find({{99}, 99});
+
+	ASSERT_NE(found10, nullptr);
+	ASSERT_NE(found20, nullptr);
+	EXPECT_EQ(found99, nullptr);
+	EXPECT_EQ(*found10, 10);
+	EXPECT_EQ(*found20, 20);
+}
+
+TEST_F(ThreadingTest, ShardedHashSetParallelInsert) {
+	ParallelDispatchThreadPool pool(3);
+
+	using HashSet = ShardedHashSet<IntValue, IntValueEquality>;
+	HashSet::Builder builder(pool);
+
+	// Insert values from multiple threads
+	pool.run([&builder](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		for (int i = 0; i < 10; ++i) {
+			int val = ctx.thread_num * 100 + i;
+			builder.insert(ctx, {{val}, static_cast<unsigned>(val)});
+		}
+	});
+
+	pool.run([&builder](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		builder.process(ctx);
+	});
+
+	HashSet set(builder);
+
+	// Verify all values can be found
+	for (int t = 0; t < pool.num_threads(); ++t) {
+		for (int i = 0; i < 10; ++i) {
+			int val = t * 100 + i;
+			const IntValue *found = set.find({{val}, static_cast<unsigned>(val)});
+			ASSERT_NE(found, nullptr) << "Value " << val << " not found";
+			EXPECT_EQ(*found, val);
+		}
+	}
+}
+
+// Helper types for ShardedHashSet tests
+struct IntDictValue {
+	using Accumulated = IntDictValue;
+	int key;
+	int value;
+	bool operator==(const IntDictValue &other) const { return key == other.key && value == other.value; }
+	bool operator!=(const IntDictValue &other) const { return !(*this == other); }
+};
+
+struct IntDictKeyEquality {
+	bool operator()(const IntDictValue &a, const IntDictValue &b) const { return a.key == b.key; }
+};
+
+// Collision handler that sums values
+struct SumCollisionHandler {
+	void operator()(IntDictValue &existing, IntDictValue &incoming) const {
+		existing.value += incoming.value;
+	}
+};
+
+TEST_F(ThreadingTest, ShardedHashSetCollision) {
+	ParallelDispatchThreadPool pool(1);
+
+	using HashSet = ShardedHashSet<IntDictValue, IntDictKeyEquality, SumCollisionHandler>;
+	HashSet::Builder builder(pool);
+
+	// Insert duplicate keys with same hash - duplicates should collapse
+	pool.run([&builder](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		builder.insert(ctx, {{5, 10}, 5});
+		builder.insert(ctx, {{5, 12}, 5});  // Duplicate key/hash
+		builder.insert(ctx, {{5, 14}, 5});  // Another duplicate
+	});
+
+	pool.run([&builder](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		builder.process(ctx);
+	});
+
+	HashSet set(builder);
+	const IntDictValue *found = set.find({{5, 0}, 5});
+	ASSERT_NE(found, nullptr);
+	// With default collision handler, first value is kept
+	EXPECT_EQ(*found, (IntDictValue{5, 36}));
+}
+
+TEST_F(ThreadingTest, ShardedHashSetEmpty) {
+	ParallelDispatchThreadPool pool(1);
+
+	using HashSet = ShardedHashSet<IntValue, IntValueEquality>;
+	HashSet::Builder builder(pool);
+
+	// Don't insert anything, just process
+	pool.run([&builder](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		builder.process(ctx);
+	});
+
+	HashSet set(builder);
+	const IntValue *found = set.find({{42}, 42});
+	EXPECT_EQ(found, nullptr);
+}
+
+TEST_F(ThreadingTest, ConcurrentWorkQueueSingleThread) {
+	ConcurrentWorkQueue<int> queue(1, 10);  // 1 thread, batch size 10
+	EXPECT_EQ(queue.num_threads(), 1);
+
+	ThreadIndex thread{0};
+
+	// Push some items (less than batch size)
+	for (int i = 0; i < 5; ++i)
+		queue.push(thread, i);
+
+	// Pop should return those items
+	std::vector<int> batch = queue.pop_batch(thread);
+	EXPECT_THAT(batch, testing::UnorderedElementsAre(0, 1, 2, 3, 4));
+
+	// Next pop should return empty (all threads "waiting")
+	std::vector<int> empty_batch = queue.pop_batch(thread);
+	EXPECT_TRUE(empty_batch.empty());
+}
+
+TEST_F(ThreadingTest, ConcurrentWorkQueueBatching) {
+	ConcurrentWorkQueue<int> queue(1, 3);  // batch size 3
+	ThreadIndex thread{0};
+
+	queue.push(thread, 10);
+	queue.push(thread, 20);
+	queue.push(thread, 30);
+	queue.push(thread, 40);
+	queue.push(thread, 50);
+
+	std::vector<int> popped;
+	while (true) {
+		std::vector<int> batch = queue.pop_batch(thread);
+		if (batch.empty())
+			break;
+		popped.insert(popped.end(), batch.begin(), batch.end());
+	}
+	EXPECT_THAT(popped, testing::UnorderedElementsAre(10, 20, 30, 40, 50));
+}
+
+TEST_F(ThreadingTest, ConcurrentWorkQueueParallel) {
+	ParallelDispatchThreadPool pool(2);
+	if (pool.num_threads() < 2) {
+		// Skip test if we don't have multiple threads
+		return;
+	}
+
+	ConcurrentWorkQueue<int> queue(2, 3);
+	std::atomic<int> sum{0};
+
+	pool.run([&queue, &sum](const ParallelDispatchThreadPool::RunCtx &ctx) {
+		// Each thread pushes some work
+		for (int i = 0; i < 10; ++i)
+			queue.push(ctx, ctx.thread_num * 100 + i);
+
+		// Each thread processes work until done
+		while (true) {
+			std::vector<int> batch = queue.pop_batch(ctx);
+			if (batch.empty())
+				break;
+			for (int v : batch)
+				sum.fetch_add(v);
+		}
+	});
+
+	// Thread 0 pushes: 0+1+2+...+9 = 45
+	// Thread 1 pushes: 100+101+...+109 = 1045
+	// Total = 45 + 1045 = 1090
+	EXPECT_EQ(sum.load(), 1090);
+}
+
+YOSYS_NAMESPACE_END