diff --git a/.github/actions/setup-build-env/action.yml b/.github/actions/setup-build-env/action.yml index 60fe481e7..fd25ae68e 100644 --- a/.github/actions/setup-build-env/action.yml +++ b/.github/actions/setup-build-env/action.yml @@ -58,7 +58,7 @@ runs: if: runner.os == 'Linux' && inputs.get-test-deps == 'true' uses: awalsh128/cache-apt-pkgs-action@v1.6.0 with: - packages: libgtest-dev + packages: libgtest-dev libgmock-dev version: ${{ inputs.runs-on }}-testys - name: Install macOS Dependencies diff --git a/frontends/rtlil/rtlil_frontend.cc b/frontends/rtlil/rtlil_frontend.cc index a1412d983..7e2ec5460 100644 --- a/frontends/rtlil/rtlil_frontend.cc +++ b/frontends/rtlil/rtlil_frontend.cc @@ -286,6 +286,7 @@ struct RTLILFrontendWorker { if (width > MAX_CONST_WIDTH) error("Constant width %lld out of range before `%s`.", width, error_token()); bits.reserve(width); + int start_idx = idx; while (true) { RTLIL::State bit; switch (line[idx]) { @@ -300,8 +301,9 @@ struct RTLILFrontendWorker { bits.push_back(bit); ++idx; } - done: - std::reverse(bits.begin(), bits.end()); + done: + if (start_idx < idx) + std::reverse(bits.begin(), bits.end()); if (GetSize(bits) > width) bits.resize(width); diff --git a/kernel/ffinit.h b/kernel/ffinit.h index 920fba307..8b4758f60 100644 --- a/kernel/ffinit.h +++ b/kernel/ffinit.h @@ -22,6 +22,7 @@ #include "kernel/yosys.h" #include "kernel/sigtools.h" +#include "kernel/threading.h" YOSYS_NAMESPACE_BEGIN @@ -35,34 +36,55 @@ struct FfInitVals sigmap = sigmap_; initbits.clear(); for (auto wire : module->wires()) + if (wire->attributes.count(ID::init)) + process_wire(wire); + } + + void process_wire(RTLIL::Wire *wire) + { + SigSpec wirebits = (*sigmap)(wire); + Const initval = wire->attributes.at(ID::init); + + for (int i = 0; i < GetSize(wirebits) && i < GetSize(initval); i++) { - if (wire->attributes.count(ID::init) == 0) + SigBit bit = wirebits[i]; + State val = initval[i]; + + if (val != State::S0 && val != State::S1 && bit.wire != nullptr) continue; - SigSpec wirebits = (*sigmap)(wire); - Const initval = wire->attributes.at(ID::init); - - for (int i = 0; i < GetSize(wirebits) && i < GetSize(initval); i++) - { - SigBit bit = wirebits[i]; - State val = initval[i]; - - if (val != State::S0 && val != State::S1 && bit.wire != nullptr) - continue; - - if (initbits.count(bit)) { - if (initbits.at(bit).first != val) - log_error("Conflicting init values for signal %s (%s = %s != %s).\n", - log_signal(bit), log_signal(SigBit(wire, i)), - log_signal(val), log_signal(initbits.at(bit).first)); - continue; - } - - initbits[bit] = std::make_pair(val,SigBit(wire,i)); + if (initbits.count(bit)) { + if (initbits.at(bit).first != val) + log_error("Conflicting init values for signal %s (%s = %s != %s).\n", + log_signal(bit), log_signal(SigBit(wire, i)), + log_signal(val), log_signal(initbits.at(bit).first)); + continue; } + + initbits[bit] = std::make_pair(val,SigBit(wire,i)); } } + void set_parallel(const SigMapView *sigmap_, ParallelDispatchThreadPool &thread_pool, RTLIL::Module *module) + { + sigmap = sigmap_; + initbits.clear(); + + const RTLIL::Module *const_module = module; + ParallelDispatchThreadPool::Subpool subpool(thread_pool, ThreadPool::work_pool_size(0, module->wires_size(), 1000)); + ShardedVector init_wires(subpool); + subpool.run([const_module, &init_wires](const ParallelDispatchThreadPool::RunCtx &ctx) { + for (int i : ctx.item_range(const_module->wires_size())) { + RTLIL::Wire *wire = const_module->wire_at(i); + if (wire->attributes.count(ID::init)) + init_wires.insert(ctx, wire); + } + }); + + for (RTLIL::Wire *wire : init_wires) + process_wire(wire); + } + RTLIL::State operator()(RTLIL::SigBit bit) const { auto it = initbits.find((*sigmap)(bit)); diff --git a/kernel/log.cc b/kernel/log.cc index 018a19081..b114f1eaf 100644 --- a/kernel/log.cc +++ b/kernel/log.cc @@ -324,6 +324,14 @@ void log_formatted_file_info(std::string_view filename, int lineno, std::string log("%s:%d: Info: %s", filename, lineno, str); } +void log_suppressed() { + if (log_debug_suppressed && !log_make_debug) { + constexpr const char* format = "\n"; + logv_string(format, stringf(format, log_debug_suppressed)); + log_debug_suppressed = 0; + } +} + [[noreturn]] static void log_error_with_prefix(std::string_view prefix, std::string str) { @@ -345,7 +353,9 @@ static void log_error_with_prefix(std::string_view prefix, std::string str) } log_last_error = std::move(str); - log("%s%s", prefix, log_last_error); + std::string message(prefix); + message += log_last_error; + logv_string("%s%s", message); log_flush(); log_make_debug = bak_log_make_debug; @@ -355,7 +365,7 @@ static void log_error_with_prefix(std::string_view prefix, std::string str) item.current_count++; for (auto &[_, item] : log_expect_prefix_error) - if (std::regex_search(string(prefix) + string(log_last_error), item.pattern)) + if (std::regex_search(message, item.pattern)) item.current_count++; log_check_expected(); diff --git a/kernel/log.h b/kernel/log.h index 63faf7091..d132ba1a0 100644 --- a/kernel/log.h +++ b/kernel/log.h @@ -206,12 +206,7 @@ template log_formatted_cmd_error(fmt.format(args...)); } -static inline void log_suppressed() { - if (log_debug_suppressed && !log_make_debug) { - log("\n", log_debug_suppressed); - log_debug_suppressed = 0; - } -} +void log_suppressed(); struct LogMakeDebugHdl { bool status = false; diff --git a/kernel/rtlil.cc b/kernel/rtlil.cc index eef1c319d..54696e000 100644 --- a/kernel/rtlil.cc +++ b/kernel/rtlil.cc @@ -22,6 +22,7 @@ #include "kernel/celltypes.h" #include "kernel/binding.h" #include "kernel/sigtools.h" +#include "kernel/threading.h" #include "frontends/verilog/verilog_frontend.h" #include "frontends/verilog/preproc.h" #include "backends/rtlil/rtlil_backend.h" @@ -142,9 +143,17 @@ static constexpr bool check_well_known_id_order() // and in sorted ascii order, as required by the ID macro. static_assert(check_well_known_id_order()); +constexpr int STATIC_ID_END = static_cast(RTLIL::StaticId::STATIC_ID_END); + struct IdStringCollector { + IdStringCollector(std::vector &live_ids) + : live_ids(live_ids) {} + void trace(IdString id) { - live.insert(id.index_); + if (id.index_ >= STATIC_ID_END) + live_ids[id.index_ - STATIC_ID_END].set(); + else if (id.index_ < 0) + live_autoidx_ids.push_back(id.index_); } template void trace(const T* v) { trace(*v); @@ -178,10 +187,6 @@ struct IdStringCollector { trace(element); } - void trace(const RTLIL::Design &design) { - trace_values(design.modules_); - trace(design.selection_vars); - } void trace(const RTLIL::Selection &selection_var) { trace(selection_var.selected_modules); trace(selection_var.selected_members); @@ -190,15 +195,6 @@ struct IdStringCollector { trace_keys(named.attributes); trace(named.name); } - void trace(const RTLIL::Module &module) { - trace_named(module); - trace_values(module.wires_); - trace_values(module.cells_); - trace(module.avail_parameters); - trace_keys(module.parameter_default_values); - trace_values(module.memories); - trace_values(module.processes); - } void trace(const RTLIL::Wire &wire) { trace_named(wire); if (wire.known_driver()) @@ -234,7 +230,8 @@ struct IdStringCollector { trace(action.memid); } - std::unordered_set live; + std::vector &live_ids; + std::vector live_autoidx_ids; }; int64_t RTLIL::OwningIdString::gc_ns; @@ -243,20 +240,55 @@ int RTLIL::OwningIdString::gc_count; void RTLIL::OwningIdString::collect_garbage() { int64_t start = PerformanceTimer::query(); - IdStringCollector collector; - for (auto &[idx, design] : *RTLIL::Design::get_all_designs()) { - collector.trace(*design); - } - int size = GetSize(global_id_storage_); - for (int i = static_cast(StaticId::STATIC_ID_END); i < size; ++i) { - RTLIL::IdString::Storage &storage = global_id_storage_.at(i); - if (storage.buf == nullptr) - continue; - if (collector.live.find(i) != collector.live.end()) - continue; - if (global_refcount_storage_.find(i) != global_refcount_storage_.end()) - continue; + int pool_size = 0; + for (auto &[idx, design] : *RTLIL::Design::get_all_designs()) + for (RTLIL::Module *module : design->modules()) + pool_size = std::max(pool_size, ThreadPool::work_pool_size(0, module->cells_size(), 1000)); + ParallelDispatchThreadPool thread_pool(pool_size); + + int size = GetSize(global_id_storage_); + std::vector live_ids(size - STATIC_ID_END); + std::vector collectors; + int num_threads = thread_pool.num_threads(); + collectors.reserve(num_threads); + for (int i = 0; i < num_threads; ++i) + collectors.emplace_back(live_ids); + + for (auto &[idx, design] : *RTLIL::Design::get_all_designs()) { + for (RTLIL::Module *module : design->modules()) { + collectors[0].trace_named(*module); + ParallelDispatchThreadPool::Subpool subpool(thread_pool, ThreadPool::work_pool_size(0, module->cells_size(), 1000)); + subpool.run([&collectors, module](const ParallelDispatchThreadPool::RunCtx &ctx) { + for (int i : ctx.item_range(module->cells_size())) + collectors[ctx.thread_num].trace(module->cell_at(i)); + for (int i : ctx.item_range(module->wires_size())) + collectors[ctx.thread_num].trace(module->wire_at(i)); + }); + collectors[0].trace(module->avail_parameters); + collectors[0].trace_keys(module->parameter_default_values); + collectors[0].trace_values(module->memories); + collectors[0].trace_values(module->processes); + } + collectors[0].trace(design->selection_vars); + } + + ShardedVector free_ids(thread_pool); + thread_pool.run([&live_ids, size, &free_ids](const ParallelDispatchThreadPool::RunCtx &ctx) { + for (int i : ctx.item_range(size - STATIC_ID_END)) { + int index = i + STATIC_ID_END; + RTLIL::IdString::Storage &storage = global_id_storage_.at(index); + if (storage.buf == nullptr) + continue; + if (live_ids[i].load()) + continue; + if (global_refcount_storage_.find(index) != global_refcount_storage_.end()) + continue; + free_ids.insert(ctx, index); + } + }); + for (int i : free_ids) { + RTLIL::IdString::Storage &storage = global_id_storage_.at(i); if (yosys_xtrace) { log("#X# Removed IdString '%s' with index %d.\n", storage.buf, i); log_backtrace("-X- ", yosys_xtrace-1); @@ -268,8 +300,13 @@ void RTLIL::OwningIdString::collect_garbage() global_free_idx_list_.push_back(i); } + std::unordered_set live_autoidx_ids; + for (IdStringCollector &collector : collectors) + for (int id : collector.live_autoidx_ids) + live_autoidx_ids.insert(id); + for (auto it = global_autoidx_id_storage_.begin(); it != global_autoidx_id_storage_.end();) { - if (collector.live.find(it->first) != collector.live.end()) { + if (live_autoidx_ids.find(it->first) != live_autoidx_ids.end()) { ++it; continue; } @@ -1466,15 +1503,21 @@ void RTLIL::Design::sort_modules() modules_.sort(sort_by_id_str()); } +void check_module(RTLIL::Module *module, ParallelDispatchThreadPool &thread_pool); + void RTLIL::Design::check() { #ifndef NDEBUG log_assert(!selection_stack.empty()); + int pool_size = 0; + for (auto &it : modules_) + pool_size = std::max(pool_size, ThreadPool::work_pool_size(0, it.second->cells_size(), 1000)); + ParallelDispatchThreadPool thread_pool(pool_size); for (auto &it : modules_) { log_assert(this == it.second->design); log_assert(it.first == it.second->name); log_assert(!it.first.empty()); - it.second->check(); + check_module(it.second, thread_pool); } #endif } @@ -1710,11 +1753,11 @@ size_t RTLIL::Module::count_id(RTLIL::IdString id) namespace { struct InternalCellChecker { - RTLIL::Module *module; + const RTLIL::Module *module; RTLIL::Cell *cell; pool expected_params, expected_ports; - InternalCellChecker(RTLIL::Module *module, RTLIL::Cell *cell) : module(module), cell(cell) { } + InternalCellChecker(const RTLIL::Module *module, RTLIL::Cell *cell) : module(module), cell(cell) { } void error(int linenr) { @@ -2690,88 +2733,96 @@ void RTLIL::Module::sort() it.second->attributes.sort(sort_by_id_str()); } -void RTLIL::Module::check() +void check_module(RTLIL::Module *module, ParallelDispatchThreadPool &thread_pool) { #ifndef NDEBUG - std::vector ports_declared; - for (auto &it : wires_) { - log_assert(this == it.second->module); - log_assert(it.first == it.second->name); - log_assert(!it.first.empty()); - log_assert(it.second->width >= 0); - log_assert(it.second->port_id >= 0); - for (auto &it2 : it.second->attributes) - log_assert(!it2.first.empty()); - if (it.second->port_id) { - log_assert(GetSize(ports) >= it.second->port_id); - log_assert(ports.at(it.second->port_id-1) == it.first); - log_assert(it.second->port_input || it.second->port_output); - if (GetSize(ports_declared) < it.second->port_id) - ports_declared.resize(it.second->port_id); - log_assert(ports_declared[it.second->port_id-1] == false); - ports_declared[it.second->port_id-1] = true; - } else - log_assert(!it.second->port_input && !it.second->port_output); - } - for (auto port_declared : ports_declared) - log_assert(port_declared == true); - log_assert(GetSize(ports) == GetSize(ports_declared)); + ParallelDispatchThreadPool::Subpool subpool(thread_pool, ThreadPool::work_pool_size(0, module->cells_size(), 1000)); + const RTLIL::Module *const_module = module; - for (auto &it : memories) { + pool memory_strings; + for (auto &it : module->memories) { log_assert(it.first == it.second->name); log_assert(!it.first.empty()); log_assert(it.second->width >= 0); log_assert(it.second->size >= 0); for (auto &it2 : it.second->attributes) log_assert(!it2.first.empty()); + memory_strings.insert(it.second->name.str()); } - pool packed_memids; + std::vector ports_declared(GetSize(module->ports)); + ShardedVector memids(subpool); + subpool.run([const_module, &ports_declared, &memory_strings, &memids](const ParallelDispatchThreadPool::RunCtx &ctx) { + for (int i : ctx.item_range(const_module->cells_size())) { + auto it = *const_module->cells_.element(i); + log_assert(const_module == it.second->module); + log_assert(it.first == it.second->name); + log_assert(!it.first.empty()); + log_assert(!it.second->type.empty()); + for (auto &it2 : it.second->connections()) { + log_assert(!it2.first.empty()); + it2.second.check(const_module); + } + for (auto &it2 : it.second->attributes) + log_assert(!it2.first.empty()); + for (auto &it2 : it.second->parameters) + log_assert(!it2.first.empty()); + InternalCellChecker checker(const_module, it.second); + checker.check(); + if (it.second->has_memid()) { + log_assert(memory_strings.count(it.second->parameters.at(ID::MEMID).decode_string())); + } else if (it.second->is_mem_cell()) { + std::string memid = it.second->parameters.at(ID::MEMID).decode_string(); + log_assert(!memory_strings.count(memid)); + memids.insert(ctx, std::move(memid)); + } + auto cell_mod = const_module->design->module(it.first); + if (cell_mod != nullptr) { + // assertion check below to make sure that there are no + // cases where a cell has a blackbox attribute since + // that is deprecated + #ifdef __GNUC__ + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wdeprecated-declarations" + #endif + log_assert(!it.second->get_blackbox_attribute()); + #ifdef __GNUC__ + #pragma GCC diagnostic pop + #endif + } + } - for (auto &it : cells_) { - log_assert(this == it.second->module); - log_assert(it.first == it.second->name); - log_assert(!it.first.empty()); - log_assert(!it.second->type.empty()); - for (auto &it2 : it.second->connections()) { - log_assert(!it2.first.empty()); - it2.second.check(this); + for (int i : ctx.item_range(const_module->wires_size())) { + auto it = *const_module->wires_.element(i); + log_assert(const_module == it.second->module); + log_assert(it.first == it.second->name); + log_assert(!it.first.empty()); + log_assert(it.second->width >= 0); + log_assert(it.second->port_id >= 0); + for (auto &it2 : it.second->attributes) + log_assert(!it2.first.empty()); + if (it.second->port_id) { + log_assert(GetSize(const_module->ports) >= it.second->port_id); + log_assert(const_module->ports.at(it.second->port_id-1) == it.first); + log_assert(it.second->port_input || it.second->port_output); + log_assert(it.second->port_id <= GetSize(ports_declared)); + bool previously_declared = ports_declared[it.second->port_id-1].set_and_return_old(); + log_assert(previously_declared == false); + } else + log_assert(!it.second->port_input && !it.second->port_output); } - for (auto &it2 : it.second->attributes) - log_assert(!it2.first.empty()); - for (auto &it2 : it.second->parameters) - log_assert(!it2.first.empty()); - InternalCellChecker checker(this, it.second); - checker.check(); - if (it.second->has_memid()) { - log_assert(memories.count(it.second->parameters.at(ID::MEMID).decode_string())); - } else if (it.second->is_mem_cell()) { - IdString memid = it.second->parameters.at(ID::MEMID).decode_string(); - log_assert(!memories.count(memid)); - log_assert(!packed_memids.count(memid)); - packed_memids.insert(memid); - } - auto cell_mod = design->module(it.first); - if (cell_mod != nullptr) { - // assertion check below to make sure that there are no - // cases where a cell has a blackbox attribute since - // that is deprecated - #ifdef __GNUC__ - #pragma GCC diagnostic push - #pragma GCC diagnostic ignored "-Wdeprecated-declarations" - #endif - log_assert(!it.second->get_blackbox_attribute()); - #ifdef __GNUC__ - #pragma GCC diagnostic pop - #endif - } - } + }); + for (const MonotonicFlag &port_declared : ports_declared) + log_assert(port_declared.load() == true); + pool memids_pool; + for (std::string &memid : memids) + log_assert(memids_pool.insert(memid).second); - for (auto &it : processes) { + for (auto &it : module->processes) { log_assert(it.first == it.second->name); log_assert(!it.first.empty()); log_assert(it.second->root_case.compare.empty()); - std::vector all_cases = {&it.second->root_case}; + std::vector all_cases = {&it.second->root_case}; for (size_t i = 0; i < all_cases.size(); i++) { for (auto &switch_it : all_cases[i]->switches) { for (auto &case_it : switch_it->cases) { @@ -2784,34 +2835,41 @@ void RTLIL::Module::check() } for (auto &sync_it : it.second->syncs) { switch (sync_it->type) { - case SyncType::ST0: - case SyncType::ST1: - case SyncType::STp: - case SyncType::STn: - case SyncType::STe: + case RTLIL::SyncType::ST0: + case RTLIL::SyncType::ST1: + case RTLIL::SyncType::STp: + case RTLIL::SyncType::STn: + case RTLIL::SyncType::STe: log_assert(!sync_it->signal.empty()); break; - case SyncType::STa: - case SyncType::STg: - case SyncType::STi: + case RTLIL::SyncType::STa: + case RTLIL::SyncType::STg: + case RTLIL::SyncType::STi: log_assert(sync_it->signal.empty()); break; } } } - for (auto &it : connections_) { + for (auto &it : module->connections_) { log_assert(it.first.size() == it.second.size()); log_assert(!it.first.has_const()); - it.first.check(this); - it.second.check(this); + it.first.check(module); + it.second.check(module); } - for (auto &it : attributes) + for (auto &it : module->attributes) log_assert(!it.first.empty()); #endif } +void RTLIL::Module::check() +{ + int pool_size = ThreadPool::work_pool_size(0, cells_size(), 1000); + ParallelDispatchThreadPool thread_pool(pool_size); + check_module(this, thread_pool); +} + void RTLIL::Module::optimize() { } @@ -5470,7 +5528,7 @@ RTLIL::SigSpec RTLIL::SigSpec::repeat(int num) const } #ifndef NDEBUG -void RTLIL::SigSpec::check(Module *mod) const +void RTLIL::SigSpec::check(const Module *mod) const { if (rep_ == CHUNK) { diff --git a/kernel/rtlil.h b/kernel/rtlil.h index fea53081e..9ecee8942 100644 --- a/kernel/rtlil.h +++ b/kernel/rtlil.h @@ -275,6 +275,17 @@ struct RTLIL::IdString *out += std::to_string(-index_); } + std::string unescape() const { + if (index_ < 0) { + // Must start with "$auto$" so no unescaping required. + return str(); + } + std::string_view str = global_id_storage_.at(index_).str_view(); + if (str.size() < 2 || str[0] != '\\' || str[1] == '$' || str[1] == '\\' || (str[1] >= '0' && str[1] <= '9')) + return std::string(str); + return std::string(str.substr(1)); + } + class Substrings { std::string_view first_; int suffix_number; @@ -758,7 +769,7 @@ namespace RTLIL { } static inline std::string unescape_id(RTLIL::IdString str) { - return unescape_id(str.str()); + return str.unescape(); } static inline const char *id2cstr(RTLIL::IdString str) { @@ -1748,9 +1759,9 @@ public: } #ifndef NDEBUG - void check(Module *mod = nullptr) const; + void check(const Module *mod = nullptr) const; #else - void check(Module *mod = nullptr) const { (void)mod; } + void check(const Module *mod = nullptr) const { (void)mod; } #endif }; diff --git a/kernel/threading.cc b/kernel/threading.cc index dcc044c89..3766c4ddf 100644 --- a/kernel/threading.cc +++ b/kernel/threading.cc @@ -17,6 +17,20 @@ static int get_max_threads() return max_threads; } +static int init_work_units_per_thread_override() +{ + const char *v = getenv("YOSYS_WORK_UNITS_PER_THREAD"); + if (v == nullptr) + return 0; + return atoi(v); +} + +static int get_work_units_per_thread_override() +{ + static int work_units_per_thread = init_work_units_per_thread_override(); + return work_units_per_thread; +} + void DeferredLogs::flush() { for (auto &m : logs) @@ -37,6 +51,14 @@ int ThreadPool::pool_size(int reserved_cores, int max_worker_threads) #endif } +int ThreadPool::work_pool_size(int reserved_cores, int work_units, int work_units_per_thread) +{ + int work_units_per_thread_override = get_work_units_per_thread_override(); + if (work_units_per_thread_override > 0) + work_units_per_thread = work_units_per_thread_override; + return pool_size(reserved_cores, work_units / work_units_per_thread); +} + ThreadPool::ThreadPool(int pool_size, std::function b) : body(std::move(b)) { @@ -57,4 +79,72 @@ ThreadPool::~ThreadPool() #endif } +IntRange item_range_for_worker(int num_items, int thread_num, int num_threads) +{ + if (num_threads <= 1) { + return {0, num_items}; + } + int items_per_thread = num_items / num_threads; + int extra_items = num_items % num_threads; + // The first `extra_items` threads get one extra item. + int start = thread_num * items_per_thread + std::min(thread_num, extra_items); + int end = (thread_num + 1) * items_per_thread + std::min(thread_num + 1, extra_items); + return {start, end}; +} + +ParallelDispatchThreadPool::ParallelDispatchThreadPool(int pool_size) + : num_worker_threads_(std::max(1, pool_size) - 1) +{ +#ifdef YOSYS_ENABLE_THREADS + main_to_workers_signal.resize(num_worker_threads_, 0); +#endif + // Don't start the threads until we've constructed all our data members. + thread_pool = std::make_unique(num_worker_threads_, [this](int thread_num){ + run_worker(thread_num); + }); +} + +ParallelDispatchThreadPool::~ParallelDispatchThreadPool() +{ +#ifdef YOSYS_ENABLE_THREADS + if (num_worker_threads_ == 0) + return; + current_work = nullptr; + num_active_worker_threads_ = num_worker_threads_; + signal_workers_start(); + wait_for_workers_done(); +#endif +} + +void ParallelDispatchThreadPool::run(std::function work, int max_threads) +{ + Multithreading multithreading; + num_active_worker_threads_ = num_threads(max_threads) - 1; + if (num_active_worker_threads_ == 0) { + work({{0}, 1}); + return; + } +#ifdef YOSYS_ENABLE_THREADS + current_work = &work; + signal_workers_start(); + work({{0}, num_active_worker_threads_ + 1}); + wait_for_workers_done(); +#endif +} + +void ParallelDispatchThreadPool::run_worker(int thread_num) +{ +#ifdef YOSYS_ENABLE_THREADS + while (true) + { + worker_wait_for_start(thread_num); + if (current_work == nullptr) + break; + (*current_work)({{thread_num + 1}, num_active_worker_threads_ + 1}); + signal_worker_done(); + } + signal_worker_done(); +#endif +} + YOSYS_NAMESPACE_END diff --git a/kernel/threading.h b/kernel/threading.h index b8cd62f87..3d6495720 100644 --- a/kernel/threading.h +++ b/kernel/threading.h @@ -131,6 +131,11 @@ public: // The result may be 0. static int pool_size(int reserved_cores, int max_worker_threads); + // Computes the number of worker threads to use, by dividing work_units among threads. + // For testing purposes you can set YOSYS_WORK_UNITS_PER_THREAD to override `work_units_per_thread`. + // The result may be 0. + static int work_pool_size(int reserved_cores, int work_units, int work_units_per_thread); + // Create a pool of threads running the given closure (parameterized by thread number). // `pool_size` must be the result of a `pool_size()` call. ThreadPool(int pool_size, std::function b); @@ -154,6 +159,140 @@ private: #endif }; +// A range of integers [start_, end_) that can be iterated over with a +// C++ range-based for loop. +struct IntRange { + int start_; + int end_; + struct Int { + int v; + int operator*() const { return v; } + Int &operator++() { ++v; return *this; } + bool operator!=(const Int &other) const { return v != other.v; } + }; + Int begin() const { return {start_}; } + Int end() const { return {end_}; } + + bool operator==(const IntRange &other) const { return start_ == other.start_ && end_ == other.end_; } + bool operator!=(const IntRange &other) const { return !(*this == other); } +}; +// Divides some number of items into `num_threads` subranges and returns the +// `thread_num`'th subrange. If `num_threads` is zero, returns the whole range. +IntRange item_range_for_worker(int num_items, int thread_num, int num_threads); + +// A type that encapsulates the index of a thread in some list of threads. Useful for +// stronger typechecking and code readability. +struct ThreadIndex { + int thread_num; +}; + +// A set of threads with a `run()` API that runs a closure on all of the threads +// and wait for all those closures to complete. This is a convenient way to implement +// parallel algorithms that use barrier synchronization. +class ParallelDispatchThreadPool +{ +public: + // Create a pool of threads running the given closure (parameterized by thread number). + // `pool_size` must be the result of a `pool_size()` call. + // `pool_size` can be zero, which we treat as 1. + ParallelDispatchThreadPool(int pool_size); + ~ParallelDispatchThreadPool(); + + // For each thread running a closure, a `RunCtx` is passed to the closure. Currently + // it contains the thread index and the total number of threads. It can be passed + // directly to any APIs requiring a `ThreadIndex`. + struct RunCtx : public ThreadIndex { + int num_threads; + IntRange item_range(int num_items) const { + return item_range_for_worker(num_items, thread_num, num_threads); + } + }; + // Sometimes we only want to activate a subset of the threads in the pool. This + // class provides a way to do that. It provides the same `num_threads()` + // and `run()` APIs as a `ParallelDispatchThreadPool`. + class Subpool { + public: + Subpool(ParallelDispatchThreadPool &parent, int max_threads) + : parent(parent), max_threads(max_threads) {} + // Returns the number of threads that will be used when calling `run()`. + int num_threads() const { + return parent.num_threads(max_threads); + } + void run(std::function work) { + parent.run(std::move(work), max_threads); + } + ParallelDispatchThreadPool &thread_pool() { return parent; } + private: + ParallelDispatchThreadPool &parent; + int max_threads; + }; + + // Run the `work` function in parallel on each thread in the pool (parameterized by + // thread number). Waits for all work functions to complete. Only one `run()` can be + // active at a time. + // Uses no more than `max_threads` threads (but at least one). + void run(std::function work) { + run(std::move(work), INT_MAX); + } + + // Returns the number of threads that will be used when calling `run()`. + int num_threads() const { + return num_threads(INT_MAX); + } +private: + friend class Subpool; + + void run(std::function work, int max_threads); + int num_threads(int max_threads) const { + return std::min(num_worker_threads_ + 1, std::max(1, max_threads)); + } + void run_worker(int thread_num); + + std::unique_ptr thread_pool; + std::function *current_work = nullptr; + // Keeps a correct count even when threads are exiting. + int num_worker_threads_; + // The count of active workerthreads for the current `run()`. + int num_active_worker_threads_ = 0; + +#ifdef YOSYS_ENABLE_THREADS + // Not especially efficient for large numbers of threads. Worker wakeup could scale + // better by conceptually organising workers into a tree and having workers wake + // up their children. + std::mutex main_to_workers_signal_mutex; + std::condition_variable main_to_workers_signal_cv; + std::vector main_to_workers_signal; + void signal_workers_start() { + std::unique_lock lock(main_to_workers_signal_mutex); + std::fill(main_to_workers_signal.begin(), main_to_workers_signal.begin() + num_active_worker_threads_, 1); + // When `num_active_worker_threads_` is small compared to `num_worker_threads_`, we have a "thundering herd" + // problem here. Fixing that would add complexity so don't worry about it for now. + main_to_workers_signal_cv.notify_all(); + } + void worker_wait_for_start(int thread_num) { + std::unique_lock lock(main_to_workers_signal_mutex); + main_to_workers_signal_cv.wait(lock, [this, thread_num] { return main_to_workers_signal[thread_num] > 0; }); + main_to_workers_signal[thread_num] = 0; + } + + std::atomic done_workers = 0; + std::mutex workers_to_main_signal_mutex; + std::condition_variable workers_to_main_signal_cv; + void signal_worker_done() { + int d = done_workers.fetch_add(1, std::memory_order_release); + if (d + 1 == num_active_worker_threads_) { + std::unique_lock lock(workers_to_main_signal_mutex); + workers_to_main_signal_cv.notify_all(); + } + } + void wait_for_workers_done() { + std::unique_lock lock(workers_to_main_signal_mutex); + workers_to_main_signal_cv.wait(lock, [this] { return done_workers.load(std::memory_order_acquire) == num_active_worker_threads_; }); + done_workers.store(0, std::memory_order_relaxed); + } +#endif +}; + template class ConcurrentStack { @@ -181,6 +320,373 @@ private: std::vector contents; }; +// A vector that is sharded into buckets, one per thread. This lets multiple threads write +// efficiently to the vector without synchronization overhead. After all writers have +// finished writing, the vector can be iterated over. The iteration order is deterministic: +// all the elements written by thread 0 in the order it inserted them, followed by all elements +// written by thread 1, etc. +template +class ShardedVector { +public: + ShardedVector(const ParallelDispatchThreadPool &thread_pool) { + init(thread_pool.num_threads()); + } + ShardedVector(const ParallelDispatchThreadPool::Subpool &thread_pool) { + init(thread_pool.num_threads()); + } + + // Insert a value, passing the `ThreadIndex` of the writer thread. + // Parallel inserts with different `ThreadIndex` values are fine. + // Inserts must not run concurrently with any other methods (e.g. + // iteration or `empty()`.) + void insert(const ThreadIndex &thread, T value) { + buckets[thread.thread_num].emplace_back(std::move(value)); + } + + bool empty() const { + for (const std::vector &bucket : buckets) + if (!bucket.empty()) + return false; + return true; + } + + using Buckets = std::vector>; + class iterator { + public: + iterator(typename Buckets::iterator bucket_it, typename Buckets::iterator bucket_end) + : bucket_it(std::move(bucket_it)), bucket_end(std::move(bucket_end)) { + if (bucket_it != bucket_end) + inner_it = bucket_it->begin(); + normalize(); + } + T& operator*() const { return *inner_it.value(); } + iterator &operator++() { + ++*inner_it; + normalize(); + return *this; + } + bool operator!=(const iterator &other) const { + return bucket_it != other.bucket_it || inner_it != other.inner_it; + } + private: + void normalize() { + if (bucket_it == bucket_end) + return; + while (inner_it == bucket_it->end()) { + ++bucket_it; + if (bucket_it == bucket_end) { + inner_it.reset(); + return; + } + inner_it = bucket_it->begin(); + } + } + std::optional::iterator> inner_it; + typename Buckets::iterator bucket_it; + typename Buckets::iterator bucket_end; + }; + iterator begin() { return iterator(buckets.begin(), buckets.end()); } + iterator end() { return iterator(buckets.end(), buckets.end()); } +private: + void init(int num_threads) { + buckets.resize(num_threads); + } + Buckets buckets; +}; + +template +struct DefaultCollisionHandler { + void operator()(typename V::Accumulated &, typename V::Accumulated &) const {} +}; + +// A hashtable that can be efficiently built in parallel and then looked up concurrently. +// `V` is the type of elements that will be added to the hashtable. It must have a +// member type `Accumulated` representing the combination of multiple `V` elements. This +// can be the same as `V`, but for example `V` could contain a Wire* and `V::Accumulated` +// could contain a `pool`. `KeyEquality` is a class containing an `operator()` that +// returns true of two `V` elements have equal keys. +// `CollisionHandler` is used to reduce two `V::Accumulated` values into a single value. +// +// To use this, first construct a `Builder` and fill it in (in parallel), then construct +// a `ShardedHashSet` from the `Builder`. +template > +class ShardedHashSet { +public: + // A combination of a `V` and its hash value. + struct Value { + Value(V value, unsigned int hash) : value(std::move(value)), hash(hash) {} + Value(Value &&) = default; + Value(const Value &) = delete; + Value &operator=(const Value &) = delete; + V value; + unsigned int hash; + }; + // A combination of a `V::Accumulated` and its hash value. + struct AccumulatedValue { + AccumulatedValue(typename V::Accumulated value, unsigned int hash) : value(std::move(value)), hash(hash) {} + AccumulatedValue(AccumulatedValue &&) = default; +#if defined(_MSC_VER) + AccumulatedValue(const AccumulatedValue &) { + log_error("Copy constructor called on AccumulatedValue"); + } + AccumulatedValue &operator=(const AccumulatedValue &) { + log_error("Copy assignment called on AccumulatedValue"); + return *this; + } +#else + AccumulatedValue(const AccumulatedValue &) = delete; + AccumulatedValue &operator=(const AccumulatedValue &) = delete; +#endif + typename V::Accumulated value; + unsigned int hash; + }; + // A class containing an `operator()` that returns true of two `AccumulatedValue` + // elements have equal keys. + // Required to insert `AccumulatedValue`s into an `std::unordered_set`. + struct AccumulatedValueEquality { + KeyEquality inner; + AccumulatedValueEquality(const KeyEquality &inner) : inner(inner) {} + bool operator()(const AccumulatedValue &v1, const AccumulatedValue &v2) const { + return inner(v1.value, v2.value); + } + }; + // A class containing an `operator()` that returns the hash value of an `AccumulatedValue`. + // Required to insert `AccumulatedValue`s into an `std::unordered_set`. + struct AccumulatedValueHashOp { + size_t operator()(const AccumulatedValue &v) const { + return static_cast(v.hash); + } + }; + using Shard = std::unordered_set; + + // First construct one of these. Then populate it in parallel by calling `insert()` from many threads. + // Then do another parallel phase calling `process()` from many threads. + class Builder { + public: + Builder(const ParallelDispatchThreadPool &thread_pool, KeyEquality equality = KeyEquality(), CollisionHandler collision_handler = CollisionHandler()) + : collision_handler(std::move(collision_handler)) { + init(thread_pool.num_threads(), std::move(equality)); + } + Builder(const ParallelDispatchThreadPool::Subpool &thread_pool, KeyEquality equality = KeyEquality(), CollisionHandler collision_handler = CollisionHandler()) + : collision_handler(std::move(collision_handler)) { + init(thread_pool.num_threads(), std::move(equality)); + } + // First call `insert` to insert all elements. All inserts must finish + // before calling any `process()`. + void insert(const ThreadIndex &thread, Value v) { + // You might think that for the single-threaded case, we can optimize by + // inserting directly into the `std::unordered_set` here. But that slows things down + // a lot and I never got around to figuring out why. + std::vector> &buckets = all_buckets[thread.thread_num]; + size_t bucket = static_cast(v.hash) % buckets.size(); + buckets[bucket].emplace_back(std::move(v)); + } + // Then call `process` for each thread. All `process()`s must finish before using + // the `Builder` to construct a `ShardedHashSet`. + void process(const ThreadIndex &thread) { + int size = 0; + for (std::vector> &buckets : all_buckets) + size += GetSize(buckets[thread.thread_num]); + Shard &shard = shards[thread.thread_num]; + shard.reserve(size); + for (std::vector> &buckets : all_buckets) { + for (Value &value : buckets[thread.thread_num]) + accumulate(value, shard); + // Free as much memory as we can during the parallel phase. + std::vector().swap(buckets[thread.thread_num]); + } + } + private: + friend class ShardedHashSet; + void accumulate(Value &value, Shard &shard) { + // With C++20 we could make this more efficient using heterogenous lookup + AccumulatedValue accumulated_value{std::move(value.value), value.hash}; + auto [it, inserted] = shard.insert(std::move(accumulated_value)); + if (!inserted) + collision_handler(const_cast(it->value), accumulated_value.value); + } + void init(int num_threads, KeyEquality equality) { + all_buckets.resize(num_threads); + for (std::vector> &buckets : all_buckets) + buckets.resize(num_threads); + for (int i = 0; i < num_threads; ++i) + shards.emplace_back(0, AccumulatedValueHashOp(), AccumulatedValueEquality(equality)); + } + const CollisionHandler collision_handler; + std::vector>> all_buckets; + std::vector shards; + }; + + // Then finally construct the hashtable: + ShardedHashSet(Builder &builder) : shards(std::move(builder.shards)) { + // Check that all necessary 'process()' calls were made. + for (std::vector> &buckets : builder.all_buckets) + for (std::vector &bucket : buckets) + log_assert(bucket.empty()); + // Free memory. + std::vector>>().swap(builder.all_buckets); + } + ShardedHashSet(ShardedHashSet &&other) = default; + ShardedHashSet() {} + + ShardedHashSet &operator=(ShardedHashSet &&other) = default; + + // Look up by `AccumulatedValue`. If we switch to C++20 then we could use + // heterogenous lookup to support looking up by `Value` here. Returns nullptr + // if the key is not found. + const typename V::Accumulated *find(const AccumulatedValue &v) const { + size_t num_shards = shards.size(); + if (num_shards == 0) + return nullptr; + size_t shard = static_cast(v.hash) % num_shards; + auto it = shards[shard].find(v); + if (it == shards[shard].end()) + return nullptr; + return &it->value; + } + + // Insert an element into the table. The caller is responsible for ensuring this does not + // happen concurrently with any other method calls. + void insert(AccumulatedValue v) { + size_t num_shards = shards.size(); + if (num_shards == 0) + return; + size_t shard = static_cast(v.hash) % num_shards; + shards[shard].insert(v); + } + + // Call this for each shard to implement parallel destruction. For very large `ShardedHashSet`s, + // deleting all elements of all shards on a single thread can be a performance bottleneck. + void clear(const ThreadIndex &shard) { + AccumulatedValueEquality equality = shards[0].key_eq(); + shards[shard.thread_num] = Shard(0, AccumulatedValueHashOp(), equality); + } +private: + std::vector shards; +}; + +// A concurrent work-queue that can share batches of work across threads. +// Uses a naive implementation of work-stealing. +template +class ConcurrentWorkQueue { +public: + // Create a queue that supports the given number of threads and + // groups work into `batch_size` units. + ConcurrentWorkQueue(int num_threads, int batch_size = 100) + : batch_size(batch_size), thread_states(num_threads) {} + int num_threads() const { return GetSize(thread_states); } + // Push some work to do. Pushes and pops with the same `thread` must + // not happen concurrently. + void push(const ThreadIndex &thread, T work) { + ThreadState &thread_state = thread_states[thread.thread_num]; + thread_state.next_batch.emplace_back(std::move(work)); + if (GetSize(thread_state.next_batch) < batch_size) + return; + bool was_empty; + { + std::unique_lock lock(thread_state.batches_lock); + was_empty = thread_state.batches.empty(); + thread_state.batches.push_back(std::move(thread_state.next_batch)); + } + if (was_empty) { + std::unique_lock lock(waiters_lock); + if (num_waiters > 0) { + waiters_cv.notify_one(); + } + } + } + // Grab some work to do. + // If all threads enter `pop_batch()`, then instead of deadlocking the + // queue will return no work. That is the only case in which it will + // return no work. + std::vector pop_batch(const ThreadIndex &thread) { + ThreadState &thread_state = thread_states[thread.thread_num]; + if (!thread_state.next_batch.empty()) + return std::move(thread_state.next_batch); + // Empty our own work queue first. + { + std::unique_lock lock(thread_state.batches_lock); + if (!thread_state.batches.empty()) { + std::vector batch = std::move(thread_state.batches.back()); + thread_state.batches.pop_back(); + return batch; + } + } + // From here on in this function, our work queue is empty. + while (true) { + std::vector batch = try_steal(thread); + if (!batch.empty()) { + return std::move(batch); + } + // Termination: if all threads run out of work, then all of + // them will eventually enter this loop and there will be no further + // notifications on waiters_cv, so all will eventually increment + // num_waiters and wait, so num_waiters == num_threads() + // will become true. + std::unique_lock lock(waiters_lock); + ++num_waiters; + if (num_waiters == num_threads()) { + waiters_cv.notify_all(); + return {}; + } + // As above, it's possible that we'll wait here even when there + // are work batches posted by other threads. That's OK. + waiters_cv.wait(lock); + if (num_waiters == num_threads()) + return {}; + --num_waiters; + } + } +private: + std::vector try_steal(const ThreadIndex &thread) { + for (int i = 1; i < num_threads(); i++) { + int other_thread_num = (thread.thread_num + i) % num_threads(); + ThreadState &other_thread_state = thread_states[other_thread_num]; + std::unique_lock lock(other_thread_state.batches_lock); + if (!other_thread_state.batches.empty()) { + std::vector batch = std::move(other_thread_state.batches.front()); + other_thread_state.batches.pop_front(); + return batch; + } + } + return {}; + } + + int batch_size; + + struct ThreadState { + // Entirely thread-local. + std::vector next_batch; + + std::mutex batches_lock; + // Only the associated thread ever adds to this, and only at the back. + // Other threads can remove elements from the front. + std::deque> batches; + }; + std::vector thread_states; + + std::mutex waiters_lock; + std::condition_variable waiters_cv; + // Number of threads waiting for work. Their queues are empty. + int num_waiters = 0; +}; + +// A monotonic flag. Starts false, and can be set to true in a thread-safe way. +// Once `load()` returns true, it will always return true. +// Uses relaxed atomics so there are no memory ordering guarantees. Do not use this +// to guard access to shared memory. +class MonotonicFlag { +public: + MonotonicFlag() : value(false) {} + bool load() const { return value.load(std::memory_order_relaxed); } + void set() { value.store(true, std::memory_order_relaxed); } + bool set_and_return_old() { + return value.exchange(true, std::memory_order_relaxed); + } +private: + std::atomic value; +}; + YOSYS_NAMESPACE_END #endif // YOSYS_THREADING_H diff --git a/passes/opt/opt_clean.cc b/passes/opt/opt_clean.cc index f1d21435c..7c2377b10 100644 --- a/passes/opt/opt_clean.cc +++ b/passes/opt/opt_clean.cc @@ -22,6 +22,7 @@ #include "kernel/log.h" #include "kernel/celltypes.h" #include "kernel/ffinit.h" +#include "kernel/threading.h" #include #include #include @@ -33,47 +34,120 @@ using RTLIL::id2cstr; struct keep_cache_t { - Design *design; - dict cache; - bool purge_mode = false; + dict keep_modules; + bool purge_mode; - void reset(Design *design = nullptr, bool purge_mode = false) - { - this->design = design; - this->purge_mode = purge_mode; - cache.clear(); - } + keep_cache_t(bool purge_mode, ParallelDispatchThreadPool &thread_pool, const std::vector &selected_modules) + : purge_mode(purge_mode) { - bool query(Module *module) - { - log_assert(design != nullptr); - - if (module == nullptr) - return false; - - if (cache.count(module)) - return cache.at(module); - - cache[module] = true; - if (!module->get_bool_attribute(ID::keep)) { - bool found_keep = false; - for (auto cell : module->cells()) - if (query(cell, true /* ignore_specify */)) { - found_keep = true; - break; - } - for (auto wire : module->wires()) - if (wire->get_bool_attribute(ID::keep)) { - found_keep = true; - break; - } - cache[module] = found_keep; + std::vector scan_modules_worklist; + dict> dependents; + std::vector propagate_kept_modules_worklist; + for (RTLIL::Module *module : selected_modules) { + if (keep_modules.count(module)) + continue; + bool keep = scan_module(module, thread_pool, dependents, ALL_CELLS, scan_modules_worklist); + keep_modules[module] = keep; + if (keep) + propagate_kept_modules_worklist.push_back(module); } - return cache[module]; + while (!scan_modules_worklist.empty()) { + RTLIL::Module *module = scan_modules_worklist.back(); + scan_modules_worklist.pop_back(); + if (keep_modules.count(module)) + continue; + bool keep = scan_module(module, thread_pool, dependents, MINIMUM_CELLS, scan_modules_worklist); + keep_modules[module] = keep; + if (keep) + propagate_kept_modules_worklist.push_back(module); + } + + while (!propagate_kept_modules_worklist.empty()) { + RTLIL::Module *module = propagate_kept_modules_worklist.back(); + propagate_kept_modules_worklist.pop_back(); + for (RTLIL::Module *dependent : dependents[module]) { + if (keep_modules[dependent]) + continue; + keep_modules[dependent] = true; + propagate_kept_modules_worklist.push_back(dependent); + } + } } - bool query(Cell *cell, bool ignore_specify = false) + bool query(Cell *cell) const + { + if (keep_cell(cell, purge_mode)) + return true; + if (cell->type.in(ID($specify2), ID($specify3), ID($specrule))) + return true; + if (cell->module && cell->module->design) { + RTLIL::Module *cell_module = cell->module->design->module(cell->type); + return cell_module != nullptr && keep_modules.at(cell_module); + } + return false; + } + +private: + enum ScanCells { + // Scan every cell to see if it uses a module that is kept. + ALL_CELLS, + // Stop scanning cells if we determine early that this module is kept. + MINIMUM_CELLS, + }; + bool scan_module(Module *module, ParallelDispatchThreadPool &thread_pool, dict> &dependents, + ScanCells scan_cells, std::vector &worklist) const + { + MonotonicFlag keep_module; + if (module->get_bool_attribute(ID::keep)) { + if (scan_cells == MINIMUM_CELLS) + return true; + keep_module.set(); + } + + ParallelDispatchThreadPool::Subpool subpool(thread_pool, ThreadPool::work_pool_size(0, module->cells_size(), 1000)); + ShardedVector deps(subpool); + const RTLIL::Module *const_module = module; + bool purge_mode = this->purge_mode; + subpool.run([purge_mode, const_module, scan_cells, &deps, &keep_module](const ParallelDispatchThreadPool::RunCtx &ctx) { + bool keep = false; + for (int i : ctx.item_range(const_module->cells_size())) { + Cell *cell = const_module->cell_at(i); + if (keep_cell(cell, purge_mode)) { + if (scan_cells == MINIMUM_CELLS) { + keep_module.set(); + return; + } + keep = true; + } + if (const_module->design) { + RTLIL::Module *cell_module = const_module->design->module(cell->type); + if (cell_module != nullptr) + deps.insert(ctx, cell_module); + } + } + if (keep) { + keep_module.set(); + return; + } + for (int i : ctx.item_range(const_module->wires_size())) { + Wire *wire = const_module->wire_at(i); + if (wire->get_bool_attribute(ID::keep)) { + keep_module.set(); + return; + } + } + }); + if (scan_cells == MINIMUM_CELLS && keep_module.load()) + return true; + for (Module *dep : deps) { + dependents[dep].push_back(module); + worklist.push_back(dep); + } + return keep_module.load(); + } + + static bool keep_cell(Cell *cell, bool purge_mode) { if (cell->type.in(ID($assert), ID($assume), ID($live), ID($fair), ID($cover))) return true; @@ -81,9 +155,6 @@ struct keep_cache_t if (cell->type.in(ID($overwrite_tag))) return true; - if (!ignore_specify && cell->type.in(ID($specify2), ID($specify3), ID($specrule))) - return true; - if (cell->type == ID($print) || cell->type == ID($check)) return true; @@ -92,28 +163,32 @@ struct keep_cache_t if (!purge_mode && cell->type == ID($scopeinfo)) return true; - - if (cell->module && cell->module->design) - return query(cell->module->design->module(cell->type)); - return false; } }; -keep_cache_t keep_cache; CellTypes ct_reg, ct_all; -int count_rm_cells, count_rm_wires; -void rmunused_module_cells(Module *module, bool verbose) +struct RmStats { + int count_rm_cells = 0; + int count_rm_wires = 0; + + void log() + { + if (count_rm_cells > 0 || count_rm_wires > 0) + YOSYS_NAMESPACE_PREFIX log("Removed %d unused cells and %d unused wires.\n", count_rm_cells, count_rm_wires); + } +}; + +unsigned int hash_bit(const SigBit &bit) { + return static_cast(hash_ops::hash(bit).yield()); +} + +void rmunused_module_cells(Module *module, ParallelDispatchThreadPool::Subpool &subpool, bool verbose, RmStats &stats, keep_cache_t &keep_cache) { SigMap sigmap(module); - dict> mem2cells; - pool mem_unused; - pool queue, unused; - pool used_raw_bits; - dict> wire2driver; - dict> driver_driver_logs; - FfInitVals ffinit(&sigmap, module); + FfInitVals ffinit; + ffinit.set_parallel(&sigmap, subpool.thread_pool(), module); SigMap raw_sigmap; for (auto &it : module->connections_) { @@ -123,117 +198,243 @@ void rmunused_module_cells(Module *module, bool verbose) } } - for (auto &it : module->memories) { - mem_unused.insert(it.first); - } + struct WireDrivers; + struct WireDriver { + using Accumulated = WireDrivers; + SigBit bit; + int driver_cell; + }; + struct WireDrivers { + WireDrivers() : driver_cell(0) {} + WireDrivers(WireDriver driver) : bit(driver.bit), driver_cell(driver.driver_cell) {} + WireDrivers(SigBit bit) : bit(bit), driver_cell(0) {} + WireDrivers(WireDrivers &&other) = default; - for (Cell *cell : module->cells()) { - if (cell->type.in(ID($memwr), ID($memwr_v2), ID($meminit), ID($meminit_v2))) { - IdString mem_id = cell->getParam(ID::MEMID).decode_string(); - mem2cells[mem_id].insert(cell); - } - } - - for (auto &it : module->cells_) { - Cell *cell = it.second; - for (auto &it2 : cell->connections()) { - if (ct_all.cell_known(cell->type) && !ct_all.cell_output(cell->type, it2.first)) - continue; - for (auto raw_bit : it2.second) { - if (raw_bit.wire == nullptr) - continue; - auto bit = sigmap(raw_bit); - if (bit.wire == nullptr && ct_all.cell_known(cell->type)) - driver_driver_logs[raw_sigmap(raw_bit)].push_back(stringf("Driver-driver conflict " - "for %s between cell %s.%s and constant %s in %s: Resolved using constant.", - log_signal(raw_bit), log_id(cell), log_id(it2.first), log_signal(bit), log_id(module))); - if (bit.wire != nullptr) - wire2driver[bit].insert(cell); - } - } - if (keep_cache.query(cell)) - queue.insert(cell); - else - unused.insert(cell); - } - - for (auto &it : module->wires_) { - Wire *wire = it.second; - if (wire->port_output || wire->get_bool_attribute(ID::keep)) { - for (auto bit : sigmap(wire)) - for (auto c : wire2driver[bit]) - queue.insert(c), unused.erase(c); - for (auto raw_bit : SigSpec(wire)) - used_raw_bits.insert(raw_sigmap(raw_bit)); - } - } - - while (!queue.empty()) - { - pool bits; - pool mems; - for (auto cell : queue) { - for (auto &it : cell->connections()) - if (!ct_all.cell_known(cell->type) || ct_all.cell_input(cell->type, it.first)) - for (auto bit : sigmap(it.second)) - bits.insert(bit); - - if (cell->type.in(ID($memrd), ID($memrd_v2))) { - IdString mem_id = cell->getParam(ID::MEMID).decode_string(); - if (mem_unused.count(mem_id)) { - mem_unused.erase(mem_id); - mems.insert(mem_id); + class const_iterator { + public: + const_iterator(const WireDrivers &drivers, bool end) + : driver_cell(drivers.driver_cell), in_extra_cells(end) { + if (drivers.extra_driver_cells) { + if (end) { + extra_it = drivers.extra_driver_cells->end(); + } else { + extra_it = drivers.extra_driver_cells->begin(); + } } } + int operator*() const { + if (in_extra_cells) + return **extra_it; + return driver_cell; + } + const_iterator& operator++() { + if (in_extra_cells) + ++*extra_it; + else + in_extra_cells = true; + return *this; + } + bool operator!=(const const_iterator &other) const { + return !(*this == other); + } + bool operator==(const const_iterator &other) const { + return in_extra_cells == other.in_extra_cells && + extra_it == other.extra_it; + } + private: + std::optional::iterator> extra_it; + int driver_cell; + bool in_extra_cells; + }; + + const_iterator begin() const { return const_iterator(*this, false); } + const_iterator end() const { return const_iterator(*this, true); } + + SigBit bit; + int driver_cell; + std::unique_ptr> extra_driver_cells; + }; + struct WireDriversKeyEquality { + bool operator()(const WireDrivers &a, const WireDrivers &b) const { + return a.bit == b.bit; } + }; + struct WireDriversCollisionHandler { + void operator()(WireDrivers &incumbent, WireDrivers &new_value) const { + log_assert(new_value.extra_driver_cells == nullptr); + if (!incumbent.extra_driver_cells) + incumbent.extra_driver_cells.reset(new pool()); + incumbent.extra_driver_cells->insert(new_value.driver_cell); + } + }; + using Wire2Drivers = ShardedHashSet; - queue.clear(); + Wire2Drivers::Builder wire2driver_builder(subpool); + ShardedVector> mem2cells_vector(subpool); + ShardedVector> driver_driver_logs(subpool); + ShardedVector keep_wires(subpool); + const RTLIL::Module *const_module = module; + int num_threads = subpool.num_threads(); + ConcurrentWorkQueue cell_queue(num_threads); + std::vector> unused(const_module->cells_size()); + subpool.run([&sigmap, &raw_sigmap, &keep_cache, const_module, &mem2cells_vector, &driver_driver_logs, &keep_wires, &cell_queue, &wire2driver_builder, &unused](const ParallelDispatchThreadPool::RunCtx &ctx) { + for (int i : ctx.item_range(const_module->cells_size())) { + Cell *cell = const_module->cell_at(i); + if (cell->type.in(ID($memwr), ID($memwr_v2), ID($meminit), ID($meminit_v2))) + mem2cells_vector.insert(ctx, {cell->getParam(ID::MEMID).decode_string(), i}); - for (auto bit : bits) - for (auto c : wire2driver[bit]) - if (unused.count(c)) - queue.insert(c), unused.erase(c); + for (auto &it2 : cell->connections()) { + if (ct_all.cell_known(cell->type) && !ct_all.cell_output(cell->type, it2.first)) + continue; + for (auto raw_bit : it2.second) { + if (raw_bit.wire == nullptr) + continue; + auto bit = sigmap(raw_bit); + if (bit.wire == nullptr && ct_all.cell_known(cell->type)) { + std::string msg = stringf("Driver-driver conflict " + "for %s between cell %s.%s and constant %s in %s: Resolved using constant.", + log_signal(raw_bit), cell->name.unescape(), it2.first.unescape(), log_signal(bit), const_module->name.unescape()); + driver_driver_logs.insert(ctx, {raw_sigmap(raw_bit), msg}); + } + if (bit.wire != nullptr) + wire2driver_builder.insert(ctx, {{bit, i}, hash_bit(bit)}); + } + } + bool keep = keep_cache.query(cell); + unused[i].store(!keep, std::memory_order_relaxed); + if (keep) + cell_queue.push(ctx, i); + } + for (int i : ctx.item_range(const_module->wires_size())) { + Wire *wire = const_module->wire_at(i); + if (wire->port_output || wire->get_bool_attribute(ID::keep)) + keep_wires.insert(ctx, wire); + } + }); + subpool.run([&wire2driver_builder](const ParallelDispatchThreadPool::RunCtx &ctx) { + wire2driver_builder.process(ctx); + }); + Wire2Drivers wire2driver(wire2driver_builder); - for (auto mem : mems) - for (auto c : mem2cells[mem]) - if (unused.count(c)) - queue.insert(c), unused.erase(c); + dict> mem2cells; + for (std::pair &mem2cell : mem2cells_vector) + mem2cells[mem2cell.first].insert(mem2cell.second); + + pool used_raw_bits; + int i = 0; + for (Wire *wire : keep_wires) { + for (auto bit : sigmap(wire)) { + const WireDrivers *drivers = wire2driver.find({{bit}, hash_bit(bit)}); + if (drivers != nullptr) + for (int cell_index : *drivers) + if (unused[cell_index].exchange(false, std::memory_order_relaxed)) { + ThreadIndex fake_thread_index = {i++ % num_threads}; + cell_queue.push(fake_thread_index, cell_index); + } + } + for (auto raw_bit : SigSpec(wire)) + used_raw_bits.insert(raw_sigmap(raw_bit)); } - unused.sort(RTLIL::sort_by_name_id()); + std::vector> mem_unused(module->memories.size()); + dict mem_indices; + for (int i = 0; i < GetSize(module->memories); ++i) { + mem_indices[module->memories.element(i)->first.str()] = i; + mem_unused[i].store(true, std::memory_order_relaxed); + } - for (auto cell : unused) { + subpool.run([const_module, &sigmap, &wire2driver, &mem2cells, &unused, &cell_queue, &mem_indices, &mem_unused](const ParallelDispatchThreadPool::RunCtx &ctx) { + pool bits; + pool mems; + while (true) { + std::vector cell_indices = cell_queue.pop_batch(ctx); + if (cell_indices.empty()) + return; + for (auto cell_index : cell_indices) { + Cell *cell = const_module->cell_at(cell_index); + for (auto &it : cell->connections()) + if (!ct_all.cell_known(cell->type) || ct_all.cell_input(cell->type, it.first)) + for (auto bit : sigmap(it.second)) + bits.insert(bit); + + if (cell->type.in(ID($memrd), ID($memrd_v2))) { + std::string mem_id = cell->getParam(ID::MEMID).decode_string(); + if (mem_indices.count(mem_id)) { + int mem_index = mem_indices[mem_id]; + if (mem_unused[mem_index].exchange(false, std::memory_order_relaxed)) + mems.insert(mem_id); + } + } + } + + for (auto bit : bits) { + const WireDrivers *drivers = wire2driver.find({{bit}, hash_bit(bit)}); + if (drivers != nullptr) + for (int cell_index : *drivers) + if (unused[cell_index].exchange(false, std::memory_order_relaxed)) + cell_queue.push(ctx, cell_index); + } + bits.clear(); + + for (auto mem : mems) { + if (mem2cells.count(mem) == 0) + continue; + for (int cell_index : mem2cells.at(mem)) + if (unused[cell_index].exchange(false, std::memory_order_relaxed)) + cell_queue.push(ctx, cell_index); + } + mems.clear(); + } + }); + + ShardedVector sharded_unused_cells(subpool); + subpool.run([const_module, &unused, &sharded_unused_cells, &wire2driver](const ParallelDispatchThreadPool::RunCtx &ctx) { + // Parallel destruction of `wire2driver` + wire2driver.clear(ctx); + for (int i : ctx.item_range(const_module->cells_size())) + if (unused[i].load(std::memory_order_relaxed)) + sharded_unused_cells.insert(ctx, i); + }); + pool unused_cells; + for (int cell_index : sharded_unused_cells) + unused_cells.insert(const_module->cell_at(cell_index)); + unused_cells.sort(RTLIL::sort_by_name_id()); + + for (auto cell : unused_cells) { if (verbose) log_debug(" removing unused `%s' cell `%s'.\n", cell->type, cell->name); module->design->scratchpad_set_bool("opt.did_something", true); if (cell->is_builtin_ff()) ffinit.remove_init(cell->getPort(ID::Q)); module->remove(cell); - count_rm_cells++; + stats.count_rm_cells++; } - for (auto it : mem_unused) - { + for (const auto &it : mem_indices) { + if (!mem_unused[it.second].load(std::memory_order_relaxed)) + continue; + RTLIL::IdString id(it.first); if (verbose) - log_debug(" removing unused memory `%s'.\n", it); - delete module->memories.at(it); - module->memories.erase(it); + log_debug(" removing unused memory `%s'.\n", id.unescape()); + delete module->memories.at(id); + module->memories.erase(id); } - for (auto &it : module->cells_) { - Cell *cell = it.second; - for (auto &it2 : cell->connections()) { - if (ct_all.cell_known(cell->type) && !ct_all.cell_input(cell->type, it2.first)) - continue; - for (auto raw_bit : raw_sigmap(it2.second)) - used_raw_bits.insert(raw_bit); + if (!driver_driver_logs.empty()) { + // We could do this in parallel but hopefully this is rare. + for (auto &it : module->cells_) { + Cell *cell = it.second; + for (auto &it2 : cell->connections()) { + if (ct_all.cell_known(cell->type) && !ct_all.cell_input(cell->type, it2.first)) + continue; + for (auto raw_bit : raw_sigmap(it2.second)) + used_raw_bits.insert(raw_bit); + } + } + for (std::pair &it : driver_driver_logs) { + if (used_raw_bits.count(it.first)) + log_warning("%s\n", it.second); } - } - - for (auto it : driver_driver_logs) { - if (used_raw_bits.count(it.first)) - for (auto msg : it.second) - log_warning("%s\n", msg); } } @@ -247,9 +448,62 @@ int count_nontrivial_wire_attrs(RTLIL::Wire *w) return count; } +struct ShardedSigBit { + using Accumulated = ShardedSigBit; + RTLIL::SigBit bit; + ShardedSigBit() = default; + ShardedSigBit(const RTLIL::SigBit &bit) : bit(bit) {} +}; +struct ShardedSigBitEquality { + bool operator()(const ShardedSigBit &b1, const ShardedSigBit &b2) const { + return b1.bit == b2.bit; + } +}; +using ShardedSigPool = ShardedHashSet; + +struct ShardedSigSpec { + using Accumulated = ShardedSigSpec; + RTLIL::SigSpec spec; + ShardedSigSpec() = default; + ShardedSigSpec(RTLIL::SigSpec spec) : spec(std::move(spec)) {} + ShardedSigSpec(ShardedSigSpec &&) = default; +}; +struct ShardedSigSpecEquality { + bool operator()(const ShardedSigSpec &s1, const ShardedSigSpec &s2) const { + return s1.spec == s2.spec; + } +}; +using ShardedSigSpecPool = ShardedHashSet; + +struct DirectWires { + const SigMap &assign_map; + const ShardedSigSpecPool &direct_sigs; + dict cache; + + DirectWires(const SigMap &assign_map, const ShardedSigSpecPool &direct_sigs) : assign_map(assign_map), direct_sigs(direct_sigs) {} + void cache_result_for_bit(const SigBit &bit) { + if (bit.wire != nullptr) + is_direct(bit.wire); + } + bool is_direct(RTLIL::Wire *wire) { + if (wire->port_input) + return true; + auto it = cache.find(wire); + if (it != cache.end()) + return it->second; + SigSpec direct_sig = assign_map(wire); + bool direct = direct_sigs.find({direct_sig, direct_sig.hash_into(Hasher()).yield()}) != nullptr; + cache.insert({wire, direct}); + return direct; + } +}; + // Should we pick `s2` over `s1` to represent a signal? -bool compare_signals(RTLIL::SigBit &s1, RTLIL::SigBit &s2, SigPool ®s, SigPool &conns, pool &direct_wires) +bool compare_signals(const RTLIL::SigBit &s1, const RTLIL::SigBit &s2, const ShardedSigPool ®s, const ShardedSigPool &conns, DirectWires &direct_wires) { + if (s1 == s2) + return false; + RTLIL::Wire *w1 = s1.wire; RTLIL::Wire *w2 = s2.wire; @@ -263,12 +517,20 @@ bool compare_signals(RTLIL::SigBit &s1, RTLIL::SigBit &s2, SigPool ®s, SigPoo return !(w2->port_input && w2->port_output); if (w1->name.isPublic() && w2->name.isPublic()) { - if (regs.check(s1) != regs.check(s2)) - return regs.check(s2); - if (direct_wires.count(w1) != direct_wires.count(w2)) - return direct_wires.count(w2) != 0; - if (conns.check_any(s1) != conns.check_any(s2)) - return conns.check_any(s2); + ShardedSigPool::AccumulatedValue s1_val = {s1, s1.hash_top().yield()}; + ShardedSigPool::AccumulatedValue s2_val = {s2, s2.hash_top().yield()}; + bool regs1 = regs.find(s1_val) != nullptr; + bool regs2 = regs.find(s2_val) != nullptr; + if (regs1 != regs2) + return regs2; + bool w1_direct = direct_wires.is_direct(w1); + bool w2_direct = direct_wires.is_direct(w2); + if (w1_direct != w2_direct) + return w2_direct; + bool conns1 = conns.find(s1_val) != nullptr; + bool conns2 = conns.find(s2_val) != nullptr; + if (conns1 != conns2) + return conns2; } if (w1 == w2) @@ -301,109 +563,185 @@ bool check_public_name(RTLIL::IdString id) return true; } -bool rmunused_module_signals(RTLIL::Module *module, bool purge_mode, bool verbose) -{ - // `register_signals` and `connected_signals` will help us decide later on - // on picking representatives out of groups of connected signals - SigPool register_signals; - SigPool connected_signals; - if (!purge_mode) - for (auto &it : module->cells_) { - RTLIL::Cell *cell = it.second; - if (ct_reg.cell_known(cell->type)) { - bool clk2fflogic = cell->get_bool_attribute(ID(clk2fflogic)); - for (auto &it2 : cell->connections()) - if (clk2fflogic ? it2.first == ID::D : ct_reg.cell_output(cell->type, it2.first)) - register_signals.add(it2.second); - } - for (auto &it2 : cell->connections()) - connected_signals.add(it2.second); - } +void add_spec(ShardedSigPool::Builder &builder, const ThreadIndex &thread, const RTLIL::SigSpec &spec) { + for (SigBit bit : spec) + if (bit.wire != nullptr) + builder.insert(thread, {bit, bit.hash_top().yield()}); +} +bool check_any(const ShardedSigPool &sigs, const RTLIL::SigSpec &spec) { + for (SigBit b : spec) + if (sigs.find({b, b.hash_top().yield()}) != nullptr) + return true; + return false; +} + +bool check_all(const ShardedSigPool &sigs, const RTLIL::SigSpec &spec) { + for (SigBit b : spec) + if (sigs.find({b, b.hash_top().yield()}) == nullptr) + return false; + return true; +} + +bool rmunused_module_signals(RTLIL::Module *module, ParallelDispatchThreadPool::Subpool &subpool, bool purge_mode, bool verbose, RmStats &stats) +{ SigMap assign_map(module); + const RTLIL::Module *const_module = module; + // `register_signals` and `connected_signals` will help us decide later on + // on picking representatives out of groups of connected signals + ShardedSigPool::Builder register_signals_builder(subpool); + ShardedSigPool::Builder connected_signals_builder(subpool); // construct a pool of wires which are directly driven by a known celltype, // this will influence our choice of representatives - pool direct_wires; - { - pool direct_sigs; - for (auto &it : module->cells_) { - RTLIL::Cell *cell = it.second; + ShardedSigSpecPool::Builder direct_sigs_builder(subpool); + subpool.run([const_module, purge_mode, &assign_map, &direct_sigs_builder, ®ister_signals_builder, &connected_signals_builder](const ParallelDispatchThreadPool::RunCtx &ctx) { + for (int i : ctx.item_range(const_module->cells_size())) { + RTLIL::Cell *cell = const_module->cell_at(i); + if (!purge_mode) { + if (ct_reg.cell_known(cell->type)) { + bool clk2fflogic = cell->get_bool_attribute(ID(clk2fflogic)); + for (auto &it2 : cell->connections()) + if (clk2fflogic ? it2.first == ID::D : ct_reg.cell_output(cell->type, it2.first)) + add_spec(register_signals_builder, ctx, it2.second); + } + for (auto &it2 : cell->connections()) + add_spec(connected_signals_builder, ctx, it2.second); + } if (ct_all.cell_known(cell->type)) for (auto &it2 : cell->connections()) - if (ct_all.cell_output(cell->type, it2.first)) - direct_sigs.insert(assign_map(it2.second)); + if (ct_all.cell_output(cell->type, it2.first)) { + RTLIL::SigSpec spec = assign_map(it2.second); + unsigned int hash = spec.hash_into(Hasher()).yield(); + direct_sigs_builder.insert(ctx, {std::move(spec), hash}); + } } - for (auto &it : module->wires_) { - if (direct_sigs.count(assign_map(it.second)) || it.second->port_input) - direct_wires.insert(it.second); - } - } + }); + subpool.run([®ister_signals_builder, &connected_signals_builder, &direct_sigs_builder](const ParallelDispatchThreadPool::RunCtx &ctx) { + register_signals_builder.process(ctx); + connected_signals_builder.process(ctx); + direct_sigs_builder.process(ctx); + }); + ShardedSigPool register_signals(register_signals_builder); + ShardedSigPool connected_signals(connected_signals_builder); + ShardedSigSpecPool direct_sigs(direct_sigs_builder); - // weight all options for representatives with `compare_signals`, - // the one that wins will be what `assign_map` maps to - for (auto &it : module->wires_) { - RTLIL::Wire *wire = it.second; - for (int i = 0; i < wire->width; i++) { - RTLIL::SigBit s1 = RTLIL::SigBit(wire, i), s2 = assign_map(s1); - if (compare_signals(s2, s1, register_signals, connected_signals, direct_wires)) - assign_map.add(s1); + ShardedVector sigmap_canonical_candidates(subpool); + DirectWires direct_wires(assign_map, direct_sigs); + subpool.run([const_module, &assign_map, ®ister_signals, &connected_signals, &sigmap_canonical_candidates, &direct_sigs, &direct_wires](const ParallelDispatchThreadPool::RunCtx &ctx) { + std::optional local_direct_wires; + DirectWires *this_thread_direct_wires = &direct_wires; + if (ctx.thread_num > 0) { + local_direct_wires.emplace(assign_map, direct_sigs); + this_thread_direct_wires = &local_direct_wires.value(); } + for (int i : ctx.item_range(const_module->wires_size())) { + RTLIL::Wire *wire = const_module->wire_at(i); + for (int j = 0; j < wire->width; ++j) { + RTLIL::SigBit s1(wire, j); + RTLIL::SigBit s2 = assign_map(s1); + if (compare_signals(s2, s1, register_signals, connected_signals, *this_thread_direct_wires)) + sigmap_canonical_candidates.insert(ctx, s1); + } + } + }); + // Cache all the direct_wires results that we might possible need. This avoids the results + // changing when we update `assign_map` below. + for (RTLIL::SigBit candidate : sigmap_canonical_candidates) { + direct_wires.cache_result_for_bit(candidate); + direct_wires.cache_result_for_bit(assign_map(candidate)); + } + for (RTLIL::SigBit candidate : sigmap_canonical_candidates) { + RTLIL::SigBit current_canonical = assign_map(candidate); + if (compare_signals(current_canonical, candidate, register_signals, connected_signals, direct_wires)) + assign_map.add(candidate); } // we are removing all connections module->connections_.clear(); // used signals sigmapped - SigPool used_signals; + ShardedSigPool::Builder used_signals_builder(subpool); // used signals pre-sigmapped - SigPool raw_used_signals; + ShardedSigPool::Builder raw_used_signals_builder(subpool); // used signals sigmapped, ignoring drivers (we keep track of this to set `unused_bits`) - SigPool used_signals_nodrivers; - - // gather the usage information for cells - for (auto &it : module->cells_) { - RTLIL::Cell *cell = it.second; - for (auto &it2 : cell->connections_) { - assign_map.apply(it2.second); // modify the cell connection in place - raw_used_signals.add(it2.second); - used_signals.add(it2.second); - if (!ct_all.cell_output(cell->type, it2.first)) - used_signals_nodrivers.add(it2.second); - } - } - - // gather the usage information for ports, wires with `keep`, + ShardedSigPool::Builder used_signals_nodrivers_builder(subpool); + struct UpdateConnection { + RTLIL::Cell *cell; + RTLIL::IdString port; + RTLIL::SigSpec spec; + }; + ShardedVector update_connections(subpool); + ShardedVector initialized_wires(subpool); + // gather the usage information for cells and update cell connections + // also gather the usage information for ports, wires with `keep` // also gather init bits + subpool.run([const_module, ®ister_signals, &connected_signals, &direct_sigs, &assign_map, &used_signals_builder, &raw_used_signals_builder, &used_signals_nodrivers_builder, &update_connections, &initialized_wires](const ParallelDispatchThreadPool::RunCtx &ctx) { + // Parallel destruction of these sharded structures + register_signals.clear(ctx); + connected_signals.clear(ctx); + direct_sigs.clear(ctx); + + for (int i : ctx.item_range(const_module->cells_size())) { + RTLIL::Cell *cell = const_module->cell_at(i); + for (const auto &it2 : cell->connections_) { + SigSpec spec = assign_map(it2.second); + if (spec != it2.second) + update_connections.insert(ctx, {cell, it2.first, spec}); + add_spec(raw_used_signals_builder, ctx, spec); + add_spec(used_signals_builder, ctx, spec); + if (!ct_all.cell_output(cell->type, it2.first)) + add_spec(used_signals_nodrivers_builder, ctx, spec); + } + } + for (int i : ctx.item_range(const_module->wires_size())) { + RTLIL::Wire *wire = const_module->wire_at(i); + if (wire->port_id > 0) { + RTLIL::SigSpec sig = RTLIL::SigSpec(wire); + add_spec(raw_used_signals_builder, ctx, sig); + assign_map.apply(sig); + add_spec(used_signals_builder, ctx, sig); + if (!wire->port_input) + add_spec(used_signals_nodrivers_builder, ctx, sig); + } + if (wire->get_bool_attribute(ID::keep)) { + RTLIL::SigSpec sig = RTLIL::SigSpec(wire); + assign_map.apply(sig); + add_spec(used_signals_builder, ctx, sig); + } + auto it2 = wire->attributes.find(ID::init); + if (it2 != wire->attributes.end()) + initialized_wires.insert(ctx, wire); + } + }); + subpool.run([&used_signals_builder, &raw_used_signals_builder, &used_signals_nodrivers_builder](const ParallelDispatchThreadPool::RunCtx &ctx) { + used_signals_builder.process(ctx); + raw_used_signals_builder.process(ctx); + used_signals_nodrivers_builder.process(ctx); + }); + ShardedSigPool used_signals(used_signals_builder); + ShardedSigPool raw_used_signals(raw_used_signals_builder); + ShardedSigPool used_signals_nodrivers(used_signals_nodrivers_builder); + dict init_bits; - for (auto &it : module->wires_) { - RTLIL::Wire *wire = it.second; - if (wire->port_id > 0) { - RTLIL::SigSpec sig = RTLIL::SigSpec(wire); - raw_used_signals.add(sig); - assign_map.apply(sig); - used_signals.add(sig); - if (!wire->port_input) - used_signals_nodrivers.add(sig); - } - if (wire->get_bool_attribute(ID::keep)) { - RTLIL::SigSpec sig = RTLIL::SigSpec(wire); - assign_map.apply(sig); - used_signals.add(sig); - } - auto it2 = wire->attributes.find(ID::init); - if (it2 != wire->attributes.end()) { - RTLIL::Const &val = it2->second; - SigSpec sig = assign_map(wire); - for (int i = 0; i < GetSize(val) && i < GetSize(sig); i++) - if (val[i] != State::Sx) - init_bits[sig[i]] = val[i]; - wire->attributes.erase(it2); - } + // The wires that appear in the keys of `init_bits` + pool init_bits_wires; + for (const UpdateConnection &update : update_connections) + update.cell->connections_.at(update.port) = std::move(update.spec); + for (RTLIL::Wire *intialized_wire : initialized_wires) { + auto it = intialized_wire->attributes.find(ID::init); + RTLIL::Const &val = it->second; + SigSpec sig = assign_map(intialized_wire); + for (int i = 0; i < GetSize(val) && i < GetSize(sig); i++) + if (val[i] != State::Sx && sig[i].wire != nullptr) { + init_bits[sig[i]] = val[i]; + init_bits_wires.insert(sig[i].wire); + } + intialized_wire->attributes.erase(it); } // set init attributes on all wires of a connected group - for (auto wire : module->wires()) { + for (RTLIL::Wire *wire : init_bits_wires) { bool found = false; Const val(State::Sx, wire->width); for (int i = 0; i < wire->width; i++) { @@ -418,81 +756,117 @@ bool rmunused_module_signals(RTLIL::Module *module, bool purge_mode, bool verbos } // now decide for each wire if we should be deleting it - pool del_wires_queue; - for (auto wire : module->wires()) - { - SigSpec s1 = SigSpec(wire), s2 = assign_map(s1); - log_assert(GetSize(s1) == GetSize(s2)); + ShardedVector del_wires(subpool); + ShardedVector remove_init(subpool); + ShardedVector> set_init(subpool); + ShardedVector connections(subpool); + ShardedVector remove_unused_bits(subpool); + ShardedVector> set_unused_bits(subpool); + subpool.run([const_module, purge_mode, &assign_map, &used_signals, &raw_used_signals, &used_signals_nodrivers, &del_wires, &remove_init, &set_init, &connections, &remove_unused_bits, &set_unused_bits](const ParallelDispatchThreadPool::RunCtx &ctx) { + for (int i : ctx.item_range(const_module->wires_size())) { + RTLIL::Wire *wire = const_module->wire_at(i); + SigSpec s1 = SigSpec(wire), s2 = assign_map(s1); + log_assert(GetSize(s1) == GetSize(s2)); - Const initval; - if (wire->attributes.count(ID::init)) - initval = wire->attributes.at(ID::init); - if (GetSize(initval) != GetSize(wire)) - initval.resize(GetSize(wire), State::Sx); - if (initval.is_fully_undef()) - wire->attributes.erase(ID::init); + Const initval; + bool has_init_attribute = wire->attributes.count(ID::init); + bool init_changed = false; + if (has_init_attribute) + initval = wire->attributes.at(ID::init); + if (GetSize(initval) != GetSize(wire)) { + initval.resize(GetSize(wire), State::Sx); + init_changed = true; + } - if (GetSize(wire) == 0) { - // delete zero-width wires, unless they are module ports - if (wire->port_id == 0) + if (GetSize(wire) == 0) { + // delete zero-width wires, unless they are module ports + if (wire->port_id == 0) + goto delete_this_wire; + } else + if (wire->port_id != 0 || wire->get_bool_attribute(ID::keep) || !initval.is_fully_undef()) { + // do not delete anything with "keep" or module ports or initialized wires + } else + if (!purge_mode && check_public_name(wire->name) && (check_any(raw_used_signals, s1) || check_any(used_signals, s2) || s1 != s2)) { + // do not get rid of public names unless in purge mode or if the wire is entirely unused, not even aliased + } else + if (!check_any(raw_used_signals, s1)) { + // delete wires that aren't used by anything directly goto delete_this_wire; - } else - if (wire->port_id != 0 || wire->get_bool_attribute(ID::keep) || !initval.is_fully_undef()) { - // do not delete anything with "keep" or module ports or initialized wires - } else - if (!purge_mode && check_public_name(wire->name) && (raw_used_signals.check_any(s1) || used_signals.check_any(s2) || s1 != s2)) { - // do not get rid of public names unless in purge mode or if the wire is entirely unused, not even aliased - } else - if (!raw_used_signals.check_any(s1)) { - // delete wires that aren't used by anything directly - goto delete_this_wire; - } - - if (0) - { - delete_this_wire: - del_wires_queue.insert(wire); - } - else - { - RTLIL::SigSig new_conn; - for (int i = 0; i < GetSize(s1); i++) - if (s1[i] != s2[i]) { - if (s2[i] == State::Sx && (initval[i] == State::S0 || initval[i] == State::S1)) { - s2[i] = initval[i]; - initval.set(i, State::Sx); - } - new_conn.first.append(s1[i]); - new_conn.second.append(s2[i]); - } - if (new_conn.first.size() > 0) { - if (initval.is_fully_undef()) - wire->attributes.erase(ID::init); - else - wire->attributes.at(ID::init) = initval; - module->connect(new_conn); } - if (!used_signals_nodrivers.check_all(s2)) { + if (0) + { + delete_this_wire: + del_wires.insert(ctx, wire); + } + else + { + RTLIL::SigSig new_conn; + for (int i = 0; i < GetSize(s1); i++) + if (s1[i] != s2[i]) { + if (s2[i] == State::Sx && (initval[i] == State::S0 || initval[i] == State::S1)) { + s2[i] = initval[i]; + initval.set(i, State::Sx); + init_changed = true; + } + new_conn.first.append(s1[i]); + new_conn.second.append(s2[i]); + } + if (new_conn.first.size() > 0) + connections.insert(ctx, std::move(new_conn)); + if (initval.is_fully_undef()) { + if (has_init_attribute) + remove_init.insert(ctx, wire); + } else + if (init_changed) + set_init.insert(ctx, {wire, std::move(initval)}); + std::string unused_bits; - for (int i = 0; i < GetSize(s2); i++) { - if (s2[i].wire == NULL) - continue; - if (!used_signals_nodrivers.check(s2[i])) { - if (!unused_bits.empty()) - unused_bits += " "; - unused_bits += stringf("%d", i); + if (!check_all(used_signals_nodrivers, s2)) { + for (int i = 0; i < GetSize(s2); i++) { + if (s2[i].wire == NULL) + continue; + SigBit b = s2[i]; + if (used_signals_nodrivers.find({b, b.hash_top().yield()}) == nullptr) { + if (!unused_bits.empty()) + unused_bits += " "; + unused_bits += stringf("%d", i); + } } } - if (unused_bits.empty() || wire->port_id != 0) - wire->attributes.erase(ID::unused_bits); - else - wire->attributes[ID::unused_bits] = RTLIL::Const(unused_bits); - } else { - wire->attributes.erase(ID::unused_bits); + if (unused_bits.empty() || wire->port_id != 0) { + if (wire->attributes.count(ID::unused_bits)) + remove_unused_bits.insert(ctx, wire); + } else { + RTLIL::Const unused_bits_const(std::move(unused_bits)); + if (wire->attributes.count(ID::unused_bits)) { + RTLIL::Const &unused_bits_attr = wire->attributes.at(ID::unused_bits); + if (unused_bits_attr != unused_bits_const) + set_unused_bits.insert(ctx, {wire, std::move(unused_bits_const)}); + } else + set_unused_bits.insert(ctx, {wire, std::move(unused_bits_const)}); + } } } - } + }); + pool del_wires_queue; + del_wires_queue.insert(del_wires.begin(), del_wires.end()); + for (RTLIL::Wire *wire : remove_init) + wire->attributes.erase(ID::init); + for (auto &p : set_init) + p.first->attributes[ID::init] = std::move(p.second); + for (auto &conn : connections) + module->connect(std::move(conn)); + for (RTLIL::Wire *wire : remove_unused_bits) + wire->attributes.erase(ID::unused_bits); + for (auto &p : set_unused_bits) + p.first->attributes[ID::unused_bits] = std::move(p.second); + + subpool.run([&used_signals, &raw_used_signals, &used_signals_nodrivers](const ParallelDispatchThreadPool::RunCtx &ctx) { + used_signals.clear(ctx); + raw_used_signals.clear(ctx); + used_signals_nodrivers.clear(ctx); + }); int del_temp_wires_count = 0; for (auto wire : del_wires_queue) { @@ -503,7 +877,7 @@ bool rmunused_module_signals(RTLIL::Module *module, bool purge_mode, bool verbos } module->remove(del_wires_queue); - count_rm_wires += GetSize(del_wires_queue); + stats.count_rm_wires += GetSize(del_wires_queue); if (verbose && del_temp_wires_count) log_debug(" removed %d unused temporary wires.\n", del_temp_wires_count); @@ -514,79 +888,93 @@ bool rmunused_module_signals(RTLIL::Module *module, bool purge_mode, bool verbos return !del_wires_queue.empty(); } -bool rmunused_module_init(RTLIL::Module *module, bool verbose) +bool rmunused_module_init(RTLIL::Module *module, ParallelDispatchThreadPool::Subpool &subpool, bool verbose) { - bool did_something = false; CellTypes fftypes; fftypes.setup_internals_mem(); SigMap sigmap(module); - dict qbits; - for (auto cell : module->cells()) - if (fftypes.cell_known(cell->type) && cell->hasPort(ID::Q)) - { - SigSpec sig = cell->getPort(ID::Q); - - for (int i = 0; i < GetSize(sig); i++) + const Module *const_module = module; + ShardedVector> results(subpool); + subpool.run([const_module, &fftypes, &results](const ParallelDispatchThreadPool::RunCtx &ctx) { + for (int i : ctx.item_range(const_module->cells_size())) { + RTLIL::Cell *cell = const_module->cell_at(i); + if (fftypes.cell_known(cell->type) && cell->hasPort(ID::Q)) { - SigBit bit = sig[i]; + SigSpec sig = cell->getPort(ID::Q); - if (bit.wire == nullptr || bit.wire->attributes.count(ID::init) == 0) - continue; + for (int i = 0; i < GetSize(sig); i++) + { + SigBit bit = sig[i]; - Const init = bit.wire->attributes.at(ID::init); + if (bit.wire == nullptr || bit.wire->attributes.count(ID::init) == 0) + continue; - if (i >= GetSize(init) || init[i] == State::Sx || init[i] == State::Sz) - continue; + Const init = bit.wire->attributes.at(ID::init); - sigmap.add(bit); - qbits[bit] = init[i]; - } - } + if (i >= GetSize(init) || init[i] == State::Sx || init[i] == State::Sz) + continue; - for (auto wire : module->wires()) - { - if (wire->attributes.count(ID::init) == 0) - continue; - - Const init = wire->attributes.at(ID::init); - - for (int i = 0; i < GetSize(wire) && i < GetSize(init); i++) - { - if (init[i] == State::Sx || init[i] == State::Sz) - continue; - - SigBit wire_bit = SigBit(wire, i); - SigBit mapped_wire_bit = sigmap(wire_bit); - - if (wire_bit == mapped_wire_bit) - goto next_wire; - - if (mapped_wire_bit.wire) { - if (qbits.count(mapped_wire_bit) == 0) - goto next_wire; - - if (qbits.at(mapped_wire_bit) != init[i]) - goto next_wire; - } - else { - if (mapped_wire_bit == State::Sx || mapped_wire_bit == State::Sz) - goto next_wire; - - if (mapped_wire_bit != init[i]) { - log_warning("Initial value conflict for %s resolving to %s but with init %s.\n", log_signal(wire_bit), log_signal(mapped_wire_bit), log_signal(init[i])); - goto next_wire; + results.insert(ctx, {bit, init[i]}); } } } + }); + dict qbits; + for (std::pair &p : results) { + sigmap.add(p.first); + qbits[p.first] = p.second; + } + ShardedVector wire_results(subpool); + subpool.run([const_module, &sigmap, &qbits, &wire_results](const ParallelDispatchThreadPool::RunCtx &ctx) { + for (int j : ctx.item_range(const_module->wires_size())) { + RTLIL::Wire *wire = const_module->wire_at(j); + if (wire->attributes.count(ID::init) == 0) + continue; + Const init = wire->attributes.at(ID::init); + + for (int i = 0; i < GetSize(wire) && i < GetSize(init); i++) + { + if (init[i] == State::Sx || init[i] == State::Sz) + continue; + + SigBit wire_bit = SigBit(wire, i); + SigBit mapped_wire_bit = sigmap(wire_bit); + + if (wire_bit == mapped_wire_bit) + goto next_wire; + + if (mapped_wire_bit.wire) { + if (qbits.count(mapped_wire_bit) == 0) + goto next_wire; + + if (qbits.at(mapped_wire_bit) != init[i]) + goto next_wire; + } + else { + if (mapped_wire_bit == State::Sx || mapped_wire_bit == State::Sz) + goto next_wire; + + if (mapped_wire_bit != init[i]) { + log_warning("Initial value conflict for %s resolving to %s but with init %s.\n", log_signal(wire_bit), log_signal(mapped_wire_bit), log_signal(init[i])); + goto next_wire; + } + } + } + wire_results.insert(ctx, wire); + + next_wire:; + } + }); + + bool did_something = false; + for (RTLIL::Wire *wire : wire_results) { if (verbose) log_debug(" removing redundant init attribute on %s.\n", log_id(wire)); - wire->attributes.erase(ID::init); did_something = true; - next_wire:; } if (did_something) @@ -595,47 +983,53 @@ bool rmunused_module_init(RTLIL::Module *module, bool verbose) return did_something; } -void rmunused_module(RTLIL::Module *module, bool purge_mode, bool verbose, bool rminit) +void remove_temporary_cells(RTLIL::Module *module, ParallelDispatchThreadPool::Subpool &subpool, bool verbose) { - if (verbose) - log("Finding unused cells or wires in module %s..\n", module->name); + ShardedVector delcells(subpool); + ShardedVector new_connections(subpool); + const RTLIL::Module *const_module = module; + subpool.run([const_module, &delcells, &new_connections](const ParallelDispatchThreadPool::RunCtx &ctx) { + for (int i : ctx.item_range(const_module->cells_size())) { + RTLIL::Cell *cell = const_module->cell_at(i); + if (cell->type.in(ID($pos), ID($_BUF_), ID($buf)) && !cell->has_keep_attr()) { + bool is_signed = cell->type == ID($pos) && cell->getParam(ID::A_SIGNED).as_bool(); + RTLIL::SigSpec a = cell->getPort(ID::A); + RTLIL::SigSpec y = cell->getPort(ID::Y); + a.extend_u0(GetSize(y), is_signed); - std::vector delcells; - for (auto cell : module->cells()) { - if (cell->type.in(ID($pos), ID($_BUF_), ID($buf)) && !cell->has_keep_attr()) { - bool is_signed = cell->type == ID($pos) && cell->getParam(ID::A_SIGNED).as_bool(); - RTLIL::SigSpec a = cell->getPort(ID::A); - RTLIL::SigSpec y = cell->getPort(ID::Y); - a.extend_u0(GetSize(y), is_signed); - - if (a.has_const(State::Sz)) { - SigSpec new_a; - SigSpec new_y; - for (int i = 0; i < GetSize(a); ++i) { - SigBit b = a[i]; - if (b == State::Sz) - continue; - new_a.append(b); - new_y.append(y[i]); + if (a.has_const(State::Sz)) { + RTLIL::SigSpec new_a; + RTLIL::SigSpec new_y; + for (int i = 0; i < GetSize(a); ++i) { + RTLIL::SigBit b = a[i]; + if (b == State::Sz) + continue; + new_a.append(b); + new_y.append(y[i]); + } + a = std::move(new_a); + y = std::move(new_y); } - a = std::move(new_a); - y = std::move(new_y); + if (!y.empty()) + new_connections.insert(ctx, {y, a}); + delcells.insert(ctx, cell); + } else if (cell->type.in(ID($connect)) && !cell->has_keep_attr()) { + RTLIL::SigSpec a = cell->getPort(ID::A); + RTLIL::SigSpec b = cell->getPort(ID::B); + if (a.has_const() && !b.has_const()) + std::swap(a, b); + new_connections.insert(ctx, {a, b}); + delcells.insert(ctx, cell); + } else if (cell->type.in(ID($input_port)) && !cell->has_keep_attr()) { + delcells.insert(ctx, cell); } - if (!y.empty()) - module->connect(y, a); - delcells.push_back(cell); - } else if (cell->type.in(ID($connect)) && !cell->has_keep_attr()) { - RTLIL::SigSpec a = cell->getPort(ID::A); - RTLIL::SigSpec b = cell->getPort(ID::B); - if (a.has_const() && !b.has_const()) - std::swap(a, b); - module->connect(a, b); - delcells.push_back(cell); - } else if (cell->type.in(ID($input_port)) && !cell->has_keep_attr()) { - delcells.push_back(cell); } + }); + bool did_something = false; + for (RTLIL::SigSig &connection : new_connections) { + module->connect(connection); } - for (auto cell : delcells) { + for (RTLIL::Cell *cell : delcells) { if (verbose) { if (cell->type == ID($connect)) log_debug(" removing connect cell `%s': %s <-> %s\n", cell->name, @@ -648,17 +1042,28 @@ void rmunused_module(RTLIL::Module *module, bool purge_mode, bool verbose, bool log_signal(cell->getPort(ID::Y)), log_signal(cell->getPort(ID::A))); } module->remove(cell); + did_something = true; } - if (!delcells.empty()) + if (did_something) module->design->scratchpad_set_bool("opt.did_something", true); - - rmunused_module_cells(module, verbose); - while (rmunused_module_signals(module, purge_mode, verbose)) { } - - if (rminit && rmunused_module_init(module, verbose)) - while (rmunused_module_signals(module, purge_mode, verbose)) { } } +void rmunused_module(RTLIL::Module *module, ParallelDispatchThreadPool &thread_pool, bool purge_mode, bool verbose, bool rminit, RmStats &stats, keep_cache_t &keep_cache) +{ + if (verbose) + log("Finding unused cells or wires in module %s..\n", module->name); + + // Use no more than one worker per thousand cells, rounded down, so + // we only start multithreading with at least 2000 cells. + int num_worker_threads = ThreadPool::work_pool_size(0, module->cells_size(), 1000); + ParallelDispatchThreadPool::Subpool subpool(thread_pool, num_worker_threads); + remove_temporary_cells(module, subpool, verbose); + rmunused_module_cells(module, subpool, verbose, stats, keep_cache); + while (rmunused_module_signals(module, subpool, purge_mode, verbose, stats)) { } + + if (rminit && rmunused_module_init(module, subpool, verbose)) + while (rmunused_module_signals(module, subpool, purge_mode, verbose, stats)) { } +} struct OptCleanPass : public Pass { OptCleanPass() : Pass("opt_clean", "remove unused cells and wires") { } void help() override @@ -695,7 +1100,15 @@ struct OptCleanPass : public Pass { } extra_args(args, argidx, design); - keep_cache.reset(design, purge_mode); + std::vector selected_modules; + for (auto module : design->selected_whole_modules_warn()) + if (!module->has_processes_warn()) + selected_modules.push_back(module); + int thread_pool_size = 0; + for (RTLIL::Module *m : selected_modules) + thread_pool_size = std::max(thread_pool_size, ThreadPool::work_pool_size(0, m->cells_size(), 1000)); + ParallelDispatchThreadPool thread_pool(thread_pool_size); + keep_cache_t keep_cache(purge_mode, thread_pool, selected_modules); ct_reg.setup_internals_mem(); ct_reg.setup_internals_anyinit(); @@ -703,22 +1116,14 @@ struct OptCleanPass : public Pass { ct_all.setup(design); - count_rm_cells = 0; - count_rm_wires = 0; - - for (auto module : design->selected_whole_modules_warn()) { - if (module->has_processes_warn()) - continue; - rmunused_module(module, purge_mode, true, true); - } - - if (count_rm_cells > 0 || count_rm_wires > 0) - log("Removed %d unused cells and %d unused wires.\n", count_rm_cells, count_rm_wires); + RmStats stats; + for (auto module : selected_modules) + rmunused_module(module, thread_pool, purge_mode, true, true, stats, keep_cache); + stats.log(); design->optimize(); design->check(); - keep_cache.reset(); ct_reg.clear(); ct_all.clear(); log_pop(); @@ -758,7 +1163,15 @@ struct CleanPass : public Pass { } extra_args(args, argidx, design); - keep_cache.reset(design); + std::vector selected_modules; + for (auto module : design->selected_unboxed_whole_modules()) + if (!module->has_processes()) + selected_modules.push_back(module); + int thread_pool_size = 0; + for (RTLIL::Module *m : selected_modules) + thread_pool_size = std::max(thread_pool_size, ThreadPool::work_pool_size(0, m->cells_size(), 1000)); + ParallelDispatchThreadPool thread_pool(thread_pool_size); + keep_cache_t keep_cache(purge_mode, thread_pool, selected_modules); ct_reg.setup_internals_mem(); ct_reg.setup_internals_anyinit(); @@ -766,23 +1179,16 @@ struct CleanPass : public Pass { ct_all.setup(design); - count_rm_cells = 0; - count_rm_wires = 0; - - for (auto module : design->selected_unboxed_whole_modules()) { - if (module->has_processes()) - continue; - rmunused_module(module, purge_mode, ys_debug(), true); - } + RmStats stats; + for (auto module : selected_modules) + rmunused_module(module, thread_pool, purge_mode, ys_debug(), true, stats, keep_cache); log_suppressed(); - if (count_rm_cells > 0 || count_rm_wires > 0) - log("Removed %d unused cells and %d unused wires.\n", count_rm_cells, count_rm_wires); + stats.log(); design->optimize(); design->check(); - keep_cache.reset(); ct_reg.clear(); ct_all.clear(); diff --git a/tests/opt/opt_clean_init_const.ys b/tests/opt/opt_clean_init_const.ys new file mode 100644 index 000000000..1b3d5db63 --- /dev/null +++ b/tests/opt/opt_clean_init_const.ys @@ -0,0 +1,9 @@ +read_rtlil << EOT +module \top + attribute \init 1'0 + wire \w + + connect \w 1'0 +end +EOT +opt_clean diff --git a/tests/tools/rtlil-fuzz-grammar.json b/tests/tools/rtlil-fuzz-grammar.json index c27b160f4..96af9bde3 100644 --- a/tests/tools/rtlil-fuzz-grammar.json +++ b/tests/tools/rtlil-fuzz-grammar.json @@ -8,7 +8,7 @@ "end\n" ] ], - "": [ [ " wire width ", "", " ", "", " ", "", "\n" ] ], + "": [ [ "", " wire width ", "", " ", "", " ", "", "\n" ] ], "": [ [ "1" ], [ "2" ], [ "3" ], [ "4" ], [ "32" ], [ "128" ] ], "": [ [ "input ", "" ], [ "output ", "" ], [ "inout ", "" ], [] ], "": [ @@ -71,6 +71,7 @@ " end\n" ] ], + "": [ [ " attribute \\init ", "", "\n" ] ], "": [ [ "\\wire_a" ], [ "\\wire_b" ], [ "\\wire_c" ], [ "\\wire_d" ], [ "\\wire_e" ], [ "\\wire_f" ], [ "\\wire_g" ], [ "\\wire_h" ], [ "\\wire_i" ], [ "\\wire_j" ] ], "": [ [ "\\cell_a" ], [ "\\cell_b" ], [ "\\cell_c" ], [ "\\cell_d" ], [ "\\cell_e" ], [ "\\cell_f" ], [ "\\cell_g" ], [ "\\cell_h" ], [ "\\cell_i" ], [ "\\cell_j" ] ], "": [ [ "\\bb1" ], [ "\\bb2" ] ], @@ -97,6 +98,7 @@ "": [ [ " connect ", "", " ", "", "\n" ] ], "": [ [ ], [ "", "" ] ], + "": [ [ ], [ "", "" ] ], "": [ [ ], [ "", "" ] ], "": [ [ ], [ "", "" ] ], "": [ [ ], [ "", "" ] ], diff --git a/tests/unit/Makefile b/tests/unit/Makefile index b275d7f41..3165ad97b 100644 --- a/tests/unit/Makefile +++ b/tests/unit/Makefile @@ -4,10 +4,10 @@ UNAME_S := $(shell uname -s) GTEST_PREFIX := $(shell brew --prefix googletest 2>/dev/null) ifeq ($(GTEST_PREFIX),) GTEST_CXXFLAGS := - GTEST_LDFLAGS := -lgtest -lgtest_main + GTEST_LDFLAGS := -lgtest -lgmock -lgtest_main else GTEST_CXXFLAGS := -I$(GTEST_PREFIX)/include - GTEST_LDFLAGS := -L$(GTEST_PREFIX)/lib -lgtest -lgtest_main + GTEST_LDFLAGS := -L$(GTEST_PREFIX)/lib -lgtest -lgmock -lgtest_main endif ifeq ($(UNAME_S),Darwin) diff --git a/tests/unit/kernel/threadingTest.cc b/tests/unit/kernel/threadingTest.cc new file mode 100644 index 000000000..c0bd5927f --- /dev/null +++ b/tests/unit/kernel/threadingTest.cc @@ -0,0 +1,442 @@ +#include +#include +#include "kernel/threading.h" + +YOSYS_NAMESPACE_BEGIN + +class ThreadingTest : public testing::Test { +protected: + ThreadingTest() { + if (log_files.empty()) + log_files.emplace_back(stdout); + } +}; + +TEST_F(ThreadingTest, ParallelDispatchThreadPoolCreate) { + // Test creating a pool with 0 threads (treated as 1) + ParallelDispatchThreadPool pool0(0); + EXPECT_EQ(pool0.num_threads(), 1); + + // Test creating a pool with 1 thread + ParallelDispatchThreadPool pool1(1); + EXPECT_EQ(pool1.num_threads(), 1); + + // Test creating a pool with 2 threads + ParallelDispatchThreadPool pool2(2); + // YOSYS_MAX_THREADS or system configuration could mean we + // decide to only use one thread. + EXPECT_GE(pool2.num_threads(), 1); + EXPECT_LE(pool2.num_threads(), 2); +} + +TEST_F(ThreadingTest, ParallelDispatchThreadPoolRunSimple) { + ParallelDispatchThreadPool pool(2); + + std::atomic counter{0}; + pool.run([&counter](const ParallelDispatchThreadPool::RunCtx &) { + counter.fetch_add(1, std::memory_order_relaxed); + }); + + EXPECT_EQ(counter.load(), pool.num_threads()); +} + +TEST_F(ThreadingTest, ParallelDispatchThreadPoolRunMultiple) { + ParallelDispatchThreadPool pool(2); + + std::atomic counter{0}; + // Run multiple times to verify the pool can be reused + for (int i = 0; i < 5; ++i) + pool.run([&counter](const ParallelDispatchThreadPool::RunCtx &) { + counter.fetch_add(1, std::memory_order_relaxed); + }); + + EXPECT_EQ(counter.load(), pool.num_threads() * 5); +} + +TEST_F(ThreadingTest, ParallelDispatchThreadPoolRunCtxThreadNums) { + ParallelDispatchThreadPool pool(4); + + std::vector thread_nums(pool.num_threads(), -1); + pool.run([&thread_nums](const ParallelDispatchThreadPool::RunCtx &ctx) { + thread_nums[ctx.thread_num] = ctx.thread_num; + }); + + // Every thread should have recorded its own thread number + for (int i = 0; i < pool.num_threads(); ++i) + EXPECT_EQ(thread_nums[i], i); +} + +TEST_F(ThreadingTest, ParallelDispatchThreadPoolItemRange) { + ParallelDispatchThreadPool pool(3); + + const int num_items = 100; + std::vector> item_counts(num_items); + for (std::atomic &c : item_counts) + c.store(0); + + pool.run([&item_counts](const ParallelDispatchThreadPool::RunCtx &ctx) { + for (int i : ctx.item_range(num_items)) + item_counts[i].fetch_add(1); + }); + + // Each item should have been processed exactly once + for (int i = 0; i < num_items; ++i) + EXPECT_EQ(item_counts[i].load(), 1); +} + +TEST_F(ThreadingTest, ParallelDispatchThreadPoolSubpool) { + ParallelDispatchThreadPool pool(4); + + // Subpool limited to 2 threads + ParallelDispatchThreadPool::Subpool subpool(pool, 2); + EXPECT_LE(subpool.num_threads(), 2); + + std::atomic counter{0}; + subpool.run([&counter](const ParallelDispatchThreadPool::RunCtx &) { + counter.fetch_add(1, std::memory_order_relaxed); + }); + + EXPECT_EQ(counter.load(), subpool.num_threads()); +} + +TEST_F(ThreadingTest, IntRangeIteration) { + IntRange range{3, 7}; + std::vector values; + for (int i : range) + values.push_back(i); + EXPECT_THAT(values, testing::ElementsAre(3, 4, 5, 6)); +} + +TEST_F(ThreadingTest, IntRangeEmpty) { + IntRange range{5, 5}; + for (int _ : range) + FAIL(); +} + +TEST_F(ThreadingTest, ItemRangeForWorker) { + EXPECT_EQ(item_range_for_worker(10, 0, 3), (IntRange{0, 4})); + EXPECT_EQ(item_range_for_worker(10, 1, 3), (IntRange{4, 7})); + EXPECT_EQ(item_range_for_worker(10, 2, 3), (IntRange{7, 10})); +} + +TEST_F(ThreadingTest, ItemRangeForWorkerZeroThreads) { + EXPECT_EQ(item_range_for_worker(10, 0, 0), (IntRange{0, 10})); +} + +TEST_F(ThreadingTest, ShardedVectorBasic) { + ParallelDispatchThreadPool pool(2); + ShardedVector vec(pool); + pool.run([&vec](const ParallelDispatchThreadPool::RunCtx &ctx) { + vec.insert(ctx, ctx.thread_num * 10); + vec.insert(ctx, ctx.thread_num * 10 + 1); + }); + + EXPECT_FALSE(vec.empty()); + + // Count elements + std::vector elements; + for (int v : vec) { + elements.push_back(v); + } + + if (pool.num_threads() == 2) + EXPECT_THAT(elements, testing::ElementsAre(0, 1, 10, 11)); + else + EXPECT_THAT(elements, testing::ElementsAre(0, 1)); +} + +TEST_F(ThreadingTest, MonotonicFlagBasic) { + MonotonicFlag flag; + EXPECT_FALSE(flag.load()); + flag.set(); + EXPECT_TRUE(flag.load()); + flag.set(); + EXPECT_TRUE(flag.load()); +} + +TEST_F(ThreadingTest, MonotonicFlagSetAndReturnOld) { + MonotonicFlag flag; + EXPECT_FALSE(flag.set_and_return_old()); + EXPECT_TRUE(flag.load()); + EXPECT_TRUE(flag.set_and_return_old()); +} + +TEST_F(ThreadingTest, ConcurrentQueueBasic) { + ConcurrentQueue queue; + queue.push_back(1); + queue.push_back(2); + queue.push_back(3); + + auto v1 = queue.pop_front(); + auto v2 = queue.pop_front(); + auto v3 = queue.pop_front(); + + ASSERT_TRUE(v1.has_value()); + ASSERT_TRUE(v2.has_value()); + ASSERT_TRUE(v3.has_value()); + EXPECT_EQ(*v1, 1); + EXPECT_EQ(*v2, 2); + EXPECT_EQ(*v3, 3); +} + +TEST_F(ThreadingTest, ConcurrentQueueTryPopEmpty) { + ConcurrentQueue queue; + auto v = queue.try_pop_front(); + EXPECT_FALSE(v.has_value()); +} + +TEST_F(ThreadingTest, ConcurrentQueueClose) { + ConcurrentQueue queue; + queue.push_back(42); + queue.close(); + + // Can still pop existing elements + auto v1 = queue.pop_front(); + ASSERT_TRUE(v1.has_value()); + EXPECT_EQ(*v1, 42); + + // After close and empty, pop_front returns nullopt + auto v2 = queue.pop_front(); + EXPECT_FALSE(v2.has_value()); +} + +TEST_F(ThreadingTest, ThreadPoolCreate) { + // pool_size of 0 means no worker threads + ThreadPool pool0(0, [](int) {}); + EXPECT_EQ(pool0.num_threads(), 0); + + // pool_size of 1 means 1 worker thread + std::atomic counter{0}; + { + ThreadPool pool1(1, [&counter](int thread_num) { + EXPECT_EQ(thread_num, 0); + counter.fetch_add(1); + }); + } +#ifdef YOSYS_ENABLE_THREADS + EXPECT_EQ(counter.load(), 1); +#else + EXPECT_EQ(counter.load(), 0); +#endif +} + +TEST_F(ThreadingTest, ThreadPoolMultipleThreads) { + std::atomic counter{0}; + { + ThreadPool pool(2, [&counter](int) { + counter.fetch_add(1); + }); + EXPECT_LE(pool.num_threads(), 2); + } +#ifdef YOSYS_ENABLE_THREADS + EXPECT_GE(counter.load(), 1); + EXPECT_LE(counter.load(), 2); +#else + EXPECT_EQ(counter.load(), 0); +#endif +} + +// Helper types for ShardedHashSet tests +struct IntValue { + using Accumulated = IntValue; + int value; + operator int() const { return value; } +}; + +struct IntValueEquality { + bool operator()(int a, int b) const { return a == b; } +}; + +TEST_F(ThreadingTest, ShardedHashSetBasic) { + ParallelDispatchThreadPool pool(1); + + using HashSet = ShardedHashSet; + HashSet::Builder builder(pool); + + // Insert some values + pool.run([&builder](const ParallelDispatchThreadPool::RunCtx &ctx) { + builder.insert(ctx, {{10}, 10}); + builder.insert(ctx, {{20}, 20}); + builder.insert(ctx, {{30}, 30}); + }); + + // Process + pool.run([&builder](const ParallelDispatchThreadPool::RunCtx &ctx) { + builder.process(ctx); + }); + + // Build and lookup + HashSet set(builder); + const IntValue *found10 = set.find({{10}, 10}); + const IntValue *found20 = set.find({{20}, 20}); + const IntValue *found99 = set.find({{99}, 99}); + + ASSERT_NE(found10, nullptr); + ASSERT_NE(found20, nullptr); + EXPECT_EQ(found99, nullptr); + EXPECT_EQ(*found10, 10); + EXPECT_EQ(*found20, 20); +} + +TEST_F(ThreadingTest, ShardedHashSetParallelInsert) { + ParallelDispatchThreadPool pool(3); + + using HashSet = ShardedHashSet; + HashSet::Builder builder(pool); + + // Insert values from multiple threads + pool.run([&builder](const ParallelDispatchThreadPool::RunCtx &ctx) { + for (int i = 0; i < 10; ++i) { + int val = ctx.thread_num * 100 + i; + builder.insert(ctx, {{val}, static_cast(val)}); + } + }); + + pool.run([&builder](const ParallelDispatchThreadPool::RunCtx &ctx) { + builder.process(ctx); + }); + + HashSet set(builder); + + // Verify all values can be found + for (int t = 0; t < pool.num_threads(); ++t) { + for (int i = 0; i < 10; ++i) { + int val = t * 100 + i; + const IntValue *found = set.find({{val}, static_cast(val)}); + ASSERT_NE(found, nullptr) << "Value " << val << " not found"; + EXPECT_EQ(*found, val); + } + } +} + +// Helper types for ShardedHashSet tests +struct IntDictValue { + using Accumulated = IntDictValue; + int key; + int value; + bool operator==(const IntDictValue &other) const { return key == other.key && value == other.value; } + bool operator!=(const IntDictValue &other) const { return !(*this == other); } +}; + +struct IntDictKeyEquality { + bool operator()(const IntDictValue &a, const IntDictValue &b) const { return a.key == b.key; } +}; + +// Collision handler that sums values +struct SumCollisionHandler { + void operator()(IntDictValue &existing, IntDictValue &incoming) const { + existing.value += incoming.value; + } +}; + +TEST_F(ThreadingTest, ShardedHashSetCollision) { + ParallelDispatchThreadPool pool(1); + + using HashSet = ShardedHashSet; + HashSet::Builder builder(pool); + + // Insert duplicate keys with same hash - duplicates should collapse + pool.run([&builder](const ParallelDispatchThreadPool::RunCtx &ctx) { + builder.insert(ctx, {{5, 10}, 5}); + builder.insert(ctx, {{5, 12}, 5}); // Duplicate key/hash + builder.insert(ctx, {{5, 14}, 5}); // Another duplicate + }); + + pool.run([&builder](const ParallelDispatchThreadPool::RunCtx &ctx) { + builder.process(ctx); + }); + + HashSet set(builder); + const IntDictValue *found = set.find({{5, 0}, 5}); + ASSERT_NE(found, nullptr); + // With default collision handler, first value is kept + EXPECT_EQ(*found, (IntDictValue{5, 36})); +} + +TEST_F(ThreadingTest, ShardedHashSetEmpty) { + ParallelDispatchThreadPool pool(1); + + using HashSet = ShardedHashSet; + HashSet::Builder builder(pool); + + // Don't insert anything, just process + pool.run([&builder](const ParallelDispatchThreadPool::RunCtx &ctx) { + builder.process(ctx); + }); + + HashSet set(builder); + const IntValue *found = set.find({{42}, 42}); + EXPECT_EQ(found, nullptr); +} + +TEST_F(ThreadingTest, ConcurrentWorkQueueSingleThread) { + ConcurrentWorkQueue queue(1, 10); // 1 thread, batch size 10 + EXPECT_EQ(queue.num_threads(), 1); + + ThreadIndex thread{0}; + + // Push some items (less than batch size) + for (int i = 0; i < 5; ++i) + queue.push(thread, i); + + // Pop should return those items + std::vector batch = queue.pop_batch(thread); + EXPECT_THAT(batch, testing::UnorderedElementsAre(0, 1, 2, 3, 4)); + + // Next pop should return empty (all threads "waiting") + std::vector empty_batch = queue.pop_batch(thread); + EXPECT_TRUE(empty_batch.empty()); +} + +TEST_F(ThreadingTest, ConcurrentWorkQueueBatching) { + ConcurrentWorkQueue queue(1, 3); // batch size 3 + ThreadIndex thread{0}; + + queue.push(thread, 10); + queue.push(thread, 20); + queue.push(thread, 30); + queue.push(thread, 40); + queue.push(thread, 50); + + std::vector popped; + while (true) { + std::vector batch = queue.pop_batch(thread); + if (batch.empty()) + break; + popped.insert(popped.end(), batch.begin(), batch.end()); + } + EXPECT_THAT(popped, testing::UnorderedElementsAre(10, 20, 30, 40, 50)); +} + +TEST_F(ThreadingTest, ConcurrentWorkQueueParallel) { + ParallelDispatchThreadPool pool(2); + if (pool.num_threads() < 2) { + // Skip test if we don't have multiple threads + return; + } + + ConcurrentWorkQueue queue(2, 3); + std::atomic sum{0}; + + pool.run([&queue, &sum](const ParallelDispatchThreadPool::RunCtx &ctx) { + // Each thread pushes some work + for (int i = 0; i < 10; ++i) + queue.push(ctx, ctx.thread_num * 100 + i); + + // Each thread processes work until done + while (true) { + std::vector batch = queue.pop_batch(ctx); + if (batch.empty()) + break; + for (int v : batch) + sum.fetch_add(v); + } + }); + + // Thread 0 pushes: 0+1+2+...+9 = 45 + // Thread 1 pushes: 100+101+...+109 = 1045 + // Total = 45 + 1045 = 1090 + EXPECT_EQ(sum.load(), 1090); +} + +YOSYS_NAMESPACE_END