diff --git a/kernel/threading.cc b/kernel/threading.cc index 8c9bfb390..3766c4ddf 100644 --- a/kernel/threading.cc +++ b/kernel/threading.cc @@ -92,4 +92,59 @@ IntRange item_range_for_worker(int num_items, int thread_num, int num_threads) return {start, end}; } +ParallelDispatchThreadPool::ParallelDispatchThreadPool(int pool_size) + : num_worker_threads_(std::max(1, pool_size) - 1) +{ +#ifdef YOSYS_ENABLE_THREADS + main_to_workers_signal.resize(num_worker_threads_, 0); +#endif + // Don't start the threads until we've constructed all our data members. + thread_pool = std::make_unique(num_worker_threads_, [this](int thread_num){ + run_worker(thread_num); + }); +} + +ParallelDispatchThreadPool::~ParallelDispatchThreadPool() +{ +#ifdef YOSYS_ENABLE_THREADS + if (num_worker_threads_ == 0) + return; + current_work = nullptr; + num_active_worker_threads_ = num_worker_threads_; + signal_workers_start(); + wait_for_workers_done(); +#endif +} + +void ParallelDispatchThreadPool::run(std::function work, int max_threads) +{ + Multithreading multithreading; + num_active_worker_threads_ = num_threads(max_threads) - 1; + if (num_active_worker_threads_ == 0) { + work({{0}, 1}); + return; + } +#ifdef YOSYS_ENABLE_THREADS + current_work = &work; + signal_workers_start(); + work({{0}, num_active_worker_threads_ + 1}); + wait_for_workers_done(); +#endif +} + +void ParallelDispatchThreadPool::run_worker(int thread_num) +{ +#ifdef YOSYS_ENABLE_THREADS + while (true) + { + worker_wait_for_start(thread_num); + if (current_work == nullptr) + break; + (*current_work)({{thread_num + 1}, num_active_worker_threads_ + 1}); + signal_worker_done(); + } + signal_worker_done(); +#endif +} + YOSYS_NAMESPACE_END diff --git a/kernel/threading.h b/kernel/threading.h index eb068bb20..da21a274e 100644 --- a/kernel/threading.h +++ b/kernel/threading.h @@ -183,6 +183,113 @@ struct ThreadIndex { int thread_num; }; +// A set of threads with a `run()` API that runs a closure on all of the threads +// and wait for all those closures to complete. This is a convenient way to implement +// parallel algorithms that use barrier synchronization. +class ParallelDispatchThreadPool +{ +public: + // Create a pool of threads running the given closure (parameterized by thread number). + // `pool_size` must be the result of a `pool_size()` call. + // `pool_size` can be zero, which we treat as 1. + ParallelDispatchThreadPool(int pool_size); + ~ParallelDispatchThreadPool(); + + // For each thread running a closure, a `RunCtx` is passed to the closure. Currently + // it contains the thread index and the total number of threads. It can be passed + // directly to any APIs requiring a `ThreadIndex`. + struct RunCtx : public ThreadIndex { + int num_threads; + IntRange item_range(int num_items) const { + return item_range_for_worker(num_items, thread_num, num_threads); + } + }; + // Sometimes we only want to activate a subset of the threads in the pool. This + // class provides a way to do that. It provides the same `num_threads()` + // and `run()` APIs as a `ParallelDispatchThreadPool`. + class Subpool { + public: + Subpool(ParallelDispatchThreadPool &parent, int max_threads) + : parent(parent), max_threads(max_threads) {} + // Returns the number of threads that will be used when calling `run()`. + int num_threads() const { + return parent.num_threads(max_threads); + } + void run(std::function work) { + parent.run(std::move(work), max_threads); + } + ParallelDispatchThreadPool &thread_pool() { return parent; } + private: + ParallelDispatchThreadPool &parent; + int max_threads; + }; + + // Run the `work` function in parallel on each thread in the pool (parameterized by + // thread number). Waits for all work functions to complete. Only one `run()` can be + // active at a time. + // Uses no more than `max_threads` threads (but at least one). + void run(std::function work) { + run(std::move(work), INT_MAX); + } + + // Returns the number of threads that will be used when calling `run()`. + int num_threads() const { + return num_threads(INT_MAX); + } +private: + friend class Subpool; + + void run(std::function work, int max_threads); + int num_threads(int max_threads) const { + return std::min(num_worker_threads_ + 1, std::max(1, max_threads)); + } + void run_worker(int thread_num); + + std::unique_ptr thread_pool; + std::function *current_work = nullptr; + // Keeps a correct count even when threads are exiting. + int num_worker_threads_; + // The count of active workerthreads for the current `run()`. + int num_active_worker_threads_ = 0; + +#ifdef YOSYS_ENABLE_THREADS + // Not especially efficient for large numbers of threads. Worker wakeup could scale + // better by conceptually organising workers into a tree and having workers wake + // up their children. + std::mutex main_to_workers_signal_mutex; + std::condition_variable main_to_workers_signal_cv; + std::vector main_to_workers_signal; + void signal_workers_start() { + std::unique_lock lock(main_to_workers_signal_mutex); + std::fill(main_to_workers_signal.begin(), main_to_workers_signal.begin() + num_active_worker_threads_, 1); + // When `num_active_worker_threads_` is small compared to `num_worker_threads_`, we have a "thundering herd" + // problem here. Fixing that would add complexity so don't worry about it for now. + main_to_workers_signal_cv.notify_all(); + } + void worker_wait_for_start(int thread_num) { + std::unique_lock lock(main_to_workers_signal_mutex); + main_to_workers_signal_cv.wait(lock, [this, thread_num] { return main_to_workers_signal[thread_num] > 0; }); + main_to_workers_signal[thread_num] = 0; + } + + std::atomic done_workers = 0; + std::mutex workers_to_main_signal_mutex; + std::condition_variable workers_to_main_signal_cv; + void signal_worker_done() { + int d = done_workers.fetch_add(1, std::memory_order_release); + if (d + 1 == num_active_worker_threads_) { + std::unique_lock lock(workers_to_main_signal_mutex); + workers_to_main_signal_cv.notify_all(); + } + } + void wait_for_workers_done() { + std::unique_lock lock(workers_to_main_signal_mutex); + workers_to_main_signal_cv.wait(lock, [this] { return done_workers.load(std::memory_order_acquire) == num_active_worker_threads_; }); + done_workers.store(0, std::memory_order_relaxed); + } +#endif +}; + template class ConcurrentStack {