diff --git a/kernel/wallace_tree.h b/kernel/wallace_tree.h new file mode 100644 index 000000000..eb3513803 --- /dev/null +++ b/kernel/wallace_tree.h @@ -0,0 +1,112 @@ +/** + * Wallace tree utilities for multi-operand addition using carry-save adders + * + * Terminology: + * - compressor: $fa viewed as reducing 3 inputs to 2 outputs (sum + shifted carry) (3:2 compressor) + * - level: A stage of parallel compression operations + * - depth: Maximum number of 3:2 compressor levels from any input to a signal + * + * References: + * - "Binary Adder Architectures for Cell-Based VLSI and their Synthesis" (https://iis-people.ee.ethz.ch/~zimmi/publications/adder_arch.pdf) + * - "A Suggestion for a Fast Multiplier" (https://www.ece.ucdavis.edu/~vojin/CLASSES/EEC280/Web-page/papers/Arithmetic/Wallace_mult.pdf) + */ + +#ifndef WALLACE_TREE_H +#define WALLACE_TREE_H + +#include "kernel/sigtools.h" +#include "kernel/yosys.h" + +YOSYS_NAMESPACE_BEGIN + +inline std::pair emit_fa(Module *module, SigSpec a, SigSpec b, SigSpec c, int width) +{ + SigSpec sum = module->addWire(NEW_ID, width); + SigSpec cout = module->addWire(NEW_ID, width); + + module->addFa(NEW_ID, a, b, c, cout, sum); + + SigSpec carry; + carry.append(State::S0); + carry.append(cout.extract(0, width - 1)); + return {sum, carry}; +} + +/** + * wallace_reduce_scheduled() - Reduce multiple operands to two using a Wallace tree + * @module: The Yosys module to which the compressors will be added + * @sigs: Vector of input signals (operands) to be reduced + * @width: Target bit-width to which all operands will be zero-extended + * @compressor_count: Optional pointer to return the number of $fa cells emitted + * + * Return: The final two reduced operands, that are to be fed into an adder + */ +inline std::pair wallace_reduce_scheduled(Module *module, std::vector &sigs, int width, int *compressor_count = nullptr) +{ + struct DepthSig { + SigSpec sig; + int depth; + }; + + for (auto &s : sigs) + s.extend_u0(width); + + std::vector operands; + operands.reserve(sigs.size()); + for (auto &s : sigs) + operands.push_back({s, 0}); + + // Number of $fa's emitted + if (compressor_count) + *compressor_count = 0; + + // Only compress operands ready at current level + for (int level = 0; operands.size() > 2; level++) { + // Partition operands into ready and waiting + std::vector ready, waiting; + for (auto &op : operands) { + if (op.depth <= level) + ready.push_back(op); + else + waiting.push_back(op); + } + + if (ready.size() < 3) + continue; + + // Apply compressors to ready operands + std::vector compressed; + size_t i = 0; + while (i + 2 < ready.size()) { + auto [sum, carry] = emit_fa(module, ready[i].sig, ready[i + 1].sig, ready[i + 2].sig, width); + int new_depth = std::max({ready[i].depth, ready[i + 1].depth, ready[i + 2].depth}) + 1; + compressed.push_back({sum, new_depth}); + compressed.push_back({carry, new_depth}); + if (compressor_count) + (*compressor_count)++; + i += 3; + } + // Uncompressed operands pass through to next level + for (; i < ready.size(); i++) + compressed.push_back(ready[i]); + // Merge compressed with waiting operands + for (auto &op : waiting) + compressed.push_back(op); + + operands = std::move(compressed); + } + + if (operands.size() == 0) + return {SigSpec(State::S0, width), SigSpec(State::S0, width)}; + else if (operands.size() == 1) + return {operands[0].sig, SigSpec(State::S0, width)}; + else { + log_assert(operands.size() == 2); + log(" Wallace tree depth: %d levels of $fa + 1 final $add\n", std::max(operands[0].depth, operands[1].depth)); + return {operands[0].sig, operands[1].sig}; + } +} + +YOSYS_NAMESPACE_END + +#endif diff --git a/passes/techmap/booth.cc b/passes/techmap/booth.cc index 11ff71b29..c0bad784a 100644 --- a/passes/techmap/booth.cc +++ b/passes/techmap/booth.cc @@ -58,6 +58,7 @@ synth -top my_design -booth #include "kernel/sigtools.h" #include "kernel/yosys.h" #include "kernel/macc.h" +#include "kernel/wallace_tree.h" USING_YOSYS_NAMESPACE PRIVATE_NAMESPACE_BEGIN @@ -317,36 +318,6 @@ struct BoothPassWorker { } } - SigSig WallaceSum(int width, std::vector summands) - { - for (auto &s : summands) - s.extend_u0(width); - - while (summands.size() > 2) { - std::vector new_summands; - int i; - for (i = 0; i < (int) summands.size() - 2; i += 3) { - SigSpec x = module->addWire(NEW_ID, width); - SigSpec y = module->addWire(NEW_ID, width); - BuildBitwiseFa(module, NEW_ID.str(), summands[i], summands[i + 1], - summands[i + 2], x, y); - new_summands.push_back(y); - new_summands.push_back({x.extract(0, width - 1), State::S0}); - } - - new_summands.insert(new_summands.begin(), summands.begin() + i, summands.end()); - - std::swap(summands, new_summands); - } - - if (!summands.size()) - return SigSig(SigSpec(width, State::S0), SigSpec(width, State::S0)); - else if (summands.size() == 1) - return SigSig(summands[0], SigSpec(width, State::S0)); - else - return SigSig(summands[0], summands[1]); - } - /* Build Multiplier. ------------------------- @@ -415,16 +386,16 @@ struct BoothPassWorker { // Later on yosys will clean up unused constants // DebugDumpAlignPP(aligned_pp); - SigSig wtree_sum = WallaceSum(z_sz, aligned_pp); + auto [wtree_a, wtree_b] = wallace_reduce_scheduled(module, aligned_pp, z_sz); // Debug code: Dump out the csa trees // DumpCSATrees(debug_csa_trees); // Build the CPA to do the final accumulation. - log_assert(wtree_sum.second[0] == State::S0); + log_assert(wtree_b[0] == State::S0); if (mapped_cpa) - BuildCPA(module, wtree_sum.first, {State::S0, wtree_sum.second.extract_end(1)}, Z); + BuildCPA(module, wtree_a, wtree_b, Z); else - module->addAdd(NEW_ID, wtree_sum.first, {wtree_sum.second.extract_end(1), State::S0}, Z); + module->addAdd(NEW_ID, wtree_a, wtree_b, Z); } /* diff --git a/passes/techmap/csa_tree.cc b/passes/techmap/csa_tree.cc index b05586e58..4bd0b933f 100644 --- a/passes/techmap/csa_tree.cc +++ b/passes/techmap/csa_tree.cc @@ -1,9 +1,11 @@ -// Replaces chains of $add/$sub and $macc cells with carry-save adder trees, reducing multi-operand -// addition to logarithmic depth. ref. paper: Zimmermann, "Architectures for Adders" +/** + * Replaces chains of $add/$sub and $macc cells with carry-save adder trees + */ #include "kernel/yosys.h" #include "kernel/sigtools.h" #include "kernel/macc.h" +#include "kernel/wallace_tree.h" #include @@ -306,71 +308,6 @@ struct Rewriter return sig; } - std::pair emit_fa(SigSpec a, SigSpec b, SigSpec c, int width) - { - SigSpec sum = module->addWire(NEW_ID, width); - SigSpec cout = module->addWire(NEW_ID, width); - - module->addFa(NEW_ID, a, b, c, cout, sum); - - SigSpec carry; - carry.append(State::S0); - carry.append(cout.extract(0, width - 1)); - return {sum, carry}; - } - - struct DepthSig { - SigSpec sig; - int depth; - }; - - // Group ready operands into triplets and compress via full adders until two operands remain. - std::pair reduce_wallace(std::vector& sigs, int width, int& fa_count) - { - std::vector ops; - ops.reserve(sigs.size()); - for (auto& s : sigs) - ops.push_back({s, 0}); - - fa_count = 0; - - for (int level = 0; ops.size() > 2; level++) { - log_assert(level <= 100); - - std::vector ready, waiting; - for (auto& op : ops) { - if (op.depth <= level) - ready.push_back(op); - else - waiting.push_back(op); - } - - if (ready.size() < 3) continue; - - std::vector next; - size_t i = 0; - while (i + 2 < ready.size()) { - auto [sum, carry] = emit_fa(ready[i].sig, ready[i + 1].sig, ready[i + 2].sig, width); - int d = std::max({ready[i].depth, ready[i + 1].depth,ready[i + 2].depth}) + 1; - next.push_back({sum, d}); - next.push_back({carry, d}); - fa_count++; - i += 3; - } - for (; i < ready.size(); i++) - next.push_back(ready[i]); - for (auto& op : waiting) - next.push_back(op); - - ops = std::move(next); - } - - log_assert(ops.size() == 2); - log(" Tree depth: %d FA levels + 1 final add\n", - std::max(ops[0].depth, ops[1].depth)); - return {ops[0].sig, ops[1].sig}; - } - void replace_with_csa_tree( std::vector& operands, SigSpec result_y, @@ -392,11 +329,9 @@ struct Rewriter if (neg_compensation > 0) extended.push_back(SigSpec(neg_compensation, width)); - int fa_count; - auto [a, b] = reduce_wallace(extended, width, fa_count); - - log(" %s -> %d $fa + 1 $add (%d operands, module %s)\n", - desc, fa_count, (int)operands.size(), log_id(module)); + int compressor_count; + auto [a, b] = wallace_reduce_scheduled(module, extended, width, &compressor_count); + log(" %s -> %d $fa + 1 $add (%d operands, module %s)\n", desc, compressor_count, (int)operands.size(), log_id(module)); // Emit final add module->addAdd(NEW_ID, a, b, result_y, false);