diff --git a/Makefile b/Makefile index 6ee34070d..1aebcac77 100644 --- a/Makefile +++ b/Makefile @@ -671,7 +671,7 @@ OBJS += kernel/log_compat.o endif OBJS += kernel/binding.o kernel/tclapi.o OBJS += kernel/cellaigs.o kernel/celledges.o kernel/cost.o kernel/satgen.o kernel/scopeinfo.o kernel/qcsat.o kernel/mem.o kernel/ffmerge.o kernel/ff.o kernel/yw.o kernel/json.o kernel/fmt.o kernel/sexpr.o -OBJS += kernel/drivertools.o kernel/functional.o kernel/threading.o +OBJS += kernel/drivertools.o kernel/functional.o kernel/threading.o kernel/compressor_tree.o ifeq ($(ENABLE_ZLIB),1) OBJS += kernel/fstdata.o endif diff --git a/kernel/compressor_tree.cc b/kernel/compressor_tree.cc new file mode 100644 index 000000000..ee7b2a2fc --- /dev/null +++ b/kernel/compressor_tree.cc @@ -0,0 +1,334 @@ +#include "compressor_tree.h" + +YOSYS_NAMESPACE_BEGIN + +namespace CompressorTree +{ + +std::pair emit_compressor_32(Module *module, SigSpec a, SigSpec b, SigSpec c, int width) +{ + SigSpec sum = module->addWire(NEW_ID, width); + SigSpec cout = module->addWire(NEW_ID, width); + module->addFa(NEW_ID, a, b, c, cout, sum); + + SigSpec carry; + carry.append(State::S0); + carry.append(cout.extract(0, width - 1)); + return {sum, carry}; +} + +std::pair emit_compressor_42(Module *module, SigSpec a, SigSpec b, SigSpec c, SigSpec d, int width) +{ + // First FA: a + b + c -> s0 + SigSpec s0 = module->addWire(NEW_ID, width); + SigSpec cout_h_full = module->addWire(NEW_ID, width); + module->addFa(NEW_ID, a, b, c, cout_h_full, s0); + + // cin[0] = 0, cin[i] = cout_h_full[i-1] + SigSpec cin; + cin.append(State::S0); + if (width > 1) + cin.append(cout_h_full.extract(0, width - 1)); + + // Second FA: s0 + d + cin -> sum + SigSpec sum = module->addWire(NEW_ID, width); + SigSpec carry_full = module->addWire(NEW_ID, width); + module->addFa(NEW_ID, s0, d, cin, carry_full, sum); + + SigSpec carry; + carry.append(State::S0); + if (width > 1) + carry.append(carry_full.extract(0, width - 1)); + + return {sum, carry}; +} + +SigSpec normalize_to_width(SigSpec sig, bool is_signed, int width) +{ + // Zero/sign-extend to width + if (GetSize(sig) < width) { + SigBit pad; + if (is_signed && GetSize(sig) > 0) + pad = sig[GetSize(sig) - 1]; + else + pad = State::S0; + sig.append(SigSpec(pad, width - GetSize(sig))); + } + // Truncate to width + if (GetSize(sig) > width) + sig = sig.extract(0, width); + return sig; +} + +std::vector generate_partial_products(Module *module, SigSpec a, SigSpec b, bool a_signed, bool b_signed, int width) { + int width_a = GetSize(a); + int width_b = GetSize(b); + std::vector products; + products.reserve(width_a + 3); + + for (int i = 0; i < width_a; i++) { + SigBit ai = a[i]; + + // b_shifted = (0_i ## b) + SigSpec b_shifted = SigSpec(State::S0, i); + b_shifted.append(b); + b_shifted = normalize_to_width(b_shifted, false, width); + + // row = b_shifted & replicate(a[i], width) + SigSpec ai_rep = SigSpec(ai, width); + SigSpec row = module->addWire(NEW_ID, width); + module->addAnd(NEW_ID, b_shifted, ai_rep, row); + + // Apply Modified Baugh-Wooley inversions for this row + bool row_is_bottom = (i == width_a - 1); + bool any_inversion = (row_is_bottom && b_signed) || a_signed; + + if (any_inversion) { + std::vector mask(width, RTLIL::State::S0); + + for (int j = 0; j < width_b; j++) { + int col = i + j; + if (col < 0 || col >= width) + continue; + bool col_is_right = (j == width_b - 1); + // Flip masks + bool invert = (row_is_bottom && b_signed) ^ (col_is_right && a_signed); + if (invert) + mask[col] = RTLIL::State::S1; + } + + // Skip the xor entirely if the mask is all zeroes + bool nonzero = false; + for (auto s : mask) + if (s == RTLIL::State::S1) { + nonzero = true; + break; + } + if (nonzero) { + SigSpec inverted = module->addWire(NEW_ID, width); + module->addXor(NEW_ID, row, SigSpec(RTLIL::Const(mask)), inverted); + row = inverted; + } + } + + products.push_back({row, 0}); + } + + // Correction constants + auto push_one_at = [&](int col) { + if (col < 0 || col >= width) + return; + std::vector v(width, RTLIL::State::S0); + v[col] = RTLIL::State::S1; + products.push_back({SigSpec(RTLIL::Const(v)), 0}); + }; + + if (b_signed) + push_one_at(width_a - 1); + if (a_signed) + push_one_at(width_b - 1); + if (a_signed || b_signed) + push_one_at(width_a + width_b - 1); + + return products; +} + +std::pair reduce_scheduled(Module *module, std::vector operands, int width, Strategy strategy, int *compressor_count) { + int levels = 0; + int fa_count = 0; + int c42_count = 0; + int final_depth = 0; + + for (auto &op : operands) + op.sig.extend_u0(width); + + // Only compress operands ready at current level + for (int level = 0; operands.size() > 2; level++) { + // Partition operands into ready and waiting + std::vector ready; + std::vector waiting; + ready.reserve(operands.size()); + for (auto &op : operands) { + if (op.depth <= level) + ready.push_back(op); + else + waiting.push_back(op); + } + + if (ready.size() < 3) { + levels++; + continue; + } + + // Apply compressors to ready operands + std::vector compressed; + compressed.reserve(ready.size()); + size_t i = 0; + + // PREFER_42 attempts 4:2 grouping greedily (falls back to 3:2 for the residual) + // FA_ONLY skips + // DADDA = PREFER_42 (TODO: inspect column heights?) + bool try_42 = (strategy == Strategy::PREFER_42 || strategy == Strategy::DADDA); + + while (i < ready.size()) { + size_t remaining = ready.size() - i; + + if (try_42 && remaining >= 4) { + DepthSig a = ready[i + 0]; + DepthSig b = ready[i + 1]; + DepthSig c = ready[i + 2]; + DepthSig d = ready[i + 3]; + + auto [sum, carry] = emit_compressor_42(module, a.sig, b.sig, c.sig, d.sig, width); + int dmax = std::max({a.depth, a.depth, a.depth, a.depth}); + + compressed.push_back({sum, dmax + 2}); + compressed.push_back({carry, dmax + 2}); + + fa_count += 2; + c42_count += 1; + i += 4; + } else if (remaining >= 3) { + DepthSig a = ready[i + 0]; + DepthSig b = ready[i + 1]; + DepthSig c = ready[i + 2]; + + auto [sum, carry] = emit_compressor_32(module, a.sig, b.sig, c.sig, width); + int dmax = std::max({a.depth, b.depth, c.depth}); + + compressed.push_back({sum, dmax + 1}); + compressed.push_back({carry, dmax + 1}); + + fa_count += 1; + i += 3; + } else { + // Uncompressed operands pass through to next level + for (; i < ready.size(); i++) + compressed.push_back(ready[i]); + break; + } + } + + // Merge compressed with waiting operands + for (auto &op : waiting) + compressed.push_back(op); + + operands = std::move(compressed); + levels++; + } + + if(compressor_count) + *compressor_count = fa_count; + + if (operands.size() == 0) + return {SigSpec(State::S0, width), SigSpec(State::S0, width)}; + if (operands.size() == 1) + return {operands[0].sig, SigSpec(State::S0, width)}; + + final_depth = std::max(operands[0].depth, operands[1].depth); + log_assert(operands.size() == 2); + log(" CompressorTree::reduce_scheduled: %d levels, %d $fa (%d as 4:2), final depth %d\n", levels, fa_count, c42_count, final_depth); + return {operands[0].sig, operands[1].sig}; +} + +void emit_kogge_stone(Module *module, SigSpec a, SigSpec b, SigSpec y) +{ + int width = GetSize(y); + log_assert(GetSize(a) == width); + log_assert(GetSize(b) == width); + + if (width == 0) + return; + + if (width == 1) { + module->addXorGate(NEW_ID, a[0], b[0], y[0]); + return; + } + + // Bit level gen and prop + std::vector g_pre(width), p_pre(width); + for (int i = 0; i < width; i++) { + SigBit gi = module->addWire(NEW_ID); + SigBit pi = module->addWire(NEW_ID); + module->addAndGate(NEW_ID, a[i], b[i], gi); + module->addXorGate(NEW_ID, a[i], b[i], pi); + g_pre[i] = gi; + p_pre[i] = pi; + } + + // Propagate (g, p) through ceil(log2 W) levels + std::vector g = g_pre; + std::vector p = p_pre; + int num_levels = 0; + + while ((1 << num_levels) < width) + num_levels++; + + for (int k = 1; k <= num_levels; k++) { + int s = 1 << (k - 1); + std::vector g_next(width), p_next(width); + for (int i = 0; i < width; i++) { + if (i < s) { + // Nothing to do + g_next[i] = g[i]; + p_next[i] = p[i]; + } else { + // g_i^k = g_i | (p_i & g_(i-s)) + SigBit and_pg = module->addWire(NEW_ID); + module->addAndGate(NEW_ID, p[i], g[i - s], and_pg); + SigBit gnew = module->addWire(NEW_ID); + module->addOrGate(NEW_ID, g[i], and_pg, gnew); + g_next[i] = gnew; + + // p_i^k = p_i & p_(i-s) + if (k < num_levels) { + SigBit pnew = module->addWire(NEW_ID); + module->addAndGate(NEW_ID, p[i], p[i - s], pnew); + p_next[i] = pnew; + } else { + // Skip last level + p_next[i] = State::Sx; + } + } + } + + g = std::move(g_next); + p = std::move(p_next); + } + + // Sum layer, g[i] is COUT of bit i + // With CIN 0: + // sum[0] = p_pre[0] + // sum[i] = p_pre[i] ^ g[i-1] ... + module->connect(y[0], p_pre[0]); + for (int i = 1; i < width; i++) + module->addXorGate(NEW_ID, p_pre[i], g[i - 1], y[i]); +} + +Cell *emit_final_adder(Module *module, SigSpec a, SigSpec b, SigSpec y, FinalAdder choice) { + switch (choice) { + case FinalAdder::DEFAULT: + case FinalAdder::RIPPLE: { + return module->addAdd(NEW_ID, a, b, y, false); + } + case FinalAdder::PARALLEL_PREFIX: { + emit_kogge_stone(module, a, b, y); + return nullptr; + } + } + log_assert(false && "CompressorTree::emit_final_adder: invalid choice"); + return nullptr; +} + +FinalAdder pick_final_adder(int width, FinalMode mode) { + switch (mode) { + case FinalMode::RIPPLE: return FinalAdder::RIPPLE; + case FinalMode::PREFIX: return FinalAdder::PARALLEL_PREFIX; + case FinalMode::AUTO: + default: return (width < RIPPLE_PREFIX_THRESHOLD) ? FinalAdder::DEFAULT : FinalAdder::PARALLEL_PREFIX; + } +} + +} // namespace CompressorTree + +YOSYS_NAMESPACE_END diff --git a/kernel/compressor_tree.h b/kernel/compressor_tree.h index 4acc7e149..54a4d421d 100644 --- a/kernel/compressor_tree.h +++ b/kernel/compressor_tree.h @@ -55,60 +55,10 @@ enum class FinalMode { PREFIX, }; -inline std::pair emit_compressor_32(Module *module, SigSpec a, SigSpec b, SigSpec c, int width) -{ - SigSpec sum = module->addWire(NEW_ID, width); - SigSpec cout = module->addWire(NEW_ID, width); - module->addFa(NEW_ID, a, b, c, cout, sum); +std::pair emit_compressor_32(Module *module, SigSpec a, SigSpec b, SigSpec c, int width); +std::pair emit_compressor_42(Module *module, SigSpec a, SigSpec b, SigSpec c, SigSpec d, int width); - SigSpec carry; - carry.append(State::S0); - carry.append(cout.extract(0, width - 1)); - return {sum, carry}; -} - -inline std::pair emit_compressor_42(Module *module, SigSpec a, SigSpec b, SigSpec c, SigSpec d, int width) -{ - // First FA: a + b + c -> s0 - SigSpec s0 = module->addWire(NEW_ID, width); - SigSpec cout_h_full = module->addWire(NEW_ID, width); - module->addFa(NEW_ID, a, b, c, cout_h_full, s0); - - // cin[0] = 0, cin[i] = cout_h_full[i-1] - SigSpec cin; - cin.append(State::S0); - if (width > 1) - cin.append(cout_h_full.extract(0, width - 1)); - - // Second FA: s0 + d + cin -> sum - SigSpec sum = module->addWire(NEW_ID, width); - SigSpec carry_full = module->addWire(NEW_ID, width); - module->addFa(NEW_ID, s0, d, cin, carry_full, sum); - - SigSpec carry; - carry.append(State::S0); - if (width > 1) - carry.append(carry_full.extract(0, width - 1)); - - return {sum, carry}; -} - -inline SigSpec normalize_to_width(SigSpec sig, bool is_signed, int width) -{ - // Zero/sign-extend to width - if (GetSize(sig) < width) { - SigBit pad; - if (is_signed && GetSize(sig) > 0) - pad = sig[GetSize(sig) - 1]; - else - pad = State::S0; - sig.append(SigSpec(pad, width - GetSize(sig))); - } - // Truncate to width - if (GetSize(sig) > width) - sig = sig.extract(0, width); - return sig; -} +SigSpec normalize_to_width(SigSpec sig, bool is_signed, int width); /** * generate_partial_products() - Generate partial products for FMA concat @@ -121,78 +71,7 @@ inline SigSpec normalize_to_width(SigSpec sig, bool is_signed, int width) * * Return: Partial-product matrix as a set of depth-0 vectors */ -inline std::vector generate_partial_products(Module *module, SigSpec a, SigSpec b, bool a_signed, bool b_signed, int width) { - int width_a = GetSize(a); - int width_b = GetSize(b); - std::vector products; - products.reserve(width_a + 3); - - for (int i = 0; i < width_a; i++) { - SigBit ai = a[i]; - - // b_shifted = (0_i ## b) - SigSpec b_shifted = SigSpec(State::S0, i); - b_shifted.append(b); - b_shifted = normalize_to_width(b_shifted, false, width); - - // row = b_shifted & replicate(a[i], width) - SigSpec ai_rep = SigSpec(ai, width); - SigSpec row = module->addWire(NEW_ID, width); - module->addAnd(NEW_ID, b_shifted, ai_rep, row); - - // Apply Modified Baugh-Wooley inversions for this row - bool row_is_bottom = (i == width_a - 1); - bool any_inversion = (row_is_bottom && b_signed) || a_signed; - - if (any_inversion) { - std::vector mask(width, RTLIL::State::S0); - - for (int j = 0; j < width_b; j++) { - int col = i + j; - if (col < 0 || col >= width) - continue; - bool col_is_right = (j == width_b - 1); - // Flip masks - bool invert = (row_is_bottom && b_signed) ^ (col_is_right && a_signed); - if (invert) - mask[col] = RTLIL::State::S1; - } - - // Skip the xor entirely if the mask is all zeroes - bool nonzero = false; - for (auto s : mask) - if (s == RTLIL::State::S1) { - nonzero = true; - break; - } - if (nonzero) { - SigSpec inverted = module->addWire(NEW_ID, width); - module->addXor(NEW_ID, row, SigSpec(RTLIL::Const(mask)), inverted); - row = inverted; - } - } - - products.push_back({row, 0}); - } - - // Correction constants - auto push_one_at = [&](int col) { - if (col < 0 || col >= width) - return; - std::vector v(width, RTLIL::State::S0); - v[col] = RTLIL::State::S1; - products.push_back({SigSpec(RTLIL::Const(v)), 0}); - }; - - if (b_signed) - push_one_at(width_a - 1); - if (a_signed) - push_one_at(width_b - 1); - if (a_signed || b_signed) - push_one_at(width_a + width_b - 1); - - return products; -} +std::vector generate_partial_products(Module *module, SigSpec a, SigSpec b, bool a_signed, bool b_signed, int width); /** * reduce_scheduled() - Reduce multiple operands to two using a compressor tree @@ -205,103 +84,7 @@ inline std::vector generate_partial_products(Module *module, SigSpec a * * Return: The final two reduced operands, that are to be fed into an adder */ -inline std::pair reduce_scheduled(Module *module, std::vector operands, int width, Strategy strategy, int *compressor_count = nullptr) { - int levels = 0; - int fa_count = 0; - int c42_count = 0; - int final_depth = 0; - - for (auto &op : operands) - op.sig.extend_u0(width); - - // Only compress operands ready at current level - for (int level = 0; operands.size() > 2; level++) { - // Partition operands into ready and waiting - std::vector ready; - std::vector waiting; - ready.reserve(operands.size()); - for (auto &op : operands) { - if (op.depth <= level) - ready.push_back(op); - else - waiting.push_back(op); - } - - if (ready.size() < 3) { - levels++; - continue; - } - - // Apply compressors to ready operands - std::vector compressed; - compressed.reserve(ready.size()); - size_t i = 0; - - // PREFER_42 attempts 4:2 grouping greedily (falls back to 3:2 for the residual) - // FA_ONLY skips - // DADDA = PREFER_42 (TODO: inspect column heights?) - bool try_42 = (strategy == Strategy::PREFER_42 || strategy == Strategy::DADDA); - - while (i < ready.size()) { - size_t remaining = ready.size() - i; - - if (try_42 && remaining >= 4) { - DepthSig a = ready[i + 0]; - DepthSig b = ready[i + 1]; - DepthSig c = ready[i + 2]; - DepthSig d = ready[i + 3]; - - auto [sum, carry] = emit_compressor_42(module, a.sig, b.sig, c.sig, d.sig, width); - int dmax = std::max({a.depth, a.depth, a.depth, a.depth}); - - compressed.push_back({sum, dmax + 2}); - compressed.push_back({carry, dmax + 2}); - - fa_count += 2; - c42_count += 1; - i += 4; - } else if (remaining >= 3) { - DepthSig a = ready[i + 0]; - DepthSig b = ready[i + 1]; - DepthSig c = ready[i + 2]; - - auto [sum, carry] = emit_compressor_32(module, a.sig, b.sig, c.sig, width); - int dmax = std::max({a.depth, b.depth, c.depth}); - - compressed.push_back({sum, dmax + 1}); - compressed.push_back({carry, dmax + 1}); - - fa_count += 1; - i += 3; - } else { - // Uncompressed operands pass through to next level - for (; i < ready.size(); i++) - compressed.push_back(ready[i]); - break; - } - } - - // Merge compressed with waiting operands - for (auto &op : waiting) - compressed.push_back(op); - - operands = std::move(compressed); - levels++; - } - - if(compressor_count) - *compressor_count = fa_count; - - if (operands.size() == 0) - return {SigSpec(State::S0, width), SigSpec(State::S0, width)}; - if (operands.size() == 1) - return {operands[0].sig, SigSpec(State::S0, width)}; - - final_depth = std::max(operands[0].depth, operands[1].depth); - log_assert(operands.size() == 2); - log(" CompressorTree::reduce_scheduled: %d levels, %d $fa (%d as 4:2), final depth %d\n", levels, fa_count, c42_count, final_depth); - return {operands[0].sig, operands[1].sig}; -} +std::pair reduce_scheduled(Module *module, std::vector operands, int width, Strategy strategy, int *compressor_count = nullptr); /** * emit_kogge_stone() - Emit a Kogge-Stone parallel-prefix adder @@ -310,79 +93,7 @@ inline std::pair reduce_scheduled(Module *module, std::vector< * @b: Signal B * @y: Signal Y = (A + B) mod 2^W */ -inline void emit_kogge_stone(Module *module, SigSpec a, SigSpec b, SigSpec y) -{ - int width = GetSize(y); - log_assert(GetSize(a) == width); - log_assert(GetSize(b) == width); - - if (width == 0) - return; - - if (width == 1) { - module->addXorGate(NEW_ID, a[0], b[0], y[0]); - return; - } - - // Bit level gen and prop - std::vector g_pre(width), p_pre(width); - for (int i = 0; i < width; i++) { - SigBit gi = module->addWire(NEW_ID); - SigBit pi = module->addWire(NEW_ID); - module->addAndGate(NEW_ID, a[i], b[i], gi); - module->addXorGate(NEW_ID, a[i], b[i], pi); - g_pre[i] = gi; - p_pre[i] = pi; - } - - // Propagate (g, p) through ceil(log2 W) levels - std::vector g = g_pre; - std::vector p = p_pre; - int num_levels = 0; - - while ((1 << num_levels) < width) - num_levels++; - - for (int k = 1; k <= num_levels; k++) { - int s = 1 << (k - 1); - std::vector g_next(width), p_next(width); - for (int i = 0; i < width; i++) { - if (i < s) { - // Nothing to do - g_next[i] = g[i]; - p_next[i] = p[i]; - } else { - // g_i^k = g_i | (p_i & g_(i-s)) - SigBit and_pg = module->addWire(NEW_ID); - module->addAndGate(NEW_ID, p[i], g[i - s], and_pg); - SigBit gnew = module->addWire(NEW_ID); - module->addOrGate(NEW_ID, g[i], and_pg, gnew); - g_next[i] = gnew; - - // p_i^k = p_i & p_(i-s) - if (k < num_levels) { - SigBit pnew = module->addWire(NEW_ID); - module->addAndGate(NEW_ID, p[i], p[i - s], pnew); - p_next[i] = pnew; - } else { - // Skip last level - p_next[i] = State::Sx; - } - } - } - - g = std::move(g_next); - p = std::move(p_next); - } - - // Sum layer, g[i] is COUT of bit i - // With CIN 0: - // sum[0] = p_pre[0] - // sum[i] = p_pre[i] ^ g[i-1] ... - module->connect(y[0], p_pre[0]); - for (int i = 1; i < width; i++) - module->addXorGate(NEW_ID, p_pre[i], g[i - 1], y[i]); -} +void emit_kogge_stone(Module *module, SigSpec a, SigSpec b, SigSpec y); /** * emit_final_adder() - Emit the final carry-propagate addition between the two reduced vectors @@ -394,29 +105,9 @@ inline void emit_kogge_stone(Module *module, SigSpec a, SigSpec b, SigSpec y) * * Return: Cell* of the emitted instance */ -inline Cell *emit_final_adder(Module *module, SigSpec a, SigSpec b, SigSpec y, FinalAdder choice) { - switch (choice) { - case FinalAdder::DEFAULT: - case FinalAdder::RIPPLE: { - return module->addAdd(NEW_ID, a, b, y, false); - } - case FinalAdder::PARALLEL_PREFIX: { - emit_kogge_stone(module, a, b, y); - return nullptr; - } - } - log_assert(false && "CompressorTree::emit_final_adder: invalid choice"); - return nullptr; -} +Cell *emit_final_adder(Module *module, SigSpec a, SigSpec b, SigSpec y, FinalAdder choice); -inline FinalAdder pick_final_adder(int width, FinalMode mode) { - switch (mode) { - case FinalMode::RIPPLE: return FinalAdder::RIPPLE; - case FinalMode::PREFIX: return FinalAdder::PARALLEL_PREFIX; - case FinalMode::AUTO: - default: return (width < RIPPLE_PREFIX_THRESHOLD) ? FinalAdder::DEFAULT : FinalAdder::PARALLEL_PREFIX; - } -} +FinalAdder pick_final_adder(int width, FinalMode mode); } // namespace CompressorTree