From f24fb4ae82c9fb0cdaf9c954dbe6b3569c1ccda0 Mon Sep 17 00:00:00 2001 From: whitequark Date: Mon, 20 Apr 2020 16:44:51 +0000 Subject: [PATCH 01/11] cxxrtl: detect buffered comb wires, not just feedback wires. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Any buffered combinatorial wires (including, as a subset, feedback wires) will prevent the design from always converging in one delta cycle. Before this commit, only feedback wires were detected. After this commit, any buffered combinatorial wires, including feedback wires, are detected. Co-authored-by: Jean-François Nguyen --- backends/cxxrtl/cxxrtl.cc | 45 ++++++++++++++++++++++++++++++++++----- 1 file changed, 40 insertions(+), 5 deletions(-) diff --git a/backends/cxxrtl/cxxrtl.cc b/backends/cxxrtl/cxxrtl.cc index ef8335e50..196a27524 100644 --- a/backends/cxxrtl/cxxrtl.cc +++ b/backends/cxxrtl/cxxrtl.cc @@ -415,6 +415,7 @@ struct CxxrtlWorker { bool localize_internal = false; bool localize_public = false; bool run_splitnets = false; + bool max_opt_level = false; std::ostringstream f; std::string indent; @@ -1648,6 +1649,8 @@ struct CxxrtlWorker { void analyze_design(RTLIL::Design *design) { bool has_feedback_arcs = false; + bool has_buffered_wires = false; + for (auto module : design->modules()) { if (!design->selected_module(module)) continue; @@ -1844,9 +1847,8 @@ struct CxxrtlWorker { if (!feedback_wires.empty()) { has_feedback_arcs = true; log("Module `%s' contains feedback arcs through wires:\n", module->name.c_str()); - for (auto wire : feedback_wires) { + for (auto wire : feedback_wires) log(" %s\n", wire->name.c_str()); - } } for (auto wire : module->wires()) { @@ -1856,13 +1858,45 @@ struct CxxrtlWorker { if (wire->name.begins_with("$") && !localize_internal) continue; if (wire->name.begins_with("\\") && !localize_public) continue; if (sync_wires[wire]) continue; - // Outputs of FF/$memrd cells and LHS of sync actions do not end up in defs. + // Wires connected to synchronous outputs do not introduce defs. if (flow.wire_defs[wire].size() != 1) continue; localized_wires.insert(wire); } + + // For maximum performance, the state of the simulation (which is the same as the set of its double buffered + // wires, since using a singly buffered wire for any kind of state introduces a race condition) should contain + // no wires attached to combinatorial outputs. Feedback wires, by definition, make that impossible. However, + // it is possible that a design with no feedback arcs would end up with doubly buffered wires in such cases + // as a wire with multiple drivers where one of them is combinatorial and the other is synchronous. Such designs + // also require more than one delta cycle to converge. + pool buffered_wires; + for (auto wire : module->wires()) { + // Only wires connected to combinatorial outputs introduce defs. + if (flow.wire_defs[wire].size() > 0 && !elided_wires.count(wire) && !localized_wires[wire]) { + if (!feedback_wires[wire]) + buffered_wires.insert(wire); + } + } + if (!buffered_wires.empty()) { + has_buffered_wires = true; + log("Module `%s' contains buffered combinatorial wires:\n", module->name.c_str()); + for (auto wire : buffered_wires) + log(" %s\n", wire->name.c_str()); + } } - if (has_feedback_arcs) { - log("Feedback arcs require delta cycles during evaluation.\n"); + if (has_feedback_arcs || has_buffered_wires) { + // Although both non-feedback buffered combinatorial wires and apparent feedback wires may be eliminated + // by optimizing the design, if after `opt_clean -purge` there are any feedback wires remaining, it is very + // likely that these feedback wires are indicative of a true logic loop, so they get emphasized in the message. + const char *why_pessimistic = nullptr; + if (has_feedback_arcs) + why_pessimistic = "feedback wires"; + else if (has_buffered_wires) + why_pessimistic = "buffered combinatorial wires"; + log("\n"); + log_warning("Design contains %s, which require delta cycles during evaluation.\n", why_pessimistic); + if (!max_opt_level) + log("Increasing the optimization level may eliminate %s from the design.\n", why_pessimistic); } } @@ -2135,6 +2169,7 @@ struct CxxrtlBackend : public Backend { switch (opt_level) { case 5: + worker.max_opt_level = true; worker.run_splitnets = true; case 4: worker.localize_public = true; From 757cbb3c809091bf20a2f3b3a8b621010de01a8d Mon Sep 17 00:00:00 2001 From: whitequark Date: Tue, 21 Apr 2020 13:33:42 +0000 Subject: [PATCH 02/11] cxxrtl: localize wires with multiple comb drivers, too. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before this commit, any wire that was not driven by an output port of exactly one comb cell would not be localized, even if there were no feedback arcs through that wire. This would cause the wire to become buffered and require (often quite a few) extraneous delta cycles during evaluation. To alleviate this problem, -O5 was running `splitnets -driver`. However, this solution was mistaken. Because `splitnets -driver` followed by `opt_clean -purge` would produce more nets with multiple drivers, it would have to be iterated to fixpoint. Moreover, even if this was done, it would not be sufficient because `opt_clean -purge` does not currently remove wires with the `\init` attribute (and it is not desirable to remove such wires, since they correspond to registers and may be useful for debugging). The proper solution is to consider the condition in which a wire may be localized. Specifically, if there are no feedback arcs through this wire, and no part of the wire is driven by an output of a sync cell, then the wire holds no state and is localizable. After this commit, the original condition for not localizing a wire is replaced by a check for any sync cell driving it. This makes it unnecessary to run `splitnets -driver` in the majority of cases to get a design with no buffered wires, and -O5 no longer includes that pass. As a result, Minerva SRAM SoC no longer has any buffered wires, and runs ~27% faster. In addition, this commit prepares the flow graph for introduction of sync outputs of black boxes. Co-authored-by: Jean-François Nguyen --- backends/cxxrtl/cxxrtl.cc | 63 +++++++++++++++++++-------------------- 1 file changed, 31 insertions(+), 32 deletions(-) diff --git a/backends/cxxrtl/cxxrtl.cc b/backends/cxxrtl/cxxrtl.cc index 196a27524..8f2bf6836 100644 --- a/backends/cxxrtl/cxxrtl.cc +++ b/backends/cxxrtl/cxxrtl.cc @@ -225,7 +225,7 @@ struct FlowGraph { }; std::vector nodes; - dict> wire_defs, wire_uses; + dict> wire_comb_defs, wire_sync_defs, wire_uses; dict wire_def_elidable, wire_use_elidable; ~FlowGraph() @@ -234,13 +234,17 @@ struct FlowGraph { delete node; } - void add_defs(Node *node, const RTLIL::SigSpec &sig, bool elidable) + void add_defs(Node *node, const RTLIL::SigSpec &sig, bool is_sync, bool elidable) { for (auto chunk : sig.chunks()) - if (chunk.wire) - wire_defs[chunk.wire].insert(node); - // Only defs of an entire wire in the right order can be elided. - if (sig.is_wire()) + if (chunk.wire) { + if (is_sync) + wire_sync_defs[chunk.wire].insert(node); + else + wire_comb_defs[chunk.wire].insert(node); + } + // Only comb defs of an entire wire in the right order can be elided. + if (!is_sync && sig.is_wire()) wire_def_elidable[sig.as_wire()] = elidable; } @@ -268,7 +272,7 @@ struct FlowGraph { // Connections void add_connect_defs_uses(Node *node, const RTLIL::SigSig &conn) { - add_defs(node, conn.first, /*elidable=*/true); + add_defs(node, conn.first, /*is_sync=*/false, /*elidable=*/true); add_uses(node, conn.second); } @@ -288,16 +292,16 @@ struct FlowGraph { log_assert(cell->known()); for (auto conn : cell->connections()) { if (cell->output(conn.first)) { - if (is_sync_ff_cell(cell->type) || (cell->type == ID($memrd) && cell->getParam(ID::CLK_ENABLE).as_bool())) - /* non-combinatorial outputs do not introduce defs */; - else if (is_elidable_cell(cell->type)) - add_defs(node, conn.second, /*elidable=*/true); + if (is_elidable_cell(cell->type)) + add_defs(node, conn.second, /*is_sync=*/false, /*elidable=*/true); + else if (is_sync_ff_cell(cell->type) || (cell->type == ID($memrd) && cell->getParam(ID::CLK_ENABLE).as_bool())) + add_defs(node, conn.second, /*is_sync=*/true, /*elidable=*/false); else if (is_internal_cell(cell->type)) - add_defs(node, conn.second, /*elidable=*/false); + add_defs(node, conn.second, /*is_sync=*/false, /*elidable=*/false); else { // Unlike outputs of internal cells (which generate code that depends on the ability to set the output // wire bits), outputs of user cells are normal wires, and the wires connected to them can be elided. - add_defs(node, conn.second, /*elidable=*/true); + add_defs(node, conn.second, /*is_sync=*/false, /*elidable=*/true); } } if (cell->input(conn.first)) @@ -319,7 +323,7 @@ struct FlowGraph { void add_case_defs_uses(Node *node, const RTLIL::CaseRule *case_) { for (auto &action : case_->actions) { - add_defs(node, action.first, /*elidable=*/false); + add_defs(node, action.first, /*is_sync=*/false, /*elidable=*/false); add_uses(node, action.second); } for (auto sub_switch : case_->switches) { @@ -338,9 +342,9 @@ struct FlowGraph { for (auto sync : process->syncs) for (auto action : sync->actions) { if (sync->type == RTLIL::STp || sync->type == RTLIL::STn || sync->type == RTLIL::STe) - /* sync actions do not introduce feedback */; + add_defs(node, action.first, /*is_sync=*/true, /*elidable=*/false); else - add_defs(node, action.first, /*elidable=*/false); + add_defs(node, action.first, /*is_sync=*/false, /*elidable=*/false); add_uses(node, action.second); } } @@ -414,7 +418,7 @@ struct CxxrtlWorker { bool elide_public = false; bool localize_internal = false; bool localize_public = false; - bool run_splitnets = false; + bool run_opt_clean_purge = false; bool max_opt_level = false; std::ostringstream f; @@ -1792,8 +1796,8 @@ struct CxxrtlWorker { if (wire->name.begins_with("$") && !elide_internal) continue; if (wire->name.begins_with("\\") && !elide_public) continue; if (sync_wires[wire]) continue; - log_assert(flow.wire_defs[wire].size() == 1); - elided_wires[wire] = **flow.wire_defs[wire].begin(); + log_assert(flow.wire_comb_defs[wire].size() == 1); + elided_wires[wire] = **flow.wire_comb_defs[wire].begin(); } // Elided wires that are outputs of internal cells are always connected to a well known port (Y). @@ -1805,9 +1809,9 @@ struct CxxrtlWorker { cell_wire_defs[cell][conn.second.as_wire()] = conn.first; dict, hash_ptr_ops> node_defs; - for (auto wire_def : flow.wire_defs) - for (auto node : wire_def.second) - node_defs[node].insert(wire_def.first); + for (auto wire_comb_def : flow.wire_comb_defs) + for (auto node : wire_comb_def.second) + node_defs[node].insert(wire_comb_def.first); Scheduler scheduler; dict::Vertex*, hash_ptr_ops> node_map; @@ -1858,8 +1862,7 @@ struct CxxrtlWorker { if (wire->name.begins_with("$") && !localize_internal) continue; if (wire->name.begins_with("\\") && !localize_public) continue; if (sync_wires[wire]) continue; - // Wires connected to synchronous outputs do not introduce defs. - if (flow.wire_defs[wire].size() != 1) continue; + if (flow.wire_sync_defs.count(wire) > 0) continue; localized_wires.insert(wire); } @@ -1871,8 +1874,7 @@ struct CxxrtlWorker { // also require more than one delta cycle to converge. pool buffered_wires; for (auto wire : module->wires()) { - // Only wires connected to combinatorial outputs introduce defs. - if (flow.wire_defs[wire].size() > 0 && !elided_wires.count(wire) && !localized_wires[wire]) { + if (flow.wire_comb_defs[wire].size() > 0 && !elided_wires.count(wire) && !localized_wires[wire]) { if (!feedback_wires[wire]) buffered_wires.insert(wire); } @@ -1942,11 +1944,8 @@ struct CxxrtlWorker { if (has_sync_init || has_packed_mem) check_design(design, has_sync_init, has_packed_mem); log_assert(!(has_sync_init || has_packed_mem)); - - if (run_splitnets) { - Pass::call(design, "splitnets -driver"); + if (run_opt_clean_purge) Pass::call(design, "opt_clean -purge"); - } log("\n"); analyze_design(design); } @@ -2134,7 +2133,7 @@ struct CxxrtlBackend : public Backend { log(" like -O3, and localize public wires not marked (*keep*) if possible.\n"); log("\n"); log(" -O5\n"); - log(" like -O4, and run `splitnets -driver; opt_clean -purge` first.\n"); + log(" like -O4, and run `opt_clean -purge` first.\n"); log("\n"); } void execute(std::ostream *&f, std::string filename, std::vector args, RTLIL::Design *design) YS_OVERRIDE @@ -2170,7 +2169,7 @@ struct CxxrtlBackend : public Backend { switch (opt_level) { case 5: worker.max_opt_level = true; - worker.run_splitnets = true; + worker.run_opt_clean_purge = true; case 4: worker.localize_public = true; case 3: From 12c5e9275c115104896d3c53f0bbdf3fc0c6f37d Mon Sep 17 00:00:00 2001 From: whitequark Date: Tue, 21 Apr 2020 13:59:42 +0000 Subject: [PATCH 03/11] cxxrtl: simplify generated edge detection logic. This commit changes the way edge detectors are represented in generated code from a variable that is set in commit() and reset in eval() to a function that considers .curr and .next of the clock wire. Behavior remains the same. Besides being simpler to generate and providing more opportunities for optimization, this commit paves way for unbuffering module inputs. --- backends/cxxrtl/cxxrtl.cc | 85 +++++++++++++-------------------------- 1 file changed, 29 insertions(+), 56 deletions(-) diff --git a/backends/cxxrtl/cxxrtl.cc b/backends/cxxrtl/cxxrtl.cc index 8f2bf6836..89b059504 100644 --- a/backends/cxxrtl/cxxrtl.cc +++ b/backends/cxxrtl/cxxrtl.cc @@ -915,7 +915,7 @@ struct CxxrtlWorker { RTLIL::SigBit clk_bit = cell->getPort(ID::CLK)[0]; clk_bit = sigmaps[clk_bit.wire->module](clk_bit); f << indent << "if (" << (cell->getParam(ID::CLK_POLARITY).as_bool() ? "posedge_" : "negedge_") - << mangle(clk_bit) << ") {\n"; + << mangle(clk_bit) << "()) {\n"; inc_indent(); if (cell->type == ID($dffe)) { f << indent << "if ("; @@ -992,7 +992,7 @@ struct CxxrtlWorker { RTLIL::SigBit clk_bit = cell->getPort(ID::CLK)[0]; clk_bit = sigmaps[clk_bit.wire->module](clk_bit); f << indent << "if (" << (cell->getParam(ID::CLK_POLARITY).as_bool() ? "posedge_" : "negedge_") - << mangle(clk_bit) << ") {\n"; + << mangle(clk_bit) << "()) {\n"; inc_indent(); } RTLIL::Memory *memory = cell->module->memories[cell->getParam(ID::MEMID).decode_string()]; @@ -1217,16 +1217,16 @@ struct CxxrtlWorker { switch (sync->type) { case RTLIL::STp: log_assert(sync_bit.wire != nullptr); - events.insert("posedge_" + mangle(sync_bit)); + events.insert("posedge_" + mangle(sync_bit) + "()"); break; case RTLIL::STn: log_assert(sync_bit.wire != nullptr); - events.insert("negedge_" + mangle(sync_bit)); + events.insert("negedge_" + mangle(sync_bit) + "()"); break; case RTLIL::STe: log_assert(sync_bit.wire != nullptr); - events.insert("posedge_" + mangle(sync_bit)); - events.insert("negedge_" + mangle(sync_bit)); + events.insert("posedge_" + mangle(sync_bit) + "()"); + events.insert("negedge_" + mangle(sync_bit) + "()"); break; case RTLIL::STa: @@ -1290,10 +1290,23 @@ struct CxxrtlWorker { if (sync_wires[wire]) { for (auto sync_type : sync_types) { if (sync_type.first.wire == wire) { - if (sync_type.second != RTLIL::STn) - f << indent << "bool posedge_" << mangle(sync_type.first) << " = false;\n"; - if (sync_type.second != RTLIL::STp) - f << indent << "bool negedge_" << mangle(sync_type.first) << " = false;\n"; + if (sync_type.second != RTLIL::STn) { + f << indent << "bool posedge_" << mangle(sync_type.first) << "() const {\n"; + inc_indent(); + f << indent << "return "; + f << "!" << mangle(sync_type.first.wire) << ".curr.slice<" << sync_type.first.offset << ">().val() && "; + f << mangle(sync_type.first.wire) << ".next.slice<" << sync_type.first.offset << ">().val();\n"; + dec_indent(); + f << indent << "}\n"; + } else { + f << indent << "bool negedge_" << mangle(sync_type.first) << "() const {\n"; + inc_indent(); + f << indent << "return "; + f << mangle(sync_type.first.wire) << ".curr.slice<" << sync_type.first.offset << ">().val() && "; + f << "!" << mangle(sync_type.first.wire) << ".next.slice<" << sync_type.first.offset << ">().val();\n"; + dec_indent(); + f << indent << "}\n"; + } } } } @@ -1365,14 +1378,6 @@ struct CxxrtlWorker { } } } - for (auto sync_type : sync_types) { - if (sync_type.first.wire->module == module) { - if (sync_type.second != RTLIL::STn) - f << indent << "posedge_" << mangle(sync_type.first) << " = false;\n"; - if (sync_type.second != RTLIL::STp) - f << indent << "negedge_" << mangle(sync_type.first) << " = false;\n"; - } - } dec_indent(); } @@ -1383,39 +1388,8 @@ struct CxxrtlWorker { for (auto wire : module->wires()) { if (elided_wires.count(wire) || localized_wires.count(wire)) continue; - if (sync_wires[wire]) { - std::string wire_prev = mangle(wire) + "_prev"; - std::string wire_curr = mangle(wire) + ".curr"; - std::string wire_edge = mangle(wire) + "_edge"; - f << indent << "value<" << wire->width << "> " << wire_prev << " = " << wire_curr << ";\n"; - f << indent << "if (" << mangle(wire) << ".commit()) {\n"; - inc_indent(); - f << indent << "value<" << wire->width << "> " << wire_edge << " = " - << wire_prev << ".bit_xor(" << wire_curr << ");\n"; - for (auto sync_type : sync_types) { - if (sync_type.first.wire != wire) - continue; - if (sync_type.second != RTLIL::STn) { - f << indent << "if (" << wire_edge << ".slice<" << sync_type.first.offset << ">().val() && " - << wire_curr << ".slice<" << sync_type.first.offset << ">().val())\n"; - inc_indent(); - f << indent << "posedge_" << mangle(sync_type.first) << " = true;\n"; - dec_indent(); - } - if (sync_type.second != RTLIL::STp) { - f << indent << "if (" << wire_edge << ".slice<" << sync_type.first.offset << ">().val() && " - << "!" << wire_curr << ".slice<" << sync_type.first.offset << ">().val())\n"; - inc_indent(); - f << indent << "negedge_" << mangle(sync_type.first) << " = true;\n"; - dec_indent(); - } - f << indent << "changed = true;\n"; - } - dec_indent(); - f << indent << "}\n"; - } else if (!module->get_bool_attribute(ID(cxxrtl.blackbox)) || wire->port_id != 0) { + if (!module->get_bool_attribute(ID(cxxrtl.blackbox)) || wire->port_id != 0) f << indent << "changed |= " << mangle(wire) << ".commit();\n"; - } } if (!module->get_bool_attribute(ID(cxxrtl.blackbox))) { for (auto memory : module->memories) { @@ -2005,7 +1979,7 @@ struct CxxrtlBackend : public Backend { log("\n"); log(" struct bb_p_debug : public module {\n"); log(" wire<1> p_clk;\n"); - log(" bool posedge_p_clk = false;\n"); + log(" bool posedge_p_clk() const { /* ... */ }\n"); log(" wire<1> p_en;\n"); log(" wire<8> p_data;\n"); log("\n"); @@ -2023,7 +1997,7 @@ struct CxxrtlBackend : public Backend { log("\n"); log(" struct stderr_debug : public bb_p_debug {\n"); log(" void eval() override {\n"); - log(" if (posedge_p_clk && p_en.curr)\n"); + log(" if (posedge_p_clk() && p_en.curr)\n"); log(" fprintf(stderr, \"debug: %%02x\\n\", p_data.curr.data[0]);\n"); log(" bb_p_debug::eval();\n"); log(" }\n"); @@ -2086,10 +2060,9 @@ struct CxxrtlBackend : public Backend { log("\n"); log(" cxxrtl.edge\n"); log(" only valid on inputs of black boxes. must be one of \"p\", \"n\", \"a\".\n"); - log(" if specified on signal `clk`, the generated code includes boolean fields\n"); - log(" `posedge_p_clk` (if \"p\"), `negedge_p_clk` (if \"n\"), or both (if \"a\"),\n"); - log(" as well as edge detection logic, simplifying implementation of clocked\n"); - log(" black boxes.\n"); + log(" if specified on signal `clk`, the generated code includes edge detectors\n"); + log(" `posedge_p_clk()` (if \"p\"), `negedge_p_clk()` (if \"n\"), or both (if\n"); + log(" \"a\"), simplifying implementation of clocked black boxes.\n"); log("\n"); log(" cxxrtl.template\n"); log(" only valid on black boxes. must contain a space separated sequence of\n"); From 06985c3afdba579d0bba266bb6daba0101358ad5 Mon Sep 17 00:00:00 2001 From: whitequark Date: Tue, 21 Apr 2020 14:49:36 +0000 Subject: [PATCH 04/11] cxxrtl: unbuffer module input wires. Module input wires are never set by the module, so it is unnecessary to buffer them. Although important for all inputs, this is especially critical for clocks, since after this commit, hierarchy levels no longer add delta cycles. As a result, Minerva SRAM SoC runs ~73% faster when flattened, and ~264% (!!) faster when hierarchical. --- backends/cxxrtl/cxxrtl.cc | 92 ++++++++++++++++++++++++++------------- 1 file changed, 61 insertions(+), 31 deletions(-) diff --git a/backends/cxxrtl/cxxrtl.cc b/backends/cxxrtl/cxxrtl.cc index 89b059504..f5a8e18b5 100644 --- a/backends/cxxrtl/cxxrtl.cc +++ b/backends/cxxrtl/cxxrtl.cc @@ -360,6 +360,11 @@ struct FlowGraph { } }; +bool is_input_wire(const RTLIL::Wire *wire) +{ + return wire->port_input && !wire->port_output; +} + bool is_cxxrtl_blackbox_cell(const RTLIL::Cell *cell) { RTLIL::Module *cell_module = cell->module->design->module(cell->type); @@ -685,7 +690,7 @@ struct CxxrtlWorker { default: log_assert(false); } - } else if (localized_wires[chunk.wire]) { + } else if (localized_wires[chunk.wire] || is_input_wire(chunk.wire)) { f << mangle(chunk.wire); } else { f << mangle(chunk.wire) << (is_lhs ? ".next" : ".curr"); @@ -1093,7 +1098,11 @@ struct CxxrtlWorker { log_assert(cell->known()); const char *access = is_cxxrtl_blackbox_cell(cell) ? "->" : "."; for (auto conn : cell->connections()) - if (cell->input(conn.first)) { + if (cell->input(conn.first) && !cell->output(conn.first)) { + f << indent << mangle(cell) << access << mangle_wire_name(conn.first) << " = "; + dump_sigspec_rhs(conn.second); + f << ";\n"; + } else if (cell->input(conn.first)) { f << indent << mangle(cell) << access << mangle_wire_name(conn.first) << ".next = "; dump_sigspec_rhs(conn.second); f << ";\n"; @@ -1258,21 +1267,17 @@ struct CxxrtlWorker { } } - void dump_wire(const RTLIL::Wire *wire, bool is_local) + void dump_wire(const RTLIL::Wire *wire, bool is_local_context) { if (elided_wires.count(wire)) return; + if (localized_wires.count(wire) != is_local_context) + return; - if (is_local) { - if (!localized_wires.count(wire)) - return; - + if (is_local_context) { dump_attrs(wire); f << indent << "value<" << wire->width << "> " << mangle(wire) << ";\n"; } else { - if (localized_wires.count(wire)) - return; - std::string width; if (wire->module->has_attribute(ID(cxxrtl.blackbox)) && wire->has_attribute(ID(cxxrtl.width))) { width = wire->get_string_attribute(ID(cxxrtl.width)); @@ -1281,29 +1286,44 @@ struct CxxrtlWorker { } dump_attrs(wire); - f << indent << "wire<" << width << "> " << mangle(wire); + f << indent << (is_input_wire(wire) ? "value" : "wire") << "<" << width << "> " << mangle(wire); if (wire->has_attribute(ID::init)) { f << " "; dump_const_init(wire->attributes.at(ID::init)); } f << ";\n"; if (sync_wires[wire]) { + if (is_input_wire(wire)) { + f << indent << "value<" << width << "> prev_" << mangle(wire); + if (wire->has_attribute(ID::init)) { + f << " "; + dump_const_init(wire->attributes.at(ID::init)); + } + f << ";\n"; + } for (auto sync_type : sync_types) { if (sync_type.first.wire == wire) { + std::string prev, next; + if (is_input_wire(wire)) { + prev = "prev_" + mangle(sync_type.first.wire); + next = mangle(sync_type.first.wire); + } else { + prev = mangle(sync_type.first.wire) + ".curr"; + next = mangle(sync_type.first.wire) + ".next"; + } + prev += ".slice<" + std::to_string(sync_type.first.offset) + ">().val()"; + next += ".slice<" + std::to_string(sync_type.first.offset) + ">().val()"; if (sync_type.second != RTLIL::STn) { f << indent << "bool posedge_" << mangle(sync_type.first) << "() const {\n"; inc_indent(); - f << indent << "return "; - f << "!" << mangle(sync_type.first.wire) << ".curr.slice<" << sync_type.first.offset << ">().val() && "; - f << mangle(sync_type.first.wire) << ".next.slice<" << sync_type.first.offset << ">().val();\n"; + f << indent << "return !" << prev << " && " << next << ";\n"; dec_indent(); f << indent << "}\n"; - } else { + } + if (sync_type.second != RTLIL::STp) { f << indent << "bool negedge_" << mangle(sync_type.first) << "() const {\n"; inc_indent(); - f << indent << "return "; - f << mangle(sync_type.first.wire) << ".curr.slice<" << sync_type.first.offset << ">().val() && "; - f << "!" << mangle(sync_type.first.wire) << ".next.slice<" << sync_type.first.offset << ">().val();\n"; + f << indent << "return " << prev << " && !" << next << ";\n"; dec_indent(); f << indent << "}\n"; } @@ -1363,7 +1383,7 @@ struct CxxrtlWorker { inc_indent(); if (!module->get_bool_attribute(ID(cxxrtl.blackbox))) { for (auto wire : module->wires()) - dump_wire(wire, /*is_local=*/true); + dump_wire(wire, /*is_local_context=*/true); for (auto node : schedule[module]) { switch (node.type) { case FlowGraph::Node::Type::CONNECT: @@ -1388,6 +1408,11 @@ struct CxxrtlWorker { for (auto wire : module->wires()) { if (elided_wires.count(wire) || localized_wires.count(wire)) continue; + if (is_input_wire(wire)) { + if (sync_wires[wire]) + f << indent << "prev_" << mangle(wire) << " = " << mangle(wire) << ";\n"; + continue; + } if (!module->get_bool_attribute(ID(cxxrtl.blackbox)) || wire->port_id != 0) f << indent << "changed |= " << mangle(wire) << ".commit();\n"; } @@ -1445,7 +1470,7 @@ struct CxxrtlWorker { inc_indent(); for (auto wire : module->wires()) { if (wire->port_id != 0) - dump_wire(wire, /*is_local=*/false); + dump_wire(wire, /*is_local_context=*/false); } f << "\n"; f << indent << "void eval() override {\n"; @@ -1485,7 +1510,7 @@ struct CxxrtlWorker { f << indent << "struct " << mangle(module) << " : public module {\n"; inc_indent(); for (auto wire : module->wires()) - dump_wire(wire, /*is_local=*/false); + dump_wire(wire, /*is_local_context=*/false); f << "\n"; bool has_memories = false; for (auto memory : module->memories) { @@ -1948,9 +1973,9 @@ struct CxxrtlBackend : public Backend { log(" top.step();\n"); log(" while (1) {\n"); log(" /* user logic */\n"); - log(" top.p_clk.next = value<1> {0u};\n"); + log(" top.p_clk = value<1> {0u};\n"); log(" top.step();\n"); - log(" top.p_clk.next = value<1> {1u};\n"); + log(" top.p_clk = value<1> {1u};\n"); log(" top.step();\n"); log(" }\n"); log(" }\n"); @@ -1972,16 +1997,18 @@ struct CxxrtlBackend : public Backend { log(" module debug(...);\n"); log(" (* cxxrtl.edge = \"p\" *) input clk;\n"); log(" input en;\n"); - log(" input [7:0] data;\n"); + log(" input [7:0] i_data;\n"); + log(" output [7:0] o_data;\n"); log(" endmodule\n"); log("\n"); log("For this HDL interface, this backend will generate the following C++ interface:\n"); log("\n"); log(" struct bb_p_debug : public module {\n"); - log(" wire<1> p_clk;\n"); + log(" value<1> p_clk;\n"); log(" bool posedge_p_clk() const { /* ... */ }\n"); - log(" wire<1> p_en;\n"); - log(" wire<8> p_data;\n"); + log(" value<1> p_en;\n"); + log(" value<8> p_i_data;\n"); + log(" wire<8> p_o_data;\n"); log("\n"); log(" void eval() override;\n"); log(" bool commit() override;\n"); @@ -1997,8 +2024,9 @@ struct CxxrtlBackend : public Backend { log("\n"); log(" struct stderr_debug : public bb_p_debug {\n"); log(" void eval() override {\n"); - log(" if (posedge_p_clk() && p_en.curr)\n"); - log(" fprintf(stderr, \"debug: %%02x\\n\", p_data.curr.data[0]);\n"); + log(" if (posedge_p_clk() && p_en)\n"); + log(" fprintf(stderr, \"debug: %%02x\\n\", p_i_data.data[0]);\n"); + log(" p_o_data.next = p_i_data;\n"); log(" bb_p_debug::eval();\n"); log(" }\n"); log(" };\n"); @@ -2020,7 +2048,8 @@ struct CxxrtlBackend : public Backend { log(" parameter WIDTH = 8;\n"); log(" (* cxxrtl.edge = \"p\" *) input clk;\n"); log(" input en;\n"); - log(" (* cxxrtl.width = \"WIDTH\" *) input [WIDTH - 1:0] data;\n"); + log(" (* cxxrtl.width = \"WIDTH\" *) input [WIDTH - 1:0] i_data;\n"); + log(" (* cxxrtl.width = \"WIDTH\" *) output [WIDTH - 1:0] o_data;\n"); log(" endmodule\n"); log("\n"); log("For this parametric HDL interface, this backend will generate the following C++\n"); @@ -2029,7 +2058,8 @@ struct CxxrtlBackend : public Backend { log(" template\n"); log(" struct bb_p_debug : public module {\n"); log(" // ...\n"); - log(" wire p_data;\n"); + log(" value p_i_data;\n"); + log(" wire p_o_data;\n"); log(" // ...\n"); log(" static std::unique_ptr>\n"); log(" create(std::string name, metadata_map parameters, metadata_map attributes);\n"); From 7f5313e6c3c932a82a0fac7719d7c7100342ec77 Mon Sep 17 00:00:00 2001 From: whitequark Date: Tue, 21 Apr 2020 15:33:12 +0000 Subject: [PATCH 05/11] cxxrtl: add -O6, a shortcut for running `proc; flatten`. People judge a compiler backend by the first impression, and the metric they judge it for is speed. -O6 does severely impact debuggability, but it provides equally massive gains in performance, so use it by default. --- backends/cxxrtl/cxxrtl.cc | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/backends/cxxrtl/cxxrtl.cc b/backends/cxxrtl/cxxrtl.cc index f5a8e18b5..eb769ac3f 100644 --- a/backends/cxxrtl/cxxrtl.cc +++ b/backends/cxxrtl/cxxrtl.cc @@ -424,6 +424,7 @@ struct CxxrtlWorker { bool localize_internal = false; bool localize_public = false; bool run_opt_clean_purge = false; + bool run_proc_flatten = false; bool max_opt_level = false; std::ostringstream f; @@ -1929,8 +1930,12 @@ struct CxxrtlWorker { void prepare_design(RTLIL::Design *design) { bool has_sync_init, has_packed_mem; + log_push(); check_design(design, has_sync_init, has_packed_mem); - if (has_sync_init) { + if (run_proc_flatten) { + Pass::call(design, "proc"); + Pass::call(design, "flatten"); + } else if (has_sync_init) { // We're only interested in proc_init, but it depends on proc_prune and proc_clean, so call those // in case they weren't already. (This allows `yosys foo.v -o foo.cc` to work.) Pass::call(design, "proc_prune"); @@ -1945,13 +1950,13 @@ struct CxxrtlWorker { log_assert(!(has_sync_init || has_packed_mem)); if (run_opt_clean_purge) Pass::call(design, "opt_clean -purge"); - log("\n"); + log_pop(); analyze_design(design); } }; struct CxxrtlBackend : public Backend { - static const int DEFAULT_OPT_LEVEL = 5; + static const int DEFAULT_OPT_LEVEL = 6; CxxrtlBackend() : Backend("cxxrtl", "convert design to C++ RTL simulation") { } void help() YS_OVERRIDE @@ -2138,6 +2143,9 @@ struct CxxrtlBackend : public Backend { log(" -O5\n"); log(" like -O4, and run `opt_clean -purge` first.\n"); log("\n"); + log(" -O6\n"); + log(" like -O5, and run `proc; flatten` first.\n"); + log("\n"); } void execute(std::ostream *&f, std::string filename, std::vector args, RTLIL::Design *design) YS_OVERRIDE { @@ -2170,8 +2178,10 @@ struct CxxrtlBackend : public Backend { extra_args(f, filename, args, argidx); switch (opt_level) { - case 5: + case 6: worker.max_opt_level = true; + worker.run_proc_flatten = true; + case 5: worker.run_opt_clean_purge = true; case 4: worker.localize_public = true; From 4aa0f450f52b595ba1327632a19dc3a989bf2438 Mon Sep 17 00:00:00 2001 From: whitequark Date: Tue, 21 Apr 2020 15:51:09 +0000 Subject: [PATCH 06/11] cxxrtl: use one delta cycle for immediately converging netlists. If it is statically known that eval() will converge in one delta cycle (that is, the second commit() will always return `false`) because the design contains no feedback or buffered wires, then there is no need to run the second delta cycle at all. After this commit, the case where eval() always converges immediately is detected and the second delta cycle is omitted. As a result, Minerva SRAM SoC runs ~25% faster. --- backends/cxxrtl/cxxrtl.cc | 25 +++++++++++++++++-------- backends/cxxrtl/cxxrtl.h | 7 ++++--- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/backends/cxxrtl/cxxrtl.cc b/backends/cxxrtl/cxxrtl.cc index eb769ac3f..843c09b91 100644 --- a/backends/cxxrtl/cxxrtl.cc +++ b/backends/cxxrtl/cxxrtl.cc @@ -441,6 +441,7 @@ struct CxxrtlWorker { dict> schedule; pool localized_wires; dict> blackbox_specializations; + dict eval_converges; void inc_indent() { indent += "\t"; @@ -1108,7 +1109,7 @@ struct CxxrtlWorker { dump_sigspec_rhs(conn.second); f << ";\n"; } - f << indent << mangle(cell) << access << "eval();\n"; + f << indent << "converged &= " << mangle(cell) << access << "eval();\n"; for (auto conn : cell->connections()) { if (conn.second.is_wire()) { RTLIL::Wire *wire = conn.second.as_wire(); @@ -1382,6 +1383,7 @@ struct CxxrtlWorker { void dump_eval_method(RTLIL::Module *module) { inc_indent(); + f << indent << "bool converged = " << (eval_converges.at(module) ? "true" : "false") << ";\n"; if (!module->get_bool_attribute(ID(cxxrtl.blackbox))) { for (auto wire : module->wires()) dump_wire(wire, /*is_local_context=*/true); @@ -1399,6 +1401,7 @@ struct CxxrtlWorker { } } } + f << indent << "return converged;\n"; dec_indent(); } @@ -1474,7 +1477,7 @@ struct CxxrtlWorker { dump_wire(wire, /*is_local_context=*/false); } f << "\n"; - f << indent << "void eval() override {\n"; + f << indent << "bool eval() override {\n"; dump_eval_method(module); f << indent << "}\n"; f << "\n"; @@ -1542,7 +1545,7 @@ struct CxxrtlWorker { } if (has_cells) f << "\n"; - f << indent << "void eval() override;\n"; + f << indent << "bool eval() override;\n"; f << indent << "bool commit() override;\n"; dec_indent(); f << indent << "}; // struct " << mangle(module) << "\n"; @@ -1554,7 +1557,7 @@ struct CxxrtlWorker { { if (module->get_bool_attribute(ID(cxxrtl.blackbox))) return; - f << indent << "void " << mangle(module) << "::eval() {\n"; + f << indent << "bool " << mangle(module) << "::eval() {\n"; dump_eval_method(module); f << indent << "}\n"; f << "\n"; @@ -1687,6 +1690,10 @@ struct CxxrtlWorker { } } } + + // Black boxes converge by default, since their implementations are quite unlikely to require + // internal propagation of comb signals. + eval_converges[module] = true; continue; } @@ -1872,7 +1879,7 @@ struct CxxrtlWorker { // it is possible that a design with no feedback arcs would end up with doubly buffered wires in such cases // as a wire with multiple drivers where one of them is combinatorial and the other is synchronous. Such designs // also require more than one delta cycle to converge. - pool buffered_wires; + pool buffered_wires; for (auto wire : module->wires()) { if (flow.wire_comb_defs[wire].size() > 0 && !elided_wires.count(wire) && !localized_wires[wire]) { if (!feedback_wires[wire]) @@ -1885,6 +1892,8 @@ struct CxxrtlWorker { for (auto wire : buffered_wires) log(" %s\n", wire->name.c_str()); } + + eval_converges[module] = feedback_wires.empty() && buffered_wires.empty(); } if (has_feedback_arcs || has_buffered_wires) { // Although both non-feedback buffered combinatorial wires and apparent feedback wires may be eliminated @@ -2015,7 +2024,7 @@ struct CxxrtlBackend : public Backend { log(" value<8> p_i_data;\n"); log(" wire<8> p_o_data;\n"); log("\n"); - log(" void eval() override;\n"); + log(" bool eval() override;\n"); log(" bool commit() override;\n"); log("\n"); log(" static std::unique_ptr\n"); @@ -2028,11 +2037,11 @@ struct CxxrtlBackend : public Backend { log(" namespace cxxrtl_design {\n"); log("\n"); log(" struct stderr_debug : public bb_p_debug {\n"); - log(" void eval() override {\n"); + log(" bool eval() override {\n"); log(" if (posedge_p_clk() && p_en)\n"); log(" fprintf(stderr, \"debug: %%02x\\n\", p_i_data.data[0]);\n"); log(" p_o_data.next = p_i_data;\n"); - log(" bb_p_debug::eval();\n"); + log(" return bb_p_debug::eval();\n"); log(" }\n"); log(" };\n"); log("\n"); diff --git a/backends/cxxrtl/cxxrtl.h b/backends/cxxrtl/cxxrtl.h index 41e6290d1..b79bbbc72 100644 --- a/backends/cxxrtl/cxxrtl.h +++ b/backends/cxxrtl/cxxrtl.h @@ -717,15 +717,16 @@ struct module { module(const module &) = delete; module &operator=(const module &) = delete; - virtual void eval() = 0; + virtual bool eval() = 0; virtual bool commit() = 0; size_t step() { size_t deltas = 0; + bool converged = false; do { - eval(); + converged = eval(); deltas++; - } while (commit()); + } while (commit() && !converged); return deltas; } }; From 164b0746d2111be17bd33b04852070fc4b65e48c Mon Sep 17 00:00:00 2001 From: whitequark Date: Tue, 21 Apr 2020 18:46:36 +0000 Subject: [PATCH 07/11] cxxrtl: s/sync_{wire,type}/edge_{wire,type}/. NFC. The attribute for this is called (* cxxrtl.edge *), and there is a planned attribute (* cxxrtl.sync *) that would cause blackbox cell outputs to be added to sync defs rather than comb defs. Rename the edge detector related stuff to avoid confusion. --- backends/cxxrtl/cxxrtl.cc | 46 +++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/backends/cxxrtl/cxxrtl.cc b/backends/cxxrtl/cxxrtl.cc index 843c09b91..39c21ecd3 100644 --- a/backends/cxxrtl/cxxrtl.cc +++ b/backends/cxxrtl/cxxrtl.cc @@ -432,8 +432,8 @@ struct CxxrtlWorker { int temporary = 0; dict sigmaps; - pool sync_wires; - dict sync_types; + pool edge_wires; + dict edge_types; pool writable_memories; dict> transparent_for; dict> cell_wire_defs; @@ -1294,7 +1294,7 @@ struct CxxrtlWorker { dump_const_init(wire->attributes.at(ID::init)); } f << ";\n"; - if (sync_wires[wire]) { + if (edge_wires[wire]) { if (is_input_wire(wire)) { f << indent << "value<" << width << "> prev_" << mangle(wire); if (wire->has_attribute(ID::init)) { @@ -1303,27 +1303,27 @@ struct CxxrtlWorker { } f << ";\n"; } - for (auto sync_type : sync_types) { - if (sync_type.first.wire == wire) { + for (auto edge_type : edge_types) { + if (edge_type.first.wire == wire) { std::string prev, next; if (is_input_wire(wire)) { - prev = "prev_" + mangle(sync_type.first.wire); - next = mangle(sync_type.first.wire); + prev = "prev_" + mangle(edge_type.first.wire); + next = mangle(edge_type.first.wire); } else { - prev = mangle(sync_type.first.wire) + ".curr"; - next = mangle(sync_type.first.wire) + ".next"; + prev = mangle(edge_type.first.wire) + ".curr"; + next = mangle(edge_type.first.wire) + ".next"; } - prev += ".slice<" + std::to_string(sync_type.first.offset) + ">().val()"; - next += ".slice<" + std::to_string(sync_type.first.offset) + ">().val()"; - if (sync_type.second != RTLIL::STn) { - f << indent << "bool posedge_" << mangle(sync_type.first) << "() const {\n"; + prev += ".slice<" + std::to_string(edge_type.first.offset) + ">().val()"; + next += ".slice<" + std::to_string(edge_type.first.offset) + ">().val()"; + if (edge_type.second != RTLIL::STn) { + f << indent << "bool posedge_" << mangle(edge_type.first) << "() const {\n"; inc_indent(); f << indent << "return !" << prev << " && " << next << ";\n"; dec_indent(); f << indent << "}\n"; } - if (sync_type.second != RTLIL::STp) { - f << indent << "bool negedge_" << mangle(sync_type.first) << "() const {\n"; + if (edge_type.second != RTLIL::STp) { + f << indent << "bool negedge_" << mangle(edge_type.first) << "() const {\n"; inc_indent(); f << indent << "return " << prev << " && !" << next << ";\n"; dec_indent(); @@ -1413,7 +1413,7 @@ struct CxxrtlWorker { if (elided_wires.count(wire) || localized_wires.count(wire)) continue; if (is_input_wire(wire)) { - if (sync_wires[wire]) + if (edge_wires[wire]) f << indent << "prev_" << mangle(wire) << " = " << mangle(wire) << ";\n"; continue; } @@ -1646,11 +1646,11 @@ struct CxxrtlWorker { log_assert(type == RTLIL::STp || type == RTLIL::STn || type == RTLIL::STe); RTLIL::SigBit sigbit = signal[0]; - if (!sync_types.count(sigbit)) - sync_types[sigbit] = type; - else if (sync_types[sigbit] != type) - sync_types[sigbit] = RTLIL::STe; - sync_wires.insert(signal.as_wire()); + if (!edge_types.count(sigbit)) + edge_types[sigbit] = type; + else if (edge_types[sigbit] != type) + edge_types[sigbit] = RTLIL::STe; + edge_wires.insert(signal.as_wire()); } void analyze_design(RTLIL::Design *design) @@ -1802,7 +1802,7 @@ struct CxxrtlWorker { if (wire->get_bool_attribute(ID::keep)) continue; if (wire->name.begins_with("$") && !elide_internal) continue; if (wire->name.begins_with("\\") && !elide_public) continue; - if (sync_wires[wire]) continue; + if (edge_wires[wire]) continue; log_assert(flow.wire_comb_defs[wire].size() == 1); elided_wires[wire] = **flow.wire_comb_defs[wire].begin(); } @@ -1868,7 +1868,7 @@ struct CxxrtlWorker { if (wire->get_bool_attribute(ID::keep)) continue; if (wire->name.begins_with("$") && !localize_internal) continue; if (wire->name.begins_with("\\") && !localize_public) continue; - if (sync_wires[wire]) continue; + if (edge_wires[wire]) continue; if (flow.wire_sync_defs.count(wire) > 0) continue; localized_wires.insert(wire); } From d22a8d157da855ca70ff89ead8406dcd629642fb Mon Sep 17 00:00:00 2001 From: whitequark Date: Tue, 21 Apr 2020 21:48:17 +0000 Subject: [PATCH 08/11] cxxrtl: add (*cxxrtl.{comb,sync}*) annotations on black box outputs. If the annotations are not used, this commit does not alter semantics at all, other than removing elision of outputs of black box cells. (Elision of such outputs is expected to be too rare to have any noticeable benefit, and the implementation was somewhat of a hack.) The (* cxxrtl.comb *) annotation alters the semantics of the output of the black box it is applied to such that, if the black box converges immediately, no additional delta cycle is necessary to propagate the computed combinatorial value upwards in hierarchy. The (* cxxrtl.sync *) annotation alters the semantics of the output of the black box it is applied to such as to remove any uses of the black box by the wires connected to this output, and break false feedback arcs arising from conservative modeling of dependencies of the black box. Although currently these attributes are only recognized on black boxes, if separate compilation is added in the future, it could also emit and consume them. --- backends/cxxrtl/cxxrtl.cc | 251 ++++++++++++++++++++++++++++---------- 1 file changed, 186 insertions(+), 65 deletions(-) diff --git a/backends/cxxrtl/cxxrtl.cc b/backends/cxxrtl/cxxrtl.cc index 39c21ecd3..ab8888e08 100644 --- a/backends/cxxrtl/cxxrtl.cc +++ b/backends/cxxrtl/cxxrtl.cc @@ -171,6 +171,11 @@ struct Scheduler { } }; +bool is_input_wire(const RTLIL::Wire *wire) +{ + return wire->port_input && !wire->port_output; +} + bool is_unary_cell(RTLIL::IdString type) { return type.in( @@ -210,11 +215,54 @@ bool is_internal_cell(RTLIL::IdString type) return type[0] == '$' && !type.begins_with("$paramod\\"); } +bool is_cxxrtl_blackbox_cell(const RTLIL::Cell *cell) +{ + RTLIL::Module *cell_module = cell->module->design->module(cell->type); + log_assert(cell_module != nullptr); + return cell_module->get_bool_attribute(ID(cxxrtl.blackbox)); +} + +enum class CxxrtlPortType { + UNKNOWN = 0, // or mixed comb/sync + COMB = 1, + SYNC = 2, +}; + +CxxrtlPortType cxxrtl_port_type(const RTLIL::Cell *cell, RTLIL::IdString port) +{ + RTLIL::Module *cell_module = cell->module->design->module(cell->type); + if (cell_module == nullptr || !cell_module->get_bool_attribute(ID(cxxrtl.blackbox))) + return CxxrtlPortType::UNKNOWN; + RTLIL::Wire *cell_output_wire = cell_module->wire(port); + log_assert(cell_output_wire != nullptr); + bool is_comb = cell_output_wire->get_bool_attribute(ID(cxxrtl.comb)); + bool is_sync = cell_output_wire->get_bool_attribute(ID(cxxrtl.sync)); + if (is_comb && is_sync) + log_cmd_error("Port `%s.%s' is marked as both `cxxrtl.comb` and `cxxrtl.sync`.\n", + log_id(cell_module), log_signal(cell_output_wire)); + else if (is_comb) + return CxxrtlPortType::COMB; + else if (is_sync) + return CxxrtlPortType::SYNC; + return CxxrtlPortType::UNKNOWN; +} + +bool is_cxxrtl_comb_port(const RTLIL::Cell *cell, RTLIL::IdString port) +{ + return cxxrtl_port_type(cell, port) == CxxrtlPortType::COMB; +} + +bool is_cxxrtl_sync_port(const RTLIL::Cell *cell, RTLIL::IdString port) +{ + return cxxrtl_port_type(cell, port) == CxxrtlPortType::SYNC; +} + struct FlowGraph { struct Node { enum class Type { CONNECT, - CELL, + CELL_SYNC, + CELL_EVAL, PROCESS }; @@ -234,17 +282,17 @@ struct FlowGraph { delete node; } - void add_defs(Node *node, const RTLIL::SigSpec &sig, bool is_sync, bool elidable) + void add_defs(Node *node, const RTLIL::SigSpec &sig, bool fully_sync, bool elidable) { for (auto chunk : sig.chunks()) if (chunk.wire) { - if (is_sync) + if (fully_sync) wire_sync_defs[chunk.wire].insert(node); else wire_comb_defs[chunk.wire].insert(node); } // Only comb defs of an entire wire in the right order can be elided. - if (!is_sync && sig.is_wire()) + if (!fully_sync && sig.is_wire()) wire_def_elidable[sig.as_wire()] = elidable; } @@ -272,7 +320,7 @@ struct FlowGraph { // Connections void add_connect_defs_uses(Node *node, const RTLIL::SigSig &conn) { - add_defs(node, conn.first, /*is_sync=*/false, /*elidable=*/true); + add_defs(node, conn.first, /*fully_sync=*/false, /*elidable=*/true); add_uses(node, conn.second); } @@ -287,21 +335,59 @@ struct FlowGraph { } // Cells - void add_cell_defs_uses(Node *node, const RTLIL::Cell *cell) + void add_cell_sync_defs(Node *node, const RTLIL::Cell *cell) + { + // To understand why this node type is necessary and why it produces comb defs, consider a cell + // with input \i and sync output \o, used in a design such that \i is connected to \o. This does + // not result in a feedback arc because the output is synchronous. However, a naive implementation + // of code generation for cells that assigns to inputs, evaluates cells, assigns from outputs + // would not be able to immediately converge... + // + // wire<1> i_tmp; + // cell->p_i = i_tmp.curr; + // cell->eval(); + // i_tmp.next = cell->p_o.curr; + // + // ... since the wire connecting the input and output ports would not be localizable. To solve + // this, the cell is split into two scheduling nodes; one exclusively for sync outputs, and + // another for inputs and all non-sync outputs. This way the generated code can be rearranged... + // + // value<1> i_tmp; + // i_tmp = cell->p_o.curr; + // cell->p_i = i_tmp; + // cell->eval(); + // + // eliminating the unnecessary delta cycle. Conceptually, the CELL_SYNC node type is a series of + // connections of the form `connect \lhs \cell.\sync_output`; the right-hand side of these is not + // as a wire in RTLIL. If it was expressible, then `\cell.\sync_output` would have a sync def, + // and this node would be an ordinary CONNECT node, with `\lhs` having a comb def. Because it isn't, + // a special node type is used, the right-hand side does not appear anywhere, and the left-hand + // side has a comb def. + for (auto conn : cell->connections()) + if (cell->output(conn.first)) + if (is_cxxrtl_sync_port(cell, conn.first)) { + // See note regarding elidability below. + add_defs(node, conn.second, /*fully_sync=*/false, /*elidable=*/false); + } + } + + void add_cell_eval_defs_uses(Node *node, const RTLIL::Cell *cell) { - log_assert(cell->known()); for (auto conn : cell->connections()) { if (cell->output(conn.first)) { if (is_elidable_cell(cell->type)) - add_defs(node, conn.second, /*is_sync=*/false, /*elidable=*/true); + add_defs(node, conn.second, /*fully_sync=*/false, /*elidable=*/true); else if (is_sync_ff_cell(cell->type) || (cell->type == ID($memrd) && cell->getParam(ID::CLK_ENABLE).as_bool())) - add_defs(node, conn.second, /*is_sync=*/true, /*elidable=*/false); + add_defs(node, conn.second, /*fully_sync=*/true, /*elidable=*/false); else if (is_internal_cell(cell->type)) - add_defs(node, conn.second, /*is_sync=*/false, /*elidable=*/false); - else { - // Unlike outputs of internal cells (which generate code that depends on the ability to set the output - // wire bits), outputs of user cells are normal wires, and the wires connected to them can be elided. - add_defs(node, conn.second, /*is_sync=*/false, /*elidable=*/true); + add_defs(node, conn.second, /*fully_sync=*/false, /*elidable=*/false); + else if (!is_cxxrtl_sync_port(cell, conn.first)) { + // Although at first it looks like outputs of user-defined cells may always be elided, the reality is + // more complex. Fully sync outputs produce no defs and so don't participate in elision. Fully comb + // outputs are assigned in a different way depending on whether the cell's eval() immediately converged. + // Unknown/mixed outputs could be elided, but should be rare in practical designs and don't justify + // the infrastructure required to elide outputs of cells with many of them. + add_defs(node, conn.second, /*fully_sync=*/false, /*elidable=*/false); } } if (cell->input(conn.first)) @@ -311,11 +397,27 @@ struct FlowGraph { Node *add_node(const RTLIL::Cell *cell) { + log_assert(cell->known()); + + bool has_fully_sync_outputs = false; + for (auto conn : cell->connections()) + if (cell->output(conn.first) && is_cxxrtl_sync_port(cell, conn.first)) { + has_fully_sync_outputs = true; + break; + } + if (has_fully_sync_outputs) { + Node *node = new Node; + node->type = Node::Type::CELL_SYNC; + node->cell = cell; + nodes.push_back(node); + add_cell_sync_defs(node, cell); + } + Node *node = new Node; - node->type = Node::Type::CELL; + node->type = Node::Type::CELL_EVAL; node->cell = cell; nodes.push_back(node); - add_cell_defs_uses(node, cell); + add_cell_eval_defs_uses(node, cell); return node; } @@ -360,18 +462,6 @@ struct FlowGraph { } }; -bool is_input_wire(const RTLIL::Wire *wire) -{ - return wire->port_input && !wire->port_output; -} - -bool is_cxxrtl_blackbox_cell(const RTLIL::Cell *cell) -{ - RTLIL::Module *cell_module = cell->module->design->module(cell->type); - log_assert(cell_module != nullptr); - return cell_module->get_bool_attribute(ID(cxxrtl.blackbox)); -} - std::vector split_by(const std::string &str, const std::string &sep) { std::vector result; @@ -436,7 +526,6 @@ struct CxxrtlWorker { dict edge_types; pool writable_memories; dict> transparent_for; - dict> cell_wire_defs; dict elided_wires; dict> schedule; pool localized_wires; @@ -681,13 +770,9 @@ struct CxxrtlWorker { case FlowGraph::Node::Type::CONNECT: dump_connect_elided(node.connect); break; - case FlowGraph::Node::Type::CELL: - if (is_elidable_cell(node.cell->type)) { - dump_cell_elided(node.cell); - } else { - const char *access = is_cxxrtl_blackbox_cell(node.cell) ? "->" : "."; - f << mangle(node.cell) << access << mangle_wire_name(cell_wire_defs[node.cell][chunk.wire]) << ".curr"; - } + case FlowGraph::Node::Type::CELL_EVAL: + log_assert(is_elidable_cell(node.cell->type)); + dump_cell_elided(node.cell); break; default: log_assert(false); @@ -752,8 +837,8 @@ struct CxxrtlWorker { case FlowGraph::Node::Type::CONNECT: collect_connect(node.connect, cells); break; - case FlowGraph::Node::Type::CELL: - collect_cell(node.cell, cells); + case FlowGraph::Node::Type::CELL_EVAL: + collect_cell_eval(node.cell, cells); break; default: log_assert(false); @@ -792,6 +877,19 @@ struct CxxrtlWorker { f << ";\n"; } + void dump_cell_sync(const RTLIL::Cell *cell) + { + const char *access = is_cxxrtl_blackbox_cell(cell) ? "->" : "."; + f << indent << "// cell " << cell->name.str() << " syncs\n"; + for (auto conn : cell->connections()) + if (cell->output(conn.first)) + if (is_cxxrtl_sync_port(cell, conn.first)) { + f << indent; + dump_sigspec_lhs(conn.second); + f << " = " << mangle(cell) << access << mangle_wire_name(conn.first) << ".curr;\n"; + } + } + void dump_cell_elided(const RTLIL::Cell *cell) { // Unary cells @@ -845,7 +943,7 @@ struct CxxrtlWorker { elided_wires.count(cell->getPort(ID::Y).as_wire()); } - void collect_cell(const RTLIL::Cell *cell, std::vector &cells) + void collect_cell_eval(const RTLIL::Cell *cell, std::vector &cells) { if (!is_cell_elided(cell)) return; @@ -856,7 +954,7 @@ struct CxxrtlWorker { collect_sigspec_rhs(port.second, cells); } - void dump_cell(const RTLIL::Cell *cell) + void dump_cell_eval(const RTLIL::Cell *cell) { if (is_cell_elided(cell)) return; @@ -1109,21 +1207,42 @@ struct CxxrtlWorker { dump_sigspec_rhs(conn.second); f << ";\n"; } - f << indent << "converged &= " << mangle(cell) << access << "eval();\n"; - for (auto conn : cell->connections()) { - if (conn.second.is_wire()) { - RTLIL::Wire *wire = conn.second.as_wire(); - if (elided_wires.count(wire) && cell_wire_defs[cell].count(wire)) - continue; + auto assign_from_outputs = [&](bool cell_converged) { + for (auto conn : cell->connections()) { + if (cell->output(conn.first)) { + if (conn.second.empty()) + continue; // ignore disconnected ports + if (is_cxxrtl_sync_port(cell, conn.first)) + continue; // fully sync ports are handled in CELL_SYNC nodes + f << indent; + dump_sigspec_lhs(conn.second); + f << " = " << mangle(cell) << access << mangle_wire_name(conn.first); + // Similarly to how there is no purpose to buffering cell inputs, there is also no purpose to buffering + // combinatorial cell outputs in case the cell converges within one cycle. (To convince yourself that + // this optimization is valid, consider that, since the cell converged within one cycle, it would not + // have any buffered wires if they were not output ports. Imagine inlining the cell's eval() function, + // and consider the fate of the localized wires that used to be output ports.) + // + // Unlike cell inputs (which are never buffered), it is not possible to know apriori whether the cell + // (which may be late bound) will converge immediately. Because of this, the choice between using .curr + // (appropriate for buffered outputs) and .next (appropriate for unbuffered outputs) is made at runtime. + if (cell_converged && is_cxxrtl_comb_port(cell, conn.first)) + f << ".next;\n"; + else + f << ".curr;\n"; + } } - if (cell->output(conn.first)) { - if (conn.second.empty()) - continue; // ignore disconnected ports - f << indent; - dump_sigspec_lhs(conn.second); - f << " = " << mangle(cell) << access << mangle_wire_name(conn.first) << ".curr;\n"; - } - } + }; + f << indent << "if (" << mangle(cell) << access << "eval()) {\n"; + inc_indent(); + assign_from_outputs(/*cell_converged=*/true); + dec_indent(); + f << indent << "} else {\n"; + inc_indent(); + f << indent << "converged = false;\n"; + assign_from_outputs(/*cell_converged=*/false); + dec_indent(); + f << indent << "}\n"; } } @@ -1392,8 +1511,11 @@ struct CxxrtlWorker { case FlowGraph::Node::Type::CONNECT: dump_connect(node.connect); break; - case FlowGraph::Node::Type::CELL: - dump_cell(node.cell); + case FlowGraph::Node::Type::CELL_SYNC: + dump_cell_sync(node.cell); + break; + case FlowGraph::Node::Type::CELL_EVAL: + dump_cell_eval(node.cell); break; case FlowGraph::Node::Type::PROCESS: dump_process(node.process); @@ -1807,14 +1929,6 @@ struct CxxrtlWorker { elided_wires[wire] = **flow.wire_comb_defs[wire].begin(); } - // Elided wires that are outputs of internal cells are always connected to a well known port (Y). - // For user cells, there could be multiple of them, and we need a way to look up the port name - // knowing only the wire. - for (auto cell : module->cells()) - for (auto conn : cell->connections()) - if (conn.second.is_wire() && elided_wires.count(conn.second.as_wire())) - cell_wire_defs[cell][conn.second.as_wire()] = conn.first; - dict, hash_ptr_ops> node_defs; for (auto wire_comb_def : flow.wire_comb_defs) for (auto node : wire_comb_def.second) @@ -2012,7 +2126,7 @@ struct CxxrtlBackend : public Backend { log(" (* cxxrtl.edge = \"p\" *) input clk;\n"); log(" input en;\n"); log(" input [7:0] i_data;\n"); - log(" output [7:0] o_data;\n"); + log(" (* cxxrtl.sync *) output [7:0] o_data;\n"); log(" endmodule\n"); log("\n"); log("For this HDL interface, this backend will generate the following C++ interface:\n"); @@ -2117,6 +2231,13 @@ struct CxxrtlBackend : public Backend { log(" only valid on ports of black boxes. must be a constant expression, which\n"); log(" is directly inserted into generated code.\n"); log("\n"); + log(" cxxrtl.comb, cxxrtl.sync\n"); + log(" only valid on outputs of black boxes. if specified, indicates that every\n"); + log(" bit of the output port is driven, correspondingly, by combinatorial or\n"); + log(" synchronous logic. this knowledge is used for scheduling optimizations.\n"); + log(" if neither is specified, the output will be pessimistically treated as\n"); + log(" driven by both combinatorial and synchronous logic.\n"); + log("\n"); log("The following options are supported by this backend:\n"); log("\n"); log(" -header\n"); From 5f17e0ced552afb0a27bd244727ce58c24f45e95 Mon Sep 17 00:00:00 2001 From: whitequark Date: Tue, 21 Apr 2020 23:42:56 +0000 Subject: [PATCH 09/11] cxxrtl: use log_id() where appropriate. NFC. --- backends/cxxrtl/cxxrtl.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/backends/cxxrtl/cxxrtl.cc b/backends/cxxrtl/cxxrtl.cc index ab8888e08..3cdc5ab6b 100644 --- a/backends/cxxrtl/cxxrtl.cc +++ b/backends/cxxrtl/cxxrtl.cc @@ -1971,9 +1971,9 @@ struct CxxrtlWorker { if (!feedback_wires.empty()) { has_feedback_arcs = true; - log("Module `%s' contains feedback arcs through wires:\n", module->name.c_str()); + log("Module `%s' contains feedback arcs through wires:\n", log_id(module)); for (auto wire : feedback_wires) - log(" %s\n", wire->name.c_str()); + log(" %s\n", log_id(wire)); } for (auto wire : module->wires()) { @@ -2002,9 +2002,9 @@ struct CxxrtlWorker { } if (!buffered_wires.empty()) { has_buffered_wires = true; - log("Module `%s' contains buffered combinatorial wires:\n", module->name.c_str()); + log("Module `%s' contains buffered combinatorial wires:\n", log_id(module)); for (auto wire : buffered_wires) - log(" %s\n", wire->name.c_str()); + log(" %s\n", log_id(wire)); } eval_converges[module] = feedback_wires.empty() && buffered_wires.empty(); From 1d5b6ac253a397d173b92549d420d898f80e577f Mon Sep 17 00:00:00 2001 From: whitequark Date: Wed, 22 Apr 2020 01:15:27 +0000 Subject: [PATCH 10/11] cxxrtl: add an unsupported knob for manipulating clock trees. This is quite possibly the worst way to implement this, but it does work for a subset of well-behaved designs, and can be used to measure how much performance is lost simulating the inactive edge of a clock. It should be replaced with a clock tree analyzer generating safe code once it is clear how should such a thing look like. --- backends/cxxrtl/cxxrtl.cc | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/backends/cxxrtl/cxxrtl.cc b/backends/cxxrtl/cxxrtl.cc index 3cdc5ab6b..bb473cefb 100644 --- a/backends/cxxrtl/cxxrtl.cc +++ b/backends/cxxrtl/cxxrtl.cc @@ -1202,6 +1202,24 @@ struct CxxrtlWorker { f << indent << mangle(cell) << access << mangle_wire_name(conn.first) << " = "; dump_sigspec_rhs(conn.second); f << ";\n"; + if (getenv("CXXRTL_VOID_MY_WARRANTY")) { + // Until we have proper clock tree detection, this really awful hack that opportunistically + // propagates prev_* values for clocks can be used to estimate how much faster a design could + // be if only one clock edge was simulated by replacing: + // top.p_clk = value<1>{0u}; top.step(); + // top.p_clk = value<1>{1u}; top.step(); + // with: + // top.prev_p_clk = value<1>{0u}; top.p_clk = value<1>{1u}; top.step(); + // Don't rely on this; it will be removed without warning. + RTLIL::Module *cell_module = cell->module->design->module(cell->type); + if (cell_module != nullptr && cell_module->wire(conn.first) && conn.second.is_wire()) { + RTLIL::Wire *cell_module_wire = cell_module->wire(conn.first); + if (edge_wires[conn.second.as_wire()] && edge_wires[cell_module_wire]) { + f << indent << mangle(cell) << access << "prev_" << mangle(cell_module_wire) << " = "; + f << "prev_" << mangle(conn.second.as_wire()) << ";\n"; + } + } + } } else if (cell->input(conn.first)) { f << indent << mangle(cell) << access << mangle_wire_name(conn.first) << ".next = "; dump_sigspec_rhs(conn.second); From 93288b8eaea3e346275082352edeea5cfb4ac38a Mon Sep 17 00:00:00 2001 From: whitequark Date: Wed, 22 Apr 2020 12:45:19 +0000 Subject: [PATCH 11/11] cxxrtl: run edge detectors only once in eval(). As a result, Minerva SRAM SoC runs ~15% faster. --- backends/cxxrtl/cxxrtl.cc | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/backends/cxxrtl/cxxrtl.cc b/backends/cxxrtl/cxxrtl.cc index bb473cefb..237700b29 100644 --- a/backends/cxxrtl/cxxrtl.cc +++ b/backends/cxxrtl/cxxrtl.cc @@ -1020,7 +1020,7 @@ struct CxxrtlWorker { RTLIL::SigBit clk_bit = cell->getPort(ID::CLK)[0]; clk_bit = sigmaps[clk_bit.wire->module](clk_bit); f << indent << "if (" << (cell->getParam(ID::CLK_POLARITY).as_bool() ? "posedge_" : "negedge_") - << mangle(clk_bit) << "()) {\n"; + << mangle(clk_bit) << ") {\n"; inc_indent(); if (cell->type == ID($dffe)) { f << indent << "if ("; @@ -1097,7 +1097,7 @@ struct CxxrtlWorker { RTLIL::SigBit clk_bit = cell->getPort(ID::CLK)[0]; clk_bit = sigmaps[clk_bit.wire->module](clk_bit); f << indent << "if (" << (cell->getParam(ID::CLK_POLARITY).as_bool() ? "posedge_" : "negedge_") - << mangle(clk_bit) << "()) {\n"; + << mangle(clk_bit) << ") {\n"; inc_indent(); } RTLIL::Memory *memory = cell->module->memories[cell->getParam(ID::MEMID).decode_string()]; @@ -1365,16 +1365,16 @@ struct CxxrtlWorker { switch (sync->type) { case RTLIL::STp: log_assert(sync_bit.wire != nullptr); - events.insert("posedge_" + mangle(sync_bit) + "()"); + events.insert("posedge_" + mangle(sync_bit)); break; case RTLIL::STn: log_assert(sync_bit.wire != nullptr); - events.insert("negedge_" + mangle(sync_bit) + "()"); + events.insert("negedge_" + mangle(sync_bit)); break; case RTLIL::STe: log_assert(sync_bit.wire != nullptr); - events.insert("posedge_" + mangle(sync_bit) + "()"); - events.insert("negedge_" + mangle(sync_bit) + "()"); + events.insert("posedge_" + mangle(sync_bit)); + events.insert("negedge_" + mangle(sync_bit)); break; case RTLIL::STa: @@ -1522,6 +1522,22 @@ struct CxxrtlWorker { inc_indent(); f << indent << "bool converged = " << (eval_converges.at(module) ? "true" : "false") << ";\n"; if (!module->get_bool_attribute(ID(cxxrtl.blackbox))) { + for (auto wire : module->wires()) { + if (edge_wires[wire]) { + for (auto edge_type : edge_types) { + if (edge_type.first.wire == wire) { + if (edge_type.second != RTLIL::STn) { + f << indent << "bool posedge_" << mangle(edge_type.first) << " = "; + f << "this->posedge_" << mangle(edge_type.first) << "();\n"; + } + if (edge_type.second != RTLIL::STp) { + f << indent << "bool negedge_" << mangle(edge_type.first) << " = "; + f << "this->negedge_" << mangle(edge_type.first) << "();\n"; + } + } + } + } + } for (auto wire : module->wires()) dump_wire(wire, /*is_local_context=*/true); for (auto node : schedule[module]) {