add PolarFire FPGA support

2025-08-15 23:35:28 +00:00 · 2024-07-02 12:44:30 -07:00 · 2024-07-02 12:44:30 -07:00 · acddc36389
commit acddc36389
parent a739e21a5f
59 changed files with 5389 additions and 0 deletions
--- a/passes/pmgen/Makefile.inc
+++ b/passes/pmgen/Makefile.inc
@ -37,6 +37,17 @@ $(eval $(call add_extra_objs,passes/pmgen/xilinx_dsp_cascade_pm.h))

 # --------------------------------------

+OBJS += passes/pmgen/mchp_dsp.o
+GENFILES += passes/pmgen/mchp_dsp_pm.h
+GENFILES += passes/pmgen/mchp_dsp_CREG_pm.h
+GENFILES += passes/pmgen/mchp_dsp_cascade_pm.h
+passes/pmgen/mchp_dsp.o: passes/pmgen/mchp_dsp_pm.h passes/pmgen/mchp_dsp_CREG_pm.h passes/pmgen/mchp_dsp_cascade_pm.h
+$(eval $(call add_extra_objs,passes/pmgen/mchp_dsp_pm.h))
+$(eval $(call add_extra_objs,passes/pmgen/mchp_dsp_CREG_pm.h))
+$(eval $(call add_extra_objs,passes/pmgen/mchp_dsp_cascade_pm.h))
+
+# --------------------------------------
+
 OBJS += passes/pmgen/peepopt.o
 GENFILES += passes/pmgen/peepopt_pm.h
 passes/pmgen/peepopt.o: passes/pmgen/peepopt_pm.h
--- a/passes/pmgen/mchp_dsp.cc
+++ b/passes/pmgen/mchp_dsp.cc
@ -0,0 +1,373 @@
+/*
+ISC License
+
+Copyright (C) 2024 Microchip Technology Inc. and its subsidiaries
+
+Permission to use, copy, modify, and/or distribute this software for any
+purpose with or without fee is hereby granted, provided that the above
+copyright notice and this permission notice appear in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+*/
+
+#include "kernel/yosys.h"
+#include "kernel/sigtools.h"
+#include <deque>
+
+USING_YOSYS_NAMESPACE
+PRIVATE_NAMESPACE_BEGIN
+
+#include "passes/pmgen/mchp_dsp_pm.h"
+#include "passes/pmgen/mchp_dsp_CREG_pm.h"
+#include "passes/pmgen/mchp_dsp_cascade_pm.h"
+
+void mchp_dsp_pack(mchp_dsp_pm &pm)
+{
+	auto &st = pm.st_mchp_dsp_pack;
+
+	log("Analysing %s.%s for MCHP MACC_PA packing.\n", log_id(pm.module), log_id(st.dsp));
+
+	Cell *cell = st.dsp;
+	//pack pre-adder
+	if (st.preAdderStatic) {
+		SigSpec &pasub = cell->connections_.at(ID(PASUB));
+		log("  static PASUB preadder %s (%s)\n", log_id(st.preAdderStatic), log_id(st.preAdderStatic->type));
+		bool D_SIGNED = st.preAdderStatic->getParam(ID::B_SIGNED).as_bool();
+		bool B_SIGNED = st.preAdderStatic->getParam(ID::A_SIGNED).as_bool();
+		st.sigB.extend_u0(18, B_SIGNED);
+		st.sigD.extend_u0(18, D_SIGNED);
+		if (st.moveBtoA)
+		{
+			cell->setPort(ID::A, st.sigA); // if pre-adder feeds into A, original sigB will be moved to port A
+		}
+		cell->setPort(ID::B, st.sigB);
+		cell->setPort(ID::D, st.sigD);
+		// MACC_PA supports both addition and subtraction with the pre-adder.
+		//   Affects the sign of the 'D' port.
+		if (st.preAdderStatic->type == ID($add))
+			pasub[0] = State::S0;
+		else if (st.preAdderStatic->type == ID($sub))
+			pasub[0] = State::S1;
+		else
+			log_assert(!"strange pre-adder type");
+
+		pm.autoremove(st.preAdderStatic);
+	}
+	//pack post-adder
+	if (st.postAdderStatic) {
+		log("  postadder %s (%s)\n", log_id(st.postAdderStatic), log_id(st.postAdderStatic->type));
+		SigSpec &sub = cell->connections_.at(ID(SUB));
+		// Post-adder in MACC_PA also supports subtraction
+		//   Determines the sign of the output from the multiplier.
+		if (st.postAdderStatic->type == ID($add))
+			sub[0] = State::S0;
+		else if (st.postAdderStatic->type == ID($sub))
+			sub[0] = State::S1;
+		else
+			log_assert(!"strange post-adder type");
+
+		if (st.useFeedBack) {
+			cell->setPort(ID(CDIN_FDBK_SEL), {State::S0, State::S1});	
+		} else {
+			st.sigC.extend_u0(48, st.postAdderStatic->getParam(ID::A_SIGNED).as_bool());
+			cell->setPort(ID::C, st.sigC);	
+		}
+
+
+		pm.autoremove(st.postAdderStatic);
+	}
+	
+	// pack registers
+	if (st.clock != SigBit())
+	{
+		cell->setPort(ID::CLK, st.clock);
+
+		// function to absorb a register
+		auto f = [&pm,cell](SigSpec &A, Cell* ff, IdString ceport, IdString rstport, IdString bypass) {
+
+			// input/output ports
+			SigSpec D = ff->getPort(ID::D);
+			SigSpec Q = pm.sigmap(ff->getPort(ID::Q));
+			
+			if (!A.empty())
+				A.replace(Q, D);
+			if (rstport != IdString()) {
+				if (ff->type.in(ID($sdff), ID($sdffe))) {
+					SigSpec srst = ff->getPort(ID::SRST);
+					bool rstpol_n = !ff->getParam(ID::SRST_POLARITY).as_bool();
+					// active low sync rst
+					cell->setPort(rstport, rstpol_n ? srst : pm.module->Not(NEW_ID, srst));
+				} else if (ff->type.in(ID($adff), ID($adffe))) {
+					SigSpec arst = ff->getPort(ID::ARST);
+					bool rstpol_n = !ff->getParam(ID::ARST_POLARITY).as_bool();
+					// active low async rst
+					cell->setPort(rstport, rstpol_n ? arst : pm.module->Not(NEW_ID, arst));
+				} else {
+					// active low async/sync rst
+					cell->setPort(rstport, State::S1);
+				}
+			}
+			if (ff->type.in(ID($dffe), ID($sdffe), ID($adffe))) {
+				SigSpec ce = ff->getPort(ID::EN);
+				bool cepol = ff->getParam(ID::EN_POLARITY).as_bool();
+				// enables are all active high
+				cell->setPort(ceport, cepol ? ce : pm.module->Not(NEW_ID, ce));
+			}
+			else {
+				// enables are all active high
+				cell->setPort(ceport, State::S1); 
+			}
+
+			// bypass set to 0
+			cell->setPort(bypass, State::S0); 
+			
+
+			for (auto c : Q.chunks()) {
+				auto it = c.wire->attributes.find(ID::init);
+				if (it == c.wire->attributes.end())
+					continue;
+				for (int i = c.offset; i < c.offset+c.width; i++) {
+					log_assert(it->second[i] == State::S0 || it->second[i] == State::Sx);
+					it->second[i] = State::Sx;
+				}
+			}
+		};
+
+		// NOTE: flops are not autoremoved because it is possible that they
+		//       are only partially absorbed into DSP, or have fanouts.
+		if (st.ffA) {
+			SigSpec A = cell->getPort(ID::A);
+			if (st.ffA) {
+				f(A, st.ffA, ID(A_EN), ID(A_SRST_N), ID(A_BYPASS));
+			}
+			pm.add_siguser(A, cell);
+			cell->setPort(ID::A, A);
+		}
+		if (st.ffB) {
+			SigSpec B = cell->getPort(ID::B);
+			if (st.ffB) {
+				f(B, st.ffB, ID(B_EN), ID(B_SRST_N), ID(B_BYPASS));
+			}
+			pm.add_siguser(B, cell);
+			cell->setPort(ID::B, B);
+		}
+		if (st.ffD) {
+			SigSpec D = cell->getPort(ID::D);
+			if (st.ffD->type.in(ID($adff), ID($adffe))) {
+				f(D, st.ffD, ID(D_EN), ID(D_ARST_N), ID(D_BYPASS));
+			} else {
+				f(D, st.ffD, ID(D_EN), ID(D_SRST_N), ID(D_BYPASS));
+			}
+			
+			pm.add_siguser(D, cell);
+			cell->setPort(ID::D, D);
+		}
+		if (st.ffP) {
+			SigSpec P; // unused
+			f(P, st.ffP, ID(P_EN), ID(P_SRST_N), ID(P_BYPASS));
+			st.ffP->connections_.at(ID::Q).replace(st.sigP, pm.module->addWire(NEW_ID, GetSize(st.sigP)));
+		}
+
+		log("  clock: %s (%s)\n", log_signal(st.clock), "posedge");
+
+		if (st.ffA)
+			log(" \t ffA:%s\n", log_id(st.ffA));
+		if (st.ffB)
+			log(" \t ffB:%s\n", log_id(st.ffB));
+		if (st.ffD)
+			log(" \t ffD:%s\n", log_id(st.ffD));
+		if (st.ffP)
+			log(" \t ffP:%s\n", log_id(st.ffP));
+	}
+	log("\n");
+
+	SigSpec P = st.sigP;
+	if (GetSize(P) < 48)
+		P.append(pm.module->addWire(NEW_ID, 48-GetSize(P)));
+	cell->setPort(ID::P, P);
+
+	pm.blacklist(cell);
+}
+
+// For packing cascaded DSPs
+void mchp_dsp_packC(mchp_dsp_CREG_pm &pm)
+{
+	auto &st = pm.st_mchp_dsp_packC;
+
+	log_debug("Analysing %s.%s for MCHP DSP packing (REG_C).\n", log_id(pm.module), log_id(st.dsp));
+	log_debug("ffC:        %s\n", log_id(st.ffC, "--"));
+
+	Cell *cell = st.dsp;
+
+	if (st.clock != SigBit())
+	{
+		cell->setPort(ID::CLK, st.clock);
+
+		// same function as above, used for the last CREG we need to absorb
+		auto f = [&pm,cell](SigSpec &A, Cell* ff, IdString ceport, IdString rstport, IdString bypass) {
+
+			// input/output ports
+			SigSpec D = ff->getPort(ID::D);
+			SigSpec Q = pm.sigmap(ff->getPort(ID::Q));
+			if (!A.empty())
+				A.replace(Q, D);
+			if (rstport != IdString()) {
+				if (ff->type.in(ID($sdff), ID($sdffe))) {
+					SigSpec srst = ff->getPort(ID::SRST);
+					bool rstpol_n = !ff->getParam(ID::SRST_POLARITY).as_bool();
+					// active low sync rst
+					cell->setPort(rstport, rstpol_n ? srst : pm.module->Not(NEW_ID, srst));
+				} else if (ff->type.in(ID($adff), ID($adffe))) {
+					SigSpec arst = ff->getPort(ID::ARST);
+					bool rstpol_n = !ff->getParam(ID::ARST_POLARITY).as_bool();
+					// active low async rst
+					cell->setPort(rstport, rstpol_n ? arst : pm.module->Not(NEW_ID, arst));
+				} else {
+					// active low async/sync rst
+					cell->setPort(rstport, State::S1);
+				}
+			}
+			if (ff->type.in(ID($dffe), ID($sdffe), ID($adffe))) {
+				SigSpec ce = ff->getPort(ID::EN);
+				bool cepol = ff->getParam(ID::EN_POLARITY).as_bool();
+				// enables are all active high
+				cell->setPort(ceport, cepol ? ce : pm.module->Not(NEW_ID, ce));
+			} else {
+				// enables are all active high
+				cell->setPort(ceport, State::S1); 
+			}
+
+			// bypass set to 0
+			cell->setPort(bypass, State::S0); 
+			
+
+			for (auto c : Q.chunks()) {
+				auto it = c.wire->attributes.find(ID::init);
+				if (it == c.wire->attributes.end())
+					continue;
+				for (int i = c.offset; i < c.offset+c.width; i++) {
+					log_assert(it->second[i] == State::S0 || it->second[i] == State::Sx);
+					it->second[i] = State::Sx;
+				}
+			}
+		};
+
+		if (st.ffC) {
+			SigSpec C = cell->getPort(ID::C);
+			
+			if (st.ffC->type.in(ID($adff), ID($adffe))) {
+				f(C, st.ffC, ID(C_EN), ID(C_ARST_N), ID(C_BYPASS));
+			} else {
+				f(C, st.ffC, ID(C_EN), ID(C_SRST_N), ID(C_BYPASS));
+			}
+			pm.add_siguser(C, cell);
+			cell->setPort(ID::C, C);
+		}
+
+		log("  clock: %s (%s)", log_signal(st.clock), "posedge");
+
+		if (st.ffC)
+			log(" ffC:%s", log_id(st.ffC));
+		log("\n");
+	}
+
+	pm.blacklist(cell);
+}
+
+struct MchpDspPass : public Pass {
+	MchpDspPass() : Pass("mchp_dsp", "MCHP: pack resources into DSPs") { }
+	void help() override
+	{
+		//   |---v---|---v---|---v---|---v---|---v---|---v---|---v---|---v---|---v---|---v---|
+		log("\n");
+		log("    mchp_dsp [options] [selection]\n");
+		log("\n");
+		log("Pack input registers 'A', 'B', 'C', and 'D' (with optional enable/reset),\n");
+		log("output register 'P' (with optional enable/reset), pre-adder and/or post-adder into\n");
+		log("MCHP DSP resources.\n");
+		log("\n");
+		log("Multiply-accumulate operations using the post-adder with feedback on the 'C'\n");
+		log("input will be folded into the DSP. In this scenario only, the 'C' input can be\n");
+		log("used to override the current accumulation result with a new value. This will\n");
+		log("be added to the multiplier result to form the next accumulation result.\n");
+		log("\n");
+		log("Use of the dedicated 'PCOUT' -> 'PCIN' cascade path is detected for 'P' -> 'C'\n");
+		log("connections (optionally, where 'P' is right-shifted by 17-bits and used as an\n");
+		log("input to the post-adder. This pattern is common for summing partial products to\n");
+		log("implement wide multipliers). Cascade chains are limited to a mazimum length \n");
+		log("of 24 cells, corresponding to PolarFire (pf) devices.\n");
+		log("\n");
+		log("This pass is a no-op if the scratchpad variable 'mchp_dsp.multonly' is set\n");
+		log("to 1.\n");
+		log("\n");
+		log("\n");
+		log("    -family {pf}\n");
+		log("        select the family to target\n");
+		log("        default: pf\n");
+		log("\n");
+	}
+	void execute(std::vector<std::string> args, RTLIL::Design *design) override
+	{
+		log_header(design, "Executing MCHP_DSP pass (pack resources into DSPs).\n");
+
+		std::string family = "pf";
+		size_t argidx;
+		for (argidx = 1; argidx < args.size(); argidx++)
+		{
+			if ((args[argidx] == "-family") && argidx+1 < args.size()) {
+				family = args[++argidx];
+				continue;
+			}
+			break;
+		}
+		extra_args(args, argidx, design);
+
+		for (auto module : design->selected_modules()) {
+
+			if (design->scratchpad_get_bool("mchp_dsp.multonly"))
+				continue;
+
+			{
+				// For more details on PolarFire MACC_PA, consult
+				//   the "PolarFire FPGA Macro Library Guide"
+
+				// Main pattern matching step to capture a DSP cell.
+				//   Match for pre-adder, post-adder, as well as 
+				//   registers 'A', 'B', 'D', and 'P'. Additionally,
+				//   check for an accumulator pattern based on whether
+				//   a post-adder and PREG are both present AND
+				//   if PREG feeds into this post-adder.  
+				mchp_dsp_pm pm(module, module->selected_cells());
+				pm.run_mchp_dsp_pack(mchp_dsp_pack);
+			}
+
+			// Separating out CREG packing is necessary since there
+			//   is no guarantee that the cell ordering corresponds
+			//   to the "expected" case (i.e. the order in which
+			//   they appear in the source). There existed the possibility
+			// 	 where a register got packed as a CREG into a
+			//   downstream DSP that should have otherwise been a
+			//   PREG of an upstream DSP that had not been visited
+			//   yet
+			{
+				mchp_dsp_CREG_pm pm(module, module->selected_cells());
+				pm.run_mchp_dsp_packC(mchp_dsp_packC);
+			}
+
+			// Lastly, identify and utilise PCOUT -> PCIN chains
+			{
+				mchp_dsp_cascade_pm pm(module, module->selected_cells());
+				pm.run_mchp_dsp_cascade();
+			}
+			
+		}
+	}
+} MchpDspPass;
+
+PRIVATE_NAMESPACE_END
--- a/passes/pmgen/mchp_dsp.pmg
+++ b/passes/pmgen/mchp_dsp.pmg
@ -0,0 +1,440 @@
+// ISC License
+// 
+// Copyright (C) 2024 Microchip Technology Inc. and its subsidiaries
+// 
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+
+// This file describes the main pattern matcher setup (of three total) that
+//   forms the `mchp_dsp` pass described in mchp_dsp.cc - version for
+//   DSP48A/DSP48A1 (Spartan 3A DSP, Spartan 6).
+// At a high level, it works as follows:
+//   ( 1) Starting from a DSP cell. Capture DSP configurations as states
+//   ( 2) Match for pre-adder
+//   ( 3) Match for post-adder
+//   ( 4) Match register 'A', 'B', 'D', 'P' 
+//   ( 5) If post-adder and PREG both present, check if PREG feeds into post-adder.
+//        This indicates an accumulator situation like the ASCII diagram below:
+//             +--------------------------------+
+//             |_________                       |
+//                       | /-------\   +----+   |
+//            +----+     +-| post- |___|PREG|---+ 'P'
+//            |MULT|------ | adder |   +----+
+//            +----+       \-------/
+
+pattern mchp_dsp_pack
+
+state <SigBit> clock
+state <SigSpec> sigA sigB sigC sigD sigP
+state <Cell*> ffA ffB ffD ffP
+state <Cell*> preAdderStatic postAdderStatic
+state <bool> moveBtoA useFeedBack
+
+// static ports, used to detect dsp configuration
+state <SigSpec> bypassA bypassB bypassC bypassD bypassP
+state <SigSpec> bypassPASUB
+
+// Variables used for subpatterns
+state <SigSpec> argQ argD
+udata <bool> allowAsync
+udata <SigSpec> dffD dffQ
+udata <SigBit> dffclock
+udata <Cell*> dff
+udata <Cell*> u_preAdderStatic u_postAdderStatic
+udata <IdString> u_postAddAB
+state <IdString> postAddAB
+
+// (1) Starting from a DSP cell
+match dsp
+	select dsp->type.in(\MACC_PA)
+endmatch
+
+// detect existing signals connected to DSP
+// detect configuration ports
+code sigA sigB sigC sigD clock sigP
+	//helper function to remove unused bits
+	auto unextend = [](const SigSpec &sig) {
+		int i;
+		for (i = GetSize(sig)-1; i > 0; i--)
+			if (sig[i] != sig[i-1])
+				break;
+		// Do not remove non-const sign bit
+		if (sig[i].wire)
+			++i;
+		return sig.extract(0, i);
+	};
+
+	//unextend to remove unused bits
+	sigA = unextend(port(dsp, \A));
+	sigB = unextend(port(dsp, \B));
+
+	//update signals
+	sigC = port(dsp, \C, SigSpec());
+	sigD = port(dsp, \D, SigSpec());
+
+
+	SigSpec P = port(dsp, \P);
+	// Only care about bits that are used
+	int i;
+	for (i = GetSize(P)-1; i >= 0; i--)
+		if (nusers(P[i]) > 1)
+			break;
+	i++;
+	log_assert(nusers(P.extract_end(i)) <= 1);
+	// This sigP could have no users if downstream sinks (e.g. $add) is
+	//   narrower than $mul result, for example
+	if (i == 0)
+		reject;
+	sigP = P.extract(0, i);
+	clock = port(dsp, \CLK, SigBit());
+
+endcode
+
+// capture static configuration ports
+code bypassA bypassB bypassC bypassD bypassPASUB bypassP
+	bypassA = port(dsp, \A_BYPASS, SigSpec());
+	bypassB = port(dsp, \B_BYPASS, SigSpec());
+	bypassC = port(dsp, \C_BYPASS, SigSpec());
+	bypassD = port(dsp, \D_BYPASS, SigSpec());
+	bypassPASUB = port(dsp, \PASUB_BYPASS, SigSpec());
+	bypassP = port(dsp, \P_BYPASS, SigSpec());
+endcode
+
+// (2) Match for pre-adder
+// 
+code sigA sigB sigD preAdderStatic moveBtoA
+	subpattern(preAddMatching);
+	preAdderStatic = u_preAdderStatic;
+	moveBtoA = false;
+
+	if (preAdderStatic) {
+		
+		if (port(preAdderStatic, \Y) == sigA)
+		{
+			//used for packing
+			moveBtoA = true;
+
+			// sigA should be the input to the multiplier without the preAdd. sigB and sigD should be 
+			//the preAdd inputs. If our "A" input into the multiplier is from the preAdd (not sigA), then 
+			// we basically swap it.
+			sigA = port(dsp, \B);
+		}
+
+		// port B of preAdderStatic must be mapped to port D of DSP for subtraction
+		sigD = port(preAdderStatic, \B);
+		sigB = port(preAdderStatic, \A);
+	}
+endcode
+
+//  (3) Match for post-adder
+//
+code postAdderStatic sigP sigC
+	u_postAdderStatic = nullptr;
+	subpattern(postAddMatching);
+	postAdderStatic = u_postAdderStatic;
+
+	if (postAdderStatic) {
+		//sigC will be whichever input to the postAdder that is NOT from the multiplier
+		// u_postAddAB is the input to the postAdder from the multiplier 
+		sigC = port(postAdderStatic, u_postAddAB == \A ? \B : \A);
+		sigP = port(postAdderStatic, \Y);
+	}
+endcode
+
+
+// (4) Matching registers
+//
+// 'A' input for REG_A
+code argQ bypassA sigA clock ffA
+	if (bypassA.is_fully_ones()){
+		argQ = sigA;
+		allowAsync = false;
+		subpattern(in_dffe);
+		if (dff) {
+			ffA = dff;
+			clock = dffclock;
+			sigA = dffD;
+		}
+	}
+endcode
+
+// 'B' input for REG_B
+code argQ bypassB sigB clock ffB
+	if (bypassB.is_fully_ones()){
+		argQ = sigB;
+		allowAsync = false;
+		subpattern(in_dffe);
+		if (dff) {
+			ffB = dff;
+			clock = dffclock;
+			sigB = dffD;
+		}
+	}
+endcode
+
+// 'D' input for REG_D
+code argQ bypassP sigD clock ffD
+	if (bypassD.is_fully_ones()){
+		argQ = sigD;
+		allowAsync = true;
+		subpattern(in_dffe);
+		if (dff) {
+			ffD = dff;
+			clock = dffclock;
+			sigD = dffD;
+		}
+	}
+endcode
+
+// 'P' output for REG_P
+code argD ffP sigP clock bypassP
+	if (bypassP.is_fully_ones() && nusers(sigP) == 2) {
+		argD = sigP;
+		allowAsync = false;
+		subpattern(out_dffe);
+		if (dff) {
+			ffP = dff;
+			clock = dffclock;
+			sigP = dffQ;
+		}
+	}
+endcode
+
+// (5) If post-adder and PREG both present, check if PREG feeds into post-adder via port C.
+//        This indicates an accumulator situation. Port C can be freed
+//             +--------------------------------+
+//             |_________                       |
+//                       | /-------\   +----+   |
+//            +----+     +-| post- |___|PREG|---+ 'P'
+//            |MULT|------ | adder |   +----+
+//            +----+       \-------/
+code useFeedBack
+	useFeedBack = false;
+	if (postAdderStatic && ffP)	{
+		if (sigC == sigP) {
+			useFeedBack = true;
+		}
+	}
+
+endcode
+
+// if any cells are absorbed, invoke the callback function
+code
+	if (preAdderStatic || postAdderStatic)
+		accept;
+	if (ffA || ffB || ffD || ffP)
+		accept;
+endcode
+
+
+// #######################
+// Subpattern for matching against post-adder
+//   Match 'P' output that exclusively drives one of two inputs to an $add
+//   cell (post-adder).
+//   The other input to the adder is assumed to come in from the 'C' input
+
+subpattern postAddMatching
+arg sigP
+
+match postAdd
+
+	select postAdd->type.in($add, $sub)
+	select GetSize(port(postAdd, \Y)) <= 48
+
+	// AB is the port that connects MUL to ADD
+	choice <IdString> AB {\A, \B}
+	select nusers(port(postAdd, AB)) == 2
+
+	// has one input coming from multiplier
+	index <SigBit> port(postAdd, AB)[0] === sigP[0]
+	filter GetSize(port(postAdd, AB)) >= GetSize(sigP)
+	filter port(postAdd, AB).extract(0, GetSize(sigP)) == sigP
+	// Check that remainder of AB is a sign- or zero-extension
+	filter port(postAdd, AB).extract_end(GetSize(sigP)) == SigSpec(sigP[GetSize(sigP)-1], GetSize(port(postAdd, AB))-GetSize(sigP)) || port(postAdd, AB).extract_end(GetSize(sigP)) == SigSpec(State::S0, GetSize(port(postAdd, AB))-GetSize(sigP))
+
+	set postAddAB AB
+	// optional
+endmatch
+
+code
+	if (postAdd)
+	{
+		if (postAdd->type.in(ID($sub)) && postAddAB == \A) {
+			// if $sub, the multiplier output must match to $sub.B, otherwise no match	
+		} else {
+			u_postAddAB = postAddAB;
+			u_postAdderStatic = postAdd;
+		}
+
+	}
+endcode
+
+
+// #######################
+// Subpattern for matching against pre-adder
+//		support static PASUB only
+
+subpattern preAddMatching
+arg sigA sigB sigD bypassB bypassD bypassPASUB
+
+code 
+	u_preAdderStatic = nullptr;
+
+	// Ensure that preAdder not already used
+	// Assume we can inspect port D to see if its all zeros. 
+	if (!(sigD.empty() || sigD.is_fully_zero())) reject;
+	if (!bypassB.is_fully_ones()) reject;
+	if (!bypassD.is_fully_ones()) reject;
+	if (!bypassPASUB.is_fully_ones()) reject;
+endcode
+
+match preAdd
+
+	// can handle add or sub
+	select preAdd->type.in($add, $sub)
+
+	// Output has to be 18 bits or less, and only has single fanout
+	select GetSize(port(preAdd, \Y)) <= 18
+	select nusers(port(preAdd, \Y)) == 2
+
+	// Adder inputs must be 18 bits or less
+	select GetSize(port(preAdd, \A)) <= 18
+	select GetSize(port(preAdd, \B)) <= 18
+
+	// Output feeds into one of multiplier input
+	filter port(preAdd, \Y) == sigB || port(preAdd, \Y) == sigA
+
+	// optional
+endmatch
+
+code
+	if (preAdd)
+	{
+		u_preAdderStatic = preAdd;
+	}
+endcode
+
+// #######################
+// Subpattern for matching against input registers, based on knowledge of the
+//   'Q' input.
+subpattern in_dffe
+arg argQ clock
+
+code
+	dff = nullptr;
+	if (argQ.empty())
+		reject;
+	for (const auto &c : argQ.chunks()) {
+		// Abandon matches when 'Q' is a constant
+		if (!c.wire)
+			reject;
+		// Abandon matches when 'Q' has the keep attribute set
+		if (c.wire->get_bool_attribute(\keep))
+			reject;
+		// Abandon matches when 'Q' has a non-zero init attribute set
+		Const init = c.wire->attributes.at(\init, Const());
+		if (!init.empty())
+			for (auto b : init.extract(c.offset, c.width))
+				if (b != State::Sx && b != State::S0)
+					reject;
+	}
+endcode
+
+match ff
+	// reg D has async rst
+	// reg A, B has sync rst
+	select ff->type.in($dff, $dffe, $sdff, $sdffe, $adff, $adffe)
+	// does not support clock inversion
+	select param(ff, \CLK_POLARITY).as_bool()
+
+	// it is possible that only part of a dff output matches argQ
+	slice offset GetSize(port(ff, \D))
+	index <SigBit> port(ff, \Q)[offset] === argQ[0]
+
+	// Check that the rest of argQ is present
+	filter GetSize(port(ff, \Q)) >= offset + GetSize(argQ)
+	filter port(ff, \Q).extract(offset, GetSize(argQ)) == argQ
+
+	// only consider async rst flops when flag is set
+	filter !ff->type.in($adff, $adffe) || allowAsync
+
+	// clock must be consistent
+	filter clock == SigBit() || port(ff, \CLK) == clock
+endmatch
+
+code argQ
+	// Check that reset value, if present, is fully 0.
+	bool noResetFlop = ff->type.in($dff, $dffe);
+	bool srstZero = ff->type.in($sdff, $sdffe) && param(ff, \SRST_VALUE).is_fully_zero();
+	bool arstZero = ff->type.in($adff, $adffe) && param(ff, \ARST_VALUE).is_fully_zero();
+	bool resetLegal = noResetFlop || srstZero || arstZero;
+	if (resetLegal)
+	{
+		SigSpec Q = port(ff, \Q);
+		dff = ff;
+		dffclock = port(ff, \CLK);
+		dffD = argQ;
+		SigSpec D = port(ff, \D);
+		argQ = Q;
+		dffD.replace(argQ, D);
+	}
+
+endcode
+// #######################
+
+
+subpattern out_dffe
+arg argD argQ clock
+
+code
+	dff = nullptr;
+	for (auto c : argD.chunks())
+		// Abandon matches when 'D' has the keep attribute set
+		if (c.wire->get_bool_attribute(\keep))
+			reject;
+endcode
+
+match ff
+	select ff->type.in($dff, $dffe, $sdff, $sdffe)
+	// does not support clock inversion
+	select param(ff, \CLK_POLARITY).as_bool()
+
+	slice offset GetSize(port(ff, \D))
+	index <SigBit> port(ff, \D)[offset] === argD[0]
+
+	// Check that the rest of argD is present
+	filter GetSize(port(ff, \D)) >= offset + GetSize(argD)
+	filter port(ff, \D).extract(offset, GetSize(argD)) == argD
+
+	filter clock == SigBit() || port(ff, \CLK) == clock
+endmatch
+
+code argQ
+	SigSpec D = port(ff, \D);
+	SigSpec Q = port(ff, \Q);
+	argQ = argD;
+	argQ.replace(D, Q);
+
+	// Abandon matches when 'Q' has a non-zero init attribute set
+	for (auto c : argQ.chunks()) {
+		Const init = c.wire->attributes.at(\init, Const());
+		if (!init.empty())
+			for (auto b : init.extract(c.offset, c.width))
+				if (b != State::Sx && b != State::S0)
+					reject;
+	}
+
+	dff = ff;
+	dffQ = argQ;
+	dffclock = port(ff, \CLK);
+endcode
--- a/passes/pmgen/mchp_dsp_CREG.pmg
+++ b/passes/pmgen/mchp_dsp_CREG.pmg
@ -0,0 +1,169 @@
+// ISC License
+// 
+// Copyright (C) 2024 Microchip Technology Inc. and its subsidiaries
+// 
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+
+// This file describes the second of three pattern matcher setups that
+//   forms the `mchp_dsp` pass described in mchp_dsp.cc
+// At a high level, it works as follows:
+//   (1) Starting from a DSP cell that (a) doesn't have a CREG already,
+//       and (b) uses the 'C' port
+//   (2) Match the driver of the 'C' input to a possible $dff cell (CREG)
+//       (attached to at most two $mux cells that implement clock-enable or
+//        reset functionality, using a subpattern discussed below)
+// Notes:
+//   - Running CREG packing after mchp_dsp_pack is necessary since there is no
+//     guarantee that the cell ordering corresponds to the "expected" case (i.e.
+//     the order in which they appear in the source) thus the possiblity existed
+//     where a register got packed as a CREG into a downstream DSP, while it should
+//     have otherwise been a PREG of an upstream DSP that had not been visited.
+//     yet.
+//   - The reason this is separated out from the mchp_dsp.pmg file is
+//     for efficiency --- each *.pmg file creates a class of the same basename,
+//     which when constructed, creates a custom database tailored to the
+//     pattern(s) contained within. Since the pattern in this file must be
+//     executed after the pattern contained in mchp_dsp.pmg, it is necessary
+//     to reconstruct this database. Separating the two patterns into
+//     independent files causes two smaller, more specific, databases.
+
+pattern mchp_dsp_packC
+
+udata <std::function<SigSpec(const SigSpec&)>> unextend
+state <SigBit> clock
+state <SigSpec> sigC sigP
+state <Cell*> ffC
+
+// Variables used for subpatterns
+state <SigSpec> argQ argD
+state <int> ffoffset
+udata <SigSpec> dffD dffQ
+udata <SigBit> dffclock
+udata <Cell*> dff
+
+// (1) Starting from a DSP cell that (a) doesn't have a CREG already,
+//     and (b) uses the 'C' port
+match dsp
+	select dsp->type.in(\MACC_PA)
+    select port(dsp, \C_BYPASS, SigSpec()).is_fully_ones()
+	select nusers(port(dsp, \C, SigSpec())) > 1
+endmatch
+
+code sigC sigP clock
+	//helper function to remove unused bits
+	unextend = [](const SigSpec &sig) {
+		int i;
+		for (i = GetSize(sig)-1; i > 0; i--)
+			if (sig[i] != sig[i-1])
+				break;
+		// Do not remove non-const sign bit
+		if (sig[i].wire)
+			++i;
+		return sig.extract(0, i);
+	};
+	sigC = unextend(port(dsp, \C, SigSpec()));
+
+	SigSpec P = port(dsp, \P);
+
+    // Only care about those bits that are used
+    int i;
+    for (i = GetSize(P)-1; i >= 0; i--)
+        if (nusers(P[i]) > 1)
+            break;
+    i++;
+    log_assert(nusers(P.extract_end(i)) <= 1);
+    sigP = P.extract(0, i);
+
+	clock = port(dsp, \CLK, SigBit());
+endcode
+
+// (2) Match the driver of the 'C' input to a possible $dff cell (CREG)
+//     (attached to at most two $mux cells that implement clock-enable or
+//      reset functionality, using the in_dffe subpattern)
+code argQ ffC sigC clock
+	argQ = sigC;
+	subpattern(in_dffe);
+	if (dff) {
+		ffC = dff;
+		clock = dffclock;
+		sigC = dffD;
+	}
+endcode
+
+code
+	if (ffC)
+		accept;
+endcode
+
+// #######################
+
+// Subpattern for matching against input registers, based on knowledge of the
+//   'Q' input.
+subpattern in_dffe
+arg argQ clock
+
+code
+	dff = nullptr;
+	if (argQ.empty())
+		reject;
+	for (const auto &c : argQ.chunks()) {
+		// Abandon matches when 'Q' is a constant
+		if (!c.wire)
+			reject;
+		// Abandon matches when 'Q' has the keep attribute set
+		if (c.wire->get_bool_attribute(\keep))
+			reject;
+		// Abandon matches when 'Q' has a non-zero init attribute set
+		// (not supported by DSP48E1)
+		Const init = c.wire->attributes.at(\init, Const());
+		if (!init.empty())
+			for (auto b : init.extract(c.offset, c.width))
+				if (b != State::Sx && b != State::S0)
+					reject;
+	}
+endcode
+
+match ff
+	select ff->type.in($dff, $dffe, $sdff, $sdffe, $adff, $adffe)
+	// does not support clock inversion
+	select param(ff, \CLK_POLARITY).as_bool()
+
+	slice offset GetSize(port(ff, \D))
+	index <SigBit> port(ff, \Q)[offset] === argQ[0]
+
+	// Check that the rest of argQ is present
+	filter GetSize(port(ff, \Q)) >= offset + GetSize(argQ)
+	filter port(ff, \Q).extract(offset, GetSize(argQ)) == argQ
+
+	filter clock == SigBit() || port(ff, \CLK) == clock
+endmatch
+
+code argQ
+    // Check that reset value, if present, is fully 0.
+	bool noResetFlop = ff->type.in($dff, $dffe);
+	bool srstZero = ff->type.in($sdff, $sdffe) && param(ff, \SRST_VALUE).is_fully_zero();
+	bool arstZero = ff->type.in($adff, $adffe) && param(ff, \ARST_VALUE).is_fully_zero();
+	bool resetLegal = noResetFlop || srstZero || arstZero;
+	if (resetLegal)
+    {
+        SigSpec Q = port(ff, \Q);
+        dff = ff;
+        dffclock = port(ff, \CLK);
+        dffD = argQ;
+        SigSpec D = port(ff, \D);
+        argQ = Q;
+        dffD.replace(argQ, D);
+    }
+	
+endcode
--- a/passes/pmgen/mchp_dsp_cascade.pmg
+++ b/passes/pmgen/mchp_dsp_cascade.pmg
@ -0,0 +1,238 @@
+// ISC License
+// 
+// Copyright (C) 2024 Microchip Technology Inc. and its subsidiaries
+// 
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+
+// This file describes the third of three pattern matcher setups that
+//   forms the `mchp_dsp` pass described in mchp_dsp.cc
+// At a high level, it works as follows:
+//   (1) Starting from a DSP cell that 
+//         (a) CDIN_FDBK_SEL is set to default "00"
+//         (b) doesn't already use the 'PCOUT' port
+//   (2) Match another DSP cell that 
+//         (a) does not have the CREG enabled,
+//         (b) 'C' port is driven by the 'P' output of the previous DSP cell
+//         (c) has its 'PCIN' port unused
+//   (3) Recursively go to (2) until no more matches possible, keeping track
+//       of the longest possible chain found
+//   (4) The longest chain is then divided into chunks of no more than
+//       MAX_DSP_CASCADE in length (to prevent long cascades that exceed the
+//       height of a DSP column) with each DSP in each chunk being rewritten
+//       to use [ABP]COUT -> [ABP]CIN cascading as appropriate
+
+pattern mchp_dsp_cascade
+
+udata <std::function<SigSpec(const SigSpec&)>> unextend
+udata <vector<std::tuple<Cell*,int>>> chain longest_chain
+udata <std::set<Cell*>> visited
+state <Cell*> next
+state <SigSpec> clock
+
+// Variables used for subpatterns
+state <SigSpec> argQ argD
+state <int> ffoffset
+udata <SigSpec> dffD dffQ
+udata <SigBit> dffclock
+udata <Cell*> dff
+
+// Maximum of 24 cascaded blocks
+code
+#define MAX_DSP_CASCADE 24
+endcode
+
+// NOTE: Chain vector
+//  +--------+      +--------+
+//  | first  |----> |  next  | ----> ...
+//  +--------+      +--------+
+//  first.COUT cascades to next.CIN, so on and so forth
+
+// Helper function to remove unused bits
+code
+    unextend = [](const SigSpec &sig) {
+        int i;
+        for (i = GetSize(sig)-1; i > 0; i--)
+            if (sig[i] != sig[i-1])
+                break;
+        // Do not remove non-const sign bit
+        if (sig[i].wire)
+            ++i;
+        return sig.extract(0, i);
+    };
+endcode
+
+// (1) Starting from a DSP cell that 
+//     (a) CDIN_FDBK_SEL is set to default "00"
+//     (b) doesn't already use the 'PCOUT' port
+match first
+	select first->type.in(\MACC_PA) && port(first, \CDIN_FDBK_SEL, Const(0, 2)) == Const::from_string("00")
+	select nusers(port(first, \CDOUT, SigSpec())) <= 1
+endmatch
+
+// (4) The longest chain is then divided into chunks of no more than
+//     MAX_DSP_CASCADE in length (to prevent long cascades that exceed the
+//     height of a DSP column) with each DSP in each chunk being rewritten
+//     to use [ABP]COUT -> [ABP]CIN cascading as appropriate
+code
+    visited.clear();
+    visited.insert(first);
+
+	longest_chain.clear();
+	chain.emplace_back(first, -1);
+	subpattern(tail);
+finally
+
+    // longest cascade chain has been found with DSP "first" being the head of the chain
+    // do some post processing
+
+	chain.pop_back();
+    visited.clear();
+	log_assert(chain.empty());
+
+	if (GetSize(longest_chain) > 1) {
+		Cell *dsp = std::get<0>(longest_chain.front());
+
+		Cell *dsp_pcin;
+        int SHIFT = -1;
+		for (int i = 1; i < GetSize(longest_chain); i++) {
+            log_assert(dsp->type.in(\MACC_PA));
+
+            std::tie(dsp_pcin,SHIFT) = longest_chain[i];
+
+            // Chain length exceeds the maximum cascade length, must split it up
+			if (i % MAX_DSP_CASCADE > 0) {
+                Wire *cascade = module->addWire(NEW_ID, 48);
+
+                // zero port C and move wire to cascade
+                dsp_pcin->setPort(ID(C), Const(0, 48));
+                dsp_pcin->setPort(ID(CDIN), cascade);
+                dsp->setPort(ID(CDOUT), cascade);
+
+                // Configure wire to cascade the dsps
+                add_siguser(cascade, dsp_pcin);
+                add_siguser(cascade, dsp);
+
+                // configure mux to use cascade for signal E
+                SigSpec cdin_fdbk_sel = port(dsp_pcin, \CDIN_FDBK_SEL, Const(0, 2));
+                cdin_fdbk_sel[1] = State::S1;
+                dsp_pcin->setPort(\CDIN_FDBK_SEL, cdin_fdbk_sel);
+
+                // check if shifting is required for wide multiplier implmentation
+                if (SHIFT == 17)
+                {
+                    dsp_pcin->setPort(\ARSHFT17, State::S1);
+                }
+                
+
+                log_debug("PCOUT -> PCIN cascade for %s -> %s\n", log_id(dsp), log_id(dsp_pcin));
+
+			} else {
+				log_debug("  Blocking %s -> %s cascade (exceeds max: %d)\n", log_id(dsp), log_id(dsp_pcin), MAX_DSP_CASCADE);
+			}
+
+			dsp = dsp_pcin;
+		}
+
+		accept;
+	}
+endcode
+
+// ------------------------------------------------------------------
+
+subpattern tail
+arg first
+arg next
+
+// (2) Match another DSP cell that 
+//          (a) does not have the CREG enabled,
+//          (b) 'C' port is driven by the 'P' output of the previous DSP cell
+//          (c) has its 'PCIN' port unused
+match nextP
+    // find candidates where nextP.C port is driven (maybe partially) by chain's tail DSP.P port
+    //      and with no registers in between (since cascade path cannot be pipelined)
+
+    // reg C must not be used
+    select port(nextP, \C_BYPASS, SigSpec()).is_fully_ones()
+
+    // must be same DSP type
+    select nextP->type.in(\MACC_PA)
+
+    // port C should be driven by something
+	select nusers(port(nextP, \C, SigSpec())) > 1
+
+    // CIN must be unused
+	select nusers(port(nextP, \PCIN, SigSpec())) == 0
+    
+    // should not have internal feedback connection
+    select port(nextP, \CDIN_FDBK_SEL, SigSpec()).is_fully_zero()
+
+    // SHIFT should be unused
+    select port(nextP, \ARSHFT17_BYPASS).is_fully_ones()
+    select port(nextP, \ARSHFT17).is_fully_zero()
+    select nusers(port(nextP, \ARSHFT17, SigSpec())) == 0
+
+    // current DSP cell can be cascaded with the back of the cascade chain
+	// index <SigBit> port(nextP, \C)[0] === port(std::get<0>(chain.back()), \P)[0] || port(nextP, \C)[0] === port(std::get<0>(chain.back()), \P)[17]
+    filter port(nextP, \C)[0] == port(std::get<0>(chain.back()), \P)[0] || port(nextP, \C)[0] == port(std::get<0>(chain.back()), \P)[17]
+
+	// semioptional
+
+    optional
+endmatch
+
+code next
+	next = nextP;
+
+    // keep DSP type consistent in the chain
+    // currently since we only have one type anyways, this line is always false
+	if (next && next->type != first->type) reject;
+
+    // break infinite recursion when there's a combinational loop
+    if (visited.count(next) > 0) reject;
+
+endcode
+
+// (3) Recursively go to (2) until no more matches possible, recording the
+//     longest possible chain
+code
+	if (next) {
+        SigSpec driver_sigP = port(std::get<0>(chain.back()), \P);
+        int shift = 0;
+        if (port(next, \C)[0] == port(std::get<0>(chain.back()), \P)[17]) shift = 17;
+
+		chain.emplace_back(next, shift);
+        visited.insert(next);
+       
+		SigSpec sigC = unextend(port(next, \C));
+
+            //     Make sure driverDSP.P === DSP.C
+            if (GetSize(sigC) + shift <= GetSize(driver_sigP) && driver_sigP.extract(shift, GetSize(sigC)) == sigC)
+            {
+                subpattern(tail);
+            }
+                
+
+	} else {
+		if (GetSize(chain) > GetSize(longest_chain))
+			longest_chain = chain;
+	}
+finally
+	if (next)
+    {
+        visited.erase(next);
+        chain.pop_back();
+    }
+		
+
+endcode