diff --git a/techlibs/xilinx/cells_map.v b/techlibs/xilinx/cells_map.v index c1b911b0b..253f13b7b 100644 --- a/techlibs/xilinx/cells_map.v +++ b/techlibs/xilinx/cells_map.v @@ -195,20 +195,18 @@ module \$__XILINX_SHIFTX (A, B, Y); else if (A_WIDTH < `MIN_MUX_INPUTS) begin wire _TECHMAP_FAIL_ = 1; end - else if (A_WIDTH == 2) begin - MUXF7 fpga_hard_mux (.I0(A[0]), .I1(A[1]), .S(B[0]), .O(Y)); + else if (A_WIDTH == 1) begin + assign Y = A[0]; end + // Use one LUT3 instead of a MUXF7 because a MUXF7 gets its inputs from 2 LUT6.O6 anyway + else if (A_WIDTH == 2) begin + assign Y = B[0] ? A[1] : A[0]; + end + // Use one LUT6 instead of 2 LUT3 + MUXF7 because a MUXF7 gets its inputs from 2 LUT6.O6 anyway else if (A_WIDTH <= 4) begin - wire [4-1:0] Ax; - if (A_WIDTH == 4) - assign Ax = A; - else - // Rather than extend with 1'bx which gets flattened to 1'b0 - // causing the "don't care" status to get lost, extend with - // the same driver of F7B.I0 so that we can optimise F7B away - // later - assign Ax = {A[1], A}; - \$__XILINX_MUXF78 fpga_hard_mux (.I0(Ax[0]), .I1(Ax[2]), .I2(Ax[1]), .I3(Ax[3]), .S0(B[1]), .S1(B[0]), .O(Y)); + wire [4-1:0] Ax = {{(4-A_WIDTH){1'bx}}, A}; + assign Y = B[1] ? (B[0] ? Ax[3] : Ax[2]) + : (B[0] ? Ax[1] : Ax[0]); end // Note that the following decompositions are 'backwards' in that // the LSBs are placed on the hard resources, and the soft resources @@ -232,15 +230,46 @@ module \$__XILINX_SHIFTX (A, B, Y); // but that the 'backwards' mapping (left) is more delay efficient // since smaller LUTs are faster than wider ones. else if (A_WIDTH <= 8) begin - wire [8-1:0] Ax = {{{8-A_WIDTH}{1'bx}}, A}; - wire T0 = B[2] ? Ax[4] : Ax[0]; - wire T1 = B[2] ? Ax[5] : Ax[1]; - wire T2 = B[2] ? Ax[6] : Ax[2]; - wire T3 = B[2] ? Ax[7] : Ax[3]; - \$__XILINX_MUXF78 fpga_hard_mux (.I0(T0), .I1(T2), .I2(T1), .I3(T3), .S0(B[1]), .S1(B[0]), .O(Y)); + wire [8-1:0] Ax = {{(8-A_WIDTH){1'bx}}, A}; + // For 5-8 inputs, there are 2 possible implementations: + // - Using 4 LUT3 + 2 MUXF7 + MUXF8 + // The MUXF7 inputs come from LUT6.O6 outputs so at best this would mean I5=1, inputs I4-I3-I2 for the MUX, leaving only I1-I0 available for other logic + // This means only 4 other LUT2 operations can be mapped to the slice (or larger ones assuming input sharing) + // - Using 2 LUT6 + MUXF7 + // This leaves 2 LUT6 + 2 inputs Cx and Dx available for other logic + // The solution used here is 2 LUT6 + MUXF7 for the following reasons : + // - Area report is much closer to what a user would expect + // - The rest of the slice is probably easier to use by place and route tools + // - Delay should be rather similar + wire T0 = B[1] ? (B[2] ? Ax[6] : Ax[2]) + : (B[2] ? Ax[4] : Ax[0]); + wire T1 = B[1] ? (B[2] ? Ax[7] : Ax[3]) + : (B[2] ? Ax[5] : Ax[1]); + MUXF7 fpga_hard_mux (.I0(T0), .I1(T1), .S(B[0]), .O(Y)); + end + else if (A_WIDTH <= 12) begin + wire [12-1:0] Ax = {{(12-A_WIDTH){1'bx}}, A}; + // For 9-12 inputs, only 3 LUT6 are needed + // Note that an explicit user objective of optimization for delay might make the mux16 below preferrable + // (not a binary decision though : the overall design would be larger, which is not good for wire delay) + wire T0 = B[1] ? (B[0] ? Ax[ 3] : Ax[ 2]) + : (B[0] ? Ax[ 1] : Ax[ 0]); + wire T1 = B[1] ? (B[0] ? Ax[ 7] : Ax[ 6]) + : (B[0] ? Ax[ 5] : Ax[ 4]); + wire T2 = B[1] ? (B[0] ? Ax[11] : Ax[10]) + : (B[0] ? Ax[ 9] : Ax[ 8]); + // Set parameters _TECHMAP_* to indicate that I2===I3 so that the upper MUXF7 is bypassed (assuming this is well handled by pnr tools) + \$__XILINX_MUXF78 #( + ._TECHMAP_BITS_CONNMAP_(2), + ._TECHMAP_CONNMAP_I0_(2'd0), + ._TECHMAP_CONNMAP_I1_(2'd1), + ._TECHMAP_CONNMAP_I2_(2'd2), + ._TECHMAP_CONNMAP_I3_(2'd2) + ) fpga_hard_mux (.I0(T0), .I1(T1), .I2(T2), .I3(T2), .S0(B[2]), .S1(B[3]), .O(Y)); end else if (A_WIDTH <= 16) begin - wire [16-1:0] Ax = {{{16-A_WIDTH}{1'bx}}, A}; + // For 13-16 inputs, use the full slice with 'backwards' decomposition described above + wire [16-1:0] Ax = {{(16-A_WIDTH){1'bx}}, A}; wire T0 = B[2] ? B[3] ? Ax[12] : Ax[4] : B[3] ? Ax[ 8] : Ax[0]; wire T1 = B[2] ? B[3] ? Ax[13] : Ax[5] @@ -252,32 +281,36 @@ module \$__XILINX_SHIFTX (A, B, Y); \$__XILINX_MUXF78 fpga_hard_mux (.I0(T0), .I1(T2), .I2(T1), .I3(T3), .S0(B[1]), .S1(B[0]), .O(Y)); end else begin + // For more than 16 inputs, recursively split into sub-multiplexers of size at most 16 localparam num_mux16 = (A_WIDTH+15) / 16; localparam clog2_num_mux16 = $clog2(num_mux16); wire [num_mux16-1:0] T; - wire [num_mux16*16-1:0] Ax = {{(num_mux16*16-A_WIDTH){1'bx}}, A}; - for (i = 0; i < num_mux16; i++) + for (i = 0; i < num_mux16; i++) begin + localparam local_num_in = (A_WIDTH-i*16 < 16) ? A_WIDTH-i*16 : 16; + localparam clog2_num_in = $clog2(local_num_in); \$__XILINX_SHIFTX #( .A_SIGNED(A_SIGNED), .B_SIGNED(B_SIGNED), - .A_WIDTH(16), - .B_WIDTH(4), + .A_WIDTH(local_num_in), + .B_WIDTH(clog2_num_in), .Y_WIDTH(Y_WIDTH) ) fpga_mux ( - .A(Ax[i*16+:16]), - .B(B[3:0]), + .A(A[i*16+:local_num_in]), + .B(B[clog2_num_in-1:0]), .Y(T[i]) ); + end \$__XILINX_SHIFTX #( - .A_SIGNED(A_SIGNED), - .B_SIGNED(B_SIGNED), - .A_WIDTH(num_mux16), - .B_WIDTH(clog2_num_mux16), - .Y_WIDTH(Y_WIDTH) + .A_SIGNED(A_SIGNED), + .B_SIGNED(B_SIGNED), + .A_WIDTH(num_mux16), + .B_WIDTH(clog2_num_mux16), + .Y_WIDTH(Y_WIDTH) ) _TECHMAP_REPLACE_ ( - .A(T), - .B(B[B_WIDTH-1-:clog2_num_mux16]), - .Y(Y)); + .A(T), + .B(B[4+:clog2_num_mux16]), + .Y(Y) + ); end endgenerate endmodule diff --git a/tests/arch/common/mux.v b/tests/arch/common/mux.v index 71c1ac7f2..fc9789fee 100644 --- a/tests/arch/common/mux.v +++ b/tests/arch/common/mux.v @@ -51,10 +51,32 @@ module mux8 ( S, D, Y ); end endmodule +module mux12 (D, S, Y); + input [11:0] D; + input [3:0] S; + output Y; + + wire[15:0] D16; + + assign D16 = {4'bx, D}; + assign Y = D16[S]; +endmodule + module mux16 (D, S, Y); - input [15:0] D; - input [3:0] S; - output Y; + input [15:0] D; + input [3:0] S; + output Y; assign Y = D[S]; endmodule + +module mux20 (D, S, Y); + input [19:0] D; + input [4:0] S; + output Y; + + wire[31:0] D32; + + assign D32 = {12'bx, D}; + assign Y = D32[S]; +endmodule diff --git a/tests/arch/xilinx/mux.ys b/tests/arch/xilinx/mux.ys index c2a23de6d..c40cf3a43 100644 --- a/tests/arch/xilinx/mux.ys +++ b/tests/arch/xilinx/mux.ys @@ -1,6 +1,10 @@ + read_verilog ../common/mux.v design -save read + +# mux2 + hierarchy -top mux2 proc equiv_opt -assert -map +/xilinx/cells_sim.v synth_xilinx -noiopad # equivalency check @@ -8,9 +12,12 @@ design -load postopt # load the post-opt design (otherwise equiv_opt loads the p cd mux2 # Constrain all select calls below inside the top module select -assert-count 1 t:LUT3 +# Ensure there are no other cells select -assert-none t:LUT3 %% t:* %D +# mux4 + design -load read hierarchy -top mux4 proc @@ -19,9 +26,12 @@ design -load postopt # load the post-opt design (otherwise equiv_opt loads the p cd mux4 # Constrain all select calls below inside the top module select -assert-count 1 t:LUT6 +# Ensure there are no other cells select -assert-none t:LUT6 %% t:* %D +# mux8 without widemux + design -load read hierarchy -top mux8 proc @@ -31,9 +41,43 @@ cd mux8 # Constrain all select calls below inside the top module select -assert-count 1 t:LUT3 select -assert-count 2 t:LUT6 +# Ensure there are no other cells select -assert-none t:LUT3 t:LUT6 %% t:* %D +# mux8 with widemux 5 + +design -load read +hierarchy -top mux8 +proc +equiv_opt -assert -map +/xilinx/cells_sim.v synth_xilinx -noiopad -widemux 5 # equivalency check +design -load postopt # load the post-opt design (otherwise equiv_opt loads the pre-opt design) +cd mux8 # Constrain all select calls below inside the top module +select -assert-count 2 t:LUT6 +select -assert-count 1 t:MUXF7 + +# Ensure there are no other cells +select -assert-none t:LUT6 t:MUXF7 %% t:* %D + + +# mux12 with widemux 5 +# There is no equivalence check because selection values 12 to 15 are unspecified + +design -load read +hierarchy -top mux12 +proc +synth_xilinx -noiopad -widemux 5 +cd mux12 # Constrain all select calls below inside the top module +select -assert-count 3 t:LUT6 +select -assert-max 2 t:MUXF7 +select -assert-count 1 t:MUXF8 + +# Ensure there are no other cells +select -assert-none t:LUT6 t:MUXF7 t:MUXF8 %% t:* %D + + +# mux16 without widemux + design -load read hierarchy -top mux16 proc @@ -47,4 +91,45 @@ select -assert-max 7 t:LUT6 select -assert-max 2 t:MUXF7 dump +# Ensure there are no other cells select -assert-none t:LUT6 t:LUT4 t:LUT3 t:MUXF7 %% t:* %D + + +# mux16 with widemux 5 + +design -load read +hierarchy -top mux16 +proc +equiv_opt -assert -map +/xilinx/cells_sim.v synth_xilinx -noiopad -widemux 5 # equivalency check +design -load postopt # load the post-opt design (otherwise equiv_opt loads the pre-opt design) +cd mux16 # Constrain all select calls below inside the top module +select -assert-count 4 t:LUT6 +select -assert-count 2 t:MUXF7 +select -assert-count 1 t:MUXF8 +dump + +# Ensure there are no other cells +select -assert-none t:LUT6 t:MUXF7 t:MUXF8 %% t:* %D + + +# mux20 with widemux 5 +# Expect one mux16 (4 lut6 + 2 muxf7 + muxf8) + one mux4 (one lut6), then one mux2 (one lut3) +# These mapping results are achieved only with abc9 (without abc, we get undesired additional muxf7/muxf8) +# There is no equivalence check because selection values 20 to 31 are unspecified + +design -load read +hierarchy -top mux20 +proc +scratchpad -set abc9.D 5000 # Set a period high enough so we get area-optimized result +synth_xilinx -noiopad -widemux 5 -abc9 +cd mux20 # Constrain all select calls below inside the top module +select -assert-count 1 t:LUT3 +select -assert-count 5 t:LUT6 +select -assert-count 2 t:MUXF7 +select -assert-count 1 t:MUXF8 +dump + +# Ensure there are no other cells +select -assert-none t:LUT3 t:LUT6 t:MUXF7 t:MUXF8 %% t:* %D + +