diff --git a/techlibs/xilinx/cells_map.v b/techlibs/xilinx/cells_map.v
index c1b911b0b..253f13b7b 100644
--- a/techlibs/xilinx/cells_map.v
+++ b/techlibs/xilinx/cells_map.v
@@ -195,20 +195,18 @@ module \$__XILINX_SHIFTX (A, B, Y);
     else if (A_WIDTH < `MIN_MUX_INPUTS) begin
       wire _TECHMAP_FAIL_ = 1;
     end
-    else if (A_WIDTH == 2) begin
-      MUXF7 fpga_hard_mux (.I0(A[0]), .I1(A[1]), .S(B[0]), .O(Y));
+    else if (A_WIDTH == 1) begin
+      assign Y = A[0];
     end
+    // Use one LUT3 instead of a MUXF7 because a MUXF7 gets its inputs from 2 LUT6.O6 anyway
+    else if (A_WIDTH == 2) begin
+      assign Y = B[0] ? A[1] : A[0];
+    end
+    // Use one LUT6 instead of 2 LUT3 + MUXF7 because a MUXF7 gets its inputs from 2 LUT6.O6 anyway
     else if (A_WIDTH <= 4) begin
-      wire [4-1:0] Ax;
-      if (A_WIDTH == 4)
-        assign Ax = A;
-      else
-        // Rather than extend with 1'bx which gets flattened to 1'b0
-        // causing the "don't care" status to get lost, extend with
-        // the same driver of F7B.I0 so that we can optimise F7B away
-        // later
-        assign Ax = {A[1], A};
-      \$__XILINX_MUXF78 fpga_hard_mux (.I0(Ax[0]), .I1(Ax[2]), .I2(Ax[1]), .I3(Ax[3]), .S0(B[1]), .S1(B[0]), .O(Y));
+      wire [4-1:0] Ax = {{(4-A_WIDTH){1'bx}}, A};
+      assign Y = B[1] ? (B[0] ? Ax[3] : Ax[2])
+                      : (B[0] ? Ax[1] : Ax[0]);
     end
     // Note that the following decompositions are 'backwards' in that
     // the LSBs are placed on the hard resources, and the soft resources
@@ -232,15 +230,46 @@ module \$__XILINX_SHIFTX (A, B, Y);
     // but that the 'backwards' mapping (left) is more delay efficient
     // since smaller LUTs are faster than wider ones.
     else if (A_WIDTH <= 8) begin
-      wire [8-1:0] Ax = {{{8-A_WIDTH}{1'bx}}, A};
-      wire T0 = B[2] ? Ax[4] : Ax[0];
-      wire T1 = B[2] ? Ax[5] : Ax[1];
-      wire T2 = B[2] ? Ax[6] : Ax[2];
-      wire T3 = B[2] ? Ax[7] : Ax[3];
-      \$__XILINX_MUXF78 fpga_hard_mux (.I0(T0), .I1(T2), .I2(T1), .I3(T3), .S0(B[1]), .S1(B[0]), .O(Y));
+      wire [8-1:0] Ax = {{(8-A_WIDTH){1'bx}}, A};
+      // For 5-8 inputs, there are 2 possible implementations:
+      // - Using 4 LUT3 + 2 MUXF7 + MUXF8
+      //   The MUXF7 inputs come from LUT6.O6 outputs so at best this would mean I5=1, inputs I4-I3-I2 for the MUX, leaving only I1-I0 available for other logic
+      //   This means only 4 other LUT2 operations can be mapped to the slice (or larger ones assuming input sharing)
+      // - Using 2 LUT6 + MUXF7
+      //   This leaves 2 LUT6 + 2 inputs Cx and Dx available for other logic
+      // The solution used here is 2 LUT6 + MUXF7 for the following reasons :
+      // - Area report is much closer to what a user would expect
+      // - The rest of the slice is probably easier to use by place and route tools
+      // - Delay should be rather similar
+      wire T0 = B[1] ? (B[2] ? Ax[6] : Ax[2])
+                     : (B[2] ? Ax[4] : Ax[0]);
+      wire T1 = B[1] ? (B[2] ? Ax[7] : Ax[3])
+                     : (B[2] ? Ax[5] : Ax[1]);
+      MUXF7 fpga_hard_mux (.I0(T0), .I1(T1), .S(B[0]), .O(Y));
+    end
+    else if (A_WIDTH <= 12) begin
+      wire [12-1:0] Ax = {{(12-A_WIDTH){1'bx}}, A};
+      // For 9-12 inputs, only 3 LUT6 are needed
+      // Note that an explicit user objective of optimization for delay might make the mux16 below preferrable
+      // (not a binary decision though : the overall design would be larger, which is not good for wire delay)
+      wire T0 = B[1] ? (B[0] ? Ax[ 3] : Ax[ 2])
+                     : (B[0] ? Ax[ 1] : Ax[ 0]);
+      wire T1 = B[1] ? (B[0] ? Ax[ 7] : Ax[ 6])
+                     : (B[0] ? Ax[ 5] : Ax[ 4]);
+      wire T2 = B[1] ? (B[0] ? Ax[11] : Ax[10])
+                     : (B[0] ? Ax[ 9] : Ax[ 8]);
+      // Set parameters _TECHMAP_* to indicate that I2===I3 so that the upper MUXF7 is bypassed (assuming this is well handled by pnr tools)
+      \$__XILINX_MUXF78 #(
+        ._TECHMAP_BITS_CONNMAP_(2),
+        ._TECHMAP_CONNMAP_I0_(2'd0),
+        ._TECHMAP_CONNMAP_I1_(2'd1),
+        ._TECHMAP_CONNMAP_I2_(2'd2),
+        ._TECHMAP_CONNMAP_I3_(2'd2)
+      ) fpga_hard_mux (.I0(T0), .I1(T1), .I2(T2), .I3(T2), .S0(B[2]), .S1(B[3]), .O(Y));
     end
     else if (A_WIDTH <= 16) begin
-      wire [16-1:0] Ax = {{{16-A_WIDTH}{1'bx}}, A};
+      // For 13-16 inputs, use the full slice with 'backwards' decomposition described above
+      wire [16-1:0] Ax = {{(16-A_WIDTH){1'bx}}, A};
       wire T0 = B[2] ? B[3] ? Ax[12] : Ax[4]
                      : B[3] ? Ax[ 8] : Ax[0];
       wire T1 = B[2] ? B[3] ? Ax[13] : Ax[5]
@@ -252,32 +281,36 @@ module \$__XILINX_SHIFTX (A, B, Y);
       \$__XILINX_MUXF78 fpga_hard_mux (.I0(T0), .I1(T2), .I2(T1), .I3(T3), .S0(B[1]), .S1(B[0]), .O(Y));
     end
     else begin
+      // For more than 16 inputs, recursively split into sub-multiplexers of size at most 16
       localparam num_mux16 = (A_WIDTH+15) / 16;
       localparam clog2_num_mux16 = $clog2(num_mux16);
       wire [num_mux16-1:0] T;
-      wire [num_mux16*16-1:0] Ax = {{(num_mux16*16-A_WIDTH){1'bx}}, A};
-      for (i = 0; i < num_mux16; i++)
+      for (i = 0; i < num_mux16; i++) begin
+        localparam local_num_in = (A_WIDTH-i*16 < 16) ? A_WIDTH-i*16 : 16;
+        localparam clog2_num_in = $clog2(local_num_in);
         \$__XILINX_SHIFTX  #(
           .A_SIGNED(A_SIGNED),
           .B_SIGNED(B_SIGNED),
-          .A_WIDTH(16),
-          .B_WIDTH(4),
+          .A_WIDTH(local_num_in),
+          .B_WIDTH(clog2_num_in),
           .Y_WIDTH(Y_WIDTH)
         ) fpga_mux (
-          .A(Ax[i*16+:16]),
-          .B(B[3:0]),
+          .A(A[i*16+:local_num_in]),
+          .B(B[clog2_num_in-1:0]),
           .Y(T[i])
         );
+      end
       \$__XILINX_SHIFTX  #(
-          .A_SIGNED(A_SIGNED),
-          .B_SIGNED(B_SIGNED),
-          .A_WIDTH(num_mux16),
-          .B_WIDTH(clog2_num_mux16),
-          .Y_WIDTH(Y_WIDTH)
+        .A_SIGNED(A_SIGNED),
+        .B_SIGNED(B_SIGNED),
+        .A_WIDTH(num_mux16),
+        .B_WIDTH(clog2_num_mux16),
+        .Y_WIDTH(Y_WIDTH)
       ) _TECHMAP_REPLACE_ (
-          .A(T),
-          .B(B[B_WIDTH-1-:clog2_num_mux16]),
-          .Y(Y));
+        .A(T),
+        .B(B[4+:clog2_num_mux16]),
+        .Y(Y)
+      );
     end
   endgenerate
 endmodule
diff --git a/tests/arch/common/mux.v b/tests/arch/common/mux.v
index 71c1ac7f2..fc9789fee 100644
--- a/tests/arch/common/mux.v
+++ b/tests/arch/common/mux.v
@@ -51,10 +51,32 @@ module mux8 ( S, D, Y );
     end
 endmodule
 
+module mux12 (D, S, Y);
+    input  [11:0] D;
+    input  [3:0] S;
+    output Y;
+
+    wire[15:0] D16;
+
+    assign D16 = {4'bx, D};
+    assign Y = D16[S];
+endmodule
+
 module mux16 (D, S, Y);
- 	input  [15:0] D;
- 	input  [3:0] S;
- 	output Y;
+    input  [15:0] D;
+    input  [3:0] S;
+    output Y;
 
     assign Y = D[S];
 endmodule
+
+module mux20 (D, S, Y);
+    input  [19:0] D;
+    input  [4:0] S;
+    output Y;
+
+    wire[31:0] D32;
+
+    assign D32 = {12'bx, D};
+    assign Y = D32[S];
+endmodule
diff --git a/tests/arch/xilinx/mux.ys b/tests/arch/xilinx/mux.ys
index c2a23de6d..c40cf3a43 100644
--- a/tests/arch/xilinx/mux.ys
+++ b/tests/arch/xilinx/mux.ys
@@ -1,6 +1,10 @@
+
 read_verilog ../common/mux.v
 design -save read
 
+
+# mux2
+
 hierarchy -top mux2
 proc
 equiv_opt -assert -map +/xilinx/cells_sim.v synth_xilinx -noiopad # equivalency check
@@ -8,9 +12,12 @@ design -load postopt # load the post-opt design (otherwise equiv_opt loads the p
 cd mux2 # Constrain all select calls below inside the top module
 select -assert-count 1 t:LUT3
 
+# Ensure there are no other cells
 select -assert-none t:LUT3 %% t:* %D
 
 
+# mux4
+
 design -load read
 hierarchy -top mux4
 proc
@@ -19,9 +26,12 @@ design -load postopt # load the post-opt design (otherwise equiv_opt loads the p
 cd mux4 # Constrain all select calls below inside the top module
 select -assert-count 1 t:LUT6
 
+# Ensure there are no other cells
 select -assert-none t:LUT6 %% t:* %D
 
 
+# mux8 without widemux
+
 design -load read
 hierarchy -top mux8
 proc
@@ -31,9 +41,43 @@ cd mux8 # Constrain all select calls below inside the top module
 select -assert-count 1 t:LUT3
 select -assert-count 2 t:LUT6
 
+# Ensure there are no other cells
 select -assert-none t:LUT3 t:LUT6 %% t:* %D
 
 
+# mux8 with widemux 5
+
+design -load read
+hierarchy -top mux8
+proc
+equiv_opt -assert -map +/xilinx/cells_sim.v synth_xilinx -noiopad -widemux 5 # equivalency check
+design -load postopt # load the post-opt design (otherwise equiv_opt loads the pre-opt design)
+cd mux8 # Constrain all select calls below inside the top module
+select -assert-count 2 t:LUT6
+select -assert-count 1 t:MUXF7
+
+# Ensure there are no other cells
+select -assert-none t:LUT6 t:MUXF7 %% t:* %D
+
+
+# mux12 with widemux 5
+# There is no equivalence check because selection values 12 to 15 are unspecified
+
+design -load read
+hierarchy -top mux12
+proc
+synth_xilinx -noiopad -widemux 5
+cd mux12 # Constrain all select calls below inside the top module
+select -assert-count 3 t:LUT6
+select -assert-max   2 t:MUXF7
+select -assert-count 1 t:MUXF8
+
+# Ensure there are no other cells
+select -assert-none t:LUT6 t:MUXF7 t:MUXF8 %% t:* %D
+
+
+# mux16 without widemux
+
 design -load read
 hierarchy -top mux16
 proc
@@ -47,4 +91,45 @@ select -assert-max 7 t:LUT6
 select -assert-max 2 t:MUXF7
 dump
 
+# Ensure there are no other cells
 select -assert-none t:LUT6 t:LUT4 t:LUT3 t:MUXF7 %% t:* %D
+
+
+# mux16 with widemux 5
+
+design -load read
+hierarchy -top mux16
+proc
+equiv_opt -assert -map +/xilinx/cells_sim.v synth_xilinx -noiopad -widemux 5 # equivalency check
+design -load postopt # load the post-opt design (otherwise equiv_opt loads the pre-opt design)
+cd mux16 # Constrain all select calls below inside the top module
+select -assert-count 4 t:LUT6
+select -assert-count 2 t:MUXF7
+select -assert-count 1 t:MUXF8
+dump
+
+# Ensure there are no other cells
+select -assert-none t:LUT6 t:MUXF7 t:MUXF8 %% t:* %D
+
+
+# mux20 with widemux 5
+# Expect one mux16 (4 lut6 + 2 muxf7 + muxf8) + one mux4 (one lut6), then one mux2 (one lut3)
+# These mapping results are achieved only with abc9 (without abc, we get undesired additional muxf7/muxf8)
+# There is no equivalence check because selection values 20 to 31 are unspecified
+
+design -load read
+hierarchy -top mux20
+proc
+scratchpad -set abc9.D 5000 # Set a period high enough so we get area-optimized result
+synth_xilinx -noiopad -widemux 5 -abc9
+cd mux20 # Constrain all select calls below inside the top module
+select -assert-count 1 t:LUT3
+select -assert-count 5 t:LUT6
+select -assert-count 2 t:MUXF7
+select -assert-count 1 t:MUXF8
+dump
+
+# Ensure there are no other cells
+select -assert-none t:LUT3 t:LUT6 t:MUXF7 t:MUXF8 %% t:* %D
+
+