From 6d04b87768a7582fb8a96ffb77cb1d669982fecc Mon Sep 17 00:00:00 2001
From: Jacob Lifshay <programmerjake@gmail.com>
Date: Wed, 5 Mar 2025 23:50:38 -0800
Subject: [PATCH] WIP: splitting reg_alloc

---
 crates/cpu/Cargo.toml                         |   3 +
 crates/cpu/src/config.rs                      |  40 ++-
 crates/cpu/src/instruction.rs                 |  46 +--
 crates/cpu/src/instruction_rename.rs          | 266 ++++++++++++++++++
 crates/cpu/src/lib.rs                         |   2 +
 crates/cpu/src/reg_alloc.rs                   | 158 +----------
 .../src/reg_alloc/unit_free_regs_tracker.rs   |   9 +-
 crates/cpu/src/rename_table.rs                | 187 ++++++++++++
 crates/cpu/src/unit.rs                        | 102 ++++++-
 crates/cpu/src/unit/alu_branch.rs             |  34 +--
 crates/cpu/src/unit/unit_base.rs              | 152 ++++------
 crates/cpu/src/util.rs                        |  14 +-
 crates/cpu/src/util/array_vec.rs              |  62 +++-
 crates/cpu/src/util/tree_reduce.rs            | 152 ----------
 14 files changed, 750 insertions(+), 477 deletions(-)
 create mode 100644 crates/cpu/src/instruction_rename.rs
 create mode 100644 crates/cpu/src/rename_table.rs
 delete mode 100644 crates/cpu/src/util/tree_reduce.rs

diff --git a/crates/cpu/Cargo.toml b/crates/cpu/Cargo.toml
index 16ec0b9..783981d 100644
--- a/crates/cpu/Cargo.toml
+++ b/crates/cpu/Cargo.toml
@@ -16,3 +16,6 @@ version.workspace = true
 
 [dependencies]
 fayalite.workspace = true
+
+[lints.rust]
+unexpected_cfgs = { level = "warn", check-cfg = ['cfg(todo)'] }
diff --git a/crates/cpu/src/config.rs b/crates/cpu/src/config.rs
index 5be163c..d252ca1 100644
--- a/crates/cpu/src/config.rs
+++ b/crates/cpu/src/config.rs
@@ -3,8 +3,9 @@
 use crate::{
     instruction::{MOpTrait, PRegNum, RenamedMOp, UnitNum, UnitOutRegNum, CONST_ZERO_UNIT_NUM},
     unit::{
-        unit_base::{UnitForwardingInfo, UnitToRegAlloc},
-        UnitCancelInput, UnitKind, UnitOutputWrite,
+        unit_base::{ExecuteEnd, ExecuteStart},
+        RenamedInsnData, RetireQueueIndex, UnitForwardingInfo, UnitKind, UnitOutputWrite,
+        UnitToRegAlloc,
     },
 };
 use fayalite::prelude::*;
@@ -35,7 +36,6 @@ pub struct CpuConfig {
     pub fetch_width: NonZeroUsize,
     /// default value for [`UnitConfig::max_in_flight`]
     pub default_unit_max_in_flight: NonZeroUsize,
-    pub rob_size: NonZeroUsize,
 }
 
 impl CpuConfig {
@@ -52,13 +52,12 @@ impl CpuConfig {
         };
         v
     };
-    pub fn new(units: Vec<UnitConfig>, rob_size: NonZeroUsize) -> Self {
+    pub fn new(units: Vec<UnitConfig>) -> Self {
         Self {
             units,
             out_reg_num_width: Self::DEFAULT_OUT_REG_NUM_WIDTH,
             fetch_width: Self::DEFAULT_FETCH_WIDTH,
             default_unit_max_in_flight: Self::DEFAULT_UNIT_MAX_IN_FLIGHT,
-            rob_size,
         }
     }
     pub fn non_const_unit_nums(&self) -> std::ops::Range<usize> {
@@ -82,15 +81,15 @@ impl CpuConfig {
     pub fn renamed_mop_in_unit(&self) -> RenamedMOp<UnitOutRegNum<DynSize>, DynSize> {
         RenamedMOp[self.unit_out_reg_num()][self.p_reg_num_width()]
     }
+    pub fn renamed_mop(&self) -> RenamedMOp<PRegNum<DynSize, DynSize>, DynSize> {
+        RenamedMOp[self.p_reg_num()][self.p_reg_num_width()]
+    }
     pub fn unit_output_write(&self) -> UnitOutputWrite<DynSize> {
         UnitOutputWrite[self.out_reg_num_width]
     }
     pub fn unit_output_writes(&self) -> Array<HdlOption<UnitOutputWrite<DynSize>>> {
         Array[HdlOption[self.unit_output_write()]][self.non_const_unit_nums().len()]
     }
-    pub fn unit_cancel_input(&self) -> UnitCancelInput<DynSize> {
-        UnitCancelInput[self.out_reg_num_width]
-    }
     pub fn unit_forwarding_info(&self) -> UnitForwardingInfo<DynSize, DynSize, DynSize> {
         UnitForwardingInfo[self.unit_num_width()][self.out_reg_num_width]
             [self.non_const_unit_nums().len()]
@@ -107,13 +106,34 @@ impl CpuConfig {
         &self,
         mop_ty: MOp,
         extra_out_ty: ExtraOut,
-    ) -> UnitToRegAlloc<MOp, ExtraOut, DynSize, DynSize, DynSize> {
+    ) -> UnitToRegAlloc<MOp, ExtraOut, DynSize, DynSize, DynSize, DynSize> {
         assert_eq!(
             mop_ty.dest_reg_ty(),
             self.unit_out_reg_num(),
             "inconsistent types",
         );
         UnitToRegAlloc[mop_ty][extra_out_ty][self.unit_num_width()][self.out_reg_num_width]
-            [self.non_const_unit_nums().len()]
+            [self.non_const_unit_nums().len()][self.retire_queue_index_width()]
+    }
+    pub fn retire_queue_index_width(&self) -> usize {
+        let max_in_flight: usize = (0..self.units.len())
+            .map(|unit_index| self.unit_max_in_flight(unit_index).get())
+            .sum();
+        2 + max_in_flight.next_power_of_two().ilog2() as usize
+    }
+    pub fn retire_queue_index(&self) -> RetireQueueIndex<DynSize> {
+        RetireQueueIndex[self.retire_queue_index_width()]
+    }
+    pub fn renamed_insn_data<MOp: Type>(&self, mop: MOp) -> RenamedInsnData<MOp, DynSize> {
+        RenamedInsnData[mop][self.retire_queue_index_width()]
+    }
+    pub fn execute_start<MOp: Type>(&self, mop: MOp) -> ExecuteStart<MOp, DynSize> {
+        ExecuteStart[mop][self.retire_queue_index_width()]
+    }
+    pub fn execute_end<ExtraOut: Type>(
+        &self,
+        extra_out_ty: ExtraOut,
+    ) -> ExecuteEnd<DynSize, DynSize, ExtraOut> {
+        ExecuteEnd[self.out_reg_num_width][self.retire_queue_index_width()][extra_out_ty]
     }
 }
diff --git a/crates/cpu/src/instruction.rs b/crates/cpu/src/instruction.rs
index 80dd9d5..cdd7c9c 100644
--- a/crates/cpu/src/instruction.rs
+++ b/crates/cpu/src/instruction.rs
@@ -910,11 +910,9 @@ impl MOpRegNum {
     //
     // TODO: maybe add more registers later.
     pub const FLAG_REG_NUMS: Range<u32> = 0xFE..0x100;
-    /// registers handled by a special small rename table (for flags and stuff, since it has more read/write ports)
-    pub const SPECIAL_REG_NUMS: Range<u32> = Self::FLAG_REG_NUMS;
-    /// registers handled by the large rename table for normal registers (has less read/write ports)
-    pub const NORMAL_REG_NUMS: Range<u32> =
-        Self::CONST_ZERO_REG_NUM + 1..Self::SPECIAL_REG_NUMS.start;
+    /// registers that aren't constants
+    pub const NON_CONST_REG_NUMS: Range<u32> =
+        Self::CONST_ZERO_REG_NUM + 1..Self::FLAG_REG_NUMS.end;
 }
 
 #[hdl(cmp_eq)]
@@ -929,29 +927,6 @@ pub struct MOpDestReg {
     pub flag_regs: Array<HdlOption<()>, { range_u32_len(&MOpRegNum::FLAG_REG_NUMS) }>,
 }
 
-#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug)]
-pub enum RenameTableName {
-    /// the large rename table for normal registers (has less read/write ports)
-    Normal,
-    /// a special small rename table (for flags and stuff, since it has more read/write ports)
-    Special,
-}
-
-impl RenameTableName {
-    pub const fn reg_range(self) -> std::ops::Range<u32> {
-        match self {
-            Self::Normal => MOpRegNum::NORMAL_REG_NUMS,
-            Self::Special => MOpRegNum::SPECIAL_REG_NUMS,
-        }
-    }
-    pub const fn as_str(self) -> &'static str {
-        match self {
-            Self::Normal => "rename_table_normal",
-            Self::Special => "rename_table_special",
-        }
-    }
-}
-
 #[derive(Copy, Clone, Eq, PartialEq, Hash, Debug)]
 pub enum MOpDestRegKind {
     NormalReg {
@@ -989,16 +964,13 @@ impl fmt::Display for MOpDestRegName {
 }
 
 impl MOpDestRegKind {
-    pub const fn reg_range(self) -> std::ops::Range<u32> {
+    pub const fn reg_num_range(self) -> std::ops::Range<u32> {
         match self {
-            Self::NormalReg { .. } => MOpRegNum::NORMAL_REG_NUMS,
-            Self::FlagReg { .. } => MOpRegNum::FLAG_REG_NUMS,
-        }
-    }
-    pub const fn rename_table_names(self) -> &'static [RenameTableName] {
-        match self {
-            Self::NormalReg { .. } => &[RenameTableName::Normal, RenameTableName::Special],
-            Self::FlagReg { .. } => &[RenameTableName::Special],
+            Self::NormalReg { dest_reg_index: _ } => MOpRegNum::NON_CONST_REG_NUMS,
+            Self::FlagReg {
+                reg_num,
+                flag_reg_index: _,
+            } => reg_num..reg_num + 1,
         }
     }
     pub fn fixed_reg_num(self) -> Option<u32> {
diff --git a/crates/cpu/src/instruction_rename.rs b/crates/cpu/src/instruction_rename.rs
new file mode 100644
index 0000000..a96c0f6
--- /dev/null
+++ b/crates/cpu/src/instruction_rename.rs
@@ -0,0 +1,266 @@
+// SPDX-License-Identifier: LGPL-3.0-or-later
+// See Notices.txt for copyright information
+
+use crate::{
+    config::CpuConfig,
+    instruction::{MOp, MOpDestReg, MOpRegNum, MOpTrait, MoveRegMOp, PRegNum, RenamedMOp},
+    rename_table::{rename_table, RenameTablePortConfig},
+    unit::{RenamedInsnData, RetireQueueIndex, UnitMOp},
+    util::array_vec::{ArrayVec, Length, ReadyValidArray},
+};
+use fayalite::{
+    prelude::*,
+    util::{prefix_sum::PrefixSumAlgorithm, ready_valid::ReadyValid},
+};
+
+#[hdl]
+pub struct InstructionRenameInputInsn<UnitNumWidth: Size, OutRegNumWidth: Size> {
+    pub mop: MOp,
+    pub pc: UInt<64>,
+    pub renamed_dest: PRegNum<UnitNumWidth, OutRegNumWidth>,
+}
+
+impl CpuConfig {
+    pub fn instruction_rename_input_insn(&self) -> InstructionRenameInputInsn<DynSize, DynSize> {
+        InstructionRenameInputInsn[self.unit_num_width()][self.out_reg_num_width]
+    }
+}
+
+#[hdl]
+struct InsnsInPrefixSummary<FetchWidth: Size> {
+    all_ready: Bool,
+    ready_count: Length<FetchWidth>,
+    retire_queue_used: Length<FetchWidth>,
+}
+
+#[hdl_module]
+pub fn instruction_rename(config: &CpuConfig) {
+    #[hdl]
+    let cd: ClockDomain = m.input();
+    #[hdl]
+    let insns_in: ReadyValidArray<InstructionRenameInputInsn<DynSize, DynSize>, DynSize> =
+        m.input(ReadyValidArray[config.instruction_rename_input_insn()][config.fetch_width.get()]);
+    #[hdl]
+    let start_retire_queue_index: RetireQueueIndex<DynSize> = m.input(config.retire_queue_index());
+    #[hdl]
+    let end_retire_queue_index: RetireQueueIndex<DynSize> = m.output(config.retire_queue_index());
+    #[hdl]
+    let insns_out: Array<
+        ReadyValid<RenamedInsnData<RenamedMOp<PRegNum<DynSize, DynSize>, DynSize>, DynSize>>,
+    > = m.output(
+        Array[ReadyValid[config.renamed_insn_data(config.renamed_mop())]][config.fetch_width.get()],
+    );
+
+    // TODO: handle resetting table after cancelling instructions
+
+    #[hdl]
+    let insns_ready_or_move = wire(Array[Bool][config.fetch_width.get()]);
+
+    for (insn_ready_or_move, insn_out) in insns_ready_or_move.into_iter().zip(insns_out) {
+        connect(insn_ready_or_move, insn_out.ready);
+    }
+
+    ArrayVec::for_each(insns_in.data, |fetch_index, input_insn| {
+        #[hdl]
+        match input_insn.mop {
+            UnitMOp::<_, _, _>::TransformedMove(_) => {
+                connect(insns_ready_or_move[fetch_index], true);
+            }
+            UnitMOp::<_, _, _>::AluBranch(_) | UnitMOp::<_, _, _>::LoadStore(_) => {}
+        }
+    });
+
+    let insns_in_prefix_summary_ty = InsnsInPrefixSummary[config.fetch_width.get()];
+    #[hdl]
+    let insns_in_prefix_summaries =
+        wire(Array[insns_in_prefix_summary_ty][config.fetch_width.get()]);
+    let insns_in_prefix_summaries_vec = PrefixSumAlgorithm::WorkEfficient.run(
+        (0..config.fetch_width.get()).map(|fetch_index| {
+            #[hdl]
+            let insns_in_prefix_summary_in = wire(insns_in_prefix_summary_ty);
+            #[hdl]
+            let InsnsInPrefixSummary::<_> {
+                all_ready,
+                ready_count,
+                retire_queue_used,
+            } = insns_in_prefix_summary_in;
+            connect(all_ready, insns_out[fetch_index].ready);
+            connect(
+                ready_count,
+                Expr::ty(ready_count).cast_from_uint_unchecked(all_ready.cast_to(UInt[1])),
+            );
+            connect(retire_queue_used, Expr::ty(retire_queue_used).zero());
+            #[hdl]
+            if let HdlSome(input_insn) = ArrayVec::get(insns_in.data, fetch_index) {
+                connect(retire_queue_used, ready_count);
+                #[hdl]
+                match input_insn.mop {
+                    UnitMOp::<_, _, _>::TransformedMove(_) => {
+                        connect(all_ready, true);
+                        connect(retire_queue_used, Expr::ty(retire_queue_used).zero());
+                    }
+                    UnitMOp::<_, _, _>::AluBranch(_) | UnitMOp::<_, _, _>::LoadStore(_) => {}
+                }
+            }
+            insns_in_prefix_summary_in
+        }),
+        |l, r| {
+            #[hdl]
+            let insns_in_prefix_summary_merge = wire(insns_in_prefix_summary_ty);
+            #[hdl]
+            let InsnsInPrefixSummary::<_> {
+                all_ready,
+                ready_count,
+                retire_queue_used,
+            } = insns_in_prefix_summary_merge;
+            connect(all_ready, l.all_ready & r.all_ready);
+            #[hdl]
+            if l.all_ready {
+                connect(
+                    ready_count,
+                    Expr::ty(ready_count).cast_from_uint_unchecked(
+                        Length::as_uint(l.ready_count) + Length::as_uint(r.ready_count),
+                    ),
+                );
+                connect(
+                    retire_queue_used,
+                    Expr::ty(retire_queue_used).cast_from_uint_unchecked(
+                        Length::as_uint(l.retire_queue_used) + Length::as_uint(r.retire_queue_used),
+                    ),
+                );
+            } else {
+                connect(ready_count, l.ready_count);
+                connect(retire_queue_used, l.retire_queue_used);
+            }
+            insns_in_prefix_summary_merge
+        },
+    );
+    for (l, r) in insns_in_prefix_summaries
+        .into_iter()
+        .zip(insns_in_prefix_summaries_vec)
+    {
+        connect(l, r);
+    }
+    connect(
+        insns_in.ready,
+        insns_in_prefix_summaries[config.fetch_width.get() - 1].ready_count,
+    );
+
+    #[hdl]
+    let retire_queue_indexes =
+        wire(Array[config.retire_queue_index()][config.fetch_width.get() + 1]);
+    connect(retire_queue_indexes[0], start_retire_queue_index);
+    connect(
+        end_retire_queue_index,
+        retire_queue_indexes[config.fetch_width.get()],
+    );
+    for (retire_queue_index, insns_in_prefix_summary) in retire_queue_indexes
+        .into_iter()
+        .skip(1)
+        .zip(insns_in_prefix_summaries)
+    {
+        connect_any(
+            retire_queue_index.index,
+            start_retire_queue_index.index
+                + Length::as_uint(insns_in_prefix_summary.retire_queue_used),
+        );
+    }
+
+    let mut port_configs = Vec::new();
+    let mut src_reg_count = 0;
+    MOpTrait::for_each_src_reg(MOp.uninit(), &mut |_, src_index| {
+        src_reg_count = src_reg_count.max(src_index + 1);
+    });
+    for _ in 0..config.fetch_width.get() {
+        for _ in 0..src_reg_count {
+            port_configs.push(RenameTablePortConfig::Read {
+                addr_range: MOpRegNum::NON_CONST_REG_NUMS,
+            });
+        }
+        for dest_reg_kind in MOpDestReg::REG_KINDS {
+            port_configs.push(RenameTablePortConfig::Write {
+                addr_range: dest_reg_kind.reg_num_range(),
+            });
+        }
+    }
+
+    #[hdl]
+    let rename_table = instance(rename_table(config, &port_configs));
+
+    connect(rename_table.cd, cd);
+
+    for read_port in rename_table.read_ports {
+        connect_any(read_port.addr, 0_hdl_u0);
+    }
+    for write_port in rename_table.write_ports {
+        connect_any(write_port.addr, 0_hdl_u0);
+        connect_any(write_port.data, config.p_reg_num().const_zero());
+    }
+
+    ArrayVec::for_each(
+        ReadyValidArray::firing_data(insns_in),
+        |fetch_index, input_insn| {
+            let read_port_index = fetch_index * src_reg_count;
+            let write_port_index = fetch_index * MOpDestReg::REG_COUNT;
+            #[hdl]
+            let InstructionRenameInputInsn::<_, _> {
+                mop,
+                pc,
+                renamed_dest,
+            } = input_insn;
+            let insn_out = MOpTrait::map_regs(
+                mop,
+                renamed_dest,
+                config.p_reg_num_width(),
+                &mut |src_reg, src_index| {
+                    connect(
+                        rename_table.read_ports[read_port_index + src_index].addr,
+                        src_reg.cast_bits_to(MOpRegNum),
+                    );
+                    rename_table.read_ports[read_port_index + src_index]
+                        .data
+                        .cast_to_bits()
+                },
+            );
+            for (i, dest_reg) in MOpDestReg::regs(MOpTrait::dest_reg(mop))
+                .into_iter()
+                .enumerate()
+            {
+                connect(
+                    rename_table.write_ports[write_port_index + i].addr,
+                    dest_reg,
+                );
+                connect(
+                    rename_table.write_ports[write_port_index + i].data,
+                    renamed_dest,
+                );
+            }
+            let insn_out = UnitMOp::try_with_transformed_move_op(
+                insn_out,
+                config.renamed_mop().TransformedMove,
+                |insn_out: Expr<HdlOption<_>>, move_reg: Expr<MoveRegMOp<_, _>>| {
+                    for i in 0..MOpDestReg::REG_COUNT {
+                        // execute move by using same PRegNum as src[0] for dest
+                        connect(
+                            rename_table.write_ports[write_port_index + i].data,
+                            move_reg.common.src[0].cast_bits_to(config.p_reg_num()),
+                        );
+                    }
+                    // move already executed, so remove it
+                    connect(insn_out, Expr::ty(insn_out).HdlNone());
+                },
+            );
+            connect(
+                insns_out[fetch_index].data,
+                HdlOption::map(insn_out, |insn_out| {
+                    #[hdl]
+                    RenamedInsnData::<_, _> {
+                        retire_queue_index: retire_queue_indexes[fetch_index],
+                        pc,
+                        mop: insn_out,
+                    }
+                }),
+            );
+        },
+    );
+}
diff --git a/crates/cpu/src/lib.rs b/crates/cpu/src/lib.rs
index bae3720..766811d 100644
--- a/crates/cpu/src/lib.rs
+++ b/crates/cpu/src/lib.rs
@@ -2,7 +2,9 @@
 // See Notices.txt for copyright information
 pub mod config;
 pub mod instruction;
+pub mod instruction_rename;
 pub mod reg_alloc;
 pub mod register;
+pub mod rename_table;
 pub mod unit;
 pub mod util;
diff --git a/crates/cpu/src/reg_alloc.rs b/crates/cpu/src/reg_alloc.rs
index 6e7bc5d..acf796d 100644
--- a/crates/cpu/src/reg_alloc.rs
+++ b/crates/cpu/src/reg_alloc.rs
@@ -3,17 +3,17 @@
 use crate::{
     config::CpuConfig,
     instruction::{
-        MOp, MOpDestReg, MOpRegNum, MOpTrait, MoveRegMOp, PRegNum, RenameTableName, UnitOutRegNum,
+        MOp, MOpDestReg, MOpRegNum, MOpTrait, MoveRegMOp, PRegNum, UnitOutRegNum,
         COMMON_MOP_SRC_LEN,
     },
     unit::{
-        unit_base::{UnitForwardingInfo, UnitInput},
         GlobalState, TrapData, UnitMOp, UnitOutput, UnitOutputWrite, UnitResult,
         UnitResultCompleted, UnitTrait,
     },
-    util::tree_reduce::tree_reduce_with_state,
+    util::array_vec::ReadyValidArray,
 };
 use fayalite::{
+    int::BoolOrIntType,
     memory::{splat_mask, WriteStruct},
     module::{instance_with_loc, memory_with_loc, wire_with_loc},
     prelude::*,
@@ -44,150 +44,12 @@ pub enum FetchDecodeSpecialOp {
 
 #[hdl]
 pub struct FetchDecodeInterface<FetchWidth: Size> {
-    pub decoded_insns: ArrayType<ReadyValid<FetchedDecodedMOp>, FetchWidth>,
+    pub decoded_insns: ReadyValidArray<FetchedDecodedMOp, FetchWidth>,
     #[hdl(flip)]
     pub fetch_decode_special_op: ReadyValid<FetchDecodeSpecialOp>,
 }
 
-#[hdl]
-struct ROBRenamedInsn<UnitNumWidth: Size, OutRegNumWidth: Size> {
-    mop_dest: MOpDestReg,
-    p_dest: PRegNum<UnitNumWidth, OutRegNumWidth>,
-}
-
-#[hdl]
-struct ROBEntry<UnitNumWidth: Size, OutRegNumWidth: Size> {
-    renamed_insn: ROBRenamedInsn<UnitNumWidth, OutRegNumWidth>,
-    dest_written: Bool,
-}
-
-#[hdl_module]
-fn rob(config: &CpuConfig) {
-    #[hdl]
-    let cd: ClockDomain = m.input();
-    #[hdl]
-    let renamed_insns_in: Array<ReadyValid<ROBRenamedInsn<DynSize, DynSize>>> = m.input(
-        Array[ReadyValid[ROBRenamedInsn[config.unit_num_width()][config.out_reg_num_width]]]
-            [config.fetch_width.get()],
-    );
-    #[hdl]
-    let unit_forwarding_info: UnitForwardingInfo<DynSize, DynSize, DynSize> =
-        m.input(config.unit_forwarding_info());
-
-    let rob_entry_ty = ROBEntry[config.unit_num_width()][config.out_reg_num_width];
-    #[hdl]
-    let rob = reg_builder()
-        .clock_domain(cd)
-        .no_reset(Array[rob_entry_ty][config.rob_size.get()]);
-    #[hdl]
-    let rob_valid_start = reg_builder()
-        .clock_domain(cd)
-        .reset(UInt::range(0..config.rob_size.get()).zero());
-    #[hdl]
-    let rob_valid_end = reg_builder()
-        .clock_domain(cd)
-        .reset(UInt::range(0..config.rob_size.get()).zero());
-    #[hdl]
-    let free_space = wire(UInt::range_inclusive(0..=config.rob_size.get()));
-    #[hdl]
-    if rob_valid_end.cmp_lt(rob_valid_start) {
-        // rob_valid_end wrapped around but start didn't
-        connect_any(
-            free_space,
-            rob_valid_end + config.rob_size.get() - rob_valid_start,
-        );
-    } else {
-        connect_any(free_space, rob_valid_end - rob_valid_start);
-    }
-
-    struct IndexAndRange {
-        index: Expr<UInt>,
-        range: std::ops::Range<usize>,
-    }
-
-    let mut next_write_index = IndexAndRange {
-        index: rob_valid_end,
-        range: 0..config.rob_size.get(),
-    };
-    for fetch_index in 0..config.fetch_width.get() {
-        let write_index = next_write_index;
-        let next_write_index_range = write_index.range.start..write_index.range.end + 1;
-        next_write_index = IndexAndRange {
-            index: wire_with_loc(
-                &format!("next_write_index_{fetch_index}"),
-                SourceLocation::caller(),
-                UInt::range(next_write_index_range.clone()),
-            ),
-            range: next_write_index_range,
-        };
-        connect(
-            renamed_insns_in[fetch_index].ready,
-            fetch_index.cmp_lt(free_space),
-        );
-        #[hdl]
-        if let HdlSome(renamed_insn) = ReadyValid::firing_data(renamed_insns_in[fetch_index]) {
-            for i in write_index.range.clone() {
-                #[hdl]
-                if write_index.index.cmp_eq(i) {
-                    connect(
-                        rob[i % config.rob_size.get()],
-                        #[hdl]
-                        ROBEntry {
-                            renamed_insn,
-                            dest_written: false,
-                        },
-                    );
-                }
-            }
-        }
-        // TODO: optimize write_index chain better
-        connect_any(
-            next_write_index.index,
-            write_index.index
-                + ReadyValid::firing(renamed_insns_in[fetch_index]).cast_to_static::<UInt<1>>(),
-        );
-    }
-    assert!(
-        config.rob_size >= config.fetch_width,
-        "rob_size ({}) is too small for fetch_width = {} -- next_write_index would overflow",
-        config.rob_size,
-        config.fetch_width,
-    );
-    #[hdl]
-    if next_write_index.index.cmp_lt(config.rob_size.get()) {
-        connect_any(rob_valid_end, next_write_index.index);
-    } else {
-        connect_any(
-            rob_valid_end,
-            next_write_index.index - config.rob_size.get(),
-        );
-    }
-
-    // TODO: optimize better, O(rob_size * unit_count) is too big here
-    for rob_index in 0..config.rob_size.get() {
-        for unit_index in 0..config.non_const_unit_nums().len() {
-            #[hdl]
-            if let HdlSome(unit_output_write) = unit_forwarding_info.unit_output_writes[unit_index]
-            {
-                #[hdl]
-                let UnitOutputWrite::<_> {
-                    which: unit_out_reg,
-                    value: _,
-                } = unit_output_write;
-                let p_reg_num = #[hdl]
-                PRegNum::<_, _> {
-                    unit_num: config.unit_num().from_index(unit_index),
-                    unit_out_reg,
-                };
-                #[hdl]
-                if rob[rob_index].renamed_insn.p_dest.cmp_eq(p_reg_num) {
-                    connect(rob[rob_index].dest_written, true);
-                }
-            }
-        }
-    }
-}
-
+#[cfg(todo)]
 #[hdl_module]
 /// combination register allocator, register renaming, unit selection, and retire handling
 pub fn reg_alloc(config: &CpuConfig) {
@@ -205,10 +67,6 @@ pub fn reg_alloc(config: &CpuConfig) {
     );
     // TODO: finish
 
-    #[hdl]
-    let rob = instance(rob(config));
-    connect(rob.cd, cd);
-
     let mut rename_table_mems = BTreeMap::<RenameTableName, MemBuilder<_>>::new();
 
     for reg_kind in MOpDestReg::REG_KINDS {
@@ -238,11 +96,6 @@ pub fn reg_alloc(config: &CpuConfig) {
     #[hdl]
     let renamed_mops_out_reg = wire(Array[HdlOption[config.p_reg_num()]][config.fetch_width.get()]);
     for fetch_index in 0..config.fetch_width.get() {
-        // TODO: finish
-        connect(
-            rob.renamed_insns_in[fetch_index].data,
-            Expr::ty(rob).renamed_insns_in.element().data.HdlNone(),
-        );
         // TODO: finish
         connect(
             fetch_decode_interface.decoded_insns[fetch_index].ready,
@@ -483,7 +336,6 @@ pub fn reg_alloc(config: &CpuConfig) {
     );
     #[hdl]
     let unit_forwarding_info = wire(config.unit_forwarding_info());
-    connect(rob.unit_forwarding_info, unit_forwarding_info);
     for (unit_index, unit_config) in config.units.iter().enumerate() {
         let dyn_unit = unit_config.kind.unit(config, unit_index);
         let unit = instance_with_loc(
diff --git a/crates/cpu/src/reg_alloc/unit_free_regs_tracker.rs b/crates/cpu/src/reg_alloc/unit_free_regs_tracker.rs
index d19cf2a..133049a 100644
--- a/crates/cpu/src/reg_alloc/unit_free_regs_tracker.rs
+++ b/crates/cpu/src/reg_alloc/unit_free_regs_tracker.rs
@@ -1,7 +1,10 @@
 // SPDX-License-Identifier: LGPL-3.0-or-later
 // See Notices.txt for copyright information
-use crate::util::tree_reduce::tree_reduce;
-use fayalite::{module::wire_with_loc, prelude::*, util::ready_valid::ReadyValid};
+use fayalite::{
+    module::wire_with_loc,
+    prelude::*,
+    util::{prefix_sum::reduce, ready_valid::ReadyValid},
+};
 use std::{num::NonZeroUsize, ops::Range};
 
 #[hdl_module]
@@ -44,7 +47,7 @@ pub fn unit_free_regs_tracker(
         count,
         count_overflowed,
         alloc_nums,
-    }) = tree_reduce(
+    }) = reduce(
         (0..reg_count).map(|index| Summary {
             range: index..index + 1,
             count: (!allocated_reg[index])
diff --git a/crates/cpu/src/rename_table.rs b/crates/cpu/src/rename_table.rs
new file mode 100644
index 0000000..b88f696
--- /dev/null
+++ b/crates/cpu/src/rename_table.rs
@@ -0,0 +1,187 @@
+// SPDX-License-Identifier: LGPL-3.0-or-later
+// See Notices.txt for copyright information
+
+use crate::{
+    config::CpuConfig,
+    instruction::{MOpRegNum, PRegNum},
+    util::range_intersection,
+};
+use fayalite::{
+    memory::{splat_mask, ReadStruct, WriteStruct},
+    module::memory_with_loc,
+    prelude::*,
+};
+use std::{mem, ops::Range};
+
+#[hdl]
+pub struct RenameTableReadPort<UnitNumWidth: Size, OutRegNumWidth: Size> {
+    pub addr: MOpRegNum,
+    #[hdl(flip)]
+    pub data: PRegNum<UnitNumWidth, OutRegNumWidth>,
+}
+
+#[hdl]
+pub struct RenameTableWritePort<UnitNumWidth: Size, OutRegNumWidth: Size> {
+    pub addr: MOpRegNum,
+    pub data: PRegNum<UnitNumWidth, OutRegNumWidth>,
+}
+
+#[derive(Clone, Debug)]
+pub enum RenameTablePortConfig {
+    Read { addr_range: Range<u32> },
+    Write { addr_range: Range<u32> },
+}
+
+/// register rename table.
+/// all read/write operations are done in the order of `port_configs`.
+/// So if `port_configs[0]` is a write and `port_configs[1]` is a read,
+/// then the read port will combinatorially return data written by the
+/// write port in the *same* clock cycle. However, if `port_configs[0]`
+/// is a read and `port_configs[1]` is a write, then the read port will
+/// not see the data written by the write port until the *next* clock cycle.
+#[hdl_module]
+pub fn rename_table(config: &CpuConfig, port_configs: &[RenameTablePortConfig]) {
+    let read_count = port_configs
+        .iter()
+        .filter(|v| matches!(v, RenameTablePortConfig::Read { .. }))
+        .count();
+    let write_count = port_configs
+        .iter()
+        .filter(|v| matches!(v, RenameTablePortConfig::Write { .. }))
+        .count();
+
+    #[hdl]
+    let cd: ClockDomain = m.input();
+    #[hdl]
+    let read_ports: Array<RenameTableReadPort<DynSize, DynSize>> = m.input(
+        Array[RenameTableReadPort[config.unit_num_width()][config.out_reg_num_width]][read_count],
+    );
+    #[hdl]
+    let write_ports: Array<RenameTableWritePort<DynSize, DynSize>> = m.input(
+        Array[RenameTableWritePort[config.unit_num_width()][config.out_reg_num_width]][write_count],
+    );
+
+    for read_port in read_ports {
+        connect(read_port.data, config.p_reg_num().const_zero());
+    }
+
+    let port_configs_and_indexes = port_configs.iter().scan(
+        (0usize, 0),
+        |(read_port_index, write_port_index), port_config| {
+            Some((
+                port_config,
+                match port_config {
+                    RenameTablePortConfig::Read { .. } => {
+                        mem::replace(read_port_index, *read_port_index + 1)
+                    }
+                    RenameTablePortConfig::Write { .. } => {
+                        mem::replace(write_port_index, *write_port_index + 1)
+                    }
+                },
+            ))
+        },
+    );
+
+    let mut range_transitions = Vec::with_capacity(port_configs.len() * 2);
+    for port_config in port_configs {
+        let (RenameTablePortConfig::Read { addr_range }
+        | RenameTablePortConfig::Write { addr_range }) = port_config;
+        range_transitions.push(addr_range.start);
+        range_transitions.push(addr_range.end);
+    }
+    range_transitions.sort_unstable();
+    range_transitions.dedup();
+    let mut last_range_transition = None;
+    for range_transition in range_transitions {
+        let Some(last_range_transition) = last_range_transition.replace(range_transition) else {
+            continue;
+        };
+        let cur_addr_range = last_range_transition..range_transition;
+        let mut mem = memory_with_loc(
+            &if cur_addr_range.len() == 1 {
+                format!("mem_{:#x}", cur_addr_range.start)
+            } else {
+                format!("mem_{:#x}_{:#x}", cur_addr_range.start, cur_addr_range.end)
+            },
+            config.p_reg_num(),
+            SourceLocation::caller(),
+        );
+        mem.depth(cur_addr_range.len());
+        let addr_in_range = |addr: Expr<MOpRegNum>| {
+            if cur_addr_range.len() == 1 {
+                addr.value.cmp_eq(cur_addr_range.start)
+            } else {
+                addr.value.cmp_ge(cur_addr_range.start) & addr.value.cmp_lt(cur_addr_range.end)
+            }
+        };
+        for (port_config, port_index) in port_configs_and_indexes.clone() {
+            match port_config {
+                RenameTablePortConfig::Read { addr_range } => {
+                    if range_intersection(&addr_range, &cur_addr_range).is_none() {
+                        continue;
+                    }
+                    let port = read_ports[port_index];
+                    #[hdl]
+                    let ReadStruct::<_, _> {
+                        addr,
+                        en,
+                        clk,
+                        data,
+                    } = mem.new_read_port();
+                    connect_any(addr, port.addr.value - cur_addr_range.start);
+                    connect(en, addr_in_range(port.addr));
+                    connect(clk, cd.clk);
+                    #[hdl]
+                    if en {
+                        connect(port.data, data);
+                    }
+                }
+                RenameTablePortConfig::Write { addr_range } => {
+                    if range_intersection(&addr_range, &cur_addr_range).is_none() {
+                        continue;
+                    }
+                    let port = write_ports[port_index];
+                    #[hdl]
+                    let WriteStruct::<_, _> {
+                        addr,
+                        en,
+                        clk,
+                        data,
+                        mask,
+                    } = mem.new_write_port();
+                    connect_any(addr, port.addr.value - cur_addr_range.start);
+                    connect(en, addr_in_range(port.addr));
+                    connect(clk, cd.clk);
+                    connect(data, port.data);
+                    connect(mask, splat_mask(Expr::ty(port).data, true.to_expr()));
+                }
+            }
+        }
+    }
+    for (port_config_index, (port_config, port_index)) in
+        port_configs_and_indexes.clone().enumerate()
+    {
+        let RenameTablePortConfig::Read { addr_range } = port_config else {
+            continue;
+        };
+        let port = read_ports[port_index];
+        for (prev_port_config, prev_port_index) in
+            port_configs_and_indexes.clone().take(port_config_index)
+        {
+            let RenameTablePortConfig::Write {
+                addr_range: prev_addr_range,
+            } = prev_port_config
+            else {
+                continue;
+            };
+            if range_intersection(addr_range, prev_addr_range).is_none() {
+                continue;
+            }
+            let prev_port = write_ports[prev_port_index];
+            #[hdl]
+            if prev_port.addr.cmp_eq(port.addr) {
+                connect(port.data, prev_port.data);
+            }
+        }
+    }
+}
diff --git a/crates/cpu/src/unit.rs b/crates/cpu/src/unit.rs
index cc11c55..35af5c5 100644
--- a/crates/cpu/src/unit.rs
+++ b/crates/cpu/src/unit.rs
@@ -8,13 +8,15 @@ use crate::{
         RenamedMOp, UnitOutRegNum,
     },
     register::{FlagsMode, PRegValue},
-    unit::unit_base::UnitToRegAlloc,
 };
 use fayalite::{
     bundle::{Bundle, BundleType},
+    int::BoolOrIntType,
     intern::{Intern, Interned},
     prelude::*,
+    util::ready_valid::ReadyValid,
 };
+use std::marker::PhantomData;
 
 pub mod alu_branch;
 pub mod unit_base;
@@ -253,6 +255,78 @@ pub struct GlobalState {
     pub flags_mode: FlagsMode,
 }
 
+/// index into the retire queue (the virtual queue of instructions that haven't yet retired)
+#[hdl(cmp_eq)]
+pub struct RetireQueueIndex<Width: Size> {
+    /// increases by one for each instruction added to the retire queue.
+    ///
+    /// this wraps around, so you must not compare it using `cmp_lt`/`cmp_gt`
+    /// but instead must use [`Self::insns_until`] and compare the output with zero.
+    pub index: UIntType<Width>,
+}
+
+impl<Width: Size> RetireQueueIndex<Width> {
+    pub fn insns_until(
+        this: impl ToExpr<Type = Self>,
+        target: impl ToExpr<Type = Self>,
+    ) -> Expr<SIntType<Width>> {
+        let this = this.to_expr();
+        let target = target.to_expr();
+        assert_eq!(Expr::ty(this), Expr::ty(target));
+        (this.index - target.index).cast_to(Expr::ty(this).index.as_same_width_sint())
+    }
+}
+
+#[hdl]
+pub struct RenamedInsnData<MOp, RetireQueueIndexWidth: Size> {
+    pub retire_queue_index: RetireQueueIndex<RetireQueueIndexWidth>,
+    pub pc: UInt<64>,
+    pub mop: MOp,
+}
+
+#[hdl]
+pub struct UnitForwardingInfo<UnitNumWidth: Size, OutRegNumWidth: Size, UnitCount: Size> {
+    pub unit_output_writes: ArrayType<HdlOption<UnitOutputWrite<OutRegNumWidth>>, UnitCount>,
+    pub unit_reg_frees: ArrayType<HdlOption<UnitOutRegNum<OutRegNumWidth>>, UnitCount>,
+    pub _phantom: PhantomData<UnitNumWidth>,
+}
+
+#[hdl]
+pub struct UnitToRegAlloc<
+    MOp: Type,
+    ExtraOut: Type,
+    UnitNumWidth: Size,
+    OutRegNumWidth: Size,
+    UnitCount: Size,
+    RetireQueueIndexWidth: Size,
+> {
+    #[hdl(flip)]
+    pub unit_forwarding_info: UnitForwardingInfo<UnitNumWidth, OutRegNumWidth, UnitCount>,
+    #[hdl(flip)]
+    pub input: ReadyValid<RenamedInsnData<MOp, RetireQueueIndexWidth>>,
+    #[hdl(flip)]
+    pub cancel_input: HdlOption<UnitCancelInput<OutRegNumWidth>>,
+    pub output: HdlOption<UnitOutput<OutRegNumWidth, RetireQueueIndexWidth, ExtraOut>>,
+}
+
+impl<
+        MOp: Type,
+        ExtraOut: Type,
+        UnitNumWidth: Size,
+        OutRegNumWidth: Size,
+        UnitCount: Size,
+        RetireQueueIndexWidth: Size,
+    >
+    UnitToRegAlloc<MOp, ExtraOut, UnitNumWidth, OutRegNumWidth, UnitCount, RetireQueueIndexWidth>
+{
+    pub fn mop_ty(self) -> MOp {
+        self.input.data.HdlSome.mop
+    }
+    pub fn extra_out_ty(self) -> ExtraOut {
+        self.output.HdlSome.extra_out_ty()
+    }
+}
+
 #[hdl(cmp_eq)]
 pub struct UnitResultCompleted<ExtraOut> {
     pub value: PRegValue,
@@ -283,20 +357,32 @@ impl<ExtraOut: Type> UnitResult<ExtraOut> {
 }
 
 #[hdl]
-pub struct UnitOutput<OutRegNumWidth: Size, ExtraOut> {
+pub struct UnitOutput<OutRegNumWidth: Size, RetireQueueIndexWidth: Size, ExtraOut> {
     pub which: UnitOutRegNum<OutRegNumWidth>,
+    pub retire_queue_index: RetireQueueIndex<RetireQueueIndexWidth>,
     pub result: UnitResult<ExtraOut>,
 }
 
-impl<OutRegNumWidth: Size, ExtraOut: Type> UnitOutput<OutRegNumWidth, ExtraOut> {
+impl<OutRegNumWidth: Size, RetireQueueIndexWidth: Size, ExtraOut: Type>
+    UnitOutput<OutRegNumWidth, RetireQueueIndexWidth, ExtraOut>
+{
     pub fn extra_out_ty(self) -> ExtraOut {
         self.result.extra_out_ty()
     }
 }
 
 #[hdl(cmp_eq)]
-pub struct UnitCancelInput<OutRegNumWidth: Size> {
-    pub which: UnitOutRegNum<OutRegNumWidth>,
+pub struct UnitCancelInput<RetireQueueIndexWidth: Size> {
+    pub target: RetireQueueIndex<RetireQueueIndexWidth>,
+}
+
+impl<RetireQueueIndexWidth: Size> UnitCancelInput<RetireQueueIndexWidth> {
+    pub fn is_canceled(
+        this: impl ToExpr<Type = Self>,
+        insn_retire_queue_index: impl ToExpr<Type = RetireQueueIndex<RetireQueueIndexWidth>>,
+    ) -> Expr<Bool> {
+        RetireQueueIndex::insns_until(insn_retire_queue_index, this.to_expr().target).cmp_ge(0i8)
+    }
 }
 
 pub trait UnitTrait:
@@ -322,7 +408,7 @@ pub trait UnitTrait:
     fn unit_to_reg_alloc(
         &self,
         this: Expr<Self::Type>,
-    ) -> Expr<UnitToRegAlloc<Self::MOp, Self::ExtraOut, DynSize, DynSize, DynSize>>;
+    ) -> Expr<UnitToRegAlloc<Self::MOp, Self::ExtraOut, DynSize, DynSize, DynSize, DynSize>>;
 
     fn cd(&self, this: Expr<Self::Type>) -> Expr<ClockDomain>;
 
@@ -384,7 +470,7 @@ impl UnitTrait for DynUnit {
     fn unit_to_reg_alloc(
         &self,
         this: Expr<Self::Type>,
-    ) -> Expr<UnitToRegAlloc<Self::MOp, Self::ExtraOut, DynSize, DynSize, DynSize>> {
+    ) -> Expr<UnitToRegAlloc<Self::MOp, Self::ExtraOut, DynSize, DynSize, DynSize, DynSize>> {
         self.unit.unit_to_reg_alloc(this)
     }
 
@@ -439,7 +525,7 @@ impl<T: UnitTrait + Clone + std::hash::Hash + Eq> UnitTrait for DynUnitWrapper<T
     fn unit_to_reg_alloc(
         &self,
         this: Expr<Self::Type>,
-    ) -> Expr<UnitToRegAlloc<Self::MOp, Self::ExtraOut, DynSize, DynSize, DynSize>> {
+    ) -> Expr<UnitToRegAlloc<Self::MOp, Self::ExtraOut, DynSize, DynSize, DynSize, DynSize>> {
         Expr::from_bundle(Expr::as_bundle(
             self.0.unit_to_reg_alloc(Expr::from_bundle(this)),
         ))
diff --git a/crates/cpu/src/unit/alu_branch.rs b/crates/cpu/src/unit/alu_branch.rs
index 082fd8d..1a1d723 100644
--- a/crates/cpu/src/unit/alu_branch.rs
+++ b/crates/cpu/src/unit/alu_branch.rs
@@ -9,9 +9,9 @@ use crate::{
     },
     register::{FlagsMode, PRegFlagsPowerISA, PRegFlagsX86, PRegValue},
     unit::{
-        unit_base::{unit_base, ExecuteEnd, ExecuteStart, UnitToRegAlloc},
+        unit_base::{unit_base, ExecuteEnd, ExecuteStart},
         DynUnit, DynUnitWrapper, GlobalState, UnitKind, UnitMOp, UnitOutput, UnitResult,
-        UnitResultCompleted, UnitTrait,
+        UnitResultCompleted, UnitToRegAlloc, UnitTrait,
     },
 };
 use fayalite::{
@@ -255,6 +255,7 @@ pub fn alu_branch(config: &CpuConfig, unit_index: usize) {
         DynSize,
         DynSize,
         DynSize,
+        DynSize,
     > = m.output(config.unit_to_reg_alloc(
         AluBranchMOp[config.unit_out_reg_num()][config.p_reg_num_width()],
         (),
@@ -279,24 +280,21 @@ pub fn alu_branch(config: &CpuConfig, unit_index: usize) {
     #[hdl]
     if let HdlSome(execute_start) = ReadyValid::firing_data(unit_base.execute_start) {
         #[hdl]
-        let ExecuteStart::<_> {
-            mop,
-            pc,
-            src_values,
-        } = execute_start;
+        let ExecuteStart::<_, _> { insn, src_values } = execute_start;
         #[hdl]
-        match mop {
+        match insn.mop {
             AluBranchMOp::<_, _>::AddSub(mop) => connect(
                 unit_base.execute_end,
                 HdlSome(
                     #[hdl]
-                    ExecuteEnd::<_, _> {
+                    ExecuteEnd::<_, _, _> {
                         unit_output: #[hdl]
-                        UnitOutput::<_, _> {
+                        UnitOutput::<_, _, _> {
                             which: MOpTrait::dest_reg(mop),
+                            retire_queue_index: insn.retire_queue_index,
                             result: UnitResult[()].Completed(add_sub(
                                 mop,
-                                pc,
+                                insn.pc,
                                 global_state.flags_mode,
                                 src_values,
                             )),
@@ -308,13 +306,14 @@ pub fn alu_branch(config: &CpuConfig, unit_index: usize) {
                 unit_base.execute_end,
                 HdlSome(
                     #[hdl]
-                    ExecuteEnd::<_, _> {
+                    ExecuteEnd::<_, _, _> {
                         unit_output: #[hdl]
-                        UnitOutput::<_, _> {
+                        UnitOutput::<_, _, _> {
                             which: MOpTrait::dest_reg(mop),
+                            retire_queue_index: insn.retire_queue_index,
                             result: UnitResult[()].Completed(add_sub(
                                 mop,
-                                pc,
+                                insn.pc,
                                 global_state.flags_mode,
                                 src_values,
                             )),
@@ -326,10 +325,11 @@ pub fn alu_branch(config: &CpuConfig, unit_index: usize) {
                 unit_base.execute_end,
                 HdlSome(
                     #[hdl]
-                    ExecuteEnd::<_, _> {
+                    ExecuteEnd::<_, _, _> {
                         unit_output: #[hdl]
-                        UnitOutput::<_, _> {
+                        UnitOutput::<_, _, _> {
                             which: MOpTrait::dest_reg(mop),
+                            retire_queue_index: insn.retire_queue_index,
                             result: UnitResult[()].Completed(logical(
                                 mop,
                                 global_state.flags_mode,
@@ -393,7 +393,7 @@ impl UnitTrait for AluBranch {
     fn unit_to_reg_alloc(
         &self,
         this: Expr<Self::Type>,
-    ) -> Expr<UnitToRegAlloc<Self::MOp, Self::ExtraOut, DynSize, DynSize, DynSize>> {
+    ) -> Expr<UnitToRegAlloc<Self::MOp, Self::ExtraOut, DynSize, DynSize, DynSize, DynSize>> {
         this.unit_to_reg_alloc
     }
 
diff --git a/crates/cpu/src/unit/unit_base.rs b/crates/cpu/src/unit/unit_base.rs
index 9a3d0d8..3856e18 100644
--- a/crates/cpu/src/unit/unit_base.rs
+++ b/crates/cpu/src/unit/unit_base.rs
@@ -5,69 +5,28 @@ use crate::{
     config::CpuConfig,
     instruction::{MOpTrait, PRegNum, UnitNum, UnitOutRegNum, COMMON_MOP_SRC_LEN},
     register::PRegValue,
-    unit::{UnitCancelInput, UnitOutput, UnitOutputWrite},
-    util::tree_reduce::tree_reduce,
+    unit::{
+        RenamedInsnData, UnitCancelInput, UnitForwardingInfo, UnitOutput, UnitOutputWrite,
+        UnitToRegAlloc,
+    },
 };
 use fayalite::{
     memory::splat_mask,
     module::{memory_with_loc, wire_with_loc},
     prelude::*,
     ty::StaticType,
-    util::ready_valid::ReadyValid,
+    util::{prefix_sum::reduce, ready_valid::ReadyValid},
 };
-use std::marker::PhantomData;
 
 #[hdl]
-pub struct UnitForwardingInfo<UnitNumWidth: Size, OutRegNumWidth: Size, UnitCount: Size> {
-    pub unit_output_writes: ArrayType<HdlOption<UnitOutputWrite<OutRegNumWidth>>, UnitCount>,
-    pub unit_reg_frees: ArrayType<HdlOption<UnitOutRegNum<OutRegNumWidth>>, UnitCount>,
-    pub _phantom: PhantomData<UnitNumWidth>,
-}
-
-#[hdl]
-pub struct UnitInput<MOp: Type> {
-    pub mop: MOp,
-    pub pc: UInt<64>,
-}
-
-#[hdl]
-pub struct UnitToRegAlloc<
-    MOp: Type,
-    ExtraOut: Type,
-    UnitNumWidth: Size,
-    OutRegNumWidth: Size,
-    UnitCount: Size,
-> {
-    #[hdl(flip)]
-    pub unit_forwarding_info: UnitForwardingInfo<UnitNumWidth, OutRegNumWidth, UnitCount>,
-    #[hdl(flip)]
-    pub input: ReadyValid<UnitInput<MOp>>,
-    #[hdl(flip)]
-    pub cancel_input: HdlOption<UnitCancelInput<OutRegNumWidth>>,
-    pub output: HdlOption<UnitOutput<OutRegNumWidth, ExtraOut>>,
-}
-
-impl<MOp: Type, ExtraOut: Type, UnitNumWidth: Size, OutRegNumWidth: Size, UnitCount: Size>
-    UnitToRegAlloc<MOp, ExtraOut, UnitNumWidth, OutRegNumWidth, UnitCount>
-{
-    pub fn mop_ty(self) -> MOp {
-        self.input.data.HdlSome.mop
-    }
-    pub fn extra_out_ty(self) -> ExtraOut {
-        self.output.HdlSome.extra_out_ty()
-    }
-}
-
-#[hdl]
-pub struct ExecuteStart<MOp: Type + MOpTrait<DestReg = UnitOutRegNum<DynSize>>> {
-    pub mop: MOp,
-    pub pc: UInt<64>,
+pub struct ExecuteStart<MOp: Type, RetireQueueIndexWidth: Size> {
+    pub insn: RenamedInsnData<MOp, RetireQueueIndexWidth>,
     pub src_values: Array<PRegValue, { COMMON_MOP_SRC_LEN }>,
 }
 
 #[hdl]
-pub struct ExecuteEnd<OutRegNumWidth: Size, ExtraOut> {
-    pub unit_output: UnitOutput<OutRegNumWidth, ExtraOut>,
+pub struct ExecuteEnd<OutRegNumWidth: Size, RetireQueueIndexWidth: Size, ExtraOut> {
+    pub unit_output: UnitOutput<OutRegNumWidth, RetireQueueIndexWidth, ExtraOut>,
 }
 
 #[hdl]
@@ -148,10 +107,9 @@ impl InFlightOpState {
 }
 
 #[hdl]
-struct InFlightOp<MOp: Type> {
+struct InFlightOp<MOp: Type, RetireQueueIndexWidth: Size> {
     state: InFlightOpState,
-    mop: MOp,
-    pc: UInt<64>,
+    insn: RenamedInsnData<MOp, RetireQueueIndexWidth>,
     src_ready_flags: Array<Bool, { COMMON_MOP_SRC_LEN }>,
 }
 
@@ -166,7 +124,7 @@ impl<OpIndexWidth: Size> InFlightOpsSummary<OpIndexWidth> {
     fn new<MOp: Type>(
         op_index: usize,
         op_index_ty: UIntType<OpIndexWidth>,
-        in_flight_op: impl ToExpr<Type = HdlOption<InFlightOp<MOp>>>,
+        in_flight_op: impl ToExpr<Type = HdlOption<InFlightOp<MOp, DynSize>>>,
     ) -> Expr<Self> {
         let empty_op_index = wire_with_loc(
             &format!("empty_op_index_{op_index}"),
@@ -183,10 +141,9 @@ impl<OpIndexWidth: Size> InFlightOpsSummary<OpIndexWidth> {
         #[hdl]
         if let HdlSome(in_flight_op) = in_flight_op {
             #[hdl]
-            let InFlightOp::<_> {
+            let InFlightOp::<_, _> {
                 state,
-                mop: _,
-                pc: _,
+                insn: _,
                 src_ready_flags,
             } = in_flight_op;
             connect(ready_op_index, HdlOption[op_index_ty].HdlNone());
@@ -224,13 +181,13 @@ impl<OpIndexWidth: Size> InFlightOpsSummary<OpIndexWidth> {
 
 impl InFlightOpsSummary<DynSize> {
     fn summarize<MOp: Type, MaxInFlight: Size>(
-        in_flight_ops: impl ToExpr<Type = ArrayType<HdlOption<InFlightOp<MOp>>, MaxInFlight>>,
+        in_flight_ops: impl ToExpr<Type = ArrayType<HdlOption<InFlightOp<MOp, DynSize>>, MaxInFlight>>,
     ) -> Expr<Self> {
         let in_flight_ops = in_flight_ops.to_expr();
         let max_in_flight = Expr::ty(in_flight_ops).len();
         let index_range = 0..max_in_flight;
         let index_ty = UInt::range(index_range.clone());
-        tree_reduce(
+        reduce(
             index_range.map(|i| Self::new(i, index_ty, in_flight_ops[i])),
             Self::combine,
         )
@@ -251,18 +208,19 @@ pub fn unit_base<
     #[hdl]
     let cd: ClockDomain = m.input();
     #[hdl]
-    let unit_to_reg_alloc: UnitToRegAlloc<MOp, ExtraOut, DynSize, DynSize, DynSize> =
+    let unit_to_reg_alloc: UnitToRegAlloc<MOp, ExtraOut, DynSize, DynSize, DynSize, DynSize> =
         m.output(config.unit_to_reg_alloc(mop_ty, extra_out_ty));
     #[hdl]
-    let execute_start: ReadyValid<ExecuteStart<MOp>> = m.output(ReadyValid[ExecuteStart[mop_ty]]);
+    let execute_start: ReadyValid<ExecuteStart<MOp, DynSize>> =
+        m.output(ReadyValid[config.execute_start(mop_ty)]);
     #[hdl]
-    let execute_end: HdlOption<ExecuteEnd<DynSize, ExtraOut>> =
-        m.input(HdlOption[ExecuteEnd[config.out_reg_num_width][extra_out_ty]]);
+    let execute_end: HdlOption<ExecuteEnd<DynSize, DynSize, ExtraOut>> =
+        m.input(HdlOption[config.execute_end(extra_out_ty)]);
 
     connect(execute_start.data, Expr::ty(execute_start).data.HdlNone());
 
     let max_in_flight = config.unit_max_in_flight(unit_index).get();
-    let in_flight_op_ty = InFlightOp[mop_ty];
+    let in_flight_op_ty = InFlightOp[mop_ty][config.retire_queue_index_width()];
     #[hdl]
     let in_flight_ops = reg_builder()
         .clock_domain(cd)
@@ -399,9 +357,8 @@ pub fn unit_base<
                 execute_start.data,
                 HdlSome(
                     #[hdl]
-                    ExecuteStart::<_> {
-                        mop: in_flight_op.mop,
-                        pc: in_flight_op.pc,
+                    ExecuteStart::<_, _> {
+                        insn: in_flight_op.insn,
                         src_values: read_src_values,
                     },
                 ),
@@ -420,7 +377,11 @@ pub fn unit_base<
     #[hdl]
     if let HdlSome(input) = ReadyValid::firing_data(unit_to_reg_alloc.input) {
         #[hdl]
-        let UnitInput::<_> { mop, pc } = input;
+        let RenamedInsnData::<_, _> {
+            retire_queue_index,
+            pc,
+            mop,
+        } = input;
         #[hdl]
         let input_mop_src_regs = wire(mop_ty.src_regs_ty());
         connect(
@@ -436,20 +397,24 @@ pub fn unit_base<
         connect(src_ready_flags, input_src_regs_valid);
         connect(input_src_regs, input_mop_src_regs);
         #[hdl]
-        if unit_to_reg_alloc.cancel_input.cmp_ne(HdlSome(
-            #[hdl]
-            UnitCancelInput::<_> {
-                which: MOp::dest_reg(mop),
-            },
-        )) {
+        let input_is_canceled = wire();
+        connect(input_is_canceled, false);
+        #[hdl]
+        if let HdlSome(cancel_input) = unit_to_reg_alloc.cancel_input {
+            connect(
+                input_is_canceled,
+                UnitCancelInput::is_canceled(cancel_input, retire_queue_index),
+            );
+        }
+        #[hdl]
+        if !input_is_canceled {
             connect(
                 input_in_flight_op,
                 HdlSome(
                     #[hdl]
-                    InFlightOp::<_> {
+                    InFlightOp::<_, _> {
                         state: InFlightOpState.Ready(),
-                        mop,
-                        pc,
+                        insn: input,
                         src_ready_flags,
                     },
                 ),
@@ -483,13 +448,12 @@ pub fn unit_base<
         #[hdl]
         if let HdlSome(in_flight_op) = in_flight_ops[in_flight_op_index] {
             #[hdl]
-            let InFlightOp::<_> {
+            let InFlightOp::<_, _> {
                 state,
-                mop,
-                pc,
+                insn,
                 src_ready_flags,
             } = in_flight_op;
-            let which = MOp::dest_reg(mop);
+            let which = MOp::dest_reg(insn.mop);
             let src_regs = wire_with_loc(
                 &format!("in_flight_op_src_regs_{in_flight_op_index}"),
                 SourceLocation::caller(),
@@ -499,7 +463,7 @@ pub fn unit_base<
                 src_regs,
                 repeat(config.p_reg_num().const_zero().cast_to_bits(), ConstUsize),
             );
-            MOp::connect_src_regs(mop, src_regs);
+            MOp::connect_src_regs(insn.mop, src_regs);
 
             #[hdl]
             if in_flight_ops_summary.ready_op_index.cmp_eq(HdlSome(
@@ -537,18 +501,19 @@ pub fn unit_base<
                 }
             }
 
-            connect(
-                in_flight_op_canceling[in_flight_op_index],
-                unit_to_reg_alloc.cancel_input.cmp_eq(HdlSome(
-                    #[hdl]
-                    UnitCancelInput::<_> { which },
-                )),
-            );
+            connect(in_flight_op_canceling[in_flight_op_index], false);
+            #[hdl]
+            if let HdlSome(cancel_input) = unit_to_reg_alloc.cancel_input {
+                connect(
+                    in_flight_op_canceling[in_flight_op_index],
+                    UnitCancelInput::is_canceled(cancel_input, insn.retire_queue_index),
+                );
+            }
 
             #[hdl]
             if let HdlSome(execute_end) = execute_end {
                 #[hdl]
-                let ExecuteEnd::<_, _> { unit_output } = execute_end;
+                let ExecuteEnd::<_, _, _> { unit_output } = execute_end;
                 #[hdl]
                 if which.cmp_eq(unit_output.which) {
                     connect(in_flight_op_execute_ending[in_flight_op_index], true);
@@ -567,7 +532,7 @@ pub fn unit_base<
             #[hdl]
             if let HdlSome(execute_start) = ReadyValid::firing_data(execute_start) {
                 #[hdl]
-                if which.cmp_eq(MOp::dest_reg(execute_start.mop)) {
+                if which.cmp_eq(MOp::dest_reg(execute_start.insn.mop)) {
                     connect(in_flight_op_execute_starting[in_flight_op_index], true);
                 }
             }
@@ -594,10 +559,9 @@ pub fn unit_base<
                     in_flight_ops[in_flight_op_index],
                     HdlSome(
                         #[hdl]
-                        InFlightOp::<_> {
+                        InFlightOp::<_, _> {
                             state,
-                            mop,
-                            pc,
+                            insn,
                             src_ready_flags: in_flight_op_next_src_ready_flags[in_flight_op_index],
                         },
                     ),
diff --git a/crates/cpu/src/util.rs b/crates/cpu/src/util.rs
index 0b53274..f57003a 100644
--- a/crates/cpu/src/util.rs
+++ b/crates/cpu/src/util.rs
@@ -2,7 +2,6 @@
 // See Notices.txt for copyright information
 
 pub mod array_vec;
-pub mod tree_reduce;
 
 pub(crate) const fn range_u32_len(range: &std::ops::Range<u32>) -> usize {
     let retval = range.end.saturating_sub(range.start);
@@ -25,3 +24,16 @@ pub(crate) const fn range_u32_nth_or_panic(range: &std::ops::Range<u32>, index:
         panic!("index out of range")
     }
 }
+
+pub(crate) const fn range_intersection(
+    a: &std::ops::Range<u32>,
+    b: &std::ops::Range<u32>,
+) -> Option<std::ops::Range<u32>> {
+    let start = if a.start > b.start { a.start } else { b.start };
+    let end = if a.end < b.end { a.end } else { b.end };
+    if start < end {
+        Some(start..end)
+    } else {
+        None
+    }
+}
diff --git a/crates/cpu/src/util/array_vec.rs b/crates/cpu/src/util/array_vec.rs
index 761f53f..be256b4 100644
--- a/crates/cpu/src/util/array_vec.rs
+++ b/crates/cpu/src/util/array_vec.rs
@@ -2,8 +2,11 @@
 // See Notices.txt for copyright information
 
 use fayalite::{
-    expr::ops::{ExprCastTo, ExprIndex, ExprPartialEq, ExprPartialOrd},
-    int::SizeType,
+    expr::{
+        ops::{ExprCastTo, ExprIndex, ExprPartialEq, ExprPartialOrd},
+        ToLiteralBits,
+    },
+    int::{IntType, SizeType},
     intern::{Intern, Interned},
     prelude::*,
     ty::{MatchVariantWithoutScope, StaticType, TypeProperties},
@@ -249,6 +252,29 @@ impl<T: Type, N: Size> ArrayVec<T, N> {
         });
         array_vec_as_array_of_options
     }
+    #[hdl]
+    pub fn get<Idx: IntType<Dyn = UInt>>(
+        this: impl ToExpr<Type = Self>,
+        index: impl ToExpr<Type = Idx>,
+    ) -> Expr<HdlOption<T>> {
+        let this = this.to_expr();
+        let index = Expr::as_dyn_int(index.to_expr());
+        let never_in_bounds = index.cmp_ge(Expr::ty(this).capacity());
+        if let Ok(never_in_bounds) = never_in_bounds.to_literal_bits() {
+            if never_in_bounds[0] {
+                // avoid error from out-of-bounds constant index
+                return HdlOption[Expr::ty(this).element()].HdlNone();
+            }
+        }
+        #[hdl]
+        let array_vec_get = wire(HdlOption[Expr::ty(this).element()]);
+        connect(array_vec_get, Expr::ty(array_vec_get).HdlNone());
+        #[hdl]
+        if index.cmp_lt(Length::as_uint(Self::len(this))) {
+            connect(array_vec_get, HdlSome(this.elements[index]));
+        }
+        array_vec_get
+    }
 }
 
 impl<T: Type, N: Size, Idx, IdxWidth: Size> ExprIndex<Idx> for ArrayVec<T, N>
@@ -263,3 +289,35 @@ where
         <ArrayType<T, N> as ExprIndex<Idx>>::expr_index(&this.elements, index)
     }
 }
+
+#[hdl]
+pub struct ReadyValidArray<T: Type, N: Size> {
+    pub data: ArrayVec<T, N>,
+    #[hdl(flip)]
+    pub ready: Length<N>,
+}
+
+impl<T: Type, N: Size> ReadyValidArray<T, N> {
+    #[hdl]
+    pub fn firing_len(this: impl ToExpr<Type = Self>) -> Expr<Length<N>> {
+        let this = this.to_expr();
+        assert_eq!(Expr::ty(this).data.len_ty(), Expr::ty(this).ready);
+        #[hdl]
+        let firing_len = wire(Expr::ty(this).data.len);
+        connect(firing_len, this.data.len);
+        #[hdl]
+        if this.data.len.cmp_gt(this.ready) {
+            connect(firing_len, this.ready);
+        }
+        firing_len
+    }
+    #[hdl]
+    pub fn firing_data(this: impl ToExpr<Type = Self>) -> Expr<ArrayVec<T, N>> {
+        let this = this.to_expr();
+        #[hdl]
+        let firing_data = wire(Expr::ty(this).data);
+        connect(firing_data, this.data);
+        connect(firing_data.len, Self::firing_len(this));
+        firing_data
+    }
+}
diff --git a/crates/cpu/src/util/tree_reduce.rs b/crates/cpu/src/util/tree_reduce.rs
deleted file mode 100644
index c8d12f7..0000000
--- a/crates/cpu/src/util/tree_reduce.rs
+++ /dev/null
@@ -1,152 +0,0 @@
-// SPDX-License-Identifier: LGPL-3.0-or-later
-// See Notices.txt for copyright information
-#[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)]
-pub enum TreeReduceOp {
-    Input,
-    Reduce,
-}
-
-#[derive(Copy, Clone, Debug)]
-struct Entry {
-    start: usize,
-    depth: u32,
-}
-
-#[derive(Clone, Debug)]
-pub struct TreeReduceOps {
-    len: usize,
-    stack: Vec<Entry>,
-}
-
-impl TreeReduceOps {
-    pub fn new(len: usize) -> Self {
-        TreeReduceOps {
-            len,
-            stack: Vec::new(),
-        }
-    }
-}
-
-impl Iterator for TreeReduceOps {
-    type Item = TreeReduceOp;
-    fn next(&mut self) -> Option<Self::Item> {
-        match *self.stack {
-            [] if self.len != 0 => {
-                self.stack.push(Entry { start: 0, depth: 0 });
-                Some(TreeReduceOp::Input)
-            }
-            [.., ref mut second_last, last] if second_last.depth == last.depth => {
-                second_last.depth += 1;
-                self.stack.pop();
-                Some(TreeReduceOp::Reduce)
-            }
-            [.., last] if self.len - last.start > 1 << last.depth => {
-                let start = last.start + (1 << last.depth);
-                self.stack.push(Entry { start, depth: 0 });
-                Some(TreeReduceOp::Input)
-            }
-            [.., ref mut second_last, _] => {
-                second_last.depth += 1;
-                self.stack.pop();
-                Some(TreeReduceOp::Reduce)
-            }
-            _ => None,
-        }
-    }
-}
-
-#[track_caller]
-pub fn tree_reduce_with_state<S, I, R>(
-    iter: impl IntoIterator<IntoIter: ExactSizeIterator, Item = I>,
-    state: &mut S,
-    mut input: impl FnMut(&mut S, I) -> R,
-    mut reduce: impl FnMut(&mut S, R, R) -> R,
-) -> Option<R> {
-    let mut stack = Vec::new();
-    let mut iter = iter.into_iter();
-    for op in TreeReduceOps::new(iter.len()) {
-        match op {
-            TreeReduceOp::Input => stack.push(input(
-                state,
-                iter.next().expect("inconsistent iterator len() and next()"),
-            )),
-            TreeReduceOp::Reduce => {
-                let Some(r) = stack.pop() else {
-                    unreachable!();
-                };
-                let Some(l) = stack.pop() else {
-                    unreachable!();
-                };
-                stack.push(reduce(state, l, r));
-            }
-        }
-    }
-    stack.pop()
-}
-
-pub fn tree_reduce<T>(
-    iter: impl IntoIterator<Item = T, IntoIter: ExactSizeIterator>,
-    mut reduce: impl FnMut(T, T) -> T,
-) -> Option<T> {
-    tree_reduce_with_state(iter, &mut (), |_, v| v, move |_, l, r| reduce(l, r))
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use std::ops::Range;
-
-    fn recursive_tree_reduce(range: Range<usize>, ops: &mut Vec<TreeReduceOp>) {
-        if range.len() == 1 {
-            ops.push(TreeReduceOp::Input);
-            return;
-        }
-        if range.is_empty() {
-            return;
-        }
-        let pow2_len = range.len().next_power_of_two();
-        let split = range.start + pow2_len / 2;
-        recursive_tree_reduce(range.start..split, ops);
-        recursive_tree_reduce(split..range.end, ops);
-        ops.push(TreeReduceOp::Reduce);
-    }
-
-    #[test]
-    fn test_tree_reduce() {
-        const EXPECTED: &'static [&'static [TreeReduceOp]] = {
-            use TreeReduceOp::{Input as I, Reduce as R};
-            &[
-                &[],
-                &[I],
-                &[I, I, R],
-                &[I, I, R, I, R],
-                &[I, I, R, I, I, R, R],
-                &[I, I, R, I, I, R, R, I, R],
-                &[I, I, R, I, I, R, R, I, I, R, R],
-                &[I, I, R, I, I, R, R, I, I, R, I, R, R],
-                &[I, I, R, I, I, R, R, I, I, R, I, I, R, R, R],
-            ]
-        };
-        for len in 0..64 {
-            let mut expected = vec![];
-            recursive_tree_reduce(0..len, &mut expected);
-            if let Some(&expected2) = EXPECTED.get(len) {
-                assert_eq!(*expected, *expected2, "len={len}");
-            }
-            assert_eq!(
-                TreeReduceOps::new(len).collect::<Vec<_>>(),
-                expected,
-                "len={len}"
-            );
-            let seq: Vec<_> = (0..len).collect();
-            assert_eq!(
-                seq,
-                tree_reduce(seq.iter().map(|&v| vec![v]), |mut l, r| {
-                    l.extend_from_slice(&r);
-                    l
-                })
-                .unwrap_or_default()
-            );
-        }
-    }
-}