ADR-0033 D6: address-based PC selection at HBM CTRL

Replaces global round-robin with deterministic address-derived PC striping: pc_shift = log2(burst_bytes) pc_mask = num_pcs - 1 pc = (flit.address >> pc_shift) & pc_mask Each Transaction carries base_address (HBM byte offset of the first chunk); each Flit derives its own address as base + i*flit_bytes. HBM CTRL routes flits to PCs via this formula, replacing the arrival-order RR pointer. Also splits the is_last wait into an asynchronous _finalize_txn process so the worker isn't blocked on PC commit, exposing true PC parallelism for disjoint addresses. phyaddr.py documents the canonical bit layout (bits [10:8] for the default burst=256, num_pcs=8 case). ADR-0033 D6 records the derivation and the workload scenarios where address-striping matters (strided streams, offset-disjoint parallel transfers). Adds tests/test_hbm_address_based_pc.py: canonical bit mapping, strided 8-way load distribution, same-address PC-0 serialization, PC-aligned 2KB pair collision, dynamic pc_shift from burst_bytes, and power-of-2 attr validation. Integration tests inspect _pc_avail ledger directly: at default config UCIe's 8 ns per-txn overhead exactly matches chunk_time, masking PC contention at the makespan level even though the ledger correctly distinguishes the cases. Full suite: 631 passed, 1 skipped. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-15 00:18:46 -07:00
parent a44f832be5
commit aaa1cbfaf6
6 changed files with 292 additions and 27 deletions
@@ -45,7 +45,10 @@ class HbmCtrlComponent(ComponentBase):
        self._switch_penalty_ns: float = 0.0
        self._pc_avail: list[float] = []
        self._pc_last_dir: list[str | None] = []
-        self._next_pc: int = 0
+        # Address-based PC selection (ADR-0033 D6):
+        #   pc = (address >> _pc_shift) & _pc_mask
+        self._pc_shift: int = 0
+        self._pc_mask: int = 0
        # Per-txn flit accumulation state (ADR-0033 Phase 2c-3).
        self._txn_state: dict[int, dict[str, Any]] = {}

@@ -55,11 +58,19 @@ class HbmCtrlComponent(ComponentBase):
        self._pc_bw_gbs = float(attrs.get("pc_bw_gbs", 32.0))
        self._burst_bytes = int(attrs.get("burst_bytes", 256))
        self._switch_penalty_ns = float(attrs.get("switch_penalty_ns", 0.0))
+        if self._num_pcs <= 0 or (self._num_pcs & (self._num_pcs - 1)) != 0:
+            raise ValueError(f"num_pcs must be a positive power of 2, got {self._num_pcs}")
+        if self._burst_bytes <= 0 or (self._burst_bytes & (self._burst_bytes - 1)) != 0:
+            raise ValueError(f"burst_bytes must be a positive power of 2, got {self._burst_bytes}")
+        self._pc_shift = self._burst_bytes.bit_length() - 1
+        self._pc_mask = self._num_pcs - 1
        self._pc_avail = [0.0] * self._num_pcs
        self._pc_last_dir = [None] * self._num_pcs
-        self._next_pc = 0
        super().start(env)

+    def _pc_for_address(self, address: int) -> int:
+        return (int(address) >> self._pc_shift) & self._pc_mask
+
    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
        yield env.timeout(overhead_ns)
@@ -88,9 +99,10 @@ class HbmCtrlComponent(ComponentBase):
                env.process(self._handle_txn(env, msg))

    def _handle_flit(self, env: simpy.Environment, flit: Flit) -> Generator:
-        """Per-flit PC commit. On first flit of a txn, claim PC range and
-        apply overhead. On ``is_last``, wait for last PC commit to
-        finish, then send the response."""
+        """Per-flit PC commit. On first flit of a txn, apply overhead. PC is
+        derived from the flit's address (ADR-0033 D6 address-based striping).
+        On ``is_last``, wait for last PC commit to finish, then send the
+        response."""
        txn = flit.txn
        tid = id(txn)
        chunk_time = (
@@ -100,19 +112,12 @@ class HbmCtrlComponent(ComponentBase):

        if tid not in self._txn_state:
            yield from self.run(env, txn.nbytes)
-            work_bytes = txn.nbytes if txn.nbytes > 0 else int(
-                getattr(txn.request, "nbytes", 0) or 0
-            )
-            n_flits = max(1, ceil(work_bytes / self._burst_bytes)) if work_bytes > 0 else 1
-            pc_start = self._next_pc
-            self._next_pc = (self._next_pc + n_flits) % self._num_pcs
            self._txn_state[tid] = {
-                "pc_start": pc_start,
                "last_finish": env.now,
            }

        state = self._txn_state[tid]
-        pc = (state["pc_start"] + flit.flit_index) % self._num_pcs
+        pc = self._pc_for_address(flit.address)
        switch_cost = 0.0
        if self._pc_last_dir[pc] is not None and self._pc_last_dir[pc] != new_dir:
            switch_cost = self._switch_penalty_ns
@@ -124,11 +129,22 @@ class HbmCtrlComponent(ComponentBase):
            state["last_finish"] = finish

        if flit.is_last:
-            wait = state["last_finish"] - env.now
-            if wait > 0:
-                yield env.timeout(wait)
            del self._txn_state[tid]
-            yield from self._send_response(env, txn)
+            # Finalize asynchronously so the worker can pick up the next
+            # flit while this txn's last PC commit drains. Without this
+            # split, the worker's ``yield env.timeout(wait)`` would
+            # serialize concurrent single-flit txns at chunk_time even
+            # when they hit distinct PCs, hiding address-based PC
+            # parallelism (ADR-0033 D6).
+            env.process(self._finalize_txn(env, txn, state["last_finish"]))
+
+    def _finalize_txn(
+        self, env: simpy.Environment, txn: Any, last_finish: float,
+    ) -> Generator:
+        wait = last_finish - env.now
+        if wait > 0:
+            yield env.timeout(wait)
+        yield from self._send_response(env, txn)

    def _handle_txn(self, env: simpy.Environment, txn: Any) -> Generator:
        is_write = self._is_write(txn)
@@ -146,11 +162,12 @@ class HbmCtrlComponent(ComponentBase):

        yield from self.run(env, txn.nbytes)

+        base_addr = int(getattr(txn, "base_address", 0))
        last_finish = env.now
        for i in range(n_chunks):
            if chunk_interval > 0:
                yield env.timeout(chunk_interval)
-            pc = (self._next_pc + i) % self._num_pcs
+            pc = self._pc_for_address(base_addr + i * self._burst_bytes)
            switch_cost = 0.0
            if self._pc_last_dir[pc] is not None and self._pc_last_dir[pc] != new_dir:
                switch_cost = self._switch_penalty_ns
@@ -160,8 +177,6 @@ class HbmCtrlComponent(ComponentBase):
            self._pc_last_dir[pc] = new_dir
            if finish > last_finish:
                last_finish = finish
-        if n_chunks > 0:
-            self._next_pc = (self._next_pc + n_chunks) % self._num_pcs

        wait = last_finish - env.now
        if wait > 0: