Add reverse path response latency for PE DMA and PE_CPU→M_CPU

Model fabric response hop latency for PE-internal operations: - HBM_CTRL sends PeDmaMsg response on reverse path instead of direct done signal - PE_CPU sends ResponseMsg via NOC→M_CPU on kernel completion - Add NOC→PE_DMA and PE_CPU→NOC edges in topology builder - Make HBM BW test assertions dynamic based on topology efficiency Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-20 15:40:56 -07:00
parent 8b5afef5eb
commit 62fb01ae18
8 changed files with 88 additions and 24 deletions
@@ -79,6 +79,14 @@ class HbmCtrlComponent(ComponentBase):
        from kernbench.runtime_api.kernel import MemoryReadMsg, PeDmaMsg

        if isinstance(txn.request, PeDmaMsg):
+            reverse_path = list(reversed(txn.path))
+            if len(reverse_path) >= 2:
+                resp_txn = Transaction(
+                    request=txn.request, path=reverse_path, step=0,
+                    nbytes=0, done=txn.done, is_response=True,
+                )
+                yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
+                return
            txn.done.succeed()
            return

@@ -158,7 +158,7 @@ class MCpuComponent(ComponentBase):
        """Fan out KernelLaunchMsg to target PE_CPU(s) via NOC (ADR-0009 D3).

        Routes through find_node_path (M_CPU → NOC → PE_CPU command edges).
-        Waits for sub_txn.done directly — no ResponseMsg needed for PE direction.
+        PE_CPU sends ResponseMsg back via NOC → M_CPU on completion.
        Then sends aggregate ResponseMsg back to IO_CPU on the reverse path.
        """
        request = txn.request
@@ -170,9 +170,9 @@ class MCpuComponent(ComponentBase):
            txn.done.succeed()
            return

-        # Fan out to each PE_CPU and collect done events
-        sub_dones: list[simpy.Event] = []
+        # Fan out to each PE_CPU, using response-based aggregation
        sub_txns: list[Transaction] = []
+        n_dispatched = 0
        for pe_id in pe_ids:
            pe_cpu_id = f"{cube_prefix}.pe{pe_id}.pe_cpu"
            try:
@@ -181,22 +181,26 @@ class MCpuComponent(ComponentBase):
                continue
            if len(path) < 2:
                continue
-            sub_done = env.event()
            sub_txn = Transaction(
                request=request, path=path, step=0,
-                nbytes=0, done=sub_done,
+                nbytes=0, done=env.event(),
            )
            yield self.out_ports[path[1]].put(sub_txn.advance())
-            sub_dones.append(sub_done)
            sub_txns.append(sub_txn)
+            n_dispatched += 1

-        if not sub_dones:
+        if n_dispatched == 0:
            txn.done.succeed()
            return

-        # Wait for all PE_CPUs to complete
-        for sd in sub_dones:
-            yield sd
+        # Setup response aggregation (PE_CPU ResponseMsg arrives via _collect_response)
+        all_done = env.event()
+        self._pending[request.request_id] = (n_dispatched, 0, all_done)
+        self._parent_txns[request.request_id] = txn
+
+        # Wait for all PE_CPU responses via NOC
+        yield all_done
+        del self._parent_txns[request.request_id]

        # Aggregate PE-internal metrics (max across PEs)
        pe_exec_values = [st.result_data.get("pe_exec_ns", 0.0) for st in sub_txns]
@@ -6,6 +6,7 @@ from typing import TYPE_CHECKING, Any
 import simpy

 from kernbench.components.base import ComponentBase
+from kernbench.sim_engine.transaction import Transaction

 if TYPE_CHECKING:
    from kernbench.components.context import ComponentContext
@@ -150,5 +151,21 @@ class PeCpuComponent(ComponentBase):
        txn.result_data["dma_ns"] = total_dma_ns
        txn.result_data["compute_ns"] = total_compute_ns

-        # Signal original Transaction done
-        txn.done.succeed()
+        # Send ResponseMsg on reverse path (PE_CPU → NOC → M_CPU)
+        reverse_path = list(reversed(txn.path))
+        if len(reverse_path) >= 2:
+            from kernbench.runtime_api.kernel import ResponseMsg
+
+            resp_msg = ResponseMsg(
+                correlation_id=request.correlation_id,
+                request_id=request.request_id,
+                src_cube=self._cube_idx, src_pe=self._pe_idx,
+                success=True,
+            )
+            resp_txn = Transaction(
+                request=resp_msg, path=reverse_path, step=0,
+                nbytes=0, done=env.event(), is_response=True,
+            )
+            yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
+        else:
+            txn.done.succeed()
@@ -94,6 +94,15 @@ class PeDmaComponent(PeEngineBase):

    def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
        """Handle external Transaction (PeDmaMsg probe, M_CPU DMA) with channel acquisition."""
+        # Response transactions bypass DMA channel (no outbound resource needed)
+        if getattr(txn, "is_response", False):
+            next_hop = txn.next_hop
+            if next_hop:
+                yield self.out_ports[next_hop].put(txn.advance())
+            else:
+                txn.done.succeed()
+            return
+
        dma_res = self._select_channel(txn)
        with dma_res.request() as req:
            yield req