Add reverse path response latency for PE DMA and PE_CPU→M_CPU

Model fabric response hop latency for PE-internal operations:
- HBM_CTRL sends PeDmaMsg response on reverse path instead of direct done signal
- PE_CPU sends ResponseMsg via NOC→M_CPU on kernel completion
- Add NOC→PE_DMA and PE_CPU→NOC edges in topology builder
- Make HBM BW test assertions dynamic based on topology efficiency

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-03-20 15:40:56 -07:00
parent 8b5afef5eb
commit 62fb01ae18
8 changed files with 88 additions and 24 deletions
@@ -79,6 +79,14 @@ class HbmCtrlComponent(ComponentBase):
from kernbench.runtime_api.kernel import MemoryReadMsg, PeDmaMsg
if isinstance(txn.request, PeDmaMsg):
reverse_path = list(reversed(txn.path))
if len(reverse_path) >= 2:
resp_txn = Transaction(
request=txn.request, path=reverse_path, step=0,
nbytes=0, done=txn.done, is_response=True,
)
yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
return
txn.done.succeed()
return
+14 -10
View File
@@ -158,7 +158,7 @@ class MCpuComponent(ComponentBase):
"""Fan out KernelLaunchMsg to target PE_CPU(s) via NOC (ADR-0009 D3).
Routes through find_node_path (M_CPU → NOC → PE_CPU command edges).
Waits for sub_txn.done directly — no ResponseMsg needed for PE direction.
PE_CPU sends ResponseMsg back via NOC → M_CPU on completion.
Then sends aggregate ResponseMsg back to IO_CPU on the reverse path.
"""
request = txn.request
@@ -170,9 +170,9 @@ class MCpuComponent(ComponentBase):
txn.done.succeed()
return
# Fan out to each PE_CPU and collect done events
sub_dones: list[simpy.Event] = []
# Fan out to each PE_CPU, using response-based aggregation
sub_txns: list[Transaction] = []
n_dispatched = 0
for pe_id in pe_ids:
pe_cpu_id = f"{cube_prefix}.pe{pe_id}.pe_cpu"
try:
@@ -181,22 +181,26 @@ class MCpuComponent(ComponentBase):
continue
if len(path) < 2:
continue
sub_done = env.event()
sub_txn = Transaction(
request=request, path=path, step=0,
nbytes=0, done=sub_done,
nbytes=0, done=env.event(),
)
yield self.out_ports[path[1]].put(sub_txn.advance())
sub_dones.append(sub_done)
sub_txns.append(sub_txn)
n_dispatched += 1
if not sub_dones:
if n_dispatched == 0:
txn.done.succeed()
return
# Wait for all PE_CPUs to complete
for sd in sub_dones:
yield sd
# Setup response aggregation (PE_CPU ResponseMsg arrives via _collect_response)
all_done = env.event()
self._pending[request.request_id] = (n_dispatched, 0, all_done)
self._parent_txns[request.request_id] = txn
# Wait for all PE_CPU responses via NOC
yield all_done
del self._parent_txns[request.request_id]
# Aggregate PE-internal metrics (max across PEs)
pe_exec_values = [st.result_data.get("pe_exec_ns", 0.0) for st in sub_txns]
+19 -2
View File
@@ -6,6 +6,7 @@ from typing import TYPE_CHECKING, Any
import simpy
from kernbench.components.base import ComponentBase
from kernbench.sim_engine.transaction import Transaction
if TYPE_CHECKING:
from kernbench.components.context import ComponentContext
@@ -150,5 +151,21 @@ class PeCpuComponent(ComponentBase):
txn.result_data["dma_ns"] = total_dma_ns
txn.result_data["compute_ns"] = total_compute_ns
# Signal original Transaction done
txn.done.succeed()
# Send ResponseMsg on reverse path (PE_CPU → NOC → M_CPU)
reverse_path = list(reversed(txn.path))
if len(reverse_path) >= 2:
from kernbench.runtime_api.kernel import ResponseMsg
resp_msg = ResponseMsg(
correlation_id=request.correlation_id,
request_id=request.request_id,
src_cube=self._cube_idx, src_pe=self._pe_idx,
success=True,
)
resp_txn = Transaction(
request=resp_msg, path=reverse_path, step=0,
nbytes=0, done=env.event(), is_response=True,
)
yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
else:
txn.done.succeed()
+9
View File
@@ -94,6 +94,15 @@ class PeDmaComponent(PeEngineBase):
def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
"""Handle external Transaction (PeDmaMsg probe, M_CPU DMA) with channel acquisition."""
# Response transactions bypass DMA channel (no outbound resource needed)
if getattr(txn, "is_response", False):
next_hop = txn.next_hop
if next_hop:
yield self.out_ports[next_hop].put(txn.advance())
else:
txn.done.succeed()
return
dma_res = self._select_channel(txn)
with dma_res.request() as req:
yield req