Add reverse path response latency for PE DMA and PE_CPU→M_CPU
Model fabric response hop latency for PE-internal operations: - HBM_CTRL sends PeDmaMsg response on reverse path instead of direct done signal - PE_CPU sends ResponseMsg via NOC→M_CPU on kernel completion - Add NOC→PE_DMA and PE_CPU→NOC edges in topology builder - Make HBM BW test assertions dynamic based on topology efficiency Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -79,6 +79,14 @@ class HbmCtrlComponent(ComponentBase):
|
||||
from kernbench.runtime_api.kernel import MemoryReadMsg, PeDmaMsg
|
||||
|
||||
if isinstance(txn.request, PeDmaMsg):
|
||||
reverse_path = list(reversed(txn.path))
|
||||
if len(reverse_path) >= 2:
|
||||
resp_txn = Transaction(
|
||||
request=txn.request, path=reverse_path, step=0,
|
||||
nbytes=0, done=txn.done, is_response=True,
|
||||
)
|
||||
yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
|
||||
return
|
||||
txn.done.succeed()
|
||||
return
|
||||
|
||||
|
||||
@@ -158,7 +158,7 @@ class MCpuComponent(ComponentBase):
|
||||
"""Fan out KernelLaunchMsg to target PE_CPU(s) via NOC (ADR-0009 D3).
|
||||
|
||||
Routes through find_node_path (M_CPU → NOC → PE_CPU command edges).
|
||||
Waits for sub_txn.done directly — no ResponseMsg needed for PE direction.
|
||||
PE_CPU sends ResponseMsg back via NOC → M_CPU on completion.
|
||||
Then sends aggregate ResponseMsg back to IO_CPU on the reverse path.
|
||||
"""
|
||||
request = txn.request
|
||||
@@ -170,9 +170,9 @@ class MCpuComponent(ComponentBase):
|
||||
txn.done.succeed()
|
||||
return
|
||||
|
||||
# Fan out to each PE_CPU and collect done events
|
||||
sub_dones: list[simpy.Event] = []
|
||||
# Fan out to each PE_CPU, using response-based aggregation
|
||||
sub_txns: list[Transaction] = []
|
||||
n_dispatched = 0
|
||||
for pe_id in pe_ids:
|
||||
pe_cpu_id = f"{cube_prefix}.pe{pe_id}.pe_cpu"
|
||||
try:
|
||||
@@ -181,22 +181,26 @@ class MCpuComponent(ComponentBase):
|
||||
continue
|
||||
if len(path) < 2:
|
||||
continue
|
||||
sub_done = env.event()
|
||||
sub_txn = Transaction(
|
||||
request=request, path=path, step=0,
|
||||
nbytes=0, done=sub_done,
|
||||
nbytes=0, done=env.event(),
|
||||
)
|
||||
yield self.out_ports[path[1]].put(sub_txn.advance())
|
||||
sub_dones.append(sub_done)
|
||||
sub_txns.append(sub_txn)
|
||||
n_dispatched += 1
|
||||
|
||||
if not sub_dones:
|
||||
if n_dispatched == 0:
|
||||
txn.done.succeed()
|
||||
return
|
||||
|
||||
# Wait for all PE_CPUs to complete
|
||||
for sd in sub_dones:
|
||||
yield sd
|
||||
# Setup response aggregation (PE_CPU ResponseMsg arrives via _collect_response)
|
||||
all_done = env.event()
|
||||
self._pending[request.request_id] = (n_dispatched, 0, all_done)
|
||||
self._parent_txns[request.request_id] = txn
|
||||
|
||||
# Wait for all PE_CPU responses via NOC
|
||||
yield all_done
|
||||
del self._parent_txns[request.request_id]
|
||||
|
||||
# Aggregate PE-internal metrics (max across PEs)
|
||||
pe_exec_values = [st.result_data.get("pe_exec_ns", 0.0) for st in sub_txns]
|
||||
|
||||
@@ -6,6 +6,7 @@ from typing import TYPE_CHECKING, Any
|
||||
import simpy
|
||||
|
||||
from kernbench.components.base import ComponentBase
|
||||
from kernbench.sim_engine.transaction import Transaction
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from kernbench.components.context import ComponentContext
|
||||
@@ -150,5 +151,21 @@ class PeCpuComponent(ComponentBase):
|
||||
txn.result_data["dma_ns"] = total_dma_ns
|
||||
txn.result_data["compute_ns"] = total_compute_ns
|
||||
|
||||
# Signal original Transaction done
|
||||
txn.done.succeed()
|
||||
# Send ResponseMsg on reverse path (PE_CPU → NOC → M_CPU)
|
||||
reverse_path = list(reversed(txn.path))
|
||||
if len(reverse_path) >= 2:
|
||||
from kernbench.runtime_api.kernel import ResponseMsg
|
||||
|
||||
resp_msg = ResponseMsg(
|
||||
correlation_id=request.correlation_id,
|
||||
request_id=request.request_id,
|
||||
src_cube=self._cube_idx, src_pe=self._pe_idx,
|
||||
success=True,
|
||||
)
|
||||
resp_txn = Transaction(
|
||||
request=resp_msg, path=reverse_path, step=0,
|
||||
nbytes=0, done=env.event(), is_response=True,
|
||||
)
|
||||
yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
|
||||
else:
|
||||
txn.done.succeed()
|
||||
|
||||
@@ -94,6 +94,15 @@ class PeDmaComponent(PeEngineBase):
|
||||
|
||||
def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
|
||||
"""Handle external Transaction (PeDmaMsg probe, M_CPU DMA) with channel acquisition."""
|
||||
# Response transactions bypass DMA channel (no outbound resource needed)
|
||||
if getattr(txn, "is_response", False):
|
||||
next_hop = txn.next_hop
|
||||
if next_hop:
|
||||
yield self.out_ports[next_hop].put(txn.advance())
|
||||
else:
|
||||
txn.done.succeed()
|
||||
return
|
||||
|
||||
dma_res = self._select_channel(txn)
|
||||
with dma_res.request() as req:
|
||||
yield req
|
||||
|
||||
Reference in New Issue
Block a user