Files
kernbench2/src/kernbench/components/impls/hbm_ctrl.py
T
ywkang 62fb01ae18 Add reverse path response latency for PE DMA and PE_CPU→M_CPU
Model fabric response hop latency for PE-internal operations:
- HBM_CTRL sends PeDmaMsg response on reverse path instead of direct done signal
- PE_CPU sends ResponseMsg via NOC→M_CPU on kernel completion
- Add NOC→PE_DMA and PE_CPU→NOC edges in topology builder
- Make HBM BW test assertions dynamic based on topology efficiency

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-20 15:40:56 -07:00

130 lines
5.2 KiB
Python

from __future__ import annotations
from collections.abc import Generator
from typing import TYPE_CHECKING, Any
import simpy
from kernbench.components.base import ComponentBase
from kernbench.sim_engine.transaction import Transaction
if TYPE_CHECKING:
from kernbench.components.context import ComponentContext
from kernbench.topology.types import Node
class HbmCtrlComponent(ComponentBase):
"""HBM controller: terminal component that models HBM access latency.
Dual-channel model: separate read and write resources (each capacity=1)
allowing concurrent read/write like PE_DMA. Multiple reads or multiple
writes still serialize within their respective channel.
On completion, creates a ResponseMsg and sends it back on the reverse path
so that response latency is modeled through the fabric.
"""
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
super().__init__(node, ctx)
self._read: simpy.Resource | None = None
self._write: simpy.Resource | None = None
def start(self, env: simpy.Environment) -> None:
capacity = int(self.node.attrs.get("capacity", 1))
self._read = simpy.Resource(env, capacity=capacity)
self._write = simpy.Resource(env, capacity=capacity)
super().start(env)
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
yield env.timeout(overhead_ns)
def _select_channel(self, txn: Any) -> simpy.Resource:
"""Select channel based on request type: write requests → write, else → read."""
from kernbench.runtime_api.kernel import MemoryWriteMsg, PeDmaMsg
assert self._read is not None and self._write is not None
req = txn.request
if isinstance(req, MemoryWriteMsg):
return self._write
if isinstance(req, PeDmaMsg) and req.is_write:
return self._write
return self._read
def _worker(self, env: simpy.Environment) -> Generator:
"""Dispatch each incoming txn to a concurrent process for channel-level parallelism."""
while True:
txn: Any = yield self._inbox.get()
env.process(self._handle_txn(env, txn))
def _handle_txn(self, env: simpy.Environment, txn: Any) -> Generator:
"""Acquire channel, run, apply drain, send response."""
channel = self._select_channel(txn)
with channel.request() as req:
yield req
yield from self.run(env, txn.nbytes)
drain = getattr(txn, "drain_ns", 0.0)
if drain > 0:
yield env.timeout(drain)
yield from self._send_response(env, txn)
def _send_response(self, env: simpy.Environment, txn: Any) -> Generator:
"""Route completion based on path type.
- PeDmaMsg: succeed done directly (probe).
- Bypass path (no m_cpu): MemoryWrite succeeds done; MemoryRead sends
data back on reverse path with original done event.
- M_CPU DMA path: send ResponseMsg for m_cpu/io_cpu aggregation.
"""
from kernbench.runtime_api.kernel import MemoryReadMsg, PeDmaMsg
if isinstance(txn.request, PeDmaMsg):
reverse_path = list(reversed(txn.path))
if len(reverse_path) >= 2:
resp_txn = Transaction(
request=txn.request, path=reverse_path, step=0,
nbytes=0, done=txn.done, is_response=True,
)
yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
return
txn.done.succeed()
return
# Bypass path: no m_cpu in the transaction path
is_bypass = not any("m_cpu" in n for n in txn.path)
if is_bypass:
if isinstance(txn.request, MemoryReadMsg):
# D2H: send data back on reverse path to pcie_ep
reverse_path = list(reversed(txn.path))
if len(reverse_path) >= 2:
resp_txn = Transaction(
request=txn.request, path=reverse_path, step=0,
nbytes=txn.request.nbytes, done=txn.done,
)
yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
return
# MemoryWrite bypass or short path: done
txn.done.succeed()
return
# M_CPU DMA path: send ResponseMsg for aggregation
reverse_path = list(reversed(txn.path))
if len(reverse_path) >= 2 and self.ctx:
from kernbench.runtime_api.kernel import ResponseMsg
parts = self.node.id.split(".")
cube_id = int(parts[1].replace("cube", ""))
pe_id = int(parts[3].replace("slice", ""))
resp_msg = ResponseMsg(
correlation_id=txn.request.correlation_id,
request_id=txn.request.request_id,
src_cube=cube_id, src_pe=pe_id, success=True,
)
resp_txn = Transaction(
request=resp_msg, path=reverse_path, step=0,
nbytes=0, done=env.event(), is_response=True,
)
yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
else:
txn.done.succeed()