Add probe CLI improvements, D2H read, UCIe/HBM tuning, BW sweep

- Probe CLI: restructured output (tables first, routes below), per-hop
  timestamps, split cross-cube into best/worst cases, D2H read section
- UCIe overhead: 1ns -> 8ns per port (16ns per crossing) to fix
  cross-cube-best < cross-half latency inversion
- HBM efficiency: added efficiency=0.8 factor to hbm_ctrl, reducing
  effective BW from 256 to 204.8 GB/s
- Multi-size BW sweep: saturation tables (4KB-1MB) for all probe cases
- Probe default data size: 4KB -> 32KB for more realistic measurements
- IOChiplet NOC + D2H topology and tests
- NOC mesh, xbar, BW occupancy components and tests
- Cube mesh visualization diagram

278 tests pass.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-19 01:16:18 -07:00
parent 6f43807900
commit d75da439c6
24 changed files with 3456 additions and 501 deletions
+75 -30
View File
@@ -18,11 +18,10 @@ from kernbench.topology.types import Edge, TopologyGraph
class GraphEngine:
"""simpy-based discrete-event simulation engine.
Phase B: engine injects a Transaction into the PCIE_EP host queue for
each request. Components handle their own routing:
Path 1: PCIE_EP → IO_CPU (engine-computed path, pre-loaded in Transaction)
Path 2: IO_CPU → M_CPU (IO_CPU dispatches, fire-and-forget callback)
Path 3: M_CPU.DMA → HBM (M_CPU dispatches, fire-and-forget callback)
Request routing:
MemoryWrite/Read: pcie_ep → io_noc → cube → xbar → hbm_ctrl (m_cpu bypass)
KernelLaunch: pcie_ep → io_noc → io_cpu → io_noc → cube → m_cpu → PE
PeDmaMsg: pe_dma → xbar → hbm_ctrl (direct probe)
Component implementations are DI-injectable via component_overrides (ADR-0007 D3).
"""
@@ -68,18 +67,20 @@ class GraphEngine:
src_comp.out_ports[e.dst] = store
dst_comp.in_ports[e.src] = store
# Wire processes: propagation delay per edge (ADR-0015 D2)
# Cut-through (wormhole) model: wires apply propagation only.
# Serialization (drain) is computed per-path and applied once at the terminal.
# Wire processes: propagation delay + BW occupancy per edge (ADR-0015 D2)
# Cut-through (wormhole) model: wires apply propagation delay per hop.
# BW occupancy (available_at) tracks when each directed link becomes free
# for the next transaction, modeling back-to-back serialization contention.
for e in graph.edges:
src_comp = self._components.get(e.src)
dst_comp = self._components.get(e.dst)
if src_comp is None or dst_comp is None:
continue
prop_ns = e.distance_mm * self._ns_per_mm
bw_gbs = e.bw_gbs or 0.0
self._env.process(
self._wire(src_comp.out_ports[e.dst], dst_comp.in_ports[e.src],
prop_ns)
prop_ns, bw_gbs)
)
# Attach host queues to PCIE_EP in_ports before start() (ADR-0015 D3)
@@ -125,14 +126,33 @@ class GraphEngine:
out_port: simpy.Store,
in_port: simpy.Store,
prop_ns: float,
bw_gbs: float = 0.0,
):
"""SimPy process: relay messages with propagation delay only.
"""SimPy process: relay messages with propagation delay and BW occupancy.
Cut-through (wormhole) model: serialization (drain) is computed per-path
and applied once at the terminal component, not at every wire hop.
Each directed edge maintains an ``available_at`` timestamp tracking when
the link becomes free for the next transaction. When a transaction of
``nbytes`` uses a link with ``bw_gbs``, the link is occupied for
``nbytes / bw_gbs`` ns. The *next* transaction on the same directed
link must wait until ``available_at`` passes (back-to-back serialization).
The *current* transaction is NOT delayed by its own occupancy — only by
a prior transaction's occupancy that has not yet cleared. This avoids
double-drain: terminal drain_ns handles single-transaction serialization,
while available_at handles inter-transaction BW contention.
"""
available_at = 0.0
while True:
msg = yield out_port.get()
# BW occupancy: wait for link to become free, then mark busy
if bw_gbs > 0:
nbytes = getattr(msg, "nbytes", 0)
if nbytes > 0:
wait = available_at - self._env.now
if wait > 0:
yield self._env.timeout(wait)
available_at = self._env.now + (nbytes / bw_gbs)
# Propagation delay
if prop_ns > 0:
yield self._env.timeout(prop_ns)
yield in_port.put(msg)
@@ -142,6 +162,10 @@ class GraphEngine:
yield from self._process_pe_dma(key, request, done)
return
if isinstance(request, (MemoryWriteMsg, MemoryReadMsg)):
yield from self._process_memory_direct(key, request, done)
return
entries = self._entry_points(request)
if not entries:
self._results[key] = (
@@ -200,6 +224,44 @@ class GraphEngine:
)
done.succeed()
def _process_memory_direct(self, key: str, request: Any, done: simpy.Event):
"""Direct memory path: pcie_ep → io_noc → cube → xbar → hbm_ctrl.
MemoryWrite: data flows forward (nbytes on wires), drain at hbm_ctrl terminal.
MemoryRead: command flows forward (nbytes=0), hbm_ctrl sends data back on
reverse path with nbytes=request.nbytes.
"""
if isinstance(request, MemoryWriteMsg):
sip, pa_val = request.dst_sip, request.dst_pa
else:
sip, pa_val = request.src_sip, request.src_pa
pcie_ep_id = self._resolver.find_pcie_ep(sip)
pa = PhysAddr.decode(pa_val)
hbm_node = self._resolver.resolve(pa)
path = self._router.find_memory_path(pcie_ep_id, hbm_node)
drain_ns = self._path_drain_ns(path, request.nbytes)
start_ns = self._env.now
txn_done = self._env.event()
is_write = isinstance(request, MemoryWriteMsg)
txn = Transaction(
request=request, path=path, step=0,
nbytes=request.nbytes if is_write else 0,
done=txn_done, drain_ns=drain_ns,
)
yield self._host_queues[pcie_ep_id].put(txn)
yield txn_done
total_ns = self._env.now - start_ns
self._results[key] = (
Completion(ok=True),
{"total_ns": total_ns, "nbytes": request.nbytes},
)
done.succeed()
def _process_pe_dma(self, key: str, request: PeDmaMsg, done: simpy.Event):
"""Inject a Transaction directly at PE_DMA for PE→HBM latency measurement."""
pe_prefix = f"sip{request.src_sip}.cube{request.src_cube}.pe{request.src_pe}"
@@ -260,25 +322,8 @@ class GraphEngine:
def _entry_points(self, request: Any) -> list[tuple[str, str, int]]:
"""Return list of (pcie_ep_id, io_cpu_id, nbytes) per target SIP.
For Memory{Write,Read}: single SIP entry.
For KernelLaunchMsg: one entry per distinct SIP in tensor shards.
Only handles KernelLaunchMsg. MemoryWrite/Read use _process_memory_direct.
"""
if isinstance(request, MemoryWriteMsg):
sip = request.dst_sip
return [(
self._resolver.find_pcie_ep(sip),
self._resolver.find_io_cpu(sip),
request.nbytes,
)]
if isinstance(request, MemoryReadMsg):
sip = request.src_sip
return [(
self._resolver.find_pcie_ep(sip),
self._resolver.find_io_cpu(sip),
request.nbytes,
)]
if isinstance(request, KernelLaunchMsg):
seen: set[int] = set()
entries: list[tuple[str, str, int]] = []