Add probe CLI improvements, D2H read, UCIe/HBM tuning, BW sweep
- Probe CLI: restructured output (tables first, routes below), per-hop timestamps, split cross-cube into best/worst cases, D2H read section - UCIe overhead: 1ns -> 8ns per port (16ns per crossing) to fix cross-cube-best < cross-half latency inversion - HBM efficiency: added efficiency=0.8 factor to hbm_ctrl, reducing effective BW from 256 to 204.8 GB/s - Multi-size BW sweep: saturation tables (4KB-1MB) for all probe cases - Probe default data size: 4KB -> 32KB for more realistic measurements - IOChiplet NOC + D2H topology and tests - NOC mesh, xbar, BW occupancy components and tests - Cube mesh visualization diagram 278 tests pass. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -18,11 +18,10 @@ from kernbench.topology.types import Edge, TopologyGraph
|
||||
class GraphEngine:
|
||||
"""simpy-based discrete-event simulation engine.
|
||||
|
||||
Phase B: engine injects a Transaction into the PCIE_EP host queue for
|
||||
each request. Components handle their own routing:
|
||||
Path 1: PCIE_EP → IO_CPU (engine-computed path, pre-loaded in Transaction)
|
||||
Path 2: IO_CPU → M_CPU (IO_CPU dispatches, fire-and-forget callback)
|
||||
Path 3: M_CPU.DMA → HBM (M_CPU dispatches, fire-and-forget callback)
|
||||
Request routing:
|
||||
MemoryWrite/Read: pcie_ep → io_noc → cube → xbar → hbm_ctrl (m_cpu bypass)
|
||||
KernelLaunch: pcie_ep → io_noc → io_cpu → io_noc → cube → m_cpu → PE
|
||||
PeDmaMsg: pe_dma → xbar → hbm_ctrl (direct probe)
|
||||
|
||||
Component implementations are DI-injectable via component_overrides (ADR-0007 D3).
|
||||
"""
|
||||
@@ -68,18 +67,20 @@ class GraphEngine:
|
||||
src_comp.out_ports[e.dst] = store
|
||||
dst_comp.in_ports[e.src] = store
|
||||
|
||||
# Wire processes: propagation delay per edge (ADR-0015 D2)
|
||||
# Cut-through (wormhole) model: wires apply propagation only.
|
||||
# Serialization (drain) is computed per-path and applied once at the terminal.
|
||||
# Wire processes: propagation delay + BW occupancy per edge (ADR-0015 D2)
|
||||
# Cut-through (wormhole) model: wires apply propagation delay per hop.
|
||||
# BW occupancy (available_at) tracks when each directed link becomes free
|
||||
# for the next transaction, modeling back-to-back serialization contention.
|
||||
for e in graph.edges:
|
||||
src_comp = self._components.get(e.src)
|
||||
dst_comp = self._components.get(e.dst)
|
||||
if src_comp is None or dst_comp is None:
|
||||
continue
|
||||
prop_ns = e.distance_mm * self._ns_per_mm
|
||||
bw_gbs = e.bw_gbs or 0.0
|
||||
self._env.process(
|
||||
self._wire(src_comp.out_ports[e.dst], dst_comp.in_ports[e.src],
|
||||
prop_ns)
|
||||
prop_ns, bw_gbs)
|
||||
)
|
||||
|
||||
# Attach host queues to PCIE_EP in_ports before start() (ADR-0015 D3)
|
||||
@@ -125,14 +126,33 @@ class GraphEngine:
|
||||
out_port: simpy.Store,
|
||||
in_port: simpy.Store,
|
||||
prop_ns: float,
|
||||
bw_gbs: float = 0.0,
|
||||
):
|
||||
"""SimPy process: relay messages with propagation delay only.
|
||||
"""SimPy process: relay messages with propagation delay and BW occupancy.
|
||||
|
||||
Cut-through (wormhole) model: serialization (drain) is computed per-path
|
||||
and applied once at the terminal component, not at every wire hop.
|
||||
Each directed edge maintains an ``available_at`` timestamp tracking when
|
||||
the link becomes free for the next transaction. When a transaction of
|
||||
``nbytes`` uses a link with ``bw_gbs``, the link is occupied for
|
||||
``nbytes / bw_gbs`` ns. The *next* transaction on the same directed
|
||||
link must wait until ``available_at`` passes (back-to-back serialization).
|
||||
|
||||
The *current* transaction is NOT delayed by its own occupancy — only by
|
||||
a prior transaction's occupancy that has not yet cleared. This avoids
|
||||
double-drain: terminal drain_ns handles single-transaction serialization,
|
||||
while available_at handles inter-transaction BW contention.
|
||||
"""
|
||||
available_at = 0.0
|
||||
while True:
|
||||
msg = yield out_port.get()
|
||||
# BW occupancy: wait for link to become free, then mark busy
|
||||
if bw_gbs > 0:
|
||||
nbytes = getattr(msg, "nbytes", 0)
|
||||
if nbytes > 0:
|
||||
wait = available_at - self._env.now
|
||||
if wait > 0:
|
||||
yield self._env.timeout(wait)
|
||||
available_at = self._env.now + (nbytes / bw_gbs)
|
||||
# Propagation delay
|
||||
if prop_ns > 0:
|
||||
yield self._env.timeout(prop_ns)
|
||||
yield in_port.put(msg)
|
||||
@@ -142,6 +162,10 @@ class GraphEngine:
|
||||
yield from self._process_pe_dma(key, request, done)
|
||||
return
|
||||
|
||||
if isinstance(request, (MemoryWriteMsg, MemoryReadMsg)):
|
||||
yield from self._process_memory_direct(key, request, done)
|
||||
return
|
||||
|
||||
entries = self._entry_points(request)
|
||||
if not entries:
|
||||
self._results[key] = (
|
||||
@@ -200,6 +224,44 @@ class GraphEngine:
|
||||
)
|
||||
done.succeed()
|
||||
|
||||
def _process_memory_direct(self, key: str, request: Any, done: simpy.Event):
|
||||
"""Direct memory path: pcie_ep → io_noc → cube → xbar → hbm_ctrl.
|
||||
|
||||
MemoryWrite: data flows forward (nbytes on wires), drain at hbm_ctrl terminal.
|
||||
MemoryRead: command flows forward (nbytes=0), hbm_ctrl sends data back on
|
||||
reverse path with nbytes=request.nbytes.
|
||||
"""
|
||||
if isinstance(request, MemoryWriteMsg):
|
||||
sip, pa_val = request.dst_sip, request.dst_pa
|
||||
else:
|
||||
sip, pa_val = request.src_sip, request.src_pa
|
||||
|
||||
pcie_ep_id = self._resolver.find_pcie_ep(sip)
|
||||
pa = PhysAddr.decode(pa_val)
|
||||
hbm_node = self._resolver.resolve(pa)
|
||||
path = self._router.find_memory_path(pcie_ep_id, hbm_node)
|
||||
drain_ns = self._path_drain_ns(path, request.nbytes)
|
||||
|
||||
start_ns = self._env.now
|
||||
txn_done = self._env.event()
|
||||
|
||||
is_write = isinstance(request, MemoryWriteMsg)
|
||||
txn = Transaction(
|
||||
request=request, path=path, step=0,
|
||||
nbytes=request.nbytes if is_write else 0,
|
||||
done=txn_done, drain_ns=drain_ns,
|
||||
)
|
||||
|
||||
yield self._host_queues[pcie_ep_id].put(txn)
|
||||
yield txn_done
|
||||
|
||||
total_ns = self._env.now - start_ns
|
||||
self._results[key] = (
|
||||
Completion(ok=True),
|
||||
{"total_ns": total_ns, "nbytes": request.nbytes},
|
||||
)
|
||||
done.succeed()
|
||||
|
||||
def _process_pe_dma(self, key: str, request: PeDmaMsg, done: simpy.Event):
|
||||
"""Inject a Transaction directly at PE_DMA for PE→HBM latency measurement."""
|
||||
pe_prefix = f"sip{request.src_sip}.cube{request.src_cube}.pe{request.src_pe}"
|
||||
@@ -260,25 +322,8 @@ class GraphEngine:
|
||||
def _entry_points(self, request: Any) -> list[tuple[str, str, int]]:
|
||||
"""Return list of (pcie_ep_id, io_cpu_id, nbytes) per target SIP.
|
||||
|
||||
For Memory{Write,Read}: single SIP entry.
|
||||
For KernelLaunchMsg: one entry per distinct SIP in tensor shards.
|
||||
Only handles KernelLaunchMsg. MemoryWrite/Read use _process_memory_direct.
|
||||
"""
|
||||
if isinstance(request, MemoryWriteMsg):
|
||||
sip = request.dst_sip
|
||||
return [(
|
||||
self._resolver.find_pcie_ep(sip),
|
||||
self._resolver.find_io_cpu(sip),
|
||||
request.nbytes,
|
||||
)]
|
||||
|
||||
if isinstance(request, MemoryReadMsg):
|
||||
sip = request.src_sip
|
||||
return [(
|
||||
self._resolver.find_pcie_ep(sip),
|
||||
self._resolver.find_io_cpu(sip),
|
||||
request.nbytes,
|
||||
)]
|
||||
|
||||
if isinstance(request, KernelLaunchMsg):
|
||||
seen: set[int] = set()
|
||||
entries: list[tuple[str, str, int]] = []
|
||||
|
||||
Reference in New Issue
Block a user