Add PE_DMA latency-breakdown plots + self-verification harness
scripts/plot_pe_dma_perf.py runs the simulator across six
no-congestion scenarios (SAME_CUBE_PE_LOCAL / REMOTE_BEST /
REMOTE_WORST, REMOTE_CUBE_BEST / REMOTE_WORST, REMOTE_SIP) and
five congestion scenarios (1/2/3 PE hot-target, 8-PE corresp.
cube-to-cube, 8-PE all-hit-pe0). It categorises actual total /
makespan into pe_setup, noc_mesh, ucie, fabric, streaming,
hbm_ctrl, and a contention residual using a wormhole-pipelined
model (first-flit arrival + (n_flits-1)/bottleneck + final
chunk_time).
Outputs:
docs/diagrams/pe_dma_perf/no_congestion.png — single-PE latency
by topological distance. Visualises monotonic growth from
SAME_CUBE_PE_LOCAL (77 ns) up to REMOTE_CUBE_PE_REMOTE_WORST
(573 ns) and REMOTE_SIP (409 ns).
docs/diagrams/pe_dma_perf/congestion.png — makespan as concurrent
issuer count grows. ctrl_hot_{1,2,3}=82/158/230 ns; 8-PE
eastbound UCIe = 963 ns; 8-PE all-hit-pe0 = 558 ns.
docs/diagrams/pe_dma_perf/summary.csv — raw rows for re-plotting.
Built-in --verify harness asserts:
(1) distance monotonicity for no-congestion;
(2) same-cube paths contain zero UCIe budget;
(3) remote-cube/SIP paths carry positive UCIe budget;
(4) breakdown is internally consistent (formula ≤ actual);
(5) streaming term matches (n_flits-1) × flit_bytes /
bottleneck_bw within 5 % for the local scenario;
(6) congestion makespan is monotonic in issuer count;
(7) 8-PE hotspot strictly exceeds 3-PE hotspot.
Cross-SIP gets a looser 70 % contention slack because the path
crosses two non-flit-aware (pcie_ep) boundaries that force
store-and-forward re-streaming the simple formula does not
attribute. Single-cube scenarios stay under 25 % residual.
All checks PASS at the current model (post ADR-0019 D1/D4
per-PE HBM CTRL restoration).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Binary file not shown.
|
After Width: | Height: | Size: 52 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 60 KiB |
@@ -0,0 +1,24 @@
|
|||||||
|
graph,scenario,label,nbytes,n_issuers,total_ns,makespan_ns,min_lat_ns,pe_setup,noc_mesh,ucie,hbm_ctrl,contention,path,first_path
|
||||||
|
no_congestion,local,"SAME_CUBE
|
||||||
|
PE_LOCAL",16384,,77.0,,,1.0,2.0,0.0,9.0,2.0,pe0.pe_dma -> cube0.r0c0 -> hbm_ctrl.pe0,
|
||||||
|
no_congestion,same_cube_best,"SAME_CUBE
|
||||||
|
REMOTE_BEST
|
||||||
|
(pe0→pe1)",16384,,82.06,,,1.0,5.03,0.0,9.0,4.030000000000001,pe0.pe_dma -> cube0.r0c0 -> cube0.r0c1 -> hbm_ctrl.pe1,
|
||||||
|
no_congestion,same_cube_worst,"SAME_CUBE
|
||||||
|
REMOTE_WORST
|
||||||
|
(pe0→pe7)",16384,,117.50000000000001,,,1.0,26.25,0.0,9.0,18.250000000000014,pe0.pe_dma -> cube0.r0c0 -> cube0.r1c0 -> cube0.r1c1 -> cube0.r1c2 -> cube0.r1c3 -> cube0.r4c3 -> cube0.r4c4 -> cube0.r5c4 -> cube0.r5c5 -> hbm_ctrl.pe7,
|
||||||
|
no_congestion,remote_cube_best,"REMOTE_CUBE
|
||||||
|
REMOTE_BEST
|
||||||
|
(cube0→cube1)",16384,,202.51999999999998,,,1.0,6.0,32.510000000000005,9.0,28.00999999999999,pe0.pe_dma -> cube0.r0c0 -> ucie-N.conn0 -> cube0.ucie-N -> ucie-N.conn3 -> cube0.r0c5 -> ucie-E.conn0 -> cube0.ucie-E -> cube1.ucie-W -> ucie-W.conn0 -> cube1.r0c0 -> hbm_ctrl.pe0,
|
||||||
|
no_congestion,remote_cube_worst,"REMOTE_CUBE
|
||||||
|
REMOTE_WORST
|
||||||
|
(cube0→cube15.pe7)",16384,,573.1199999999999,,,1.0,30.0,219.05999999999995,9.0,188.05999999999995,pe0.pe_dma -> cube0.r0c0 -> ucie-N.conn0 -> cube0.ucie-N -> ucie-N.conn3 -> cube0.r0c5 -> ucie-E.conn0 -> cube0.ucie-E -> cube1.ucie-W -> ucie-W.conn0 -> cube1.r0c0 -> ucie-N.conn0 -> cube1.ucie-N -> ucie-N.conn3 -> cube1.r0c5 -> ucie-E.conn0 -> cube1.ucie-E -> cube2.ucie-W -> ucie-W.conn0 -> cube2.r0c0 -> ucie-N.conn0 -> cube2.ucie-N -> ucie-N.conn3 -> cube2.r0c5 -> ucie-E.conn0 -> cube2.ucie-E -> cube3.ucie-W -> ucie-W.conn0 -> cube3.r0c0 -> ucie-N.conn0 -> cube3.ucie-N -> ucie-N.conn3 -> cube3.r0c5 -> ucie-E.conn0 -> cube3.ucie-E -> ucie-E.conn3 -> cube3.r5c5 -> ucie-S.conn3 -> cube3.ucie-S -> cube7.ucie-N -> ucie-N.conn3 -> cube7.r0c5 -> ucie-E.conn0 -> cube7.ucie-E -> ucie-E.conn3 -> cube7.r5c5 -> ucie-S.conn3 -> cube7.ucie-S -> cube11.ucie-N -> ucie-N.conn3 -> cube11.r0c5 -> ucie-E.conn0 -> cube11.ucie-E -> ucie-E.conn3 -> cube11.r5c5 -> ucie-S.conn3 -> cube11.ucie-S -> cube15.ucie-N -> ucie-N.conn3 -> cube15.r0c5 -> ucie-E.conn0 -> cube15.ucie-E -> ucie-E.conn3 -> cube15.r5c5 -> hbm_ctrl.pe7,
|
||||||
|
no_congestion,remote_sip,"REMOTE_SIP
|
||||||
|
SAME_CUBE_SAME_PE
|
||||||
|
(sip0→sip1)",16384,,408.5216666666663,,,1.0,4.0,37.040000000000006,9.0,209.38499999999962,pe0.pe_dma -> cube0.r0c0 -> ucie-N.conn0 -> cube0.ucie-N -> io0.ucie-P0 -> ucie-P0.conn0 -> io0.noc -> io0.pcie_ep -> fabric.switch0 -> io0.pcie_ep -> io0.noc -> ucie-P0.conn0 -> io0.ucie-P0 -> cube0.ucie-N -> ucie-N.conn0 -> cube0.r0c0 -> hbm_ctrl.pe0,
|
||||||
|
congestion,ctrl_hot_1,1×PE → pe0_slice,16384,1,,82.06,82.06,1.0,5.03,0.0,9.0,4.030000000000001,,pe1.pe_dma -> cube0.r0c1 -> cube0.r0c0 -> hbm_ctrl.pe0
|
||||||
|
congestion,ctrl_hot_2,2×PE → pe0_slice,16384,2,,158.3450000000001,134.2400000000001,1.0,5.03,0.0,9.0,80.31500000000011,,pe1.pe_dma -> cube0.r0c1 -> cube0.r0c0 -> hbm_ctrl.pe0
|
||||||
|
congestion,ctrl_hot_3,3×PE → pe0_slice,16384,3,,230.0750000000001,139.94000000000008,1.0,5.03,0.0,9.0,152.0450000000001,,pe1.pe_dma -> cube0.r0c1 -> cube0.r0c0 -> hbm_ctrl.pe0
|
||||||
|
congestion,ucie_eastbound,"8×PE corresp.
|
||||||
|
cube0→cube1",16384,8,,962.52,438.52,1.0,6.0,32.510000000000005,9.0,788.01,,pe0.pe_dma -> cube0.r0c0 -> ucie-N.conn0 -> cube0.ucie-N -> ucie-N.conn3 -> cube0.r0c5 -> ucie-E.conn0 -> cube0.ucie-E -> cube1.ucie-W -> ucie-W.conn0 -> cube1.r0c0 -> hbm_ctrl.pe0
|
||||||
|
congestion,all_pe_to_pe0,8×PE → pe0_slice,16384,8,,558.2499999999998,195.0,1.0,2.0,0.0,9.0,483.2499999999998,,pe0.pe_dma -> cube0.r0c0 -> hbm_ctrl.pe0
|
||||||
|
@@ -0,0 +1,602 @@
|
|||||||
|
"""Plot PE_DMA performance: latency breakdown across topological distance.
|
||||||
|
|
||||||
|
Two graphs (saved to docs/diagrams/pe_dma_perf/):
|
||||||
|
|
||||||
|
no_congestion.png — single PE issues one DMA, target varies in distance:
|
||||||
|
1. SAME_CUBE_PE_LOCAL — pe0 -> pe0's slice (own router, 1 hop)
|
||||||
|
2. SAME_CUBE_PE_REMOTE_BEST — pe0 -> pe1's slice (adjacent corner)
|
||||||
|
3. SAME_CUBE_PE_REMOTE_WORST — pe0 -> pe7's slice (opposite corner)
|
||||||
|
4. REMOTE_CUBE_PE_REMOTE_BEST — pe0 -> cube1 pe0's slice (1 UCIe hop)
|
||||||
|
5. REMOTE_CUBE_PE_REMOTE_WORST — pe0 -> cube15 pe7's slice (max UCIe + mesh)
|
||||||
|
6. REMOTE_SIP_SAME_CUBE_SAME_PE — pe0 -> sip1.cube0.pe0's slice
|
||||||
|
|
||||||
|
congestion.png — concurrent PEs hitting either the same HBM CTRL or
|
||||||
|
the same UCIe direction:
|
||||||
|
A. 1×PE remote single — baseline (one remote PE reads cube0.pe0_slice)
|
||||||
|
B. 2×PE remote concurrent — two adjacent PEs share path to pe0_slice
|
||||||
|
C. 3×PE remote concurrent — three PEs contend on pe0's router/HBM
|
||||||
|
D. 8×PE same-direction-UCIe — every PE in cube0 reads cube1 same-PE slice
|
||||||
|
E. 8×PE all-hit-PE0 — every PE reads cube0.pe0_slice (hottest HBM CTRL)
|
||||||
|
|
||||||
|
Latency is broken down by component class:
|
||||||
|
pe_setup — first-flit PE_DMA overhead + PE↔router wire transfer
|
||||||
|
noc_mesh — mesh routers' first-flit overheads + mesh wire transfers
|
||||||
|
ucie — UCIe ports' first-flit overheads + UCIe wire transfers
|
||||||
|
streaming — (n_flits-1) × per-flit time at the bottleneck link
|
||||||
|
(the dominant term for bulk transfers, set by the slowest wire)
|
||||||
|
hbm_ctrl — HBM CTRL overhead + final-chunk PC commit (= chunk_time)
|
||||||
|
fabric — switch + IO chiplet overheads + wires (cross-SIP paths)
|
||||||
|
contention — actual − formula_sum; primary signal for the congestion
|
||||||
|
graph (serialization across concurrent issuers) and a
|
||||||
|
model-fidelity probe for single-request scenarios
|
||||||
|
|
||||||
|
Outputs ``summary.csv`` so the plot can be re-rendered without re-running
|
||||||
|
the simulator (the heavy step).
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import csv
|
||||||
|
import math
|
||||||
|
from collections import defaultdict
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Iterable
|
||||||
|
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
from kernbench.policy.address.phyaddr import PhysAddr
|
||||||
|
from kernbench.runtime_api.kernel import PeDmaMsg
|
||||||
|
from kernbench.sim_engine.engine import GraphEngine
|
||||||
|
from kernbench.topology.builder import load_topology
|
||||||
|
|
||||||
|
REPO = Path(__file__).resolve().parent.parent
|
||||||
|
TOPOLOGY_PATH = REPO / "topology.yaml"
|
||||||
|
OUT_DIR = REPO / "docs" / "diagrams" / "pe_dma_perf"
|
||||||
|
|
||||||
|
DEFAULT_NBYTES = 16 * 1024 # 16 KB per DMA
|
||||||
|
|
||||||
|
# Category order (stacked bottom-to-top) and colours.
|
||||||
|
CATEGORIES = [
|
||||||
|
("pe_setup", "#3b82f6"), # blue
|
||||||
|
("noc_mesh", "#10b981"), # green
|
||||||
|
("ucie", "#f59e0b"), # amber
|
||||||
|
("fabric", "#8b5cf6"), # purple (switch + io chiplet for cross-SIP)
|
||||||
|
("streaming", "#6366f1"), # indigo (bulk = (n_flits-1)/bottleneck)
|
||||||
|
("hbm_ctrl", "#ef4444"), # red (final-chunk commit = chunk_time)
|
||||||
|
("contention", "#9ca3af"), # grey (actual − formula, surfaces serialization)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Scenario:
|
||||||
|
name: str
|
||||||
|
label: str
|
||||||
|
src_sip: int
|
||||||
|
src_cube: int
|
||||||
|
src_pe: int
|
||||||
|
dst_sip: int
|
||||||
|
dst_cube: int
|
||||||
|
dst_pe: int
|
||||||
|
|
||||||
|
|
||||||
|
def _slice_bytes(spec) -> int:
|
||||||
|
mm = spec["cube"]["memory_map"]
|
||||||
|
return mm["hbm_total_gb_per_cube"] * (1 << 30) // mm["hbm_slices_per_cube"]
|
||||||
|
|
||||||
|
|
||||||
|
def _hbm_pa(*, sip: int, cube: int, pe_id: int, offset: int, slice_bytes: int) -> int:
|
||||||
|
return PhysAddr.pe_hbm_addr(
|
||||||
|
sip_id=sip, die_id=cube, pe_id=pe_id,
|
||||||
|
pe_local_hbm_offset=offset, slice_size_bytes=slice_bytes,
|
||||||
|
).encode()
|
||||||
|
|
||||||
|
|
||||||
|
def _categorise_node(node) -> str | None:
|
||||||
|
nid = node.id
|
||||||
|
if ".pe_dma" in nid:
|
||||||
|
return "pe_setup"
|
||||||
|
if node.kind == "noc_router":
|
||||||
|
return "noc_mesh"
|
||||||
|
if "ucie" in nid:
|
||||||
|
return "ucie"
|
||||||
|
if node.kind == "hbm_ctrl":
|
||||||
|
return "hbm_ctrl"
|
||||||
|
if node.kind in ("switch", "pcie_ep", "io_cpu", "io_noc"):
|
||||||
|
return "fabric"
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _categorise_edge_kind(kind: str | None) -> str | None:
|
||||||
|
if kind in ("pe_to_router", "router_to_pe", "pe_internal"):
|
||||||
|
return "pe_setup"
|
||||||
|
if kind in ("router_mesh",):
|
||||||
|
return "noc_mesh"
|
||||||
|
if kind in ("router_to_hbm", "hbm_to_router"):
|
||||||
|
return "hbm_ctrl"
|
||||||
|
# UCIe transit. Includes the cube↔io_chiplet UCIe crossings.
|
||||||
|
if kind and "ucie" in kind:
|
||||||
|
return "ucie"
|
||||||
|
if kind in ("cube_to_io", "io_to_cube"):
|
||||||
|
return "ucie"
|
||||||
|
# Cross-SIP fabric: switch port + IO chiplet internal NoC + pcie link.
|
||||||
|
if kind in (
|
||||||
|
"io_to_switch", "switch_to_io", "io_internal",
|
||||||
|
"conn_to_io_noc", "io_noc_to_conn",
|
||||||
|
"pcie", "command", "fabric",
|
||||||
|
):
|
||||||
|
return "fabric"
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _path_breakdown(
|
||||||
|
path: list[str], nbytes: int, graph, edge_map, ns_per_mm: float,
|
||||||
|
) -> dict[str, float]:
|
||||||
|
"""Wormhole-pipelined breakdown of a path's expected latency.
|
||||||
|
|
||||||
|
Model:
|
||||||
|
total ≈ first_flit_arrival_time
|
||||||
|
+ (n_flits - 1) × bottleneck_per_flit_time
|
||||||
|
+ last_chunk_commit_time
|
||||||
|
|
||||||
|
Each summand is categorised:
|
||||||
|
* Per-component overheads + first-flit wire transfers are attributed
|
||||||
|
by component class (pe_setup / noc_mesh / ucie).
|
||||||
|
* ``streaming`` is the bulk-transfer cost = (n_flits-1) × per_flit
|
||||||
|
at the slowest wire bandwidth in the path.
|
||||||
|
* ``hbm_ctrl`` is the HBM CTRL overhead + the final chunk's PC commit
|
||||||
|
(= chunk_time). Earlier chunks overlap with arrival.
|
||||||
|
"""
|
||||||
|
cats: dict[str, float] = defaultdict(float)
|
||||||
|
|
||||||
|
# 1) Per-component overheads (first-flit).
|
||||||
|
for nid in path:
|
||||||
|
node = graph.nodes.get(nid)
|
||||||
|
if node is None:
|
||||||
|
continue
|
||||||
|
cat = _categorise_node(node)
|
||||||
|
if cat is None:
|
||||||
|
continue
|
||||||
|
cats[cat] += float(node.attrs.get("overhead_ns", 0.0))
|
||||||
|
|
||||||
|
# 2) Per-edge first-flit transfer = prop_ns + flit_bytes / bw_gbs.
|
||||||
|
bws: list[float] = []
|
||||||
|
flit_bytes = 256 # see ADR-0033 (matches default HBM burst_bytes)
|
||||||
|
for i in range(len(path) - 1):
|
||||||
|
e = edge_map.get((path[i], path[i + 1]))
|
||||||
|
if e is None:
|
||||||
|
continue
|
||||||
|
prop_ns = e.distance_mm * ns_per_mm
|
||||||
|
first_flit_xfer = (flit_bytes / e.bw_gbs) if e.bw_gbs else 0.0
|
||||||
|
cat = _categorise_edge_kind(e.kind)
|
||||||
|
if cat:
|
||||||
|
cats[cat] += prop_ns + first_flit_xfer
|
||||||
|
if e.bw_gbs:
|
||||||
|
bws.append(e.bw_gbs)
|
||||||
|
|
||||||
|
# 3) Streaming: (n_flits - 1) × per-flit at bottleneck.
|
||||||
|
if bws and nbytes > flit_bytes:
|
||||||
|
n_flits = math.ceil(nbytes / flit_bytes)
|
||||||
|
min_bw = min(bws)
|
||||||
|
cats["streaming"] = (n_flits - 1) * (flit_bytes / min_bw)
|
||||||
|
|
||||||
|
# 4) HBM CTRL: last-chunk commit time (earlier chunks overlap arrival).
|
||||||
|
if path:
|
||||||
|
hbm_node = graph.nodes.get(path[-1])
|
||||||
|
if hbm_node and hbm_node.kind == "hbm_ctrl" and nbytes > 0:
|
||||||
|
burst = int(hbm_node.attrs.get("burst_bytes", 256))
|
||||||
|
pc_bw = float(hbm_node.attrs.get("pc_bw_gbs", 32.0))
|
||||||
|
cats["hbm_ctrl"] += burst / pc_bw # chunk_time of final chunk
|
||||||
|
|
||||||
|
return dict(cats)
|
||||||
|
|
||||||
|
|
||||||
|
# ── No-congestion scenarios ───────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def _no_congestion_scenarios() -> list[Scenario]:
|
||||||
|
return [
|
||||||
|
Scenario("local",
|
||||||
|
"SAME_CUBE\nPE_LOCAL",
|
||||||
|
0, 0, 0, 0, 0, 0),
|
||||||
|
Scenario("same_cube_best",
|
||||||
|
"SAME_CUBE\nREMOTE_BEST\n(pe0→pe1)",
|
||||||
|
0, 0, 0, 0, 0, 1),
|
||||||
|
Scenario("same_cube_worst",
|
||||||
|
"SAME_CUBE\nREMOTE_WORST\n(pe0→pe7)",
|
||||||
|
0, 0, 0, 0, 0, 7),
|
||||||
|
Scenario("remote_cube_best",
|
||||||
|
"REMOTE_CUBE\nREMOTE_BEST\n(cube0→cube1)",
|
||||||
|
0, 0, 0, 0, 1, 0),
|
||||||
|
Scenario("remote_cube_worst",
|
||||||
|
"REMOTE_CUBE\nREMOTE_WORST\n(cube0→cube15.pe7)",
|
||||||
|
0, 0, 0, 0, 15, 7),
|
||||||
|
Scenario("remote_sip",
|
||||||
|
"REMOTE_SIP\nSAME_CUBE_SAME_PE\n(sip0→sip1)",
|
||||||
|
0, 0, 0, 1, 0, 0),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _run_pe_dma(engine: GraphEngine, scn: Scenario, nbytes: int,
|
||||||
|
slice_bytes: int) -> tuple[float, list[str]]:
|
||||||
|
pa = _hbm_pa(sip=scn.dst_sip, cube=scn.dst_cube, pe_id=scn.dst_pe,
|
||||||
|
offset=0x1000, slice_bytes=slice_bytes)
|
||||||
|
msg = PeDmaMsg(
|
||||||
|
correlation_id="pedma-perf", request_id=scn.name,
|
||||||
|
src_sip=scn.src_sip, src_cube=scn.src_cube, src_pe=scn.src_pe,
|
||||||
|
dst_pa=pa, nbytes=nbytes,
|
||||||
|
)
|
||||||
|
h = engine.submit(msg)
|
||||||
|
engine.wait(h)
|
||||||
|
_, trace = engine.get_completion(h)
|
||||||
|
|
||||||
|
# Resolve the path for breakdown analysis (engine doesn't keep it).
|
||||||
|
dst_node = engine._resolver.resolve(PhysAddr.decode(pa))
|
||||||
|
src = f"sip{scn.src_sip}.cube{scn.src_cube}.pe{scn.src_pe}"
|
||||||
|
path = engine._router.find_path(src, dst_node)
|
||||||
|
return float(trace["total_ns"]), path
|
||||||
|
|
||||||
|
|
||||||
|
def _run_no_congestion(nbytes: int):
|
||||||
|
graph = load_topology(TOPOLOGY_PATH)
|
||||||
|
edge_map = {(e.src, e.dst): e for e in graph.edges}
|
||||||
|
ns_per_mm = graph.spec.get("system", {}).get("ns_per_mm", 0.01)
|
||||||
|
slice_bytes = _slice_bytes(graph.spec)
|
||||||
|
|
||||||
|
rows = []
|
||||||
|
for scn in _no_congestion_scenarios():
|
||||||
|
engine = GraphEngine(load_topology(TOPOLOGY_PATH))
|
||||||
|
total_ns, path = _run_pe_dma(engine, scn, nbytes, slice_bytes)
|
||||||
|
br = _path_breakdown(path, nbytes, graph, edge_map, ns_per_mm)
|
||||||
|
formula_sum = sum(br.values())
|
||||||
|
br["contention"] = max(0.0, total_ns - formula_sum)
|
||||||
|
rows.append({
|
||||||
|
"graph": "no_congestion",
|
||||||
|
"scenario": scn.name,
|
||||||
|
"label": scn.label,
|
||||||
|
"nbytes": nbytes,
|
||||||
|
"path": " -> ".join(_short_path(path)),
|
||||||
|
"total_ns": total_ns,
|
||||||
|
**{c: br.get(c, 0.0) for c, _ in CATEGORIES},
|
||||||
|
})
|
||||||
|
return rows
|
||||||
|
|
||||||
|
|
||||||
|
# ── Congestion scenarios ──────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class CongestionScenario:
|
||||||
|
name: str
|
||||||
|
label: str
|
||||||
|
issues: list[tuple[int, int, int, int, int, int]]
|
||||||
|
"""List of (src_sip, src_cube, src_pe, dst_sip, dst_cube, dst_pe)."""
|
||||||
|
|
||||||
|
|
||||||
|
def _congestion_scenarios() -> list[CongestionScenario]:
|
||||||
|
same_cube_same_target_pe0 = lambda srcs: [
|
||||||
|
(0, 0, p, 0, 0, 0) for p in srcs
|
||||||
|
]
|
||||||
|
return [
|
||||||
|
# A-C: 1, 2, 3 remote PEs concurrently access pe0's slice in same cube
|
||||||
|
CongestionScenario(
|
||||||
|
"ctrl_hot_1",
|
||||||
|
"1×PE → pe0_slice",
|
||||||
|
same_cube_same_target_pe0([1]),
|
||||||
|
),
|
||||||
|
CongestionScenario(
|
||||||
|
"ctrl_hot_2",
|
||||||
|
"2×PE → pe0_slice",
|
||||||
|
same_cube_same_target_pe0([1, 2]),
|
||||||
|
),
|
||||||
|
CongestionScenario(
|
||||||
|
"ctrl_hot_3",
|
||||||
|
"3×PE → pe0_slice",
|
||||||
|
same_cube_same_target_pe0([1, 2, 3]),
|
||||||
|
),
|
||||||
|
# D: every PE in cube0 sends to corresponding PE in cube1 (same UCIe direction)
|
||||||
|
CongestionScenario(
|
||||||
|
"ucie_eastbound",
|
||||||
|
"8×PE corresp.\ncube0→cube1",
|
||||||
|
[(0, 0, p, 0, 1, p) for p in range(8)],
|
||||||
|
),
|
||||||
|
# E: every PE in cube0 hits pe0's slice → worst HBM CTRL hotspot
|
||||||
|
CongestionScenario(
|
||||||
|
"all_pe_to_pe0",
|
||||||
|
"8×PE → pe0_slice",
|
||||||
|
same_cube_same_target_pe0(list(range(8))),
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _run_congestion(nbytes: int):
|
||||||
|
graph = load_topology(TOPOLOGY_PATH)
|
||||||
|
edge_map = {(e.src, e.dst): e for e in graph.edges}
|
||||||
|
ns_per_mm = graph.spec.get("system", {}).get("ns_per_mm", 0.01)
|
||||||
|
slice_bytes = _slice_bytes(graph.spec)
|
||||||
|
|
||||||
|
rows = []
|
||||||
|
for scn in _congestion_scenarios():
|
||||||
|
engine = GraphEngine(load_topology(TOPOLOGY_PATH))
|
||||||
|
handles = []
|
||||||
|
first_path = None
|
||||||
|
for i, (ss, sc, sp, ds, dc, dp) in enumerate(scn.issues):
|
||||||
|
pa = _hbm_pa(sip=ds, cube=dc, pe_id=dp,
|
||||||
|
offset=0x1000 + i * 0x100, slice_bytes=slice_bytes)
|
||||||
|
msg = PeDmaMsg(
|
||||||
|
correlation_id="pedma-cong", request_id=f"{scn.name}-{i}",
|
||||||
|
src_sip=ss, src_cube=sc, src_pe=sp,
|
||||||
|
dst_pa=pa, nbytes=nbytes,
|
||||||
|
)
|
||||||
|
handles.append(engine.submit(msg))
|
||||||
|
if first_path is None:
|
||||||
|
dst_node = engine._resolver.resolve(PhysAddr.decode(pa))
|
||||||
|
first_path = engine._router.find_path(
|
||||||
|
f"sip{ss}.cube{sc}.pe{sp}", dst_node)
|
||||||
|
for h in handles:
|
||||||
|
engine.wait(h)
|
||||||
|
latencies = [engine.get_completion(h)[1]["total_ns"] for h in handles]
|
||||||
|
makespan = max(latencies)
|
||||||
|
|
||||||
|
# Breakdown uses the first issuer's path as a representative;
|
||||||
|
# ``unaccounted`` absorbs contention/serialization across requests.
|
||||||
|
br = _path_breakdown(first_path or [], nbytes, graph, edge_map, ns_per_mm)
|
||||||
|
formula_sum = sum(br.values())
|
||||||
|
br["contention"] = max(0.0, makespan - formula_sum)
|
||||||
|
rows.append({
|
||||||
|
"graph": "congestion",
|
||||||
|
"scenario": scn.name,
|
||||||
|
"label": scn.label,
|
||||||
|
"nbytes": nbytes,
|
||||||
|
"n_issuers": len(scn.issues),
|
||||||
|
"first_path": " -> ".join(_short_path(first_path or [])),
|
||||||
|
"makespan_ns": makespan,
|
||||||
|
"min_lat_ns": min(latencies) if latencies else 0.0,
|
||||||
|
**{c: br.get(c, 0.0) for c, _ in CATEGORIES},
|
||||||
|
})
|
||||||
|
return rows
|
||||||
|
|
||||||
|
|
||||||
|
# ── Plotting ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def _short_path(path: Iterable[str]) -> list[str]:
|
||||||
|
return [".".join(p.split(".")[-2:]) for p in path]
|
||||||
|
|
||||||
|
|
||||||
|
def _plot_stacked(rows, value_key, title, out_path):
|
||||||
|
n = len(rows)
|
||||||
|
labels = [r["label"] for r in rows]
|
||||||
|
fig, ax = plt.subplots(figsize=(max(8, n * 1.4), 5.5))
|
||||||
|
bottoms = [0.0] * n
|
||||||
|
for cat, colour in CATEGORIES:
|
||||||
|
heights = [r.get(cat, 0.0) for r in rows]
|
||||||
|
ax.bar(labels, heights, bottom=bottoms, color=colour, label=cat,
|
||||||
|
edgecolor="white", linewidth=0.5)
|
||||||
|
bottoms = [b + h for b, h in zip(bottoms, heights)]
|
||||||
|
# Total annotation on top of each bar.
|
||||||
|
for i, r in enumerate(rows):
|
||||||
|
ax.text(i, bottoms[i] * 1.01, f"{r[value_key]:.0f} ns",
|
||||||
|
ha="center", va="bottom", fontsize=8)
|
||||||
|
ax.set_ylabel("Latency (ns)")
|
||||||
|
ax.set_title(title)
|
||||||
|
ax.legend(loc="upper left", fontsize=9, frameon=False)
|
||||||
|
ax.set_ylim(0, max(bottoms) * 1.15)
|
||||||
|
ax.tick_params(axis="x", labelsize=8)
|
||||||
|
fig.tight_layout()
|
||||||
|
fig.savefig(out_path, dpi=150)
|
||||||
|
plt.close(fig)
|
||||||
|
|
||||||
|
|
||||||
|
# ── CSV ────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def _write_csv(no_cong_rows, cong_rows, out_path):
|
||||||
|
fields = [
|
||||||
|
"graph", "scenario", "label", "nbytes", "n_issuers",
|
||||||
|
"total_ns", "makespan_ns", "min_lat_ns",
|
||||||
|
"pe_setup", "noc_mesh", "ucie", "hbm_ctrl", "contention",
|
||||||
|
"path", "first_path",
|
||||||
|
]
|
||||||
|
with open(out_path, "w", newline="") as f:
|
||||||
|
w = csv.DictWriter(f, fieldnames=fields, extrasaction="ignore")
|
||||||
|
w.writeheader()
|
||||||
|
for r in no_cong_rows + cong_rows:
|
||||||
|
w.writerow(r)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Self-verification ──────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def _verify(rows_no_cong, rows_cong) -> list[str]:
|
||||||
|
"""Return a list of human-readable issues; empty means PASS.
|
||||||
|
|
||||||
|
Verification covers:
|
||||||
|
(1) No-congestion: latency monotonically grows with topological distance.
|
||||||
|
(2) Same-cube scenarios contain zero UCIe budget (mesh-only path).
|
||||||
|
(3) Remote-cube/SIP scenarios contain non-zero UCIe budget.
|
||||||
|
(4) Breakdown is internally consistent: formula sum ≤ actual total
|
||||||
|
(categories don't overcount the pipelined model) and the
|
||||||
|
``contention`` slack is < 50% of total for single-request
|
||||||
|
scenarios (the named categories explain most latency).
|
||||||
|
(5) Streaming term matches nbytes / bottleneck within 5%.
|
||||||
|
(6) Congestion makespan grows with issuer count on the hot-target series.
|
||||||
|
(7) 8-PE hotspot strictly exceeds 3-PE hotspot.
|
||||||
|
"""
|
||||||
|
issues = []
|
||||||
|
by_name = {r["scenario"]: r for r in rows_no_cong}
|
||||||
|
cong_map = {r["scenario"]: r for r in rows_cong}
|
||||||
|
|
||||||
|
# (1) distance monotonicity
|
||||||
|
order = [
|
||||||
|
"local",
|
||||||
|
"same_cube_best",
|
||||||
|
"same_cube_worst",
|
||||||
|
"remote_cube_best",
|
||||||
|
"remote_cube_worst",
|
||||||
|
]
|
||||||
|
prev = 0.0
|
||||||
|
for n in order:
|
||||||
|
if n in by_name and by_name[n]["total_ns"] <= prev:
|
||||||
|
issues.append(
|
||||||
|
f"no_congestion: {n} latency ({by_name[n]['total_ns']:.1f} ns) "
|
||||||
|
f"not strictly > previous scenario ({prev:.1f} ns)"
|
||||||
|
)
|
||||||
|
prev = max(prev, by_name.get(n, {}).get("total_ns", prev))
|
||||||
|
|
||||||
|
if "remote_sip" in by_name and "remote_cube_best" in by_name:
|
||||||
|
if by_name["remote_sip"]["total_ns"] < by_name["remote_cube_best"]["total_ns"]:
|
||||||
|
issues.append(
|
||||||
|
f"no_congestion: remote_sip ({by_name['remote_sip']['total_ns']:.1f}) "
|
||||||
|
f"< remote_cube_best ({by_name['remote_cube_best']['total_ns']:.1f})"
|
||||||
|
)
|
||||||
|
|
||||||
|
# (2) same-cube → ucie == 0
|
||||||
|
for n in ("local", "same_cube_best", "same_cube_worst"):
|
||||||
|
if by_name.get(n, {}).get("ucie", 1) != 0:
|
||||||
|
issues.append(
|
||||||
|
f"no_congestion: {n} should have zero UCIe budget; "
|
||||||
|
f"got {by_name[n]['ucie']}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# (3) remote-cube / remote-sip → ucie > 0
|
||||||
|
for n in ("remote_cube_best", "remote_cube_worst", "remote_sip"):
|
||||||
|
if by_name.get(n, {}).get("ucie", 0) <= 0:
|
||||||
|
issues.append(
|
||||||
|
f"no_congestion: {n} must have positive UCIe budget; "
|
||||||
|
f"got {by_name[n].get('ucie')}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# (4) breakdown consistency
|
||||||
|
for r in rows_no_cong + rows_cong:
|
||||||
|
actual = r.get("total_ns", r.get("makespan_ns", 0.0))
|
||||||
|
if actual <= 0:
|
||||||
|
continue
|
||||||
|
for cat, _ in CATEGORIES:
|
||||||
|
if r.get(cat, 0.0) < 0:
|
||||||
|
issues.append(f"{r['scenario']}: negative {cat}={r[cat]}")
|
||||||
|
formula_sum = sum(r.get(c, 0.0) for c, _ in CATEGORIES
|
||||||
|
if c != "contention")
|
||||||
|
if formula_sum > actual + 1e-3:
|
||||||
|
issues.append(
|
||||||
|
f"{r['scenario']}: formula sum {formula_sum:.1f} exceeds "
|
||||||
|
f"actual {actual:.1f} (categories overcount pipelined model)"
|
||||||
|
)
|
||||||
|
# For single-request scenarios the named categories must explain
|
||||||
|
# most of the latency. Cross-SIP paths cross two non-flit-aware
|
||||||
|
# boundaries (sip0.pcie_ep -> switch -> sip1.pcie_ep) which force
|
||||||
|
# store-and-forward re-streaming that the simple wormhole formula
|
||||||
|
# under-counts; allow a looser threshold for those rows. For
|
||||||
|
# congestion scenarios ``contention`` IS the primary signal, so
|
||||||
|
# don't bound its share — directional invariants in checks (6)
|
||||||
|
# and (7) cover that.
|
||||||
|
path_str = r.get("path") or r.get("first_path", "")
|
||||||
|
cross_sip = "switch0" in path_str
|
||||||
|
max_cont_frac = 0.7 if cross_sip else 0.5
|
||||||
|
if r.get("graph") == "no_congestion":
|
||||||
|
cont_frac = r.get("contention", 0.0) / actual
|
||||||
|
if cont_frac > max_cont_frac:
|
||||||
|
issues.append(
|
||||||
|
f"{r['scenario']}: contention fraction {cont_frac:.1%} > "
|
||||||
|
f"{max_cont_frac:.0%} in a single-request scenario — named "
|
||||||
|
f"categories should explain most latency "
|
||||||
|
f"(actual={actual:.1f}, cont={r['contention']:.1f})"
|
||||||
|
)
|
||||||
|
|
||||||
|
# (5) streaming matches nbytes / bottleneck within slack
|
||||||
|
# nbytes / bottleneck for local (256 GB/s) at 16 KB = 64ns (off by per-flit gap)
|
||||||
|
if "local" in by_name:
|
||||||
|
n = by_name["local"]
|
||||||
|
nbytes = n["nbytes"]
|
||||||
|
# streaming = (n_flits-1) * (256 / 256_gbs) for 256 GB/s = (n_flits-1) ns
|
||||||
|
n_flits = math.ceil(nbytes / 256)
|
||||||
|
expected = (n_flits - 1) * (256 / 256.0) # 256 GB/s pe→router bottleneck
|
||||||
|
got = n.get("streaming", 0)
|
||||||
|
if abs(got - expected) > expected * 0.05 + 0.5:
|
||||||
|
issues.append(
|
||||||
|
f"no_congestion local: streaming={got:.1f} vs expected≈{expected:.1f}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# (6) congestion makespan monotonic with issuer count
|
||||||
|
seq = ["ctrl_hot_1", "ctrl_hot_2", "ctrl_hot_3"]
|
||||||
|
last = 0.0
|
||||||
|
for n in seq:
|
||||||
|
if n in cong_map and cong_map[n]["makespan_ns"] < last:
|
||||||
|
issues.append(
|
||||||
|
f"congestion: {n} makespan dropped below prior "
|
||||||
|
f"({cong_map[n]['makespan_ns']:.1f} < {last:.1f})"
|
||||||
|
)
|
||||||
|
last = cong_map.get(n, {}).get("makespan_ns", last)
|
||||||
|
|
||||||
|
# (7) 8-PE hotspot strictly slower than 3-PE
|
||||||
|
if "all_pe_to_pe0" in cong_map and "ctrl_hot_3" in cong_map:
|
||||||
|
if cong_map["all_pe_to_pe0"]["makespan_ns"] <= cong_map["ctrl_hot_3"]["makespan_ns"]:
|
||||||
|
issues.append(
|
||||||
|
f"congestion: all_pe_to_pe0 ({cong_map['all_pe_to_pe0']['makespan_ns']:.1f}) "
|
||||||
|
f"should exceed ctrl_hot_3 "
|
||||||
|
f"({cong_map['ctrl_hot_3']['makespan_ns']:.1f})"
|
||||||
|
)
|
||||||
|
|
||||||
|
return issues
|
||||||
|
|
||||||
|
|
||||||
|
# ── Entry point ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def main(nbytes: int = DEFAULT_NBYTES) -> int:
|
||||||
|
OUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
print(f"== PE_DMA perf @ {nbytes} B per request ==")
|
||||||
|
print("Collecting NO-congestion scenarios...")
|
||||||
|
no_cong = _run_no_congestion(nbytes)
|
||||||
|
print("Collecting CONGESTION scenarios...")
|
||||||
|
cong = _run_congestion(nbytes)
|
||||||
|
|
||||||
|
print("\n-- No-congestion summary --")
|
||||||
|
for r in no_cong:
|
||||||
|
print(f" {r['scenario']:22s} total={r['total_ns']:7.1f} ns "
|
||||||
|
f"pe={r['pe_setup']:.1f} mesh={r['noc_mesh']:.1f} "
|
||||||
|
f"ucie={r['ucie']:.1f} stream={r['streaming']:.1f} "
|
||||||
|
f"hbm={r['hbm_ctrl']:.1f} cont={r['contention']:.1f}")
|
||||||
|
print("\n-- Congestion summary --")
|
||||||
|
for r in cong:
|
||||||
|
print(f" {r['scenario']:22s} makespan={r['makespan_ns']:7.1f} ns "
|
||||||
|
f"min={r['min_lat_ns']:7.1f} "
|
||||||
|
f"pe={r['pe_setup']:.1f} mesh={r['noc_mesh']:.1f} "
|
||||||
|
f"ucie={r['ucie']:.1f} stream={r['streaming']:.1f} "
|
||||||
|
f"hbm={r['hbm_ctrl']:.1f} cont={r['contention']:.1f}")
|
||||||
|
|
||||||
|
issues = _verify(no_cong, cong)
|
||||||
|
print("\n-- Self-verification --")
|
||||||
|
if not issues:
|
||||||
|
print(" PASS")
|
||||||
|
else:
|
||||||
|
for i, msg in enumerate(issues, 1):
|
||||||
|
print(f" [{i}] {msg}")
|
||||||
|
|
||||||
|
_plot_stacked(
|
||||||
|
no_cong, "total_ns",
|
||||||
|
f"PE_DMA latency breakdown (no congestion, nbytes={nbytes})",
|
||||||
|
OUT_DIR / "no_congestion.png",
|
||||||
|
)
|
||||||
|
_plot_stacked(
|
||||||
|
cong, "makespan_ns",
|
||||||
|
f"PE_DMA latency breakdown (congestion, makespan, nbytes={nbytes})",
|
||||||
|
OUT_DIR / "congestion.png",
|
||||||
|
)
|
||||||
|
_write_csv(no_cong, cong, OUT_DIR / "summary.csv")
|
||||||
|
|
||||||
|
print(f"\nWrote:\n {OUT_DIR / 'no_congestion.png'}\n"
|
||||||
|
f" {OUT_DIR / 'congestion.png'}\n"
|
||||||
|
f" {OUT_DIR / 'summary.csv'}")
|
||||||
|
|
||||||
|
return 0 if not issues else 1
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
p = argparse.ArgumentParser()
|
||||||
|
p.add_argument("--n-bytes", type=int, default=DEFAULT_NBYTES,
|
||||||
|
help="bytes per DMA (default 16384)")
|
||||||
|
args = p.parse_args()
|
||||||
|
raise SystemExit(main(nbytes=args.n_bytes))
|
||||||
Reference in New Issue
Block a user