diff --git a/docs/diagrams/pe_dma_perf/congestion.png b/docs/diagrams/pe_dma_perf/congestion.png new file mode 100644 index 0000000..4900221 Binary files /dev/null and b/docs/diagrams/pe_dma_perf/congestion.png differ diff --git a/docs/diagrams/pe_dma_perf/no_congestion.png b/docs/diagrams/pe_dma_perf/no_congestion.png new file mode 100644 index 0000000..9eb0216 Binary files /dev/null and b/docs/diagrams/pe_dma_perf/no_congestion.png differ diff --git a/docs/diagrams/pe_dma_perf/summary.csv b/docs/diagrams/pe_dma_perf/summary.csv new file mode 100644 index 0000000..4fb8b5e --- /dev/null +++ b/docs/diagrams/pe_dma_perf/summary.csv @@ -0,0 +1,24 @@ +graph,scenario,label,nbytes,n_issuers,total_ns,makespan_ns,min_lat_ns,pe_setup,noc_mesh,ucie,hbm_ctrl,contention,path,first_path +no_congestion,local,"SAME_CUBE +PE_LOCAL",16384,,77.0,,,1.0,2.0,0.0,9.0,2.0,pe0.pe_dma -> cube0.r0c0 -> hbm_ctrl.pe0, +no_congestion,same_cube_best,"SAME_CUBE +REMOTE_BEST +(pe0→pe1)",16384,,82.06,,,1.0,5.03,0.0,9.0,4.030000000000001,pe0.pe_dma -> cube0.r0c0 -> cube0.r0c1 -> hbm_ctrl.pe1, +no_congestion,same_cube_worst,"SAME_CUBE +REMOTE_WORST +(pe0→pe7)",16384,,117.50000000000001,,,1.0,26.25,0.0,9.0,18.250000000000014,pe0.pe_dma -> cube0.r0c0 -> cube0.r1c0 -> cube0.r1c1 -> cube0.r1c2 -> cube0.r1c3 -> cube0.r4c3 -> cube0.r4c4 -> cube0.r5c4 -> cube0.r5c5 -> hbm_ctrl.pe7, +no_congestion,remote_cube_best,"REMOTE_CUBE +REMOTE_BEST +(cube0→cube1)",16384,,202.51999999999998,,,1.0,6.0,32.510000000000005,9.0,28.00999999999999,pe0.pe_dma -> cube0.r0c0 -> ucie-N.conn0 -> cube0.ucie-N -> ucie-N.conn3 -> cube0.r0c5 -> ucie-E.conn0 -> cube0.ucie-E -> cube1.ucie-W -> ucie-W.conn0 -> cube1.r0c0 -> hbm_ctrl.pe0, +no_congestion,remote_cube_worst,"REMOTE_CUBE +REMOTE_WORST +(cube0→cube15.pe7)",16384,,573.1199999999999,,,1.0,30.0,219.05999999999995,9.0,188.05999999999995,pe0.pe_dma -> cube0.r0c0 -> ucie-N.conn0 -> cube0.ucie-N -> ucie-N.conn3 -> cube0.r0c5 -> ucie-E.conn0 -> cube0.ucie-E -> cube1.ucie-W -> ucie-W.conn0 -> cube1.r0c0 -> ucie-N.conn0 -> cube1.ucie-N -> ucie-N.conn3 -> cube1.r0c5 -> ucie-E.conn0 -> cube1.ucie-E -> cube2.ucie-W -> ucie-W.conn0 -> cube2.r0c0 -> ucie-N.conn0 -> cube2.ucie-N -> ucie-N.conn3 -> cube2.r0c5 -> ucie-E.conn0 -> cube2.ucie-E -> cube3.ucie-W -> ucie-W.conn0 -> cube3.r0c0 -> ucie-N.conn0 -> cube3.ucie-N -> ucie-N.conn3 -> cube3.r0c5 -> ucie-E.conn0 -> cube3.ucie-E -> ucie-E.conn3 -> cube3.r5c5 -> ucie-S.conn3 -> cube3.ucie-S -> cube7.ucie-N -> ucie-N.conn3 -> cube7.r0c5 -> ucie-E.conn0 -> cube7.ucie-E -> ucie-E.conn3 -> cube7.r5c5 -> ucie-S.conn3 -> cube7.ucie-S -> cube11.ucie-N -> ucie-N.conn3 -> cube11.r0c5 -> ucie-E.conn0 -> cube11.ucie-E -> ucie-E.conn3 -> cube11.r5c5 -> ucie-S.conn3 -> cube11.ucie-S -> cube15.ucie-N -> ucie-N.conn3 -> cube15.r0c5 -> ucie-E.conn0 -> cube15.ucie-E -> ucie-E.conn3 -> cube15.r5c5 -> hbm_ctrl.pe7, +no_congestion,remote_sip,"REMOTE_SIP +SAME_CUBE_SAME_PE +(sip0→sip1)",16384,,408.5216666666663,,,1.0,4.0,37.040000000000006,9.0,209.38499999999962,pe0.pe_dma -> cube0.r0c0 -> ucie-N.conn0 -> cube0.ucie-N -> io0.ucie-P0 -> ucie-P0.conn0 -> io0.noc -> io0.pcie_ep -> fabric.switch0 -> io0.pcie_ep -> io0.noc -> ucie-P0.conn0 -> io0.ucie-P0 -> cube0.ucie-N -> ucie-N.conn0 -> cube0.r0c0 -> hbm_ctrl.pe0, +congestion,ctrl_hot_1,1×PE → pe0_slice,16384,1,,82.06,82.06,1.0,5.03,0.0,9.0,4.030000000000001,,pe1.pe_dma -> cube0.r0c1 -> cube0.r0c0 -> hbm_ctrl.pe0 +congestion,ctrl_hot_2,2×PE → pe0_slice,16384,2,,158.3450000000001,134.2400000000001,1.0,5.03,0.0,9.0,80.31500000000011,,pe1.pe_dma -> cube0.r0c1 -> cube0.r0c0 -> hbm_ctrl.pe0 +congestion,ctrl_hot_3,3×PE → pe0_slice,16384,3,,230.0750000000001,139.94000000000008,1.0,5.03,0.0,9.0,152.0450000000001,,pe1.pe_dma -> cube0.r0c1 -> cube0.r0c0 -> hbm_ctrl.pe0 +congestion,ucie_eastbound,"8×PE corresp. +cube0→cube1",16384,8,,962.52,438.52,1.0,6.0,32.510000000000005,9.0,788.01,,pe0.pe_dma -> cube0.r0c0 -> ucie-N.conn0 -> cube0.ucie-N -> ucie-N.conn3 -> cube0.r0c5 -> ucie-E.conn0 -> cube0.ucie-E -> cube1.ucie-W -> ucie-W.conn0 -> cube1.r0c0 -> hbm_ctrl.pe0 +congestion,all_pe_to_pe0,8×PE → pe0_slice,16384,8,,558.2499999999998,195.0,1.0,2.0,0.0,9.0,483.2499999999998,,pe0.pe_dma -> cube0.r0c0 -> hbm_ctrl.pe0 diff --git a/scripts/plot_pe_dma_perf.py b/scripts/plot_pe_dma_perf.py new file mode 100644 index 0000000..1b99434 --- /dev/null +++ b/scripts/plot_pe_dma_perf.py @@ -0,0 +1,602 @@ +"""Plot PE_DMA performance: latency breakdown across topological distance. + +Two graphs (saved to docs/diagrams/pe_dma_perf/): + + no_congestion.png — single PE issues one DMA, target varies in distance: + 1. SAME_CUBE_PE_LOCAL — pe0 -> pe0's slice (own router, 1 hop) + 2. SAME_CUBE_PE_REMOTE_BEST — pe0 -> pe1's slice (adjacent corner) + 3. SAME_CUBE_PE_REMOTE_WORST — pe0 -> pe7's slice (opposite corner) + 4. REMOTE_CUBE_PE_REMOTE_BEST — pe0 -> cube1 pe0's slice (1 UCIe hop) + 5. REMOTE_CUBE_PE_REMOTE_WORST — pe0 -> cube15 pe7's slice (max UCIe + mesh) + 6. REMOTE_SIP_SAME_CUBE_SAME_PE — pe0 -> sip1.cube0.pe0's slice + + congestion.png — concurrent PEs hitting either the same HBM CTRL or + the same UCIe direction: + A. 1×PE remote single — baseline (one remote PE reads cube0.pe0_slice) + B. 2×PE remote concurrent — two adjacent PEs share path to pe0_slice + C. 3×PE remote concurrent — three PEs contend on pe0's router/HBM + D. 8×PE same-direction-UCIe — every PE in cube0 reads cube1 same-PE slice + E. 8×PE all-hit-PE0 — every PE reads cube0.pe0_slice (hottest HBM CTRL) + +Latency is broken down by component class: + pe_setup — first-flit PE_DMA overhead + PE↔router wire transfer + noc_mesh — mesh routers' first-flit overheads + mesh wire transfers + ucie — UCIe ports' first-flit overheads + UCIe wire transfers + streaming — (n_flits-1) × per-flit time at the bottleneck link + (the dominant term for bulk transfers, set by the slowest wire) + hbm_ctrl — HBM CTRL overhead + final-chunk PC commit (= chunk_time) + fabric — switch + IO chiplet overheads + wires (cross-SIP paths) + contention — actual − formula_sum; primary signal for the congestion + graph (serialization across concurrent issuers) and a + model-fidelity probe for single-request scenarios + +Outputs ``summary.csv`` so the plot can be re-rendered without re-running +the simulator (the heavy step). +""" +from __future__ import annotations + +import csv +import math +from collections import defaultdict +from dataclasses import dataclass +from pathlib import Path +from typing import Iterable + +import matplotlib.pyplot as plt + +from kernbench.policy.address.phyaddr import PhysAddr +from kernbench.runtime_api.kernel import PeDmaMsg +from kernbench.sim_engine.engine import GraphEngine +from kernbench.topology.builder import load_topology + +REPO = Path(__file__).resolve().parent.parent +TOPOLOGY_PATH = REPO / "topology.yaml" +OUT_DIR = REPO / "docs" / "diagrams" / "pe_dma_perf" + +DEFAULT_NBYTES = 16 * 1024 # 16 KB per DMA + +# Category order (stacked bottom-to-top) and colours. +CATEGORIES = [ + ("pe_setup", "#3b82f6"), # blue + ("noc_mesh", "#10b981"), # green + ("ucie", "#f59e0b"), # amber + ("fabric", "#8b5cf6"), # purple (switch + io chiplet for cross-SIP) + ("streaming", "#6366f1"), # indigo (bulk = (n_flits-1)/bottleneck) + ("hbm_ctrl", "#ef4444"), # red (final-chunk commit = chunk_time) + ("contention", "#9ca3af"), # grey (actual − formula, surfaces serialization) +] + + +@dataclass +class Scenario: + name: str + label: str + src_sip: int + src_cube: int + src_pe: int + dst_sip: int + dst_cube: int + dst_pe: int + + +def _slice_bytes(spec) -> int: + mm = spec["cube"]["memory_map"] + return mm["hbm_total_gb_per_cube"] * (1 << 30) // mm["hbm_slices_per_cube"] + + +def _hbm_pa(*, sip: int, cube: int, pe_id: int, offset: int, slice_bytes: int) -> int: + return PhysAddr.pe_hbm_addr( + sip_id=sip, die_id=cube, pe_id=pe_id, + pe_local_hbm_offset=offset, slice_size_bytes=slice_bytes, + ).encode() + + +def _categorise_node(node) -> str | None: + nid = node.id + if ".pe_dma" in nid: + return "pe_setup" + if node.kind == "noc_router": + return "noc_mesh" + if "ucie" in nid: + return "ucie" + if node.kind == "hbm_ctrl": + return "hbm_ctrl" + if node.kind in ("switch", "pcie_ep", "io_cpu", "io_noc"): + return "fabric" + return None + + +def _categorise_edge_kind(kind: str | None) -> str | None: + if kind in ("pe_to_router", "router_to_pe", "pe_internal"): + return "pe_setup" + if kind in ("router_mesh",): + return "noc_mesh" + if kind in ("router_to_hbm", "hbm_to_router"): + return "hbm_ctrl" + # UCIe transit. Includes the cube↔io_chiplet UCIe crossings. + if kind and "ucie" in kind: + return "ucie" + if kind in ("cube_to_io", "io_to_cube"): + return "ucie" + # Cross-SIP fabric: switch port + IO chiplet internal NoC + pcie link. + if kind in ( + "io_to_switch", "switch_to_io", "io_internal", + "conn_to_io_noc", "io_noc_to_conn", + "pcie", "command", "fabric", + ): + return "fabric" + return None + + +def _path_breakdown( + path: list[str], nbytes: int, graph, edge_map, ns_per_mm: float, +) -> dict[str, float]: + """Wormhole-pipelined breakdown of a path's expected latency. + + Model: + total ≈ first_flit_arrival_time + + (n_flits - 1) × bottleneck_per_flit_time + + last_chunk_commit_time + + Each summand is categorised: + * Per-component overheads + first-flit wire transfers are attributed + by component class (pe_setup / noc_mesh / ucie). + * ``streaming`` is the bulk-transfer cost = (n_flits-1) × per_flit + at the slowest wire bandwidth in the path. + * ``hbm_ctrl`` is the HBM CTRL overhead + the final chunk's PC commit + (= chunk_time). Earlier chunks overlap with arrival. + """ + cats: dict[str, float] = defaultdict(float) + + # 1) Per-component overheads (first-flit). + for nid in path: + node = graph.nodes.get(nid) + if node is None: + continue + cat = _categorise_node(node) + if cat is None: + continue + cats[cat] += float(node.attrs.get("overhead_ns", 0.0)) + + # 2) Per-edge first-flit transfer = prop_ns + flit_bytes / bw_gbs. + bws: list[float] = [] + flit_bytes = 256 # see ADR-0033 (matches default HBM burst_bytes) + for i in range(len(path) - 1): + e = edge_map.get((path[i], path[i + 1])) + if e is None: + continue + prop_ns = e.distance_mm * ns_per_mm + first_flit_xfer = (flit_bytes / e.bw_gbs) if e.bw_gbs else 0.0 + cat = _categorise_edge_kind(e.kind) + if cat: + cats[cat] += prop_ns + first_flit_xfer + if e.bw_gbs: + bws.append(e.bw_gbs) + + # 3) Streaming: (n_flits - 1) × per-flit at bottleneck. + if bws and nbytes > flit_bytes: + n_flits = math.ceil(nbytes / flit_bytes) + min_bw = min(bws) + cats["streaming"] = (n_flits - 1) * (flit_bytes / min_bw) + + # 4) HBM CTRL: last-chunk commit time (earlier chunks overlap arrival). + if path: + hbm_node = graph.nodes.get(path[-1]) + if hbm_node and hbm_node.kind == "hbm_ctrl" and nbytes > 0: + burst = int(hbm_node.attrs.get("burst_bytes", 256)) + pc_bw = float(hbm_node.attrs.get("pc_bw_gbs", 32.0)) + cats["hbm_ctrl"] += burst / pc_bw # chunk_time of final chunk + + return dict(cats) + + +# ── No-congestion scenarios ─────────────────────────────────────────── + + +def _no_congestion_scenarios() -> list[Scenario]: + return [ + Scenario("local", + "SAME_CUBE\nPE_LOCAL", + 0, 0, 0, 0, 0, 0), + Scenario("same_cube_best", + "SAME_CUBE\nREMOTE_BEST\n(pe0→pe1)", + 0, 0, 0, 0, 0, 1), + Scenario("same_cube_worst", + "SAME_CUBE\nREMOTE_WORST\n(pe0→pe7)", + 0, 0, 0, 0, 0, 7), + Scenario("remote_cube_best", + "REMOTE_CUBE\nREMOTE_BEST\n(cube0→cube1)", + 0, 0, 0, 0, 1, 0), + Scenario("remote_cube_worst", + "REMOTE_CUBE\nREMOTE_WORST\n(cube0→cube15.pe7)", + 0, 0, 0, 0, 15, 7), + Scenario("remote_sip", + "REMOTE_SIP\nSAME_CUBE_SAME_PE\n(sip0→sip1)", + 0, 0, 0, 1, 0, 0), + ] + + +def _run_pe_dma(engine: GraphEngine, scn: Scenario, nbytes: int, + slice_bytes: int) -> tuple[float, list[str]]: + pa = _hbm_pa(sip=scn.dst_sip, cube=scn.dst_cube, pe_id=scn.dst_pe, + offset=0x1000, slice_bytes=slice_bytes) + msg = PeDmaMsg( + correlation_id="pedma-perf", request_id=scn.name, + src_sip=scn.src_sip, src_cube=scn.src_cube, src_pe=scn.src_pe, + dst_pa=pa, nbytes=nbytes, + ) + h = engine.submit(msg) + engine.wait(h) + _, trace = engine.get_completion(h) + + # Resolve the path for breakdown analysis (engine doesn't keep it). + dst_node = engine._resolver.resolve(PhysAddr.decode(pa)) + src = f"sip{scn.src_sip}.cube{scn.src_cube}.pe{scn.src_pe}" + path = engine._router.find_path(src, dst_node) + return float(trace["total_ns"]), path + + +def _run_no_congestion(nbytes: int): + graph = load_topology(TOPOLOGY_PATH) + edge_map = {(e.src, e.dst): e for e in graph.edges} + ns_per_mm = graph.spec.get("system", {}).get("ns_per_mm", 0.01) + slice_bytes = _slice_bytes(graph.spec) + + rows = [] + for scn in _no_congestion_scenarios(): + engine = GraphEngine(load_topology(TOPOLOGY_PATH)) + total_ns, path = _run_pe_dma(engine, scn, nbytes, slice_bytes) + br = _path_breakdown(path, nbytes, graph, edge_map, ns_per_mm) + formula_sum = sum(br.values()) + br["contention"] = max(0.0, total_ns - formula_sum) + rows.append({ + "graph": "no_congestion", + "scenario": scn.name, + "label": scn.label, + "nbytes": nbytes, + "path": " -> ".join(_short_path(path)), + "total_ns": total_ns, + **{c: br.get(c, 0.0) for c, _ in CATEGORIES}, + }) + return rows + + +# ── Congestion scenarios ────────────────────────────────────────────── + + +@dataclass +class CongestionScenario: + name: str + label: str + issues: list[tuple[int, int, int, int, int, int]] + """List of (src_sip, src_cube, src_pe, dst_sip, dst_cube, dst_pe).""" + + +def _congestion_scenarios() -> list[CongestionScenario]: + same_cube_same_target_pe0 = lambda srcs: [ + (0, 0, p, 0, 0, 0) for p in srcs + ] + return [ + # A-C: 1, 2, 3 remote PEs concurrently access pe0's slice in same cube + CongestionScenario( + "ctrl_hot_1", + "1×PE → pe0_slice", + same_cube_same_target_pe0([1]), + ), + CongestionScenario( + "ctrl_hot_2", + "2×PE → pe0_slice", + same_cube_same_target_pe0([1, 2]), + ), + CongestionScenario( + "ctrl_hot_3", + "3×PE → pe0_slice", + same_cube_same_target_pe0([1, 2, 3]), + ), + # D: every PE in cube0 sends to corresponding PE in cube1 (same UCIe direction) + CongestionScenario( + "ucie_eastbound", + "8×PE corresp.\ncube0→cube1", + [(0, 0, p, 0, 1, p) for p in range(8)], + ), + # E: every PE in cube0 hits pe0's slice → worst HBM CTRL hotspot + CongestionScenario( + "all_pe_to_pe0", + "8×PE → pe0_slice", + same_cube_same_target_pe0(list(range(8))), + ), + ] + + +def _run_congestion(nbytes: int): + graph = load_topology(TOPOLOGY_PATH) + edge_map = {(e.src, e.dst): e for e in graph.edges} + ns_per_mm = graph.spec.get("system", {}).get("ns_per_mm", 0.01) + slice_bytes = _slice_bytes(graph.spec) + + rows = [] + for scn in _congestion_scenarios(): + engine = GraphEngine(load_topology(TOPOLOGY_PATH)) + handles = [] + first_path = None + for i, (ss, sc, sp, ds, dc, dp) in enumerate(scn.issues): + pa = _hbm_pa(sip=ds, cube=dc, pe_id=dp, + offset=0x1000 + i * 0x100, slice_bytes=slice_bytes) + msg = PeDmaMsg( + correlation_id="pedma-cong", request_id=f"{scn.name}-{i}", + src_sip=ss, src_cube=sc, src_pe=sp, + dst_pa=pa, nbytes=nbytes, + ) + handles.append(engine.submit(msg)) + if first_path is None: + dst_node = engine._resolver.resolve(PhysAddr.decode(pa)) + first_path = engine._router.find_path( + f"sip{ss}.cube{sc}.pe{sp}", dst_node) + for h in handles: + engine.wait(h) + latencies = [engine.get_completion(h)[1]["total_ns"] for h in handles] + makespan = max(latencies) + + # Breakdown uses the first issuer's path as a representative; + # ``unaccounted`` absorbs contention/serialization across requests. + br = _path_breakdown(first_path or [], nbytes, graph, edge_map, ns_per_mm) + formula_sum = sum(br.values()) + br["contention"] = max(0.0, makespan - formula_sum) + rows.append({ + "graph": "congestion", + "scenario": scn.name, + "label": scn.label, + "nbytes": nbytes, + "n_issuers": len(scn.issues), + "first_path": " -> ".join(_short_path(first_path or [])), + "makespan_ns": makespan, + "min_lat_ns": min(latencies) if latencies else 0.0, + **{c: br.get(c, 0.0) for c, _ in CATEGORIES}, + }) + return rows + + +# ── Plotting ─────────────────────────────────────────────────────────── + + +def _short_path(path: Iterable[str]) -> list[str]: + return [".".join(p.split(".")[-2:]) for p in path] + + +def _plot_stacked(rows, value_key, title, out_path): + n = len(rows) + labels = [r["label"] for r in rows] + fig, ax = plt.subplots(figsize=(max(8, n * 1.4), 5.5)) + bottoms = [0.0] * n + for cat, colour in CATEGORIES: + heights = [r.get(cat, 0.0) for r in rows] + ax.bar(labels, heights, bottom=bottoms, color=colour, label=cat, + edgecolor="white", linewidth=0.5) + bottoms = [b + h for b, h in zip(bottoms, heights)] + # Total annotation on top of each bar. + for i, r in enumerate(rows): + ax.text(i, bottoms[i] * 1.01, f"{r[value_key]:.0f} ns", + ha="center", va="bottom", fontsize=8) + ax.set_ylabel("Latency (ns)") + ax.set_title(title) + ax.legend(loc="upper left", fontsize=9, frameon=False) + ax.set_ylim(0, max(bottoms) * 1.15) + ax.tick_params(axis="x", labelsize=8) + fig.tight_layout() + fig.savefig(out_path, dpi=150) + plt.close(fig) + + +# ── CSV ──────────────────────────────────────────────────────────────── + + +def _write_csv(no_cong_rows, cong_rows, out_path): + fields = [ + "graph", "scenario", "label", "nbytes", "n_issuers", + "total_ns", "makespan_ns", "min_lat_ns", + "pe_setup", "noc_mesh", "ucie", "hbm_ctrl", "contention", + "path", "first_path", + ] + with open(out_path, "w", newline="") as f: + w = csv.DictWriter(f, fieldnames=fields, extrasaction="ignore") + w.writeheader() + for r in no_cong_rows + cong_rows: + w.writerow(r) + + +# ── Self-verification ────────────────────────────────────────────────── + + +def _verify(rows_no_cong, rows_cong) -> list[str]: + """Return a list of human-readable issues; empty means PASS. + + Verification covers: + (1) No-congestion: latency monotonically grows with topological distance. + (2) Same-cube scenarios contain zero UCIe budget (mesh-only path). + (3) Remote-cube/SIP scenarios contain non-zero UCIe budget. + (4) Breakdown is internally consistent: formula sum ≤ actual total + (categories don't overcount the pipelined model) and the + ``contention`` slack is < 50% of total for single-request + scenarios (the named categories explain most latency). + (5) Streaming term matches nbytes / bottleneck within 5%. + (6) Congestion makespan grows with issuer count on the hot-target series. + (7) 8-PE hotspot strictly exceeds 3-PE hotspot. + """ + issues = [] + by_name = {r["scenario"]: r for r in rows_no_cong} + cong_map = {r["scenario"]: r for r in rows_cong} + + # (1) distance monotonicity + order = [ + "local", + "same_cube_best", + "same_cube_worst", + "remote_cube_best", + "remote_cube_worst", + ] + prev = 0.0 + for n in order: + if n in by_name and by_name[n]["total_ns"] <= prev: + issues.append( + f"no_congestion: {n} latency ({by_name[n]['total_ns']:.1f} ns) " + f"not strictly > previous scenario ({prev:.1f} ns)" + ) + prev = max(prev, by_name.get(n, {}).get("total_ns", prev)) + + if "remote_sip" in by_name and "remote_cube_best" in by_name: + if by_name["remote_sip"]["total_ns"] < by_name["remote_cube_best"]["total_ns"]: + issues.append( + f"no_congestion: remote_sip ({by_name['remote_sip']['total_ns']:.1f}) " + f"< remote_cube_best ({by_name['remote_cube_best']['total_ns']:.1f})" + ) + + # (2) same-cube → ucie == 0 + for n in ("local", "same_cube_best", "same_cube_worst"): + if by_name.get(n, {}).get("ucie", 1) != 0: + issues.append( + f"no_congestion: {n} should have zero UCIe budget; " + f"got {by_name[n]['ucie']}" + ) + + # (3) remote-cube / remote-sip → ucie > 0 + for n in ("remote_cube_best", "remote_cube_worst", "remote_sip"): + if by_name.get(n, {}).get("ucie", 0) <= 0: + issues.append( + f"no_congestion: {n} must have positive UCIe budget; " + f"got {by_name[n].get('ucie')}" + ) + + # (4) breakdown consistency + for r in rows_no_cong + rows_cong: + actual = r.get("total_ns", r.get("makespan_ns", 0.0)) + if actual <= 0: + continue + for cat, _ in CATEGORIES: + if r.get(cat, 0.0) < 0: + issues.append(f"{r['scenario']}: negative {cat}={r[cat]}") + formula_sum = sum(r.get(c, 0.0) for c, _ in CATEGORIES + if c != "contention") + if formula_sum > actual + 1e-3: + issues.append( + f"{r['scenario']}: formula sum {formula_sum:.1f} exceeds " + f"actual {actual:.1f} (categories overcount pipelined model)" + ) + # For single-request scenarios the named categories must explain + # most of the latency. Cross-SIP paths cross two non-flit-aware + # boundaries (sip0.pcie_ep -> switch -> sip1.pcie_ep) which force + # store-and-forward re-streaming that the simple wormhole formula + # under-counts; allow a looser threshold for those rows. For + # congestion scenarios ``contention`` IS the primary signal, so + # don't bound its share — directional invariants in checks (6) + # and (7) cover that. + path_str = r.get("path") or r.get("first_path", "") + cross_sip = "switch0" in path_str + max_cont_frac = 0.7 if cross_sip else 0.5 + if r.get("graph") == "no_congestion": + cont_frac = r.get("contention", 0.0) / actual + if cont_frac > max_cont_frac: + issues.append( + f"{r['scenario']}: contention fraction {cont_frac:.1%} > " + f"{max_cont_frac:.0%} in a single-request scenario — named " + f"categories should explain most latency " + f"(actual={actual:.1f}, cont={r['contention']:.1f})" + ) + + # (5) streaming matches nbytes / bottleneck within slack + # nbytes / bottleneck for local (256 GB/s) at 16 KB = 64ns (off by per-flit gap) + if "local" in by_name: + n = by_name["local"] + nbytes = n["nbytes"] + # streaming = (n_flits-1) * (256 / 256_gbs) for 256 GB/s = (n_flits-1) ns + n_flits = math.ceil(nbytes / 256) + expected = (n_flits - 1) * (256 / 256.0) # 256 GB/s pe→router bottleneck + got = n.get("streaming", 0) + if abs(got - expected) > expected * 0.05 + 0.5: + issues.append( + f"no_congestion local: streaming={got:.1f} vs expected≈{expected:.1f}" + ) + + # (6) congestion makespan monotonic with issuer count + seq = ["ctrl_hot_1", "ctrl_hot_2", "ctrl_hot_3"] + last = 0.0 + for n in seq: + if n in cong_map and cong_map[n]["makespan_ns"] < last: + issues.append( + f"congestion: {n} makespan dropped below prior " + f"({cong_map[n]['makespan_ns']:.1f} < {last:.1f})" + ) + last = cong_map.get(n, {}).get("makespan_ns", last) + + # (7) 8-PE hotspot strictly slower than 3-PE + if "all_pe_to_pe0" in cong_map and "ctrl_hot_3" in cong_map: + if cong_map["all_pe_to_pe0"]["makespan_ns"] <= cong_map["ctrl_hot_3"]["makespan_ns"]: + issues.append( + f"congestion: all_pe_to_pe0 ({cong_map['all_pe_to_pe0']['makespan_ns']:.1f}) " + f"should exceed ctrl_hot_3 " + f"({cong_map['ctrl_hot_3']['makespan_ns']:.1f})" + ) + + return issues + + +# ── Entry point ──────────────────────────────────────────────────────── + + +def main(nbytes: int = DEFAULT_NBYTES) -> int: + OUT_DIR.mkdir(parents=True, exist_ok=True) + + print(f"== PE_DMA perf @ {nbytes} B per request ==") + print("Collecting NO-congestion scenarios...") + no_cong = _run_no_congestion(nbytes) + print("Collecting CONGESTION scenarios...") + cong = _run_congestion(nbytes) + + print("\n-- No-congestion summary --") + for r in no_cong: + print(f" {r['scenario']:22s} total={r['total_ns']:7.1f} ns " + f"pe={r['pe_setup']:.1f} mesh={r['noc_mesh']:.1f} " + f"ucie={r['ucie']:.1f} stream={r['streaming']:.1f} " + f"hbm={r['hbm_ctrl']:.1f} cont={r['contention']:.1f}") + print("\n-- Congestion summary --") + for r in cong: + print(f" {r['scenario']:22s} makespan={r['makespan_ns']:7.1f} ns " + f"min={r['min_lat_ns']:7.1f} " + f"pe={r['pe_setup']:.1f} mesh={r['noc_mesh']:.1f} " + f"ucie={r['ucie']:.1f} stream={r['streaming']:.1f} " + f"hbm={r['hbm_ctrl']:.1f} cont={r['contention']:.1f}") + + issues = _verify(no_cong, cong) + print("\n-- Self-verification --") + if not issues: + print(" PASS") + else: + for i, msg in enumerate(issues, 1): + print(f" [{i}] {msg}") + + _plot_stacked( + no_cong, "total_ns", + f"PE_DMA latency breakdown (no congestion, nbytes={nbytes})", + OUT_DIR / "no_congestion.png", + ) + _plot_stacked( + cong, "makespan_ns", + f"PE_DMA latency breakdown (congestion, makespan, nbytes={nbytes})", + OUT_DIR / "congestion.png", + ) + _write_csv(no_cong, cong, OUT_DIR / "summary.csv") + + print(f"\nWrote:\n {OUT_DIR / 'no_congestion.png'}\n" + f" {OUT_DIR / 'congestion.png'}\n" + f" {OUT_DIR / 'summary.csv'}") + + return 0 if not issues else 1 + + +if __name__ == "__main__": + import argparse + + p = argparse.ArgumentParser() + p.add_argument("--n-bytes", type=int, default=DEFAULT_NBYTES, + help="bytes per DMA (default 16384)") + args = p.parse_args() + raise SystemExit(main(nbytes=args.n_bytes))