kernbench2/scripts/plot_pe_dma_perf.py

"""Plot PE_DMA Effective BW utilization across topological distance.

Two graphs (saved to docs/diagrams/pe_dma_perf/):

  no_congestion.png — single PE issues one DMA, target varies in distance:
    1. SAME_CUBE_PE_LOCAL        — pe0 -> pe0's slice (own router, 1 hop)
    2. SAME_CUBE_PE_REMOTE_BEST  — pe0 -> pe1's slice (adjacent corner)
    3. SAME_CUBE_PE_REMOTE_WORST — pe0 -> pe7's slice (opposite corner)
    4. REMOTE_CUBE_PE_REMOTE_BEST  — pe0 -> cube1 pe0's slice (1 UCIe hop)
    5. REMOTE_CUBE_PE_REMOTE_WORST — pe0 -> cube15 pe7's slice (max UCIe + mesh)
    6. REMOTE_SIP_SAME_CUBE_SAME_PE — pe0 -> sip1.cube0.pe0's slice

  congestion.png — concurrent PEs hitting either the same HBM CTRL or
    the same UCIe direction:
    A. 1×PE remote single — baseline (one remote PE reads cube0.pe0_slice)
    B. 2×PE remote concurrent — two adjacent PEs share path to pe0_slice
    C. 3×PE remote concurrent — three PEs contend on pe0's router/HBM
    D. 8×PE same-direction-UCIe — every PE in cube0 reads cube1 same-PE slice
    E. 8×PE all-hit-PE0 — every PE reads cube0.pe0_slice (hottest HBM CTRL)

Effective BW = (total bytes transferred) / (wall-clock time)
  no_congestion: nbytes / total_ns
  congestion:    n_issuers × nbytes / makespan_ns  (aggregate throughput)

Peak BW = the path bottleneck (slowest single-edge bandwidth on the
first issuer's path). For shared-resource congestion scenarios the
aggregate effective BW can exceed this single-path peak when the
shared resource provides parallel lanes (e.g. UCIe has 4 connections
× 128 GB/s = 512 GB/s aggregate even though each connection is 128).

Utilization% = effective / peak × 100.

Outputs ``summary.csv`` (including breakdown components for any future
analysis) so the plot can be re-rendered without re-running the
simulator.
"""
from __future__ import annotations

import csv
import math
from collections import defaultdict
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable

import matplotlib.pyplot as plt

from kernbench.policy.address.phyaddr import PhysAddr
from kernbench.runtime_api.kernel import PeDmaMsg
from kernbench.sim_engine.engine import GraphEngine
from kernbench.topology.builder import load_topology

REPO = Path(__file__).resolve().parent.parent
TOPOLOGY_PATH = REPO / "topology.yaml"
OUT_DIR = REPO / "docs" / "diagrams" / "pe_dma_perf"

DEFAULT_NBYTES = 16 * 1024  # 16 KB per DMA

# Category order (stacked bottom-to-top) and colours.
CATEGORIES = [
    ("pe_setup",  "#3b82f6"),     # blue
    ("noc_mesh",  "#10b981"),     # green
    ("ucie",      "#f59e0b"),     # amber
    ("fabric",    "#8b5cf6"),     # purple   (switch + io chiplet for cross-SIP)
    ("streaming", "#6366f1"),     # indigo   (bulk = (n_flits-1)/bottleneck)
    ("hbm_ctrl",  "#ef4444"),     # red      (final-chunk commit = chunk_time)
    ("contention", "#9ca3af"),    # grey     (actual − formula, surfaces serialization)
]


@dataclass
class Scenario:
    name: str
    label: str
    src_sip: int
    src_cube: int
    src_pe: int
    dst_sip: int
    dst_cube: int
    dst_pe: int


def _slice_bytes(spec) -> int:
    mm = spec["cube"]["memory_map"]
    return mm["hbm_total_gb_per_cube"] * (1 << 30) // mm["hbm_slices_per_cube"]


def _hbm_pa(*, sip: int, cube: int, pe_id: int, offset: int, slice_bytes: int) -> int:
    return PhysAddr.pe_hbm_addr(
        sip_id=sip, die_id=cube, pe_id=pe_id,
        pe_local_hbm_offset=offset, slice_size_bytes=slice_bytes,
    ).encode()


def _categorise_node(node) -> str | None:
    nid = node.id
    if ".pe_dma" in nid:
        return "pe_setup"
    if node.kind == "noc_router":
        return "noc_mesh"
    if "ucie" in nid:
        return "ucie"
    if node.kind == "hbm_ctrl":
        return "hbm_ctrl"
    if node.kind in ("switch", "pcie_ep", "io_cpu", "io_noc"):
        return "fabric"
    return None


def _categorise_edge_kind(kind: str | None) -> str | None:
    if kind in ("pe_to_router", "router_to_pe", "pe_internal"):
        return "pe_setup"
    if kind in ("router_mesh",):
        return "noc_mesh"
    if kind in ("router_to_hbm", "hbm_to_router"):
        return "hbm_ctrl"
    # UCIe transit. Includes the cube↔io_chiplet UCIe crossings.
    if kind and "ucie" in kind:
        return "ucie"
    if kind in ("cube_to_io", "io_to_cube"):
        return "ucie"
    # Cross-SIP fabric: switch port + IO chiplet internal NoC + pcie link.
    if kind in (
        "io_to_switch", "switch_to_io", "io_internal",
        "conn_to_io_noc", "io_noc_to_conn",
        "pcie", "command", "fabric",
    ):
        return "fabric"
    return None


def _bottleneck_bw(path: list[str], edge_map: dict) -> float | None:
    """Min ``bw_gbs`` over edges with positive bandwidth on the path."""
    bws = [e.bw_gbs for i in range(len(path) - 1)
            if (e := edge_map.get((path[i], path[i + 1]))) and e.bw_gbs]
    return min(bws) if bws else None


def _aggregate_peak_bw(paths: list[list[str]], edge_map: dict) -> float:
    """Max-min fair-share aggregate throughput across concurrent paths.

    Each path is one unit of demand from source to destination. For each
    edge, fair share per path = ``bw_gbs / usage_count``. A path's
    sustainable throughput is the minimum fair share along its edges,
    and the aggregate peak is the sum across paths. This produces the
    correct answer for both shared-bottleneck scenarios (all paths
    converge on one wire → aggregate = wire BW) and multi-lane shared
    resources (UCIe's 4 connections used in parallel → aggregate = 4 ×
    per-conn BW), without enumerating max-flow explicitly.

    Examples:
      * 3 paths sharing r0c0→hbm_ctrl.pe0 @ 256 GB/s
          per-path = 256/3 ≈ 85.3, aggregate = 3 × 85.3 = 256 GB/s ✓
      * 8 paths sharing 4 UCIe conns @ 128 GB/s (2 paths per conn)
          per-path = 128/2 = 64, aggregate = 8 × 64 = 512 GB/s ✓
      * 1 path through 256 GB/s bottleneck
          per-path = 256, aggregate = 256 GB/s ✓ (= single-path peak)
    """
    from collections import Counter

    edge_usage: Counter = Counter()
    for path in paths:
        for i in range(len(path) - 1):
            edge_usage[(path[i], path[i + 1])] += 1

    aggregate = 0.0
    for path in paths:
        per_path = float("inf")
        for i in range(len(path) - 1):
            key = (path[i], path[i + 1])
            e = edge_map.get(key)
            if e and e.bw_gbs:
                share = e.bw_gbs / edge_usage[key]
                if share < per_path:
                    per_path = share
        if per_path != float("inf"):
            aggregate += per_path
    return aggregate


def _path_breakdown(
    path: list[str], nbytes: int, graph, edge_map, ns_per_mm: float,
) -> dict[str, float]:
    """Wormhole-pipelined breakdown of a path's expected latency.

    Model:
      total ≈ first_flit_arrival_time
              + (n_flits - 1) × bottleneck_per_flit_time
              + last_chunk_commit_time

    Each summand is categorised:
      * Per-component overheads + first-flit wire transfers are attributed
        by component class (pe_setup / noc_mesh / ucie).
      * ``streaming`` is the bulk-transfer cost = (n_flits-1) × per_flit
        at the slowest wire bandwidth in the path.
      * ``hbm_ctrl`` is the HBM CTRL overhead + the final chunk's PC commit
        (= chunk_time). Earlier chunks overlap with arrival.
    """
    cats: dict[str, float] = defaultdict(float)

    # 1) Per-component overheads (first-flit).
    for nid in path:
        node = graph.nodes.get(nid)
        if node is None:
            continue
        cat = _categorise_node(node)
        if cat is None:
            continue
        cats[cat] += float(node.attrs.get("overhead_ns", 0.0))

    # 2) Per-edge first-flit transfer = prop_ns + flit_bytes / bw_gbs.
    bws: list[float] = []
    flit_bytes = 256  # see ADR-0033 (matches default HBM burst_bytes)
    for i in range(len(path) - 1):
        e = edge_map.get((path[i], path[i + 1]))
        if e is None:
            continue
        prop_ns = e.distance_mm * ns_per_mm
        first_flit_xfer = (flit_bytes / e.bw_gbs) if e.bw_gbs else 0.0
        cat = _categorise_edge_kind(e.kind)
        if cat:
            cats[cat] += prop_ns + first_flit_xfer
        if e.bw_gbs:
            bws.append(e.bw_gbs)

    # 3) Streaming: (n_flits - 1) × per-flit at bottleneck.
    if bws and nbytes > flit_bytes:
        n_flits = math.ceil(nbytes / flit_bytes)
        min_bw = min(bws)
        cats["streaming"] = (n_flits - 1) * (flit_bytes / min_bw)

    # 4) HBM CTRL: last-chunk commit time (earlier chunks overlap arrival).
    if path:
        hbm_node = graph.nodes.get(path[-1])
        if hbm_node and hbm_node.kind == "hbm_ctrl" and nbytes > 0:
            burst = int(hbm_node.attrs.get("burst_bytes", 256))
            pc_bw = float(hbm_node.attrs.get("pc_bw_gbs", 32.0))
            cats["hbm_ctrl"] += burst / pc_bw  # chunk_time of final chunk

    return dict(cats)


# ── No-congestion scenarios ───────────────────────────────────────────


def _no_congestion_scenarios() -> list[Scenario]:
    return [
        Scenario("local",
                 "SAME_CUBE\nPE_LOCAL",
                 0, 0, 0, 0, 0, 0),
        Scenario("same_cube_best",
                 "SAME_CUBE\nREMOTE_BEST\n(pe0→pe1)",
                 0, 0, 0, 0, 0, 1),
        Scenario("same_cube_worst",
                 "SAME_CUBE\nREMOTE_WORST\n(pe0→pe7)",
                 0, 0, 0, 0, 0, 7),
        Scenario("remote_cube_best",
                 "REMOTE_CUBE\nREMOTE_BEST\n(cube0→cube1)",
                 0, 0, 0, 0, 1, 0),
        Scenario("remote_cube_worst",
                 "REMOTE_CUBE\nREMOTE_WORST\n(cube0→cube15.pe7)",
                 0, 0, 0, 0, 15, 7),
        Scenario("remote_sip",
                 "REMOTE_SIP\nSAME_CUBE_SAME_PE\n(sip0→sip1)",
                 0, 0, 0, 1, 0, 0),
    ]


def _run_pe_dma(engine: GraphEngine, scn: Scenario, nbytes: int,
                slice_bytes: int) -> tuple[float, list[str]]:
    pa = _hbm_pa(sip=scn.dst_sip, cube=scn.dst_cube, pe_id=scn.dst_pe,
                 offset=0x1000, slice_bytes=slice_bytes)
    msg = PeDmaMsg(
        correlation_id="pedma-perf", request_id=scn.name,
        src_sip=scn.src_sip, src_cube=scn.src_cube, src_pe=scn.src_pe,
        dst_pa=pa, nbytes=nbytes,
    )
    h = engine.submit(msg)
    engine.wait(h)
    _, trace = engine.get_completion(h)

    # Resolve the path for breakdown analysis (engine doesn't keep it).
    dst_node = engine._resolver.resolve(PhysAddr.decode(pa))
    src = f"sip{scn.src_sip}.cube{scn.src_cube}.pe{scn.src_pe}"
    path = engine._router.find_path(src, dst_node)
    return float(trace["total_ns"]), path


def _run_no_congestion(nbytes: int):
    graph = load_topology(TOPOLOGY_PATH)
    edge_map = {(e.src, e.dst): e for e in graph.edges}
    ns_per_mm = graph.spec.get("system", {}).get("ns_per_mm", 0.01)
    slice_bytes = _slice_bytes(graph.spec)

    rows = []
    for scn in _no_congestion_scenarios():
        engine = GraphEngine(load_topology(TOPOLOGY_PATH))
        total_ns, path = _run_pe_dma(engine, scn, nbytes, slice_bytes)
        br = _path_breakdown(path, nbytes, graph, edge_map, ns_per_mm)
        formula_sum = sum(br.values())
        br["contention"] = max(0.0, total_ns - formula_sum)
        peak_single = _bottleneck_bw(path, edge_map) or 0.0
        peak_aggregate = _aggregate_peak_bw([path], edge_map)
        eff_bw = nbytes / total_ns if total_ns > 0 else 0.0
        util_single = (eff_bw / peak_single * 100.0) if peak_single > 0 else 0.0
        util_aggregate = (eff_bw / peak_aggregate * 100.0) if peak_aggregate > 0 else 0.0
        rows.append({
            "graph": "no_congestion",
            "scenario": scn.name,
            "label": scn.label,
            "nbytes": nbytes,
            "n_issuers": 1,
            "path": " -> ".join(_short_path(path)),
            "total_ns": total_ns,
            "peak_single_bw_gbs": peak_single,
            "peak_aggregate_bw_gbs": peak_aggregate,
            "effective_bw_gbs": eff_bw,
            "util_single_pct": util_single,
            "util_aggregate_pct": util_aggregate,
            **{c: br.get(c, 0.0) for c, _ in CATEGORIES},
        })
    return rows


# ── Congestion scenarios ──────────────────────────────────────────────


@dataclass
class CongestionScenario:
    name: str
    label: str
    issues: list[tuple[int, int, int, int, int, int]]
    """List of (src_sip, src_cube, src_pe, dst_sip, dst_cube, dst_pe)."""


def _congestion_scenarios() -> list[CongestionScenario]:
    same_cube_same_target_pe0 = lambda srcs: [
        (0, 0, p, 0, 0, 0) for p in srcs
    ]
    return [
        # A-C: 1, 2, 3 remote PEs concurrently access pe0's slice in same cube
        CongestionScenario(
            "ctrl_hot_1",
            "1×PE → pe0_slice",
            same_cube_same_target_pe0([1]),
        ),
        CongestionScenario(
            "ctrl_hot_2",
            "2×PE → pe0_slice",
            same_cube_same_target_pe0([1, 2]),
        ),
        CongestionScenario(
            "ctrl_hot_3",
            "3×PE → pe0_slice",
            same_cube_same_target_pe0([1, 2, 3]),
        ),
        # D: every PE in cube0 sends to corresponding PE in cube1 (same UCIe direction)
        CongestionScenario(
            "ucie_eastbound",
            "8×PE corresp.\ncube0→cube1",
            [(0, 0, p, 0, 1, p) for p in range(8)],
        ),
        # E: every PE in cube0 hits pe0's slice → worst HBM CTRL hotspot
        CongestionScenario(
            "all_pe_to_pe0",
            "8×PE → pe0_slice",
            same_cube_same_target_pe0(list(range(8))),
        ),
    ]


def _run_congestion(nbytes: int):
    graph = load_topology(TOPOLOGY_PATH)
    edge_map = {(e.src, e.dst): e for e in graph.edges}
    ns_per_mm = graph.spec.get("system", {}).get("ns_per_mm", 0.01)
    slice_bytes = _slice_bytes(graph.spec)

    rows = []
    for scn in _congestion_scenarios():
        engine = GraphEngine(load_topology(TOPOLOGY_PATH))
        handles = []
        paths: list[list[str]] = []
        for i, (ss, sc, sp, ds, dc, dp) in enumerate(scn.issues):
            pa = _hbm_pa(sip=ds, cube=dc, pe_id=dp,
                         offset=0x1000 + i * 0x100, slice_bytes=slice_bytes)
            msg = PeDmaMsg(
                correlation_id="pedma-cong", request_id=f"{scn.name}-{i}",
                src_sip=ss, src_cube=sc, src_pe=sp,
                dst_pa=pa, nbytes=nbytes,
            )
            handles.append(engine.submit(msg))
            dst_node = engine._resolver.resolve(PhysAddr.decode(pa))
            paths.append(engine._router.find_path(
                f"sip{ss}.cube{sc}.pe{sp}", dst_node))
        first_path = paths[0] if paths else []
        for h in handles:
            engine.wait(h)
        latencies = [engine.get_completion(h)[1]["total_ns"] for h in handles]
        makespan = max(latencies)

        # Breakdown uses the first issuer's path as a representative;
        # ``contention`` absorbs serialization across requests.
        br = _path_breakdown(first_path, nbytes, graph, edge_map, ns_per_mm)
        formula_sum = sum(br.values())
        br["contention"] = max(0.0, makespan - formula_sum)
        peak_single = _bottleneck_bw(first_path, edge_map) or 0.0
        peak_aggregate = _aggregate_peak_bw(paths, edge_map)
        total_bytes = nbytes * len(scn.issues)
        eff_bw = total_bytes / makespan if makespan > 0 else 0.0
        util_single = (eff_bw / peak_single * 100.0) if peak_single > 0 else 0.0
        util_aggregate = (eff_bw / peak_aggregate * 100.0) if peak_aggregate > 0 else 0.0
        rows.append({
            "graph": "congestion",
            "scenario": scn.name,
            "label": scn.label,
            "nbytes": nbytes,
            "n_issuers": len(scn.issues),
            "first_path": " -> ".join(_short_path(first_path)),
            "makespan_ns": makespan,
            "min_lat_ns": min(latencies) if latencies else 0.0,
            "peak_single_bw_gbs": peak_single,
            "peak_aggregate_bw_gbs": peak_aggregate,
            "effective_bw_gbs": eff_bw,
            "util_single_pct": util_single,
            "util_aggregate_pct": util_aggregate,
            **{c: br.get(c, 0.0) for c, _ in CATEGORIES},
        })
    return rows


# ── Plotting ───────────────────────────────────────────────────────────


def _short_path(path: Iterable[str]) -> list[str]:
    return [".".join(p.split(".")[-2:]) for p in path]


def _plot_bw_utilization(rows, title, out_path):
    """Plot Effective BW utilization (%) per scenario with TWO bars:

      util_single    = effective_bw / single-path peak × 100
      util_aggregate = effective_bw / aggregate-resource peak × 100

    The aggregate peak sums the BW of *distinct* bottleneck edges across
    all issuer paths — modelling multi-lane shared resources (e.g. UCIe's
    4 connections) correctly. For scenarios where all paths share one
    bottleneck wire the two peaks are equal and the bars match.

    The dashed line at 100 % is the saturation reference for both
    metrics. util_single can exceed 100 % when multi-lane resources are
    used; util_aggregate is bounded by 100 % by construction (since the
    aggregate peak is the upper bound on aggregate throughput).
    """
    import numpy as np

    n = len(rows)
    labels = [r["label"] for r in rows]
    util_s = [r.get("util_single_pct", 0.0) for r in rows]
    util_a = [r.get("util_aggregate_pct", 0.0) for r in rows]
    eff = [r.get("effective_bw_gbs", 0.0) for r in rows]
    peak_s = [r.get("peak_single_bw_gbs", 0.0) for r in rows]
    peak_a = [r.get("peak_aggregate_bw_gbs", 0.0) for r in rows]

    fig, ax = plt.subplots(figsize=(max(9, n * 1.6), 6.0))
    x = np.arange(n)
    w = 0.38
    ax.bar(x - w / 2, util_s, w, color="#6366f1",
           edgecolor="white", linewidth=0.5,
           label="util vs single-path peak")
    ax.bar(x + w / 2, util_a, w, color="#10b981",
           edgecolor="white", linewidth=0.5,
           label="util vs aggregate-resource peak")
    ax.axhline(100.0, color="grey", linestyle="--", linewidth=0.8,
               label="saturation (100 %)")

    y_max = max(util_s + util_a + [100.0]) * 1.30
    for i in range(n):
        ax.text(i - w / 2, util_s[i] + y_max * 0.012,
                f"{util_s[i]:.0f}%\n/{peak_s[i]:.0f}",
                ha="center", va="bottom", fontsize=7)
        ax.text(i + w / 2, util_a[i] + y_max * 0.012,
                f"{util_a[i]:.0f}%\n/{peak_a[i]:.0f}",
                ha="center", va="bottom", fontsize=7)
        # Effective BW annotation underneath each pair
        ax.text(i, -y_max * 0.04, f"eff={eff[i]:.0f} GB/s",
                ha="center", va="top", fontsize=7, color="#444444")

    ax.set_xticks(x)
    ax.set_xticklabels(labels, fontsize=8)
    ax.set_ylabel("Effective BW utilization (%)")
    ax.set_title(title)
    ax.set_ylim(-y_max * 0.10, y_max)
    ax.legend(loc="upper right", fontsize=9, frameon=False)
    fig.tight_layout()
    fig.savefig(out_path, dpi=150)
    plt.close(fig)


# ── CSV ────────────────────────────────────────────────────────────────


def _write_csv(no_cong_rows, cong_rows, out_path):
    fields = [
        "graph", "scenario", "label", "nbytes", "n_issuers",
        "total_ns", "makespan_ns", "min_lat_ns",
        "peak_single_bw_gbs", "peak_aggregate_bw_gbs", "effective_bw_gbs",
        "util_single_pct", "util_aggregate_pct",
        "pe_setup", "noc_mesh", "ucie", "fabric", "streaming",
        "hbm_ctrl", "contention",
        "path", "first_path",
    ]
    with open(out_path, "w", newline="") as f:
        w = csv.DictWriter(f, fieldnames=fields, extrasaction="ignore")
        w.writeheader()
        for r in no_cong_rows + cong_rows:
            w.writerow(r)


# ── Self-verification ──────────────────────────────────────────────────


def _verify(rows_no_cong, rows_cong) -> list[str]:
    """Return a list of human-readable issues; empty means PASS.

    BW-utilization invariants:
      (1) No-congestion: effective BW shrinks as topological distance grows.
      (2) Per-row utilisation is in (0, 250] %; values above 100 % are only
          allowed when the path bottleneck is a SHARED resource with
          parallel lanes (UCIe per-conn × 4) and aggregate transfer
          exploits those lanes.
      (3) Single-issuer utilisation cannot exceed 100 %.
      (4) Effective BW for a single request equals nbytes / latency.
      (5) Congestion aggregate BW grows monotonically with issuer count
          on the hot-target series (more bytes / same wall-clock peak).
      (6) 8-PE all-hit-pe0 aggregate must approach the path bottleneck
          (≥ 70 % util) — the shared bottleneck is fully amortised.
    """
    issues = []
    by_name = {r["scenario"]: r for r in rows_no_cong}
    cong_map = {r["scenario"]: r for r in rows_cong}

    # (1) No-congestion effective BW shrinks as distance grows
    order = [
        "local",
        "same_cube_best",
        "same_cube_worst",
        "remote_cube_best",
        "remote_cube_worst",
    ]
    prev_bw = float("inf")
    for n in order:
        if n in by_name and by_name[n]["effective_bw_gbs"] >= prev_bw:
            issues.append(
                f"no_congestion: {n} effective BW "
                f"({by_name[n]['effective_bw_gbs']:.1f} GB/s) not strictly "
                f"smaller than previous ({prev_bw:.1f})"
            )
        prev_bw = min(prev_bw, by_name.get(n, {}).get("effective_bw_gbs", prev_bw))

    # (2) util_single in (0, 250 %]; util_aggregate in (0, 100 + ε %]
    for r in rows_no_cong + rows_cong:
        us = r.get("util_single_pct", 0.0)
        ua = r.get("util_aggregate_pct", 0.0)
        if us <= 0 or ua <= 0:
            issues.append(f"{r['scenario']}: non-positive util "
                          f"(single={us}, agg={ua})")
        if us > 250:
            issues.append(
                f"{r['scenario']}: util_single={us:.1f}% > 250 % — "
                f"likely a peak or effective BW miscompute"
            )
        if ua > 100.0 + 1.0:  # 1 % numerical slack
            issues.append(
                f"{r['scenario']}: util_aggregate={ua:.1f}% > 100 % — "
                f"effective BW must not exceed the aggregate resource peak"
            )

    # (3) Single-issuer utilisation (both metrics) cannot exceed 100 %.
    for r in rows_no_cong:
        us = r.get("util_single_pct", 0.0)
        ua = r.get("util_aggregate_pct", 0.0)
        if us > 100.0 + 1e-3:
            issues.append(
                f"no_congestion {r['scenario']}: util_single={us:.1f}% > 100% "
                f"for a single-issuer scenario"
            )
        if abs(us - ua) > 1e-3:
            issues.append(
                f"no_congestion {r['scenario']}: util_single ({us:.1f}) != "
                f"util_aggregate ({ua:.1f}) — should match for single issuer"
            )

    # (4) Effective BW for a single request = nbytes / total_ns
    for r in rows_no_cong:
        expected = r["nbytes"] / r["total_ns"] if r["total_ns"] > 0 else 0
        got = r["effective_bw_gbs"]
        if abs(got - expected) > 1e-3:
            issues.append(
                f"no_congestion {r['scenario']}: eff_bw={got:.3f} != "
                f"nbytes/total_ns={expected:.3f}"
            )

    # (5) Congestion aggregate BW grows monotonically with issuer count on
    # the hot-target series.
    seq = ["ctrl_hot_1", "ctrl_hot_2", "ctrl_hot_3"]
    last = 0.0
    for n in seq:
        if n in cong_map and cong_map[n]["effective_bw_gbs"] < last - 1e-6:
            issues.append(
                f"congestion: {n} aggregate BW dropped below prior "
                f"({cong_map[n]['effective_bw_gbs']:.1f} < {last:.1f})"
            )
        last = max(last, cong_map.get(n, {}).get("effective_bw_gbs", last))

    # (6) all_pe_to_pe0 must approach the shared single-path peak.
    if "all_pe_to_pe0" in cong_map:
        u = cong_map["all_pe_to_pe0"]["util_single_pct"]
        if u < 70.0:
            issues.append(
                f"congestion all_pe_to_pe0: util_single={u:.1f}% < 70 % — "
                f"8-PE hotspot should saturate the shared HBM CTRL path"
            )

    # (7) ucie_eastbound: util_aggregate should be meaningfully smaller
    # than util_single (the multi-lane peak should pull the bar down).
    if "ucie_eastbound" in cong_map:
        e = cong_map["ucie_eastbound"]
        if e["util_aggregate_pct"] >= e["util_single_pct"] - 5.0:
            issues.append(
                f"congestion ucie_eastbound: util_aggregate "
                f"({e['util_aggregate_pct']:.1f}%) should be << "
                f"util_single ({e['util_single_pct']:.1f}%) when UCIe's "
                f"multi-lane peak applies"
            )

    return issues


# ── Entry point ────────────────────────────────────────────────────────


def main(nbytes: int = DEFAULT_NBYTES) -> int:
    OUT_DIR.mkdir(parents=True, exist_ok=True)

    print(f"== PE_DMA perf @ {nbytes} B per request ==")
    print("Collecting NO-congestion scenarios...")
    no_cong = _run_no_congestion(nbytes)
    print("Collecting CONGESTION scenarios...")
    cong = _run_congestion(nbytes)

    print("\n-- No-congestion summary --")
    for r in no_cong:
        print(f"  {r['scenario']:22s}  total={r['total_ns']:7.1f} ns  "
              f"eff={r['effective_bw_gbs']:6.1f} GB/s  "
              f"peak_s={r['peak_single_bw_gbs']:6.1f}  "
              f"peak_a={r['peak_aggregate_bw_gbs']:6.1f}  "
              f"util_s={r['util_single_pct']:5.1f}%  "
              f"util_a={r['util_aggregate_pct']:5.1f}%")
    print("\n-- Congestion summary --")
    for r in cong:
        agg_bytes = r["nbytes"] * r["n_issuers"]
        print(f"  {r['scenario']:22s}  makespan={r['makespan_ns']:7.1f} ns  "
              f"agg_bytes={agg_bytes:>7d}  "
              f"eff={r['effective_bw_gbs']:6.1f} GB/s  "
              f"peak_s={r['peak_single_bw_gbs']:6.1f}  "
              f"peak_a={r['peak_aggregate_bw_gbs']:6.1f}  "
              f"util_s={r['util_single_pct']:5.1f}%  "
              f"util_a={r['util_aggregate_pct']:5.1f}%")

    issues = _verify(no_cong, cong)
    print("\n-- Self-verification --")
    if not issues:
        print("  PASS")
    else:
        for i, msg in enumerate(issues, 1):
            print(f"  [{i}] {msg}")

    _plot_bw_utilization(
        no_cong,
        f"PE_DMA Effective BW utilization (no congestion, nbytes={nbytes})",
        OUT_DIR / "no_congestion.png",
    )
    _plot_bw_utilization(
        cong,
        f"PE_DMA Effective BW utilization (congestion, "
        f"agg = n_issuers × nbytes / makespan, nbytes={nbytes})",
        OUT_DIR / "congestion.png",
    )
    _write_csv(no_cong, cong, OUT_DIR / "summary.csv")

    print(f"\nWrote:\n  {OUT_DIR / 'no_congestion.png'}\n"
          f"  {OUT_DIR / 'congestion.png'}\n"
          f"  {OUT_DIR / 'summary.csv'}")

    return 0 if not issues else 1


if __name__ == "__main__":
    import argparse

    p = argparse.ArgumentParser()
    p.add_argument("--n-bytes", type=int, default=DEFAULT_NBYTES,
                    help="bytes per DMA (default 16384)")
    args = p.parse_args()
    raise SystemExit(main(nbytes=args.n_bytes))