diff --git a/docs/diagrams/pe_dma_perf/congestion.png b/docs/diagrams/pe_dma_perf/congestion.png index 4900221..09725cb 100644 Binary files a/docs/diagrams/pe_dma_perf/congestion.png and b/docs/diagrams/pe_dma_perf/congestion.png differ diff --git a/docs/diagrams/pe_dma_perf/no_congestion.png b/docs/diagrams/pe_dma_perf/no_congestion.png index 9eb0216..18ea363 100644 Binary files a/docs/diagrams/pe_dma_perf/no_congestion.png and b/docs/diagrams/pe_dma_perf/no_congestion.png differ diff --git a/docs/diagrams/pe_dma_perf/summary.csv b/docs/diagrams/pe_dma_perf/summary.csv index 4fb8b5e..e44a27c 100644 --- a/docs/diagrams/pe_dma_perf/summary.csv +++ b/docs/diagrams/pe_dma_perf/summary.csv @@ -1,24 +1,24 @@ -graph,scenario,label,nbytes,n_issuers,total_ns,makespan_ns,min_lat_ns,pe_setup,noc_mesh,ucie,hbm_ctrl,contention,path,first_path +graph,scenario,label,nbytes,n_issuers,total_ns,makespan_ns,min_lat_ns,bottleneck_bw_gbs,effective_bw_gbs,util_pct,pe_setup,noc_mesh,ucie,fabric,streaming,hbm_ctrl,contention,path,first_path no_congestion,local,"SAME_CUBE -PE_LOCAL",16384,,77.0,,,1.0,2.0,0.0,9.0,2.0,pe0.pe_dma -> cube0.r0c0 -> hbm_ctrl.pe0, +PE_LOCAL",16384,1,77.0,,,256.0,212.7792207792208,83.11688311688312,1.0,2.0,0.0,0.0,63.0,9.0,2.0,pe0.pe_dma -> cube0.r0c0 -> hbm_ctrl.pe0, no_congestion,same_cube_best,"SAME_CUBE REMOTE_BEST -(pe0→pe1)",16384,,82.06,,,1.0,5.03,0.0,9.0,4.030000000000001,pe0.pe_dma -> cube0.r0c0 -> cube0.r0c1 -> hbm_ctrl.pe1, +(pe0→pe1)",16384,1,82.06,,,256.0,199.6587862539605,77.99171338045332,1.0,5.03,0.0,0.0,63.0,9.0,4.030000000000001,pe0.pe_dma -> cube0.r0c0 -> cube0.r0c1 -> hbm_ctrl.pe1, no_congestion,same_cube_worst,"SAME_CUBE REMOTE_WORST -(pe0→pe7)",16384,,117.50000000000001,,,1.0,26.25,0.0,9.0,18.250000000000014,pe0.pe_dma -> cube0.r0c0 -> cube0.r1c0 -> cube0.r1c1 -> cube0.r1c2 -> cube0.r1c3 -> cube0.r4c3 -> cube0.r4c4 -> cube0.r5c4 -> cube0.r5c5 -> hbm_ctrl.pe7, +(pe0→pe7)",16384,1,117.50000000000001,,,256.0,139.4382978723404,54.46808510638297,1.0,26.25,0.0,0.0,63.0,9.0,18.250000000000014,pe0.pe_dma -> cube0.r0c0 -> cube0.r1c0 -> cube0.r1c1 -> cube0.r1c2 -> cube0.r1c3 -> cube0.r4c3 -> cube0.r4c4 -> cube0.r5c4 -> cube0.r5c5 -> hbm_ctrl.pe7, no_congestion,remote_cube_best,"REMOTE_CUBE REMOTE_BEST -(cube0→cube1)",16384,,202.51999999999998,,,1.0,6.0,32.510000000000005,9.0,28.00999999999999,pe0.pe_dma -> cube0.r0c0 -> ucie-N.conn0 -> cube0.ucie-N -> ucie-N.conn3 -> cube0.r0c5 -> ucie-E.conn0 -> cube0.ucie-E -> cube1.ucie-W -> ucie-W.conn0 -> cube1.r0c0 -> hbm_ctrl.pe0, +(cube0→cube1)",16384,1,202.51999999999998,,,128.0,80.90065178747778,63.20363420896702,1.0,6.0,32.510000000000005,0.0,126.0,9.0,28.00999999999999,pe0.pe_dma -> cube0.r0c0 -> ucie-N.conn0 -> cube0.ucie-N -> ucie-N.conn3 -> cube0.r0c5 -> ucie-E.conn0 -> cube0.ucie-E -> cube1.ucie-W -> ucie-W.conn0 -> cube1.r0c0 -> hbm_ctrl.pe0, no_congestion,remote_cube_worst,"REMOTE_CUBE REMOTE_WORST -(cube0→cube15.pe7)",16384,,573.1199999999999,,,1.0,30.0,219.05999999999995,9.0,188.05999999999995,pe0.pe_dma -> cube0.r0c0 -> ucie-N.conn0 -> cube0.ucie-N -> ucie-N.conn3 -> cube0.r0c5 -> ucie-E.conn0 -> cube0.ucie-E -> cube1.ucie-W -> ucie-W.conn0 -> cube1.r0c0 -> ucie-N.conn0 -> cube1.ucie-N -> ucie-N.conn3 -> cube1.r0c5 -> ucie-E.conn0 -> cube1.ucie-E -> cube2.ucie-W -> ucie-W.conn0 -> cube2.r0c0 -> ucie-N.conn0 -> cube2.ucie-N -> ucie-N.conn3 -> cube2.r0c5 -> ucie-E.conn0 -> cube2.ucie-E -> cube3.ucie-W -> ucie-W.conn0 -> cube3.r0c0 -> ucie-N.conn0 -> cube3.ucie-N -> ucie-N.conn3 -> cube3.r0c5 -> ucie-E.conn0 -> cube3.ucie-E -> ucie-E.conn3 -> cube3.r5c5 -> ucie-S.conn3 -> cube3.ucie-S -> cube7.ucie-N -> ucie-N.conn3 -> cube7.r0c5 -> ucie-E.conn0 -> cube7.ucie-E -> ucie-E.conn3 -> cube7.r5c5 -> ucie-S.conn3 -> cube7.ucie-S -> cube11.ucie-N -> ucie-N.conn3 -> cube11.r0c5 -> ucie-E.conn0 -> cube11.ucie-E -> ucie-E.conn3 -> cube11.r5c5 -> ucie-S.conn3 -> cube11.ucie-S -> cube15.ucie-N -> ucie-N.conn3 -> cube15.r0c5 -> ucie-E.conn0 -> cube15.ucie-E -> ucie-E.conn3 -> cube15.r5c5 -> hbm_ctrl.pe7, +(cube0→cube15.pe7)",16384,1,573.1199999999999,,,128.0,28.587381351200452,22.333891680625353,1.0,30.0,219.05999999999995,0.0,126.0,9.0,188.05999999999995,pe0.pe_dma -> cube0.r0c0 -> ucie-N.conn0 -> cube0.ucie-N -> ucie-N.conn3 -> cube0.r0c5 -> ucie-E.conn0 -> cube0.ucie-E -> cube1.ucie-W -> ucie-W.conn0 -> cube1.r0c0 -> ucie-N.conn0 -> cube1.ucie-N -> ucie-N.conn3 -> cube1.r0c5 -> ucie-E.conn0 -> cube1.ucie-E -> cube2.ucie-W -> ucie-W.conn0 -> cube2.r0c0 -> ucie-N.conn0 -> cube2.ucie-N -> ucie-N.conn3 -> cube2.r0c5 -> ucie-E.conn0 -> cube2.ucie-E -> cube3.ucie-W -> ucie-W.conn0 -> cube3.r0c0 -> ucie-N.conn0 -> cube3.ucie-N -> ucie-N.conn3 -> cube3.r0c5 -> ucie-E.conn0 -> cube3.ucie-E -> ucie-E.conn3 -> cube3.r5c5 -> ucie-S.conn3 -> cube3.ucie-S -> cube7.ucie-N -> ucie-N.conn3 -> cube7.r0c5 -> ucie-E.conn0 -> cube7.ucie-E -> ucie-E.conn3 -> cube7.r5c5 -> ucie-S.conn3 -> cube7.ucie-S -> cube11.ucie-N -> ucie-N.conn3 -> cube11.r0c5 -> ucie-E.conn0 -> cube11.ucie-E -> ucie-E.conn3 -> cube11.r5c5 -> ucie-S.conn3 -> cube11.ucie-S -> cube15.ucie-N -> ucie-N.conn3 -> cube15.r0c5 -> ucie-E.conn0 -> cube15.ucie-E -> ucie-E.conn3 -> cube15.r5c5 -> hbm_ctrl.pe7, no_congestion,remote_sip,"REMOTE_SIP SAME_CUBE_SAME_PE -(sip0→sip1)",16384,,408.5216666666663,,,1.0,4.0,37.040000000000006,9.0,209.38499999999962,pe0.pe_dma -> cube0.r0c0 -> ucie-N.conn0 -> cube0.ucie-N -> io0.ucie-P0 -> ucie-P0.conn0 -> io0.noc -> io0.pcie_ep -> fabric.switch0 -> io0.pcie_ep -> io0.noc -> ucie-P0.conn0 -> io0.ucie-P0 -> cube0.ucie-N -> ucie-N.conn0 -> cube0.r0c0 -> hbm_ctrl.pe0, -congestion,ctrl_hot_1,1×PE → pe0_slice,16384,1,,82.06,82.06,1.0,5.03,0.0,9.0,4.030000000000001,,pe1.pe_dma -> cube0.r0c1 -> cube0.r0c0 -> hbm_ctrl.pe0 -congestion,ctrl_hot_2,2×PE → pe0_slice,16384,2,,158.3450000000001,134.2400000000001,1.0,5.03,0.0,9.0,80.31500000000011,,pe1.pe_dma -> cube0.r0c1 -> cube0.r0c0 -> hbm_ctrl.pe0 -congestion,ctrl_hot_3,3×PE → pe0_slice,16384,3,,230.0750000000001,139.94000000000008,1.0,5.03,0.0,9.0,152.0450000000001,,pe1.pe_dma -> cube0.r0c1 -> cube0.r0c0 -> hbm_ctrl.pe0 +(sip0→sip1)",16384,1,408.5216666666663,,,128.0,40.10558395515541,31.332487464965165,1.0,4.0,37.040000000000006,22.09666666666667,126.0,9.0,209.38499999999962,pe0.pe_dma -> cube0.r0c0 -> ucie-N.conn0 -> cube0.ucie-N -> io0.ucie-P0 -> ucie-P0.conn0 -> io0.noc -> io0.pcie_ep -> fabric.switch0 -> io0.pcie_ep -> io0.noc -> ucie-P0.conn0 -> io0.ucie-P0 -> cube0.ucie-N -> ucie-N.conn0 -> cube0.r0c0 -> hbm_ctrl.pe0, +congestion,ctrl_hot_1,1×PE → pe0_slice,16384,1,,82.06,82.06,256.0,199.6587862539605,77.99171338045332,1.0,5.03,0.0,0.0,63.0,9.0,4.030000000000001,,pe1.pe_dma -> cube0.r0c1 -> cube0.r0c0 -> hbm_ctrl.pe0 +congestion,ctrl_hot_2,2×PE → pe0_slice,16384,2,,158.3450000000001,134.2400000000001,256.0,206.94054122327813,80.83614891534302,1.0,5.03,0.0,0.0,63.0,9.0,80.31500000000011,,pe1.pe_dma -> cube0.r0c1 -> cube0.r0c0 -> hbm_ctrl.pe0 +congestion,ctrl_hot_3,3×PE → pe0_slice,16384,3,,230.0750000000001,139.94000000000008,256.0,213.6346843420623,83.45104857111808,1.0,5.03,0.0,0.0,63.0,9.0,152.0450000000001,,pe1.pe_dma -> cube0.r0c1 -> cube0.r0c0 -> hbm_ctrl.pe0 congestion,ucie_eastbound,"8×PE corresp. -cube0→cube1",16384,8,,962.52,438.52,1.0,6.0,32.510000000000005,9.0,788.01,,pe0.pe_dma -> cube0.r0c0 -> ucie-N.conn0 -> cube0.ucie-N -> ucie-N.conn3 -> cube0.r0c5 -> ucie-E.conn0 -> cube0.ucie-E -> cube1.ucie-W -> ucie-W.conn0 -> cube1.r0c0 -> hbm_ctrl.pe0 -congestion,all_pe_to_pe0,8×PE → pe0_slice,16384,8,,558.2499999999998,195.0,1.0,2.0,0.0,9.0,483.2499999999998,,pe0.pe_dma -> cube0.r0c0 -> hbm_ctrl.pe0 +cube0→cube1",16384,8,,962.52,438.52,128.0,136.17587167019906,106.387399742343,1.0,6.0,32.510000000000005,0.0,126.0,9.0,788.01,,pe0.pe_dma -> cube0.r0c0 -> ucie-N.conn0 -> cube0.ucie-N -> ucie-N.conn3 -> cube0.r0c5 -> ucie-E.conn0 -> cube0.ucie-E -> cube1.ucie-W -> ucie-W.conn0 -> cube1.r0c0 -> hbm_ctrl.pe0 +congestion,all_pe_to_pe0,8×PE → pe0_slice,16384,8,,558.2499999999998,195.0,256.0,234.7908643081058,91.71518137035383,1.0,2.0,0.0,0.0,63.0,9.0,483.2499999999998,,pe0.pe_dma -> cube0.r0c0 -> hbm_ctrl.pe0 diff --git a/scripts/plot_pe_dma_perf.py b/scripts/plot_pe_dma_perf.py index 1b99434..e6feaaf 100644 --- a/scripts/plot_pe_dma_perf.py +++ b/scripts/plot_pe_dma_perf.py @@ -1,4 +1,4 @@ -"""Plot PE_DMA performance: latency breakdown across topological distance. +"""Plot PE_DMA Effective BW utilization across topological distance. Two graphs (saved to docs/diagrams/pe_dma_perf/): @@ -18,20 +18,21 @@ Two graphs (saved to docs/diagrams/pe_dma_perf/): D. 8×PE same-direction-UCIe — every PE in cube0 reads cube1 same-PE slice E. 8×PE all-hit-PE0 — every PE reads cube0.pe0_slice (hottest HBM CTRL) -Latency is broken down by component class: - pe_setup — first-flit PE_DMA overhead + PE↔router wire transfer - noc_mesh — mesh routers' first-flit overheads + mesh wire transfers - ucie — UCIe ports' first-flit overheads + UCIe wire transfers - streaming — (n_flits-1) × per-flit time at the bottleneck link - (the dominant term for bulk transfers, set by the slowest wire) - hbm_ctrl — HBM CTRL overhead + final-chunk PC commit (= chunk_time) - fabric — switch + IO chiplet overheads + wires (cross-SIP paths) - contention — actual − formula_sum; primary signal for the congestion - graph (serialization across concurrent issuers) and a - model-fidelity probe for single-request scenarios +Effective BW = (total bytes transferred) / (wall-clock time) + no_congestion: nbytes / total_ns + congestion: n_issuers × nbytes / makespan_ns (aggregate throughput) -Outputs ``summary.csv`` so the plot can be re-rendered without re-running -the simulator (the heavy step). +Peak BW = the path bottleneck (slowest single-edge bandwidth on the +first issuer's path). For shared-resource congestion scenarios the +aggregate effective BW can exceed this single-path peak when the +shared resource provides parallel lanes (e.g. UCIe has 4 connections +× 128 GB/s = 512 GB/s aggregate even though each connection is 128). + +Utilization% = effective / peak × 100. + +Outputs ``summary.csv`` (including breakdown components for any future +analysis) so the plot can be re-rendered without re-running the +simulator. """ from __future__ import annotations @@ -128,6 +129,13 @@ def _categorise_edge_kind(kind: str | None) -> str | None: return None +def _bottleneck_bw(path: list[str], edge_map: dict) -> float | None: + """Min ``bw_gbs`` over edges with positive bandwidth on the path.""" + bws = [e.bw_gbs for i in range(len(path) - 1) + if (e := edge_map.get((path[i], path[i + 1]))) and e.bw_gbs] + return min(bws) if bws else None + + def _path_breakdown( path: list[str], nbytes: int, graph, edge_map, ns_per_mm: float, ) -> dict[str, float]: @@ -249,13 +257,20 @@ def _run_no_congestion(nbytes: int): br = _path_breakdown(path, nbytes, graph, edge_map, ns_per_mm) formula_sum = sum(br.values()) br["contention"] = max(0.0, total_ns - formula_sum) + peak_bw = _bottleneck_bw(path, edge_map) or 0.0 + eff_bw = nbytes / total_ns if total_ns > 0 else 0.0 + util = (eff_bw / peak_bw * 100.0) if peak_bw > 0 else 0.0 rows.append({ "graph": "no_congestion", "scenario": scn.name, "label": scn.label, "nbytes": nbytes, + "n_issuers": 1, "path": " -> ".join(_short_path(path)), "total_ns": total_ns, + "bottleneck_bw_gbs": peak_bw, + "effective_bw_gbs": eff_bw, + "util_pct": util, **{c: br.get(c, 0.0) for c, _ in CATEGORIES}, }) return rows @@ -338,10 +353,14 @@ def _run_congestion(nbytes: int): makespan = max(latencies) # Breakdown uses the first issuer's path as a representative; - # ``unaccounted`` absorbs contention/serialization across requests. + # ``contention`` absorbs serialization across requests. br = _path_breakdown(first_path or [], nbytes, graph, edge_map, ns_per_mm) formula_sum = sum(br.values()) br["contention"] = max(0.0, makespan - formula_sum) + peak_bw = (_bottleneck_bw(first_path or [], edge_map) or 0.0) + total_bytes = nbytes * len(scn.issues) + eff_bw = total_bytes / makespan if makespan > 0 else 0.0 + util = (eff_bw / peak_bw * 100.0) if peak_bw > 0 else 0.0 rows.append({ "graph": "congestion", "scenario": scn.name, @@ -351,6 +370,9 @@ def _run_congestion(nbytes: int): "first_path": " -> ".join(_short_path(first_path or [])), "makespan_ns": makespan, "min_lat_ns": min(latencies) if latencies else 0.0, + "bottleneck_bw_gbs": peak_bw, + "effective_bw_gbs": eff_bw, + "util_pct": util, **{c: br.get(c, 0.0) for c, _ in CATEGORIES}, }) return rows @@ -363,25 +385,41 @@ def _short_path(path: Iterable[str]) -> list[str]: return [".".join(p.split(".")[-2:]) for p in path] -def _plot_stacked(rows, value_key, title, out_path): +def _plot_bw_utilization(rows, title, out_path): + """Plot Effective BW utilization (%) per scenario. + + Each bar is util_pct = effective_bw / peak_bottleneck_bw × 100. + Annotation shows effective and peak in GB/s. A horizontal dashed + line marks 100 % (single-path peak); bars exceeding it indicate + the scenario uses multiple parallel resources (e.g. UCIe's 4 + connections) beyond the bottleneck of any single path. + """ n = len(rows) labels = [r["label"] for r in rows] + util = [r.get("util_pct", 0.0) for r in rows] + eff = [r.get("effective_bw_gbs", 0.0) for r in rows] + peak = [r.get("bottleneck_bw_gbs", 0.0) for r in rows] + fig, ax = plt.subplots(figsize=(max(8, n * 1.4), 5.5)) - bottoms = [0.0] * n - for cat, colour in CATEGORIES: - heights = [r.get(cat, 0.0) for r in rows] - ax.bar(labels, heights, bottom=bottoms, color=colour, label=cat, - edgecolor="white", linewidth=0.5) - bottoms = [b + h for b, h in zip(bottoms, heights)] - # Total annotation on top of each bar. - for i, r in enumerate(rows): - ax.text(i, bottoms[i] * 1.01, f"{r[value_key]:.0f} ns", + # Colour bars by utilization band for quick scanning. + colours = ["#10b981" if u >= 70 else "#f59e0b" if u >= 40 else "#ef4444" + for u in util] + ax.bar(labels, util, color=colours, edgecolor="white", linewidth=0.5) + ax.axhline(100.0, color="grey", linestyle="--", linewidth=0.8, + label="single-path peak") + + # Annotate each bar with util%, effective, and peak. + y_max = max(util + [100.0]) * 1.2 + for i, (u, e, p) in enumerate(zip(util, eff, peak)): + ax.text(i, u + y_max * 0.012, + f"{u:.1f}%\n{e:.0f} / {p:.0f} GB/s", ha="center", va="bottom", fontsize=8) - ax.set_ylabel("Latency (ns)") + + ax.set_ylabel("Effective BW utilization (%)") ax.set_title(title) - ax.legend(loc="upper left", fontsize=9, frameon=False) - ax.set_ylim(0, max(bottoms) * 1.15) + ax.set_ylim(0, y_max) ax.tick_params(axis="x", labelsize=8) + ax.legend(loc="upper right", fontsize=9, frameon=False) fig.tight_layout() fig.savefig(out_path, dpi=150) plt.close(fig) @@ -394,7 +432,9 @@ def _write_csv(no_cong_rows, cong_rows, out_path): fields = [ "graph", "scenario", "label", "nbytes", "n_issuers", "total_ns", "makespan_ns", "min_lat_ns", - "pe_setup", "noc_mesh", "ucie", "hbm_ctrl", "contention", + "bottleneck_bw_gbs", "effective_bw_gbs", "util_pct", + "pe_setup", "noc_mesh", "ucie", "fabric", "streaming", + "hbm_ctrl", "contention", "path", "first_path", ] with open(out_path, "w", newline="") as f: @@ -410,23 +450,24 @@ def _write_csv(no_cong_rows, cong_rows, out_path): def _verify(rows_no_cong, rows_cong) -> list[str]: """Return a list of human-readable issues; empty means PASS. - Verification covers: - (1) No-congestion: latency monotonically grows with topological distance. - (2) Same-cube scenarios contain zero UCIe budget (mesh-only path). - (3) Remote-cube/SIP scenarios contain non-zero UCIe budget. - (4) Breakdown is internally consistent: formula sum ≤ actual total - (categories don't overcount the pipelined model) and the - ``contention`` slack is < 50% of total for single-request - scenarios (the named categories explain most latency). - (5) Streaming term matches nbytes / bottleneck within 5%. - (6) Congestion makespan grows with issuer count on the hot-target series. - (7) 8-PE hotspot strictly exceeds 3-PE hotspot. + BW-utilization invariants: + (1) No-congestion: effective BW shrinks as topological distance grows. + (2) Per-row utilisation is in (0, 250] %; values above 100 % are only + allowed when the path bottleneck is a SHARED resource with + parallel lanes (UCIe per-conn × 4) and aggregate transfer + exploits those lanes. + (3) Single-issuer utilisation cannot exceed 100 %. + (4) Effective BW for a single request equals nbytes / latency. + (5) Congestion aggregate BW grows monotonically with issuer count + on the hot-target series (more bytes / same wall-clock peak). + (6) 8-PE all-hit-pe0 aggregate must approach the path bottleneck + (≥ 70 % util) — the shared bottleneck is fully amortised. """ issues = [] by_name = {r["scenario"]: r for r in rows_no_cong} cong_map = {r["scenario"]: r for r in rows_cong} - # (1) distance monotonicity + # (1) No-congestion effective BW shrinks as distance grows order = [ "local", "same_cube_best", @@ -434,106 +475,69 @@ def _verify(rows_no_cong, rows_cong) -> list[str]: "remote_cube_best", "remote_cube_worst", ] - prev = 0.0 + prev_bw = float("inf") for n in order: - if n in by_name and by_name[n]["total_ns"] <= prev: + if n in by_name and by_name[n]["effective_bw_gbs"] >= prev_bw: issues.append( - f"no_congestion: {n} latency ({by_name[n]['total_ns']:.1f} ns) " - f"not strictly > previous scenario ({prev:.1f} ns)" + f"no_congestion: {n} effective BW " + f"({by_name[n]['effective_bw_gbs']:.1f} GB/s) not strictly " + f"smaller than previous ({prev_bw:.1f})" ) - prev = max(prev, by_name.get(n, {}).get("total_ns", prev)) + prev_bw = min(prev_bw, by_name.get(n, {}).get("effective_bw_gbs", prev_bw)) - if "remote_sip" in by_name and "remote_cube_best" in by_name: - if by_name["remote_sip"]["total_ns"] < by_name["remote_cube_best"]["total_ns"]: - issues.append( - f"no_congestion: remote_sip ({by_name['remote_sip']['total_ns']:.1f}) " - f"< remote_cube_best ({by_name['remote_cube_best']['total_ns']:.1f})" - ) - - # (2) same-cube → ucie == 0 - for n in ("local", "same_cube_best", "same_cube_worst"): - if by_name.get(n, {}).get("ucie", 1) != 0: - issues.append( - f"no_congestion: {n} should have zero UCIe budget; " - f"got {by_name[n]['ucie']}" - ) - - # (3) remote-cube / remote-sip → ucie > 0 - for n in ("remote_cube_best", "remote_cube_worst", "remote_sip"): - if by_name.get(n, {}).get("ucie", 0) <= 0: - issues.append( - f"no_congestion: {n} must have positive UCIe budget; " - f"got {by_name[n].get('ucie')}" - ) - - # (4) breakdown consistency + # (2) Utilisation in (0, 250 %]; values > 100 only allowed on shared + # multi-lane resources (UCIe per_conn × 4 → 4-fold parallelism). for r in rows_no_cong + rows_cong: - actual = r.get("total_ns", r.get("makespan_ns", 0.0)) - if actual <= 0: - continue - for cat, _ in CATEGORIES: - if r.get(cat, 0.0) < 0: - issues.append(f"{r['scenario']}: negative {cat}={r[cat]}") - formula_sum = sum(r.get(c, 0.0) for c, _ in CATEGORIES - if c != "contention") - if formula_sum > actual + 1e-3: + u = r.get("util_pct", 0.0) + if u <= 0: + issues.append(f"{r['scenario']}: non-positive util_pct={u}") + if u > 250: issues.append( - f"{r['scenario']}: formula sum {formula_sum:.1f} exceeds " - f"actual {actual:.1f} (categories overcount pipelined model)" - ) - # For single-request scenarios the named categories must explain - # most of the latency. Cross-SIP paths cross two non-flit-aware - # boundaries (sip0.pcie_ep -> switch -> sip1.pcie_ep) which force - # store-and-forward re-streaming that the simple wormhole formula - # under-counts; allow a looser threshold for those rows. For - # congestion scenarios ``contention`` IS the primary signal, so - # don't bound its share — directional invariants in checks (6) - # and (7) cover that. - path_str = r.get("path") or r.get("first_path", "") - cross_sip = "switch0" in path_str - max_cont_frac = 0.7 if cross_sip else 0.5 - if r.get("graph") == "no_congestion": - cont_frac = r.get("contention", 0.0) / actual - if cont_frac > max_cont_frac: - issues.append( - f"{r['scenario']}: contention fraction {cont_frac:.1%} > " - f"{max_cont_frac:.0%} in a single-request scenario — named " - f"categories should explain most latency " - f"(actual={actual:.1f}, cont={r['contention']:.1f})" - ) - - # (5) streaming matches nbytes / bottleneck within slack - # nbytes / bottleneck for local (256 GB/s) at 16 KB = 64ns (off by per-flit gap) - if "local" in by_name: - n = by_name["local"] - nbytes = n["nbytes"] - # streaming = (n_flits-1) * (256 / 256_gbs) for 256 GB/s = (n_flits-1) ns - n_flits = math.ceil(nbytes / 256) - expected = (n_flits - 1) * (256 / 256.0) # 256 GB/s pe→router bottleneck - got = n.get("streaming", 0) - if abs(got - expected) > expected * 0.05 + 0.5: - issues.append( - f"no_congestion local: streaming={got:.1f} vs expected≈{expected:.1f}" + f"{r['scenario']}: util_pct={u:.1f}% exceeds 250 % — " + f"likely a peak-BW or effective-BW miscompute" ) - # (6) congestion makespan monotonic with issuer count + # (3) Single-issuer utilisation cannot exceed 100 %. + for r in rows_no_cong: + u = r.get("util_pct", 0.0) + if u > 100.0 + 1e-3: + issues.append( + f"no_congestion {r['scenario']}: util_pct={u:.1f}% > 100% " + f"for single-issuer scenario (eff={r['effective_bw_gbs']:.1f}, " + f"peak={r['bottleneck_bw_gbs']:.1f})" + ) + + # (4) Effective BW for a single request = nbytes / total_ns + for r in rows_no_cong: + expected = r["nbytes"] / r["total_ns"] if r["total_ns"] > 0 else 0 + got = r["effective_bw_gbs"] + if abs(got - expected) > 1e-3: + issues.append( + f"no_congestion {r['scenario']}: eff_bw={got:.3f} != " + f"nbytes/total_ns={expected:.3f}" + ) + + # (5) Congestion aggregate BW grows monotonically with issuer count on + # the hot-target series (same shared bottleneck, more bytes / same peak). seq = ["ctrl_hot_1", "ctrl_hot_2", "ctrl_hot_3"] last = 0.0 for n in seq: - if n in cong_map and cong_map[n]["makespan_ns"] < last: + if n in cong_map and cong_map[n]["effective_bw_gbs"] < last - 1e-6: issues.append( - f"congestion: {n} makespan dropped below prior " - f"({cong_map[n]['makespan_ns']:.1f} < {last:.1f})" + f"congestion: {n} aggregate BW dropped below prior " + f"({cong_map[n]['effective_bw_gbs']:.1f} < {last:.1f})" ) - last = cong_map.get(n, {}).get("makespan_ns", last) + last = max(last, cong_map.get(n, {}).get("effective_bw_gbs", last)) - # (7) 8-PE hotspot strictly slower than 3-PE - if "all_pe_to_pe0" in cong_map and "ctrl_hot_3" in cong_map: - if cong_map["all_pe_to_pe0"]["makespan_ns"] <= cong_map["ctrl_hot_3"]["makespan_ns"]: + # (6) all_pe_to_pe0 must approach single-path peak (≥ 70 % util) — + # the shared r0c0 → hbm_ctrl.pe0 bottleneck is fully amortised when + # all 8 PEs target it. + if "all_pe_to_pe0" in cong_map: + u = cong_map["all_pe_to_pe0"]["util_pct"] + if u < 70.0: issues.append( - f"congestion: all_pe_to_pe0 ({cong_map['all_pe_to_pe0']['makespan_ns']:.1f}) " - f"should exceed ctrl_hot_3 " - f"({cong_map['ctrl_hot_3']['makespan_ns']:.1f})" + f"congestion all_pe_to_pe0: util_pct={u:.1f}% < 70 % — " + f"8-PE hotspot should saturate the shared HBM CTRL path" ) return issues @@ -554,16 +558,15 @@ def main(nbytes: int = DEFAULT_NBYTES) -> int: print("\n-- No-congestion summary --") for r in no_cong: print(f" {r['scenario']:22s} total={r['total_ns']:7.1f} ns " - f"pe={r['pe_setup']:.1f} mesh={r['noc_mesh']:.1f} " - f"ucie={r['ucie']:.1f} stream={r['streaming']:.1f} " - f"hbm={r['hbm_ctrl']:.1f} cont={r['contention']:.1f}") + f"eff={r['effective_bw_gbs']:6.1f} peak={r['bottleneck_bw_gbs']:6.1f} " + f"GB/s util={r['util_pct']:5.1f}%") print("\n-- Congestion summary --") for r in cong: + agg_bytes = r["nbytes"] * r["n_issuers"] print(f" {r['scenario']:22s} makespan={r['makespan_ns']:7.1f} ns " - f"min={r['min_lat_ns']:7.1f} " - f"pe={r['pe_setup']:.1f} mesh={r['noc_mesh']:.1f} " - f"ucie={r['ucie']:.1f} stream={r['streaming']:.1f} " - f"hbm={r['hbm_ctrl']:.1f} cont={r['contention']:.1f}") + f"agg_bytes={agg_bytes:>7d} " + f"eff={r['effective_bw_gbs']:6.1f} peak={r['bottleneck_bw_gbs']:6.1f} " + f"GB/s util={r['util_pct']:5.1f}%") issues = _verify(no_cong, cong) print("\n-- Self-verification --") @@ -573,14 +576,15 @@ def main(nbytes: int = DEFAULT_NBYTES) -> int: for i, msg in enumerate(issues, 1): print(f" [{i}] {msg}") - _plot_stacked( - no_cong, "total_ns", - f"PE_DMA latency breakdown (no congestion, nbytes={nbytes})", + _plot_bw_utilization( + no_cong, + f"PE_DMA Effective BW utilization (no congestion, nbytes={nbytes})", OUT_DIR / "no_congestion.png", ) - _plot_stacked( - cong, "makespan_ns", - f"PE_DMA latency breakdown (congestion, makespan, nbytes={nbytes})", + _plot_bw_utilization( + cong, + f"PE_DMA Effective BW utilization (congestion, " + f"agg = n_issuers × nbytes / makespan, nbytes={nbytes})", OUT_DIR / "congestion.png", ) _write_csv(no_cong, cong, OUT_DIR / "summary.csv")