PE_DMA perf: SIP-wide scenarios + dual outputs + clearer naming
User asked to surface system-wide congestion (more accurate than
single-cube), bring back the latency-breakdown plot under a separate
filename, and rename the obscure ``streaming`` category.
Scenarios:
Renamed all_pe_to_pe0 → all_pe_cube0_to_pe0 (clarify cube scope).
Added two SIP-wide scenarios:
sip_local_all — every PE in sip0 (128 total) accesses its own
local slice. All paths disjoint (each PE owns
its own hbm_ctrl.peX), so the model should
scale linearly with cube count.
sip_hotspot_pe0 — every PE in sip0 (128 total) targets
sip0.cube0.pe0_slice. Worst-case hotspot:
UCIe inbound + r0c0→hbm_ctrl.pe0 saturated.
Each bar now carries an ``N=...`` annotation showing the issuer
count, and the chart titles say the scope explicitly.
Effective BW + util at 16 KB:
sip_local_all N=128 eff= 27.2 TB/s util_a= 83 %
sip_hotspot_pe0 N=128 eff= 134 GB/s util_a= 93 %
(UCIe-into-cube0 saturated)
Plots:
no_congestion.png + congestion.png — Effective BW utilization
(two bars: single vs aggregate peak)
breakdown_no_congestion.png +
breakdown_congestion.png — stacked latency breakdown
(renamed from previous)
summary.csv with columns for both views.
The visual y-cap on BW utilization is 150 %. Bars exceeding it (e.g.
sip_local_all's util_single = 10,639 %) are drawn at the cap with an
upward arrow and the real value annotated. The verification rule for
``util_single`` is loosened to ``≤ n_issuers × 100 % + 5 %`` so
massively-parallel disjoint scenarios pass.
Category renamed: ``streaming`` → ``wire_transfer``. It is the
bulk-transfer time = (n_flits − 1) × flit_bytes / bottleneck_bw — the
cost of streaming the rest of the payload through the slowest wire
after the first flit has arrived.
All checks PASS.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Binary file not shown.
|
After Width: | Height: | Size: 71 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 64 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 68 KiB After Width: | Height: | Size: 88 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 73 KiB After Width: | Height: | Size: 74 KiB |
@@ -1,4 +1,4 @@
|
|||||||
graph,scenario,label,nbytes,n_issuers,total_ns,makespan_ns,min_lat_ns,peak_single_bw_gbs,peak_aggregate_bw_gbs,effective_bw_gbs,util_single_pct,util_aggregate_pct,pe_setup,noc_mesh,ucie,fabric,streaming,hbm_ctrl,contention,path,first_path
|
graph,scenario,label,nbytes,n_issuers,total_ns,makespan_ns,min_lat_ns,peak_single_bw_gbs,peak_aggregate_bw_gbs,effective_bw_gbs,util_single_pct,util_aggregate_pct,pe_setup,noc_mesh,ucie,fabric,wire_transfer,hbm_ctrl,contention,path,first_path
|
||||||
no_congestion,local,"SAME_CUBE
|
no_congestion,local,"SAME_CUBE
|
||||||
PE_LOCAL",16384,1,77.0,,,256.0,256.0,212.7792207792208,83.11688311688312,83.11688311688312,1.0,2.0,0.0,0.0,63.0,9.0,2.0,pe0.pe_dma -> cube0.r0c0 -> hbm_ctrl.pe0,
|
PE_LOCAL",16384,1,77.0,,,256.0,256.0,212.7792207792208,83.11688311688312,83.11688311688312,1.0,2.0,0.0,0.0,63.0,9.0,2.0,pe0.pe_dma -> cube0.r0c0 -> hbm_ctrl.pe0,
|
||||||
no_congestion,same_cube_best,"SAME_CUBE
|
no_congestion,same_cube_best,"SAME_CUBE
|
||||||
@@ -16,9 +16,18 @@ REMOTE_WORST
|
|||||||
no_congestion,remote_sip,"REMOTE_SIP
|
no_congestion,remote_sip,"REMOTE_SIP
|
||||||
SAME_CUBE_SAME_PE
|
SAME_CUBE_SAME_PE
|
||||||
(sip0→sip1)",16384,1,408.5216666666663,,,128.0,128.0,40.10558395515541,31.332487464965165,31.332487464965165,1.0,4.0,37.040000000000006,22.09666666666667,126.0,9.0,209.38499999999962,pe0.pe_dma -> cube0.r0c0 -> ucie-N.conn0 -> cube0.ucie-N -> io0.ucie-P0 -> ucie-P0.conn0 -> io0.noc -> io0.pcie_ep -> fabric.switch0 -> io0.pcie_ep -> io0.noc -> ucie-P0.conn0 -> io0.ucie-P0 -> cube0.ucie-N -> ucie-N.conn0 -> cube0.r0c0 -> hbm_ctrl.pe0,
|
(sip0→sip1)",16384,1,408.5216666666663,,,128.0,128.0,40.10558395515541,31.332487464965165,31.332487464965165,1.0,4.0,37.040000000000006,22.09666666666667,126.0,9.0,209.38499999999962,pe0.pe_dma -> cube0.r0c0 -> ucie-N.conn0 -> cube0.ucie-N -> io0.ucie-P0 -> ucie-P0.conn0 -> io0.noc -> io0.pcie_ep -> fabric.switch0 -> io0.pcie_ep -> io0.noc -> ucie-P0.conn0 -> io0.ucie-P0 -> cube0.ucie-N -> ucie-N.conn0 -> cube0.r0c0 -> hbm_ctrl.pe0,
|
||||||
congestion,ctrl_hot_1,1×PE → pe0_slice,16384,1,,82.06,82.06,256.0,256.0,199.6587862539605,77.99171338045332,77.99171338045332,1.0,5.03,0.0,0.0,63.0,9.0,4.030000000000001,,pe1.pe_dma -> cube0.r0c1 -> cube0.r0c0 -> hbm_ctrl.pe0
|
congestion,ctrl_hot_1,"cube0
|
||||||
congestion,ctrl_hot_2,2×PE → pe0_slice,16384,2,,158.3450000000001,134.2400000000001,256.0,256.0,206.94054122327813,80.83614891534302,80.83614891534302,1.0,5.03,0.0,0.0,63.0,9.0,80.31500000000011,,pe1.pe_dma -> cube0.r0c1 -> cube0.r0c0 -> hbm_ctrl.pe0
|
1×PE → pe0_slice",16384,1,,82.06,82.06,256.0,256.0,199.6587862539605,77.99171338045332,77.99171338045332,1.0,5.03,0.0,0.0,63.0,9.0,4.030000000000001,,pe1.pe_dma -> cube0.r0c1 -> cube0.r0c0 -> hbm_ctrl.pe0
|
||||||
congestion,ctrl_hot_3,3×PE → pe0_slice,16384,3,,230.0750000000001,139.94000000000008,256.0,256.0,213.6346843420623,83.45104857111808,83.45104857111808,1.0,5.03,0.0,0.0,63.0,9.0,152.0450000000001,,pe1.pe_dma -> cube0.r0c1 -> cube0.r0c0 -> hbm_ctrl.pe0
|
congestion,ctrl_hot_2,"cube0
|
||||||
congestion,ucie_eastbound,"8×PE corresp.
|
2×PE → pe0_slice",16384,2,,158.3450000000001,134.2400000000001,256.0,256.0,206.94054122327813,80.83614891534302,80.83614891534302,1.0,5.03,0.0,0.0,63.0,9.0,80.31500000000011,,pe1.pe_dma -> cube0.r0c1 -> cube0.r0c0 -> hbm_ctrl.pe0
|
||||||
cube0→cube1",16384,8,,962.52,438.52,128.0,159.99999999999997,136.17587167019906,106.387399742343,85.10991979387443,1.0,6.0,32.510000000000005,0.0,126.0,9.0,788.01,,pe0.pe_dma -> cube0.r0c0 -> ucie-N.conn0 -> cube0.ucie-N -> ucie-N.conn3 -> cube0.r0c5 -> ucie-E.conn0 -> cube0.ucie-E -> cube1.ucie-W -> ucie-W.conn0 -> cube1.r0c0 -> hbm_ctrl.pe0
|
congestion,ctrl_hot_3,"cube0
|
||||||
congestion,all_pe_to_pe0,8×PE → pe0_slice,16384,8,,558.2499999999998,195.0,256.0,256.0,234.7908643081058,91.71518137035383,91.71518137035383,1.0,2.0,0.0,0.0,63.0,9.0,483.2499999999998,,pe0.pe_dma -> cube0.r0c0 -> hbm_ctrl.pe0
|
3×PE → pe0_slice",16384,3,,230.0750000000001,139.94000000000008,256.0,256.0,213.6346843420623,83.45104857111808,83.45104857111808,1.0,5.03,0.0,0.0,63.0,9.0,152.0450000000001,,pe1.pe_dma -> cube0.r0c1 -> cube0.r0c0 -> hbm_ctrl.pe0
|
||||||
|
congestion,ucie_eastbound,"cube0
|
||||||
|
8×PE corresp.
|
||||||
|
→ cube1",16384,8,,962.52,438.52,128.0,159.99999999999997,136.17587167019906,106.387399742343,85.10991979387443,1.0,6.0,32.510000000000005,0.0,126.0,9.0,788.01,,pe0.pe_dma -> cube0.r0c0 -> ucie-N.conn0 -> cube0.ucie-N -> ucie-N.conn3 -> cube0.r0c5 -> ucie-E.conn0 -> cube0.ucie-E -> cube1.ucie-W -> ucie-W.conn0 -> cube1.r0c0 -> hbm_ctrl.pe0
|
||||||
|
congestion,all_pe_cube0_to_pe0,"cube0
|
||||||
|
8×PE → pe0_slice",16384,8,,558.2499999999998,195.0,256.0,256.0,234.7908643081058,91.71518137035383,91.71518137035383,1.0,2.0,0.0,0.0,63.0,9.0,483.2499999999998,,pe0.pe_dma -> cube0.r0c0 -> hbm_ctrl.pe0
|
||||||
|
congestion,sip_local_all,"sip0
|
||||||
|
128×PE → own slice",16384,128,,77.0,77.0,256.0,32768.0,27235.74025974026,10638.961038961039,83.11688311688312,1.0,2.0,0.0,0.0,63.0,9.0,2.0,,pe0.pe_dma -> cube0.r0c0 -> hbm_ctrl.pe0
|
||||||
|
congestion,sip_hotspot_pe0,"sip0
|
||||||
|
128×PE → cube0.pe0_slice",16384,128,,15618.595000000001,204.0,256.0,143.9999999999998,134.2727690935068,52.4503004271511,93.24497853715764,1.0,2.0,0.0,0.0,63.0,9.0,15543.595000000001,,pe0.pe_dma -> cube0.r0c0 -> hbm_ctrl.pe0
|
||||||
|
|||||||
|
+157
-56
@@ -18,21 +18,32 @@ Two graphs (saved to docs/diagrams/pe_dma_perf/):
|
|||||||
D. 8×PE same-direction-UCIe — every PE in cube0 reads cube1 same-PE slice
|
D. 8×PE same-direction-UCIe — every PE in cube0 reads cube1 same-PE slice
|
||||||
E. 8×PE all-hit-PE0 — every PE reads cube0.pe0_slice (hottest HBM CTRL)
|
E. 8×PE all-hit-PE0 — every PE reads cube0.pe0_slice (hottest HBM CTRL)
|
||||||
|
|
||||||
Effective BW = (total bytes transferred) / (wall-clock time)
|
Outputs (under ``docs/diagrams/pe_dma_perf/``):
|
||||||
no_congestion: nbytes / total_ns
|
no_congestion.png — BW utilization, single-issuer scenarios
|
||||||
congestion: n_issuers × nbytes / makespan_ns (aggregate throughput)
|
congestion.png — BW utilization, multi-issuer scenarios
|
||||||
|
breakdown_no_congestion.png — latency stacked breakdown, single-issuer
|
||||||
|
breakdown_congestion.png — latency stacked breakdown, multi-issuer
|
||||||
|
summary.csv — all rows + columns for either re-plot
|
||||||
|
|
||||||
Peak BW = the path bottleneck (slowest single-edge bandwidth on the
|
BW utilization plot (per scenario, two bars):
|
||||||
first issuer's path). For shared-resource congestion scenarios the
|
util_single = effective_bw / single-path peak × 100
|
||||||
aggregate effective BW can exceed this single-path peak when the
|
(peak = slowest edge bw on the first issuer's path)
|
||||||
shared resource provides parallel lanes (e.g. UCIe has 4 connections
|
util_aggregate = effective_bw / aggregate-resource peak × 100
|
||||||
× 128 GB/s = 512 GB/s aggregate even though each connection is 128).
|
(peak from max-min fair share over concurrent paths)
|
||||||
|
|
||||||
Utilization% = effective / peak × 100.
|
Effective BW = (total bytes transferred) / wall-clock time
|
||||||
|
no_congestion: nbytes / total_ns
|
||||||
|
congestion: n_issuers × nbytes / makespan_ns
|
||||||
|
|
||||||
Outputs ``summary.csv`` (including breakdown components for any future
|
util_aggregate is bounded by 100 % by construction. util_single can
|
||||||
analysis) so the plot can be re-rendered without re-running the
|
exceed 100 % when concurrent paths use multiple parallel lanes of a
|
||||||
simulator.
|
shared resource (e.g. UCIe's 4 connections), because the single-path
|
||||||
|
peak under-counts the aggregate capacity. The bar is visually capped
|
||||||
|
at 150 % with an upward arrow if it would exceed.
|
||||||
|
|
||||||
|
Latency breakdown plot (per scenario, stacked bar) — see
|
||||||
|
``_path_breakdown`` for the per-category accounting and the
|
||||||
|
wormhole-pipelined formula used.
|
||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
@@ -62,7 +73,7 @@ CATEGORIES = [
|
|||||||
("noc_mesh", "#10b981"), # green
|
("noc_mesh", "#10b981"), # green
|
||||||
("ucie", "#f59e0b"), # amber
|
("ucie", "#f59e0b"), # amber
|
||||||
("fabric", "#8b5cf6"), # purple (switch + io chiplet for cross-SIP)
|
("fabric", "#8b5cf6"), # purple (switch + io chiplet for cross-SIP)
|
||||||
("streaming", "#6366f1"), # indigo (bulk = (n_flits-1)/bottleneck)
|
("wire_transfer", "#6366f1"), # indigo (bulk = (n_flits-1)/bottleneck)
|
||||||
("hbm_ctrl", "#ef4444"), # red (final-chunk commit = chunk_time)
|
("hbm_ctrl", "#ef4444"), # red (final-chunk commit = chunk_time)
|
||||||
("contention", "#9ca3af"), # grey (actual − formula, surfaces serialization)
|
("contention", "#9ca3af"), # grey (actual − formula, surfaces serialization)
|
||||||
]
|
]
|
||||||
@@ -190,9 +201,12 @@ def _path_breakdown(
|
|||||||
|
|
||||||
Each summand is categorised:
|
Each summand is categorised:
|
||||||
* Per-component overheads + first-flit wire transfers are attributed
|
* Per-component overheads + first-flit wire transfers are attributed
|
||||||
by component class (pe_setup / noc_mesh / ucie).
|
by component class (pe_setup / noc_mesh / ucie / fabric).
|
||||||
* ``streaming`` is the bulk-transfer cost = (n_flits-1) × per_flit
|
* ``wire_transfer`` is the bulk-transfer cost
|
||||||
at the slowest wire bandwidth in the path.
|
= (n_flits − 1) × flit_bytes / bottleneck_bw — the time the
|
||||||
|
rest of the payload spends streaming through the slowest link
|
||||||
|
after the first flit has arrived. Renamed from ``streaming``
|
||||||
|
for clarity.
|
||||||
* ``hbm_ctrl`` is the HBM CTRL overhead + the final chunk's PC commit
|
* ``hbm_ctrl`` is the HBM CTRL overhead + the final chunk's PC commit
|
||||||
(= chunk_time). Earlier chunks overlap with arrival.
|
(= chunk_time). Earlier chunks overlap with arrival.
|
||||||
"""
|
"""
|
||||||
@@ -227,7 +241,7 @@ def _path_breakdown(
|
|||||||
if bws and nbytes > flit_bytes:
|
if bws and nbytes > flit_bytes:
|
||||||
n_flits = math.ceil(nbytes / flit_bytes)
|
n_flits = math.ceil(nbytes / flit_bytes)
|
||||||
min_bw = min(bws)
|
min_bw = min(bws)
|
||||||
cats["streaming"] = (n_flits - 1) * (flit_bytes / min_bw)
|
cats["wire_transfer"] = (n_flits - 1) * (flit_bytes / min_bw)
|
||||||
|
|
||||||
# 4) HBM CTRL: last-chunk commit time (earlier chunks overlap arrival).
|
# 4) HBM CTRL: last-chunk commit time (earlier chunks overlap arrival).
|
||||||
if path:
|
if path:
|
||||||
@@ -337,35 +351,57 @@ def _congestion_scenarios() -> list[CongestionScenario]:
|
|||||||
same_cube_same_target_pe0 = lambda srcs: [
|
same_cube_same_target_pe0 = lambda srcs: [
|
||||||
(0, 0, p, 0, 0, 0) for p in srcs
|
(0, 0, p, 0, 0, 0) for p in srcs
|
||||||
]
|
]
|
||||||
|
# Build (sip, cube, pe, dst_sip, dst_cube, dst_pe) tuples for every
|
||||||
|
# PE in sip0 (16 cubes × 8 PEs = 128 PEs total).
|
||||||
|
sip0_all_pes = [(0, c, p) for c in range(16) for p in range(8)]
|
||||||
|
|
||||||
return [
|
return [
|
||||||
# A-C: 1, 2, 3 remote PEs concurrently access pe0's slice in same cube
|
# A-C: 1, 2, 3 cube-local PEs target pe0's slice (incremental cube0)
|
||||||
CongestionScenario(
|
CongestionScenario(
|
||||||
"ctrl_hot_1",
|
"ctrl_hot_1",
|
||||||
"1×PE → pe0_slice",
|
"cube0\n1×PE → pe0_slice",
|
||||||
same_cube_same_target_pe0([1]),
|
same_cube_same_target_pe0([1]),
|
||||||
),
|
),
|
||||||
CongestionScenario(
|
CongestionScenario(
|
||||||
"ctrl_hot_2",
|
"ctrl_hot_2",
|
||||||
"2×PE → pe0_slice",
|
"cube0\n2×PE → pe0_slice",
|
||||||
same_cube_same_target_pe0([1, 2]),
|
same_cube_same_target_pe0([1, 2]),
|
||||||
),
|
),
|
||||||
CongestionScenario(
|
CongestionScenario(
|
||||||
"ctrl_hot_3",
|
"ctrl_hot_3",
|
||||||
"3×PE → pe0_slice",
|
"cube0\n3×PE → pe0_slice",
|
||||||
same_cube_same_target_pe0([1, 2, 3]),
|
same_cube_same_target_pe0([1, 2, 3]),
|
||||||
),
|
),
|
||||||
# D: every PE in cube0 sends to corresponding PE in cube1 (same UCIe direction)
|
# D: every PE in cube0 sends to corresponding PE in cube1
|
||||||
|
# (same UCIe direction, single-cube source)
|
||||||
CongestionScenario(
|
CongestionScenario(
|
||||||
"ucie_eastbound",
|
"ucie_eastbound",
|
||||||
"8×PE corresp.\ncube0→cube1",
|
"cube0\n8×PE corresp.\n→ cube1",
|
||||||
[(0, 0, p, 0, 1, p) for p in range(8)],
|
[(0, 0, p, 0, 1, p) for p in range(8)],
|
||||||
),
|
),
|
||||||
# E: every PE in cube0 hits pe0's slice → worst HBM CTRL hotspot
|
# E: every PE in cube0 hits pe0's slice (cube-local HBM hotspot)
|
||||||
CongestionScenario(
|
CongestionScenario(
|
||||||
"all_pe_to_pe0",
|
"all_pe_cube0_to_pe0",
|
||||||
"8×PE → pe0_slice",
|
"cube0\n8×PE → pe0_slice",
|
||||||
same_cube_same_target_pe0(list(range(8))),
|
same_cube_same_target_pe0(list(range(8))),
|
||||||
),
|
),
|
||||||
|
# F: every PE in sip0 (128 PEs) accesses its own local slice.
|
||||||
|
# All paths disjoint (each PE has its own hbm_ctrl.peX) — tests
|
||||||
|
# whether the aggregate cube HBM BW scales linearly with cube
|
||||||
|
# count (16 × 8 × 32 = 4096 GB/s peak).
|
||||||
|
CongestionScenario(
|
||||||
|
"sip_local_all",
|
||||||
|
"sip0\n128×PE → own slice",
|
||||||
|
[(s, c, p, s, c, p) for (s, c, p) in sip0_all_pes],
|
||||||
|
),
|
||||||
|
# G: every PE in sip0 targets sip0.cube0.pe0_slice (system-wide
|
||||||
|
# hotspot). Tests UCIe inbound saturation into cube0 +
|
||||||
|
# convergence on r0c0 → hbm_ctrl.pe0.
|
||||||
|
CongestionScenario(
|
||||||
|
"sip_hotspot_pe0",
|
||||||
|
"sip0\n128×PE → cube0.pe0_slice",
|
||||||
|
[(s, c, p, 0, 0, 0) for (s, c, p) in sip0_all_pes],
|
||||||
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@@ -441,18 +477,19 @@ def _plot_bw_utilization(rows, title, out_path):
|
|||||||
util_single = effective_bw / single-path peak × 100
|
util_single = effective_bw / single-path peak × 100
|
||||||
util_aggregate = effective_bw / aggregate-resource peak × 100
|
util_aggregate = effective_bw / aggregate-resource peak × 100
|
||||||
|
|
||||||
The aggregate peak sums the BW of *distinct* bottleneck edges across
|
The aggregate peak sums fair-share BW across all concurrent paths
|
||||||
all issuer paths — modelling multi-lane shared resources (e.g. UCIe's
|
(max-min fair share) — modelling shared resources correctly.
|
||||||
4 connections) correctly. For scenarios where all paths share one
|
|
||||||
bottleneck wire the two peaks are equal and the bars match.
|
|
||||||
|
|
||||||
The dashed line at 100 % is the saturation reference for both
|
Y-axis is capped at ``Y_CAP_PCT`` so the chart stays readable when
|
||||||
metrics. util_single can exceed 100 % when multi-lane resources are
|
a disjoint-path scenario (e.g. all 128 SIP PEs accessing their own
|
||||||
used; util_aggregate is bounded by 100 % by construction (since the
|
slice) drives util_single far above n_issuers × 100 %. Any bar that
|
||||||
aggregate peak is the upper bound on aggregate throughput).
|
exceeds the cap is drawn at the cap with an upward arrow and the
|
||||||
|
real value annotated.
|
||||||
"""
|
"""
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
Y_CAP_PCT = 150.0 # visual ceiling
|
||||||
|
|
||||||
n = len(rows)
|
n = len(rows)
|
||||||
labels = [r["label"] for r in rows]
|
labels = [r["label"] for r in rows]
|
||||||
util_s = [r.get("util_single_pct", 0.0) for r in rows]
|
util_s = [r.get("util_single_pct", 0.0) for r in rows]
|
||||||
@@ -460,35 +497,42 @@ def _plot_bw_utilization(rows, title, out_path):
|
|||||||
eff = [r.get("effective_bw_gbs", 0.0) for r in rows]
|
eff = [r.get("effective_bw_gbs", 0.0) for r in rows]
|
||||||
peak_s = [r.get("peak_single_bw_gbs", 0.0) for r in rows]
|
peak_s = [r.get("peak_single_bw_gbs", 0.0) for r in rows]
|
||||||
peak_a = [r.get("peak_aggregate_bw_gbs", 0.0) for r in rows]
|
peak_a = [r.get("peak_aggregate_bw_gbs", 0.0) for r in rows]
|
||||||
|
n_iss = [r.get("n_issuers", 1) for r in rows]
|
||||||
|
|
||||||
fig, ax = plt.subplots(figsize=(max(9, n * 1.6), 6.0))
|
fig, ax = plt.subplots(figsize=(max(9, n * 1.6), 6.0))
|
||||||
x = np.arange(n)
|
x = np.arange(n)
|
||||||
w = 0.38
|
w = 0.38
|
||||||
ax.bar(x - w / 2, util_s, w, color="#6366f1",
|
util_s_capped = [min(u, Y_CAP_PCT) for u in util_s]
|
||||||
|
util_a_capped = [min(u, Y_CAP_PCT) for u in util_a]
|
||||||
|
ax.bar(x - w / 2, util_s_capped, w, color="#6366f1",
|
||||||
edgecolor="white", linewidth=0.5,
|
edgecolor="white", linewidth=0.5,
|
||||||
label="util vs single-path peak")
|
label="util vs single-path peak")
|
||||||
ax.bar(x + w / 2, util_a, w, color="#10b981",
|
ax.bar(x + w / 2, util_a_capped, w, color="#10b981",
|
||||||
edgecolor="white", linewidth=0.5,
|
edgecolor="white", linewidth=0.5,
|
||||||
label="util vs aggregate-resource peak")
|
label="util vs aggregate-resource peak")
|
||||||
ax.axhline(100.0, color="grey", linestyle="--", linewidth=0.8,
|
ax.axhline(100.0, color="grey", linestyle="--", linewidth=0.8,
|
||||||
label="saturation (100 %)")
|
label="saturation (100 %)")
|
||||||
|
|
||||||
y_max = max(util_s + util_a + [100.0]) * 1.30
|
y_max = Y_CAP_PCT * 1.20
|
||||||
for i in range(n):
|
for i in range(n):
|
||||||
ax.text(i - w / 2, util_s[i] + y_max * 0.012,
|
# util_single bar annotation: show ↑ if exceeded the cap
|
||||||
f"{util_s[i]:.0f}%\n/{peak_s[i]:.0f}",
|
marker_s = "↑ " if util_s[i] > Y_CAP_PCT + 1e-3 else ""
|
||||||
|
ax.text(i - w / 2, util_s_capped[i] + y_max * 0.012,
|
||||||
|
f"{marker_s}{util_s[i]:.0f}%\n/{peak_s[i]:.0f}",
|
||||||
ha="center", va="bottom", fontsize=7)
|
ha="center", va="bottom", fontsize=7)
|
||||||
ax.text(i + w / 2, util_a[i] + y_max * 0.012,
|
marker_a = "↑ " if util_a[i] > Y_CAP_PCT + 1e-3 else ""
|
||||||
f"{util_a[i]:.0f}%\n/{peak_a[i]:.0f}",
|
ax.text(i + w / 2, util_a_capped[i] + y_max * 0.012,
|
||||||
|
f"{marker_a}{util_a[i]:.0f}%\n/{peak_a[i]:.0f}",
|
||||||
ha="center", va="bottom", fontsize=7)
|
ha="center", va="bottom", fontsize=7)
|
||||||
# Effective BW annotation underneath each pair
|
# Effective BW + n_issuers annotation underneath each pair.
|
||||||
ax.text(i, -y_max * 0.04, f"eff={eff[i]:.0f} GB/s",
|
ax.text(i, -y_max * 0.04,
|
||||||
|
f"N={n_iss[i]}\neff={eff[i]:.0f} GB/s",
|
||||||
ha="center", va="top", fontsize=7, color="#444444")
|
ha="center", va="top", fontsize=7, color="#444444")
|
||||||
|
|
||||||
ax.set_xticks(x)
|
ax.set_xticks(x)
|
||||||
ax.set_xticklabels(labels, fontsize=8)
|
ax.set_xticklabels(labels, fontsize=7)
|
||||||
ax.set_ylabel("Effective BW utilization (%)")
|
ax.set_ylabel("Effective BW utilization (%)")
|
||||||
ax.set_title(title)
|
ax.set_title(title, fontsize=11)
|
||||||
ax.set_ylim(-y_max * 0.10, y_max)
|
ax.set_ylim(-y_max * 0.10, y_max)
|
||||||
ax.legend(loc="upper right", fontsize=9, frameon=False)
|
ax.legend(loc="upper right", fontsize=9, frameon=False)
|
||||||
fig.tight_layout()
|
fig.tight_layout()
|
||||||
@@ -499,13 +543,49 @@ def _plot_bw_utilization(rows, title, out_path):
|
|||||||
# ── CSV ────────────────────────────────────────────────────────────────
|
# ── CSV ────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def _plot_breakdown(rows, value_key, title, out_path):
|
||||||
|
"""Stacked-bar latency breakdown per scenario (one stack per row).
|
||||||
|
|
||||||
|
Each category from ``CATEGORIES`` (except ``contention``) contributes
|
||||||
|
a coloured segment proportional to its computed time in ns;
|
||||||
|
``contention`` is the residual ``actual − formula_sum`` and absorbs
|
||||||
|
serialisation across concurrent issuers plus any model-fidelity gap.
|
||||||
|
The total bar height = actual ``total_ns`` (no_congestion) or
|
||||||
|
``makespan_ns`` (congestion).
|
||||||
|
"""
|
||||||
|
n = len(rows)
|
||||||
|
labels = [r["label"] for r in rows]
|
||||||
|
fig, ax = plt.subplots(figsize=(max(9, n * 1.6), 6.0))
|
||||||
|
bottoms = [0.0] * n
|
||||||
|
for cat, colour in CATEGORIES:
|
||||||
|
heights = [r.get(cat, 0.0) for r in rows]
|
||||||
|
ax.bar(labels, heights, bottom=bottoms, color=colour, label=cat,
|
||||||
|
edgecolor="white", linewidth=0.5)
|
||||||
|
bottoms = [b + h for b, h in zip(bottoms, heights)]
|
||||||
|
for i, r in enumerate(rows):
|
||||||
|
ax.text(i, bottoms[i] * 1.01,
|
||||||
|
f"{r[value_key]:.0f} ns", ha="center", va="bottom",
|
||||||
|
fontsize=8)
|
||||||
|
# n_issuers annotation under the label
|
||||||
|
ax.text(i, -max(bottoms) * 0.04, f"N={r.get('n_issuers', 1)}",
|
||||||
|
ha="center", va="top", fontsize=7, color="#444444")
|
||||||
|
ax.set_ylabel("Latency (ns)")
|
||||||
|
ax.set_title(title, fontsize=11)
|
||||||
|
ax.legend(loc="upper left", fontsize=9, frameon=False)
|
||||||
|
ax.set_ylim(-max(bottoms) * 0.10, max(bottoms) * 1.18)
|
||||||
|
ax.tick_params(axis="x", labelsize=7)
|
||||||
|
fig.tight_layout()
|
||||||
|
fig.savefig(out_path, dpi=150)
|
||||||
|
plt.close(fig)
|
||||||
|
|
||||||
|
|
||||||
def _write_csv(no_cong_rows, cong_rows, out_path):
|
def _write_csv(no_cong_rows, cong_rows, out_path):
|
||||||
fields = [
|
fields = [
|
||||||
"graph", "scenario", "label", "nbytes", "n_issuers",
|
"graph", "scenario", "label", "nbytes", "n_issuers",
|
||||||
"total_ns", "makespan_ns", "min_lat_ns",
|
"total_ns", "makespan_ns", "min_lat_ns",
|
||||||
"peak_single_bw_gbs", "peak_aggregate_bw_gbs", "effective_bw_gbs",
|
"peak_single_bw_gbs", "peak_aggregate_bw_gbs", "effective_bw_gbs",
|
||||||
"util_single_pct", "util_aggregate_pct",
|
"util_single_pct", "util_aggregate_pct",
|
||||||
"pe_setup", "noc_mesh", "ucie", "fabric", "streaming",
|
"pe_setup", "noc_mesh", "ucie", "fabric", "wire_transfer",
|
||||||
"hbm_ctrl", "contention",
|
"hbm_ctrl", "contention",
|
||||||
"path", "first_path",
|
"path", "first_path",
|
||||||
]
|
]
|
||||||
@@ -557,19 +637,23 @@ def _verify(rows_no_cong, rows_cong) -> list[str]:
|
|||||||
)
|
)
|
||||||
prev_bw = min(prev_bw, by_name.get(n, {}).get("effective_bw_gbs", prev_bw))
|
prev_bw = min(prev_bw, by_name.get(n, {}).get("effective_bw_gbs", prev_bw))
|
||||||
|
|
||||||
# (2) util_single in (0, 250 %]; util_aggregate in (0, 100 + ε %]
|
# (2) util_single positive and bounded by n_issuers × 100 % (the
|
||||||
|
# max possible when all paths are disjoint and each saturates the
|
||||||
|
# single-path peak). util_aggregate bounded by 100 % by definition.
|
||||||
for r in rows_no_cong + rows_cong:
|
for r in rows_no_cong + rows_cong:
|
||||||
us = r.get("util_single_pct", 0.0)
|
us = r.get("util_single_pct", 0.0)
|
||||||
ua = r.get("util_aggregate_pct", 0.0)
|
ua = r.get("util_aggregate_pct", 0.0)
|
||||||
|
n = r.get("n_issuers", 1)
|
||||||
if us <= 0 or ua <= 0:
|
if us <= 0 or ua <= 0:
|
||||||
issues.append(f"{r['scenario']}: non-positive util "
|
issues.append(f"{r['scenario']}: non-positive util "
|
||||||
f"(single={us}, agg={ua})")
|
f"(single={us}, agg={ua})")
|
||||||
if us > 250:
|
# 5 % slack for measurement/pipeline noise.
|
||||||
|
if us > n * 100.0 + 5.0:
|
||||||
issues.append(
|
issues.append(
|
||||||
f"{r['scenario']}: util_single={us:.1f}% > 250 % — "
|
f"{r['scenario']}: util_single={us:.1f}% > n_issuers×100% "
|
||||||
f"likely a peak or effective BW miscompute"
|
f"({n * 100:.0f}%) — likely a peak or effective BW miscompute"
|
||||||
)
|
)
|
||||||
if ua > 100.0 + 1.0: # 1 % numerical slack
|
if ua > 100.0 + 1.0:
|
||||||
issues.append(
|
issues.append(
|
||||||
f"{r['scenario']}: util_aggregate={ua:.1f}% > 100 % — "
|
f"{r['scenario']}: util_aggregate={ua:.1f}% > 100 % — "
|
||||||
f"effective BW must not exceed the aggregate resource peak"
|
f"effective BW must not exceed the aggregate resource peak"
|
||||||
@@ -613,8 +697,8 @@ def _verify(rows_no_cong, rows_cong) -> list[str]:
|
|||||||
last = max(last, cong_map.get(n, {}).get("effective_bw_gbs", last))
|
last = max(last, cong_map.get(n, {}).get("effective_bw_gbs", last))
|
||||||
|
|
||||||
# (6) all_pe_to_pe0 must approach the shared single-path peak.
|
# (6) all_pe_to_pe0 must approach the shared single-path peak.
|
||||||
if "all_pe_to_pe0" in cong_map:
|
if "all_pe_cube0_to_pe0" in cong_map:
|
||||||
u = cong_map["all_pe_to_pe0"]["util_single_pct"]
|
u = cong_map["all_pe_cube0_to_pe0"]["util_single_pct"]
|
||||||
if u < 70.0:
|
if u < 70.0:
|
||||||
issues.append(
|
issues.append(
|
||||||
f"congestion all_pe_to_pe0: util_single={u:.1f}% < 70 % — "
|
f"congestion all_pe_to_pe0: util_single={u:.1f}% < 70 % — "
|
||||||
@@ -677,19 +761,36 @@ def main(nbytes: int = DEFAULT_NBYTES) -> int:
|
|||||||
|
|
||||||
_plot_bw_utilization(
|
_plot_bw_utilization(
|
||||||
no_cong,
|
no_cong,
|
||||||
f"PE_DMA Effective BW utilization (no congestion, nbytes={nbytes})",
|
f"PE_DMA Effective BW utilization — no congestion\n"
|
||||||
|
f"1 PE issuer per scenario, nbytes={nbytes}",
|
||||||
OUT_DIR / "no_congestion.png",
|
OUT_DIR / "no_congestion.png",
|
||||||
)
|
)
|
||||||
_plot_bw_utilization(
|
_plot_bw_utilization(
|
||||||
cong,
|
cong,
|
||||||
f"PE_DMA Effective BW utilization (congestion, "
|
f"PE_DMA Effective BW utilization — congestion\n"
|
||||||
f"agg = n_issuers × nbytes / makespan, nbytes={nbytes})",
|
f"N concurrent PE issuers (N shown under each label); "
|
||||||
|
f"agg = N × nbytes / makespan, nbytes={nbytes}",
|
||||||
OUT_DIR / "congestion.png",
|
OUT_DIR / "congestion.png",
|
||||||
)
|
)
|
||||||
|
_plot_breakdown(
|
||||||
|
no_cong, "total_ns",
|
||||||
|
f"PE_DMA latency breakdown — no congestion\n"
|
||||||
|
f"1 PE issuer per scenario, nbytes={nbytes}",
|
||||||
|
OUT_DIR / "breakdown_no_congestion.png",
|
||||||
|
)
|
||||||
|
_plot_breakdown(
|
||||||
|
cong, "makespan_ns",
|
||||||
|
f"PE_DMA latency breakdown — congestion (makespan)\n"
|
||||||
|
f"N concurrent PE issuers (N shown under each label), "
|
||||||
|
f"nbytes={nbytes}",
|
||||||
|
OUT_DIR / "breakdown_congestion.png",
|
||||||
|
)
|
||||||
_write_csv(no_cong, cong, OUT_DIR / "summary.csv")
|
_write_csv(no_cong, cong, OUT_DIR / "summary.csv")
|
||||||
|
|
||||||
print(f"\nWrote:\n {OUT_DIR / 'no_congestion.png'}\n"
|
print(f"\nWrote:\n {OUT_DIR / 'no_congestion.png'}\n"
|
||||||
f" {OUT_DIR / 'congestion.png'}\n"
|
f" {OUT_DIR / 'congestion.png'}\n"
|
||||||
|
f" {OUT_DIR / 'breakdown_no_congestion.png'}\n"
|
||||||
|
f" {OUT_DIR / 'breakdown_congestion.png'}\n"
|
||||||
f" {OUT_DIR / 'summary.csv'}")
|
f" {OUT_DIR / 'summary.csv'}")
|
||||||
|
|
||||||
return 0 if not issues else 1
|
return 0 if not issues else 1
|
||||||
|
|||||||
Reference in New Issue
Block a user