PE_DMA perf: dual-peak utilisation (single-path + aggregate)
Each scenario now shows TWO bars:
util_single = effective_bw / single-path peak × 100
(peak = min bw_gbs on first issuer's path)
util_aggregate = effective_bw / aggregate-resource peak × 100
(peak = max-min fair share across concurrent paths)
Aggregate peak uses a max-min fair-share computation: each concurrent
path's sustainable share on an edge is bw_gbs / usage_count, the
per-path throughput is the min share along its edges, and the aggregate
peak is the sum across paths. This produces the correct answer for both
shared-bottleneck scenarios (N paths converge on one wire → aggregate =
wire BW) and multi-lane shared resources (UCIe's 4 connections used in
parallel → aggregate ≈ 4 × per-conn BW), without enumerating max-flow.
Single-issuer (no_congestion) → util_single == util_aggregate by
definition. Congestion exposes the divergence:
ctrl_hot_{1,2,3}, all_pe_to_pe0 → both metrics agree (one shared
bottleneck: r0c0→hbm_ctrl.pe0 @ 256 GB/s)
8×PE eastbound → util_single=106 % (single conn @ 128 GB/s) but
util_aggregate=85 % (UCIe-W.conn0 @ 7-way shared,
aggregate peak ≈ 160 GB/s under the current
cross-cube routing that funnels via cube1.r0c0).
Verification updated to assert:
(2) util_aggregate ≤ 100 % (effective BW can't exceed the aggregate
resource peak, by construction).
(3) single-issuer util_single == util_aggregate.
(7) ucie_eastbound: util_aggregate is meaningfully smaller than
util_single (the multi-lane peak correction is observable).
CSV grows with peak_aggregate_bw_gbs and util_aggregate_pct columns;
breakdown columns retained.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Binary file not shown.
|
Before Width: | Height: | Size: 58 KiB After Width: | Height: | Size: 68 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 67 KiB After Width: | Height: | Size: 73 KiB |
@@ -1,24 +1,24 @@
|
|||||||
graph,scenario,label,nbytes,n_issuers,total_ns,makespan_ns,min_lat_ns,bottleneck_bw_gbs,effective_bw_gbs,util_pct,pe_setup,noc_mesh,ucie,fabric,streaming,hbm_ctrl,contention,path,first_path
|
graph,scenario,label,nbytes,n_issuers,total_ns,makespan_ns,min_lat_ns,peak_single_bw_gbs,peak_aggregate_bw_gbs,effective_bw_gbs,util_single_pct,util_aggregate_pct,pe_setup,noc_mesh,ucie,fabric,streaming,hbm_ctrl,contention,path,first_path
|
||||||
no_congestion,local,"SAME_CUBE
|
no_congestion,local,"SAME_CUBE
|
||||||
PE_LOCAL",16384,1,77.0,,,256.0,212.7792207792208,83.11688311688312,1.0,2.0,0.0,0.0,63.0,9.0,2.0,pe0.pe_dma -> cube0.r0c0 -> hbm_ctrl.pe0,
|
PE_LOCAL",16384,1,77.0,,,256.0,256.0,212.7792207792208,83.11688311688312,83.11688311688312,1.0,2.0,0.0,0.0,63.0,9.0,2.0,pe0.pe_dma -> cube0.r0c0 -> hbm_ctrl.pe0,
|
||||||
no_congestion,same_cube_best,"SAME_CUBE
|
no_congestion,same_cube_best,"SAME_CUBE
|
||||||
REMOTE_BEST
|
REMOTE_BEST
|
||||||
(pe0→pe1)",16384,1,82.06,,,256.0,199.6587862539605,77.99171338045332,1.0,5.03,0.0,0.0,63.0,9.0,4.030000000000001,pe0.pe_dma -> cube0.r0c0 -> cube0.r0c1 -> hbm_ctrl.pe1,
|
(pe0→pe1)",16384,1,82.06,,,256.0,256.0,199.6587862539605,77.99171338045332,77.99171338045332,1.0,5.03,0.0,0.0,63.0,9.0,4.030000000000001,pe0.pe_dma -> cube0.r0c0 -> cube0.r0c1 -> hbm_ctrl.pe1,
|
||||||
no_congestion,same_cube_worst,"SAME_CUBE
|
no_congestion,same_cube_worst,"SAME_CUBE
|
||||||
REMOTE_WORST
|
REMOTE_WORST
|
||||||
(pe0→pe7)",16384,1,117.50000000000001,,,256.0,139.4382978723404,54.46808510638297,1.0,26.25,0.0,0.0,63.0,9.0,18.250000000000014,pe0.pe_dma -> cube0.r0c0 -> cube0.r1c0 -> cube0.r1c1 -> cube0.r1c2 -> cube0.r1c3 -> cube0.r4c3 -> cube0.r4c4 -> cube0.r5c4 -> cube0.r5c5 -> hbm_ctrl.pe7,
|
(pe0→pe7)",16384,1,117.50000000000001,,,256.0,256.0,139.4382978723404,54.46808510638297,54.46808510638297,1.0,26.25,0.0,0.0,63.0,9.0,18.250000000000014,pe0.pe_dma -> cube0.r0c0 -> cube0.r1c0 -> cube0.r1c1 -> cube0.r1c2 -> cube0.r1c3 -> cube0.r4c3 -> cube0.r4c4 -> cube0.r5c4 -> cube0.r5c5 -> hbm_ctrl.pe7,
|
||||||
no_congestion,remote_cube_best,"REMOTE_CUBE
|
no_congestion,remote_cube_best,"REMOTE_CUBE
|
||||||
REMOTE_BEST
|
REMOTE_BEST
|
||||||
(cube0→cube1)",16384,1,202.51999999999998,,,128.0,80.90065178747778,63.20363420896702,1.0,6.0,32.510000000000005,0.0,126.0,9.0,28.00999999999999,pe0.pe_dma -> cube0.r0c0 -> ucie-N.conn0 -> cube0.ucie-N -> ucie-N.conn3 -> cube0.r0c5 -> ucie-E.conn0 -> cube0.ucie-E -> cube1.ucie-W -> ucie-W.conn0 -> cube1.r0c0 -> hbm_ctrl.pe0,
|
(cube0→cube1)",16384,1,202.51999999999998,,,128.0,128.0,80.90065178747778,63.20363420896702,63.20363420896702,1.0,6.0,32.510000000000005,0.0,126.0,9.0,28.00999999999999,pe0.pe_dma -> cube0.r0c0 -> ucie-N.conn0 -> cube0.ucie-N -> ucie-N.conn3 -> cube0.r0c5 -> ucie-E.conn0 -> cube0.ucie-E -> cube1.ucie-W -> ucie-W.conn0 -> cube1.r0c0 -> hbm_ctrl.pe0,
|
||||||
no_congestion,remote_cube_worst,"REMOTE_CUBE
|
no_congestion,remote_cube_worst,"REMOTE_CUBE
|
||||||
REMOTE_WORST
|
REMOTE_WORST
|
||||||
(cube0→cube15.pe7)",16384,1,573.1199999999999,,,128.0,28.587381351200452,22.333891680625353,1.0,30.0,219.05999999999995,0.0,126.0,9.0,188.05999999999995,pe0.pe_dma -> cube0.r0c0 -> ucie-N.conn0 -> cube0.ucie-N -> ucie-N.conn3 -> cube0.r0c5 -> ucie-E.conn0 -> cube0.ucie-E -> cube1.ucie-W -> ucie-W.conn0 -> cube1.r0c0 -> ucie-N.conn0 -> cube1.ucie-N -> ucie-N.conn3 -> cube1.r0c5 -> ucie-E.conn0 -> cube1.ucie-E -> cube2.ucie-W -> ucie-W.conn0 -> cube2.r0c0 -> ucie-N.conn0 -> cube2.ucie-N -> ucie-N.conn3 -> cube2.r0c5 -> ucie-E.conn0 -> cube2.ucie-E -> cube3.ucie-W -> ucie-W.conn0 -> cube3.r0c0 -> ucie-N.conn0 -> cube3.ucie-N -> ucie-N.conn3 -> cube3.r0c5 -> ucie-E.conn0 -> cube3.ucie-E -> ucie-E.conn3 -> cube3.r5c5 -> ucie-S.conn3 -> cube3.ucie-S -> cube7.ucie-N -> ucie-N.conn3 -> cube7.r0c5 -> ucie-E.conn0 -> cube7.ucie-E -> ucie-E.conn3 -> cube7.r5c5 -> ucie-S.conn3 -> cube7.ucie-S -> cube11.ucie-N -> ucie-N.conn3 -> cube11.r0c5 -> ucie-E.conn0 -> cube11.ucie-E -> ucie-E.conn3 -> cube11.r5c5 -> ucie-S.conn3 -> cube11.ucie-S -> cube15.ucie-N -> ucie-N.conn3 -> cube15.r0c5 -> ucie-E.conn0 -> cube15.ucie-E -> ucie-E.conn3 -> cube15.r5c5 -> hbm_ctrl.pe7,
|
(cube0→cube15.pe7)",16384,1,573.1199999999999,,,128.0,128.0,28.587381351200452,22.333891680625353,22.333891680625353,1.0,30.0,219.05999999999995,0.0,126.0,9.0,188.05999999999995,pe0.pe_dma -> cube0.r0c0 -> ucie-N.conn0 -> cube0.ucie-N -> ucie-N.conn3 -> cube0.r0c5 -> ucie-E.conn0 -> cube0.ucie-E -> cube1.ucie-W -> ucie-W.conn0 -> cube1.r0c0 -> ucie-N.conn0 -> cube1.ucie-N -> ucie-N.conn3 -> cube1.r0c5 -> ucie-E.conn0 -> cube1.ucie-E -> cube2.ucie-W -> ucie-W.conn0 -> cube2.r0c0 -> ucie-N.conn0 -> cube2.ucie-N -> ucie-N.conn3 -> cube2.r0c5 -> ucie-E.conn0 -> cube2.ucie-E -> cube3.ucie-W -> ucie-W.conn0 -> cube3.r0c0 -> ucie-N.conn0 -> cube3.ucie-N -> ucie-N.conn3 -> cube3.r0c5 -> ucie-E.conn0 -> cube3.ucie-E -> ucie-E.conn3 -> cube3.r5c5 -> ucie-S.conn3 -> cube3.ucie-S -> cube7.ucie-N -> ucie-N.conn3 -> cube7.r0c5 -> ucie-E.conn0 -> cube7.ucie-E -> ucie-E.conn3 -> cube7.r5c5 -> ucie-S.conn3 -> cube7.ucie-S -> cube11.ucie-N -> ucie-N.conn3 -> cube11.r0c5 -> ucie-E.conn0 -> cube11.ucie-E -> ucie-E.conn3 -> cube11.r5c5 -> ucie-S.conn3 -> cube11.ucie-S -> cube15.ucie-N -> ucie-N.conn3 -> cube15.r0c5 -> ucie-E.conn0 -> cube15.ucie-E -> ucie-E.conn3 -> cube15.r5c5 -> hbm_ctrl.pe7,
|
||||||
no_congestion,remote_sip,"REMOTE_SIP
|
no_congestion,remote_sip,"REMOTE_SIP
|
||||||
SAME_CUBE_SAME_PE
|
SAME_CUBE_SAME_PE
|
||||||
(sip0→sip1)",16384,1,408.5216666666663,,,128.0,40.10558395515541,31.332487464965165,1.0,4.0,37.040000000000006,22.09666666666667,126.0,9.0,209.38499999999962,pe0.pe_dma -> cube0.r0c0 -> ucie-N.conn0 -> cube0.ucie-N -> io0.ucie-P0 -> ucie-P0.conn0 -> io0.noc -> io0.pcie_ep -> fabric.switch0 -> io0.pcie_ep -> io0.noc -> ucie-P0.conn0 -> io0.ucie-P0 -> cube0.ucie-N -> ucie-N.conn0 -> cube0.r0c0 -> hbm_ctrl.pe0,
|
(sip0→sip1)",16384,1,408.5216666666663,,,128.0,128.0,40.10558395515541,31.332487464965165,31.332487464965165,1.0,4.0,37.040000000000006,22.09666666666667,126.0,9.0,209.38499999999962,pe0.pe_dma -> cube0.r0c0 -> ucie-N.conn0 -> cube0.ucie-N -> io0.ucie-P0 -> ucie-P0.conn0 -> io0.noc -> io0.pcie_ep -> fabric.switch0 -> io0.pcie_ep -> io0.noc -> ucie-P0.conn0 -> io0.ucie-P0 -> cube0.ucie-N -> ucie-N.conn0 -> cube0.r0c0 -> hbm_ctrl.pe0,
|
||||||
congestion,ctrl_hot_1,1×PE → pe0_slice,16384,1,,82.06,82.06,256.0,199.6587862539605,77.99171338045332,1.0,5.03,0.0,0.0,63.0,9.0,4.030000000000001,,pe1.pe_dma -> cube0.r0c1 -> cube0.r0c0 -> hbm_ctrl.pe0
|
congestion,ctrl_hot_1,1×PE → pe0_slice,16384,1,,82.06,82.06,256.0,256.0,199.6587862539605,77.99171338045332,77.99171338045332,1.0,5.03,0.0,0.0,63.0,9.0,4.030000000000001,,pe1.pe_dma -> cube0.r0c1 -> cube0.r0c0 -> hbm_ctrl.pe0
|
||||||
congestion,ctrl_hot_2,2×PE → pe0_slice,16384,2,,158.3450000000001,134.2400000000001,256.0,206.94054122327813,80.83614891534302,1.0,5.03,0.0,0.0,63.0,9.0,80.31500000000011,,pe1.pe_dma -> cube0.r0c1 -> cube0.r0c0 -> hbm_ctrl.pe0
|
congestion,ctrl_hot_2,2×PE → pe0_slice,16384,2,,158.3450000000001,134.2400000000001,256.0,256.0,206.94054122327813,80.83614891534302,80.83614891534302,1.0,5.03,0.0,0.0,63.0,9.0,80.31500000000011,,pe1.pe_dma -> cube0.r0c1 -> cube0.r0c0 -> hbm_ctrl.pe0
|
||||||
congestion,ctrl_hot_3,3×PE → pe0_slice,16384,3,,230.0750000000001,139.94000000000008,256.0,213.6346843420623,83.45104857111808,1.0,5.03,0.0,0.0,63.0,9.0,152.0450000000001,,pe1.pe_dma -> cube0.r0c1 -> cube0.r0c0 -> hbm_ctrl.pe0
|
congestion,ctrl_hot_3,3×PE → pe0_slice,16384,3,,230.0750000000001,139.94000000000008,256.0,256.0,213.6346843420623,83.45104857111808,83.45104857111808,1.0,5.03,0.0,0.0,63.0,9.0,152.0450000000001,,pe1.pe_dma -> cube0.r0c1 -> cube0.r0c0 -> hbm_ctrl.pe0
|
||||||
congestion,ucie_eastbound,"8×PE corresp.
|
congestion,ucie_eastbound,"8×PE corresp.
|
||||||
cube0→cube1",16384,8,,962.52,438.52,128.0,136.17587167019906,106.387399742343,1.0,6.0,32.510000000000005,0.0,126.0,9.0,788.01,,pe0.pe_dma -> cube0.r0c0 -> ucie-N.conn0 -> cube0.ucie-N -> ucie-N.conn3 -> cube0.r0c5 -> ucie-E.conn0 -> cube0.ucie-E -> cube1.ucie-W -> ucie-W.conn0 -> cube1.r0c0 -> hbm_ctrl.pe0
|
cube0→cube1",16384,8,,962.52,438.52,128.0,159.99999999999997,136.17587167019906,106.387399742343,85.10991979387443,1.0,6.0,32.510000000000005,0.0,126.0,9.0,788.01,,pe0.pe_dma -> cube0.r0c0 -> ucie-N.conn0 -> cube0.ucie-N -> ucie-N.conn3 -> cube0.r0c5 -> ucie-E.conn0 -> cube0.ucie-E -> cube1.ucie-W -> ucie-W.conn0 -> cube1.r0c0 -> hbm_ctrl.pe0
|
||||||
congestion,all_pe_to_pe0,8×PE → pe0_slice,16384,8,,558.2499999999998,195.0,256.0,234.7908643081058,91.71518137035383,1.0,2.0,0.0,0.0,63.0,9.0,483.2499999999998,,pe0.pe_dma -> cube0.r0c0 -> hbm_ctrl.pe0
|
congestion,all_pe_to_pe0,8×PE → pe0_slice,16384,8,,558.2499999999998,195.0,256.0,256.0,234.7908643081058,91.71518137035383,91.71518137035383,1.0,2.0,0.0,0.0,63.0,9.0,483.2499999999998,,pe0.pe_dma -> cube0.r0c0 -> hbm_ctrl.pe0
|
||||||
|
|||||||
|
+160
-61
@@ -136,6 +136,48 @@ def _bottleneck_bw(path: list[str], edge_map: dict) -> float | None:
|
|||||||
return min(bws) if bws else None
|
return min(bws) if bws else None
|
||||||
|
|
||||||
|
|
||||||
|
def _aggregate_peak_bw(paths: list[list[str]], edge_map: dict) -> float:
|
||||||
|
"""Max-min fair-share aggregate throughput across concurrent paths.
|
||||||
|
|
||||||
|
Each path is one unit of demand from source to destination. For each
|
||||||
|
edge, fair share per path = ``bw_gbs / usage_count``. A path's
|
||||||
|
sustainable throughput is the minimum fair share along its edges,
|
||||||
|
and the aggregate peak is the sum across paths. This produces the
|
||||||
|
correct answer for both shared-bottleneck scenarios (all paths
|
||||||
|
converge on one wire → aggregate = wire BW) and multi-lane shared
|
||||||
|
resources (UCIe's 4 connections used in parallel → aggregate = 4 ×
|
||||||
|
per-conn BW), without enumerating max-flow explicitly.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
* 3 paths sharing r0c0→hbm_ctrl.pe0 @ 256 GB/s
|
||||||
|
per-path = 256/3 ≈ 85.3, aggregate = 3 × 85.3 = 256 GB/s ✓
|
||||||
|
* 8 paths sharing 4 UCIe conns @ 128 GB/s (2 paths per conn)
|
||||||
|
per-path = 128/2 = 64, aggregate = 8 × 64 = 512 GB/s ✓
|
||||||
|
* 1 path through 256 GB/s bottleneck
|
||||||
|
per-path = 256, aggregate = 256 GB/s ✓ (= single-path peak)
|
||||||
|
"""
|
||||||
|
from collections import Counter
|
||||||
|
|
||||||
|
edge_usage: Counter = Counter()
|
||||||
|
for path in paths:
|
||||||
|
for i in range(len(path) - 1):
|
||||||
|
edge_usage[(path[i], path[i + 1])] += 1
|
||||||
|
|
||||||
|
aggregate = 0.0
|
||||||
|
for path in paths:
|
||||||
|
per_path = float("inf")
|
||||||
|
for i in range(len(path) - 1):
|
||||||
|
key = (path[i], path[i + 1])
|
||||||
|
e = edge_map.get(key)
|
||||||
|
if e and e.bw_gbs:
|
||||||
|
share = e.bw_gbs / edge_usage[key]
|
||||||
|
if share < per_path:
|
||||||
|
per_path = share
|
||||||
|
if per_path != float("inf"):
|
||||||
|
aggregate += per_path
|
||||||
|
return aggregate
|
||||||
|
|
||||||
|
|
||||||
def _path_breakdown(
|
def _path_breakdown(
|
||||||
path: list[str], nbytes: int, graph, edge_map, ns_per_mm: float,
|
path: list[str], nbytes: int, graph, edge_map, ns_per_mm: float,
|
||||||
) -> dict[str, float]:
|
) -> dict[str, float]:
|
||||||
@@ -257,9 +299,11 @@ def _run_no_congestion(nbytes: int):
|
|||||||
br = _path_breakdown(path, nbytes, graph, edge_map, ns_per_mm)
|
br = _path_breakdown(path, nbytes, graph, edge_map, ns_per_mm)
|
||||||
formula_sum = sum(br.values())
|
formula_sum = sum(br.values())
|
||||||
br["contention"] = max(0.0, total_ns - formula_sum)
|
br["contention"] = max(0.0, total_ns - formula_sum)
|
||||||
peak_bw = _bottleneck_bw(path, edge_map) or 0.0
|
peak_single = _bottleneck_bw(path, edge_map) or 0.0
|
||||||
|
peak_aggregate = _aggregate_peak_bw([path], edge_map)
|
||||||
eff_bw = nbytes / total_ns if total_ns > 0 else 0.0
|
eff_bw = nbytes / total_ns if total_ns > 0 else 0.0
|
||||||
util = (eff_bw / peak_bw * 100.0) if peak_bw > 0 else 0.0
|
util_single = (eff_bw / peak_single * 100.0) if peak_single > 0 else 0.0
|
||||||
|
util_aggregate = (eff_bw / peak_aggregate * 100.0) if peak_aggregate > 0 else 0.0
|
||||||
rows.append({
|
rows.append({
|
||||||
"graph": "no_congestion",
|
"graph": "no_congestion",
|
||||||
"scenario": scn.name,
|
"scenario": scn.name,
|
||||||
@@ -268,9 +312,11 @@ def _run_no_congestion(nbytes: int):
|
|||||||
"n_issuers": 1,
|
"n_issuers": 1,
|
||||||
"path": " -> ".join(_short_path(path)),
|
"path": " -> ".join(_short_path(path)),
|
||||||
"total_ns": total_ns,
|
"total_ns": total_ns,
|
||||||
"bottleneck_bw_gbs": peak_bw,
|
"peak_single_bw_gbs": peak_single,
|
||||||
|
"peak_aggregate_bw_gbs": peak_aggregate,
|
||||||
"effective_bw_gbs": eff_bw,
|
"effective_bw_gbs": eff_bw,
|
||||||
"util_pct": util,
|
"util_single_pct": util_single,
|
||||||
|
"util_aggregate_pct": util_aggregate,
|
||||||
**{c: br.get(c, 0.0) for c, _ in CATEGORIES},
|
**{c: br.get(c, 0.0) for c, _ in CATEGORIES},
|
||||||
})
|
})
|
||||||
return rows
|
return rows
|
||||||
@@ -333,7 +379,7 @@ def _run_congestion(nbytes: int):
|
|||||||
for scn in _congestion_scenarios():
|
for scn in _congestion_scenarios():
|
||||||
engine = GraphEngine(load_topology(TOPOLOGY_PATH))
|
engine = GraphEngine(load_topology(TOPOLOGY_PATH))
|
||||||
handles = []
|
handles = []
|
||||||
first_path = None
|
paths: list[list[str]] = []
|
||||||
for i, (ss, sc, sp, ds, dc, dp) in enumerate(scn.issues):
|
for i, (ss, sc, sp, ds, dc, dp) in enumerate(scn.issues):
|
||||||
pa = _hbm_pa(sip=ds, cube=dc, pe_id=dp,
|
pa = _hbm_pa(sip=ds, cube=dc, pe_id=dp,
|
||||||
offset=0x1000 + i * 0x100, slice_bytes=slice_bytes)
|
offset=0x1000 + i * 0x100, slice_bytes=slice_bytes)
|
||||||
@@ -343,10 +389,10 @@ def _run_congestion(nbytes: int):
|
|||||||
dst_pa=pa, nbytes=nbytes,
|
dst_pa=pa, nbytes=nbytes,
|
||||||
)
|
)
|
||||||
handles.append(engine.submit(msg))
|
handles.append(engine.submit(msg))
|
||||||
if first_path is None:
|
|
||||||
dst_node = engine._resolver.resolve(PhysAddr.decode(pa))
|
dst_node = engine._resolver.resolve(PhysAddr.decode(pa))
|
||||||
first_path = engine._router.find_path(
|
paths.append(engine._router.find_path(
|
||||||
f"sip{ss}.cube{sc}.pe{sp}", dst_node)
|
f"sip{ss}.cube{sc}.pe{sp}", dst_node))
|
||||||
|
first_path = paths[0] if paths else []
|
||||||
for h in handles:
|
for h in handles:
|
||||||
engine.wait(h)
|
engine.wait(h)
|
||||||
latencies = [engine.get_completion(h)[1]["total_ns"] for h in handles]
|
latencies = [engine.get_completion(h)[1]["total_ns"] for h in handles]
|
||||||
@@ -354,25 +400,29 @@ def _run_congestion(nbytes: int):
|
|||||||
|
|
||||||
# Breakdown uses the first issuer's path as a representative;
|
# Breakdown uses the first issuer's path as a representative;
|
||||||
# ``contention`` absorbs serialization across requests.
|
# ``contention`` absorbs serialization across requests.
|
||||||
br = _path_breakdown(first_path or [], nbytes, graph, edge_map, ns_per_mm)
|
br = _path_breakdown(first_path, nbytes, graph, edge_map, ns_per_mm)
|
||||||
formula_sum = sum(br.values())
|
formula_sum = sum(br.values())
|
||||||
br["contention"] = max(0.0, makespan - formula_sum)
|
br["contention"] = max(0.0, makespan - formula_sum)
|
||||||
peak_bw = (_bottleneck_bw(first_path or [], edge_map) or 0.0)
|
peak_single = _bottleneck_bw(first_path, edge_map) or 0.0
|
||||||
|
peak_aggregate = _aggregate_peak_bw(paths, edge_map)
|
||||||
total_bytes = nbytes * len(scn.issues)
|
total_bytes = nbytes * len(scn.issues)
|
||||||
eff_bw = total_bytes / makespan if makespan > 0 else 0.0
|
eff_bw = total_bytes / makespan if makespan > 0 else 0.0
|
||||||
util = (eff_bw / peak_bw * 100.0) if peak_bw > 0 else 0.0
|
util_single = (eff_bw / peak_single * 100.0) if peak_single > 0 else 0.0
|
||||||
|
util_aggregate = (eff_bw / peak_aggregate * 100.0) if peak_aggregate > 0 else 0.0
|
||||||
rows.append({
|
rows.append({
|
||||||
"graph": "congestion",
|
"graph": "congestion",
|
||||||
"scenario": scn.name,
|
"scenario": scn.name,
|
||||||
"label": scn.label,
|
"label": scn.label,
|
||||||
"nbytes": nbytes,
|
"nbytes": nbytes,
|
||||||
"n_issuers": len(scn.issues),
|
"n_issuers": len(scn.issues),
|
||||||
"first_path": " -> ".join(_short_path(first_path or [])),
|
"first_path": " -> ".join(_short_path(first_path)),
|
||||||
"makespan_ns": makespan,
|
"makespan_ns": makespan,
|
||||||
"min_lat_ns": min(latencies) if latencies else 0.0,
|
"min_lat_ns": min(latencies) if latencies else 0.0,
|
||||||
"bottleneck_bw_gbs": peak_bw,
|
"peak_single_bw_gbs": peak_single,
|
||||||
|
"peak_aggregate_bw_gbs": peak_aggregate,
|
||||||
"effective_bw_gbs": eff_bw,
|
"effective_bw_gbs": eff_bw,
|
||||||
"util_pct": util,
|
"util_single_pct": util_single,
|
||||||
|
"util_aggregate_pct": util_aggregate,
|
||||||
**{c: br.get(c, 0.0) for c, _ in CATEGORIES},
|
**{c: br.get(c, 0.0) for c, _ in CATEGORIES},
|
||||||
})
|
})
|
||||||
return rows
|
return rows
|
||||||
@@ -386,39 +436,60 @@ def _short_path(path: Iterable[str]) -> list[str]:
|
|||||||
|
|
||||||
|
|
||||||
def _plot_bw_utilization(rows, title, out_path):
|
def _plot_bw_utilization(rows, title, out_path):
|
||||||
"""Plot Effective BW utilization (%) per scenario.
|
"""Plot Effective BW utilization (%) per scenario with TWO bars:
|
||||||
|
|
||||||
Each bar is util_pct = effective_bw / peak_bottleneck_bw × 100.
|
util_single = effective_bw / single-path peak × 100
|
||||||
Annotation shows effective and peak in GB/s. A horizontal dashed
|
util_aggregate = effective_bw / aggregate-resource peak × 100
|
||||||
line marks 100 % (single-path peak); bars exceeding it indicate
|
|
||||||
the scenario uses multiple parallel resources (e.g. UCIe's 4
|
The aggregate peak sums the BW of *distinct* bottleneck edges across
|
||||||
connections) beyond the bottleneck of any single path.
|
all issuer paths — modelling multi-lane shared resources (e.g. UCIe's
|
||||||
|
4 connections) correctly. For scenarios where all paths share one
|
||||||
|
bottleneck wire the two peaks are equal and the bars match.
|
||||||
|
|
||||||
|
The dashed line at 100 % is the saturation reference for both
|
||||||
|
metrics. util_single can exceed 100 % when multi-lane resources are
|
||||||
|
used; util_aggregate is bounded by 100 % by construction (since the
|
||||||
|
aggregate peak is the upper bound on aggregate throughput).
|
||||||
"""
|
"""
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
n = len(rows)
|
n = len(rows)
|
||||||
labels = [r["label"] for r in rows]
|
labels = [r["label"] for r in rows]
|
||||||
util = [r.get("util_pct", 0.0) for r in rows]
|
util_s = [r.get("util_single_pct", 0.0) for r in rows]
|
||||||
|
util_a = [r.get("util_aggregate_pct", 0.0) for r in rows]
|
||||||
eff = [r.get("effective_bw_gbs", 0.0) for r in rows]
|
eff = [r.get("effective_bw_gbs", 0.0) for r in rows]
|
||||||
peak = [r.get("bottleneck_bw_gbs", 0.0) for r in rows]
|
peak_s = [r.get("peak_single_bw_gbs", 0.0) for r in rows]
|
||||||
|
peak_a = [r.get("peak_aggregate_bw_gbs", 0.0) for r in rows]
|
||||||
|
|
||||||
fig, ax = plt.subplots(figsize=(max(8, n * 1.4), 5.5))
|
fig, ax = plt.subplots(figsize=(max(9, n * 1.6), 6.0))
|
||||||
# Colour bars by utilization band for quick scanning.
|
x = np.arange(n)
|
||||||
colours = ["#10b981" if u >= 70 else "#f59e0b" if u >= 40 else "#ef4444"
|
w = 0.38
|
||||||
for u in util]
|
ax.bar(x - w / 2, util_s, w, color="#6366f1",
|
||||||
ax.bar(labels, util, color=colours, edgecolor="white", linewidth=0.5)
|
edgecolor="white", linewidth=0.5,
|
||||||
|
label="util vs single-path peak")
|
||||||
|
ax.bar(x + w / 2, util_a, w, color="#10b981",
|
||||||
|
edgecolor="white", linewidth=0.5,
|
||||||
|
label="util vs aggregate-resource peak")
|
||||||
ax.axhline(100.0, color="grey", linestyle="--", linewidth=0.8,
|
ax.axhline(100.0, color="grey", linestyle="--", linewidth=0.8,
|
||||||
label="single-path peak")
|
label="saturation (100 %)")
|
||||||
|
|
||||||
# Annotate each bar with util%, effective, and peak.
|
y_max = max(util_s + util_a + [100.0]) * 1.30
|
||||||
y_max = max(util + [100.0]) * 1.2
|
for i in range(n):
|
||||||
for i, (u, e, p) in enumerate(zip(util, eff, peak)):
|
ax.text(i - w / 2, util_s[i] + y_max * 0.012,
|
||||||
ax.text(i, u + y_max * 0.012,
|
f"{util_s[i]:.0f}%\n/{peak_s[i]:.0f}",
|
||||||
f"{u:.1f}%\n{e:.0f} / {p:.0f} GB/s",
|
ha="center", va="bottom", fontsize=7)
|
||||||
ha="center", va="bottom", fontsize=8)
|
ax.text(i + w / 2, util_a[i] + y_max * 0.012,
|
||||||
|
f"{util_a[i]:.0f}%\n/{peak_a[i]:.0f}",
|
||||||
|
ha="center", va="bottom", fontsize=7)
|
||||||
|
# Effective BW annotation underneath each pair
|
||||||
|
ax.text(i, -y_max * 0.04, f"eff={eff[i]:.0f} GB/s",
|
||||||
|
ha="center", va="top", fontsize=7, color="#444444")
|
||||||
|
|
||||||
|
ax.set_xticks(x)
|
||||||
|
ax.set_xticklabels(labels, fontsize=8)
|
||||||
ax.set_ylabel("Effective BW utilization (%)")
|
ax.set_ylabel("Effective BW utilization (%)")
|
||||||
ax.set_title(title)
|
ax.set_title(title)
|
||||||
ax.set_ylim(0, y_max)
|
ax.set_ylim(-y_max * 0.10, y_max)
|
||||||
ax.tick_params(axis="x", labelsize=8)
|
|
||||||
ax.legend(loc="upper right", fontsize=9, frameon=False)
|
ax.legend(loc="upper right", fontsize=9, frameon=False)
|
||||||
fig.tight_layout()
|
fig.tight_layout()
|
||||||
fig.savefig(out_path, dpi=150)
|
fig.savefig(out_path, dpi=150)
|
||||||
@@ -432,7 +503,8 @@ def _write_csv(no_cong_rows, cong_rows, out_path):
|
|||||||
fields = [
|
fields = [
|
||||||
"graph", "scenario", "label", "nbytes", "n_issuers",
|
"graph", "scenario", "label", "nbytes", "n_issuers",
|
||||||
"total_ns", "makespan_ns", "min_lat_ns",
|
"total_ns", "makespan_ns", "min_lat_ns",
|
||||||
"bottleneck_bw_gbs", "effective_bw_gbs", "util_pct",
|
"peak_single_bw_gbs", "peak_aggregate_bw_gbs", "effective_bw_gbs",
|
||||||
|
"util_single_pct", "util_aggregate_pct",
|
||||||
"pe_setup", "noc_mesh", "ucie", "fabric", "streaming",
|
"pe_setup", "noc_mesh", "ucie", "fabric", "streaming",
|
||||||
"hbm_ctrl", "contention",
|
"hbm_ctrl", "contention",
|
||||||
"path", "first_path",
|
"path", "first_path",
|
||||||
@@ -485,26 +557,37 @@ def _verify(rows_no_cong, rows_cong) -> list[str]:
|
|||||||
)
|
)
|
||||||
prev_bw = min(prev_bw, by_name.get(n, {}).get("effective_bw_gbs", prev_bw))
|
prev_bw = min(prev_bw, by_name.get(n, {}).get("effective_bw_gbs", prev_bw))
|
||||||
|
|
||||||
# (2) Utilisation in (0, 250 %]; values > 100 only allowed on shared
|
# (2) util_single in (0, 250 %]; util_aggregate in (0, 100 + ε %]
|
||||||
# multi-lane resources (UCIe per_conn × 4 → 4-fold parallelism).
|
|
||||||
for r in rows_no_cong + rows_cong:
|
for r in rows_no_cong + rows_cong:
|
||||||
u = r.get("util_pct", 0.0)
|
us = r.get("util_single_pct", 0.0)
|
||||||
if u <= 0:
|
ua = r.get("util_aggregate_pct", 0.0)
|
||||||
issues.append(f"{r['scenario']}: non-positive util_pct={u}")
|
if us <= 0 or ua <= 0:
|
||||||
if u > 250:
|
issues.append(f"{r['scenario']}: non-positive util "
|
||||||
|
f"(single={us}, agg={ua})")
|
||||||
|
if us > 250:
|
||||||
issues.append(
|
issues.append(
|
||||||
f"{r['scenario']}: util_pct={u:.1f}% exceeds 250 % — "
|
f"{r['scenario']}: util_single={us:.1f}% > 250 % — "
|
||||||
f"likely a peak-BW or effective-BW miscompute"
|
f"likely a peak or effective BW miscompute"
|
||||||
|
)
|
||||||
|
if ua > 100.0 + 1.0: # 1 % numerical slack
|
||||||
|
issues.append(
|
||||||
|
f"{r['scenario']}: util_aggregate={ua:.1f}% > 100 % — "
|
||||||
|
f"effective BW must not exceed the aggregate resource peak"
|
||||||
)
|
)
|
||||||
|
|
||||||
# (3) Single-issuer utilisation cannot exceed 100 %.
|
# (3) Single-issuer utilisation (both metrics) cannot exceed 100 %.
|
||||||
for r in rows_no_cong:
|
for r in rows_no_cong:
|
||||||
u = r.get("util_pct", 0.0)
|
us = r.get("util_single_pct", 0.0)
|
||||||
if u > 100.0 + 1e-3:
|
ua = r.get("util_aggregate_pct", 0.0)
|
||||||
|
if us > 100.0 + 1e-3:
|
||||||
issues.append(
|
issues.append(
|
||||||
f"no_congestion {r['scenario']}: util_pct={u:.1f}% > 100% "
|
f"no_congestion {r['scenario']}: util_single={us:.1f}% > 100% "
|
||||||
f"for single-issuer scenario (eff={r['effective_bw_gbs']:.1f}, "
|
f"for a single-issuer scenario"
|
||||||
f"peak={r['bottleneck_bw_gbs']:.1f})"
|
)
|
||||||
|
if abs(us - ua) > 1e-3:
|
||||||
|
issues.append(
|
||||||
|
f"no_congestion {r['scenario']}: util_single ({us:.1f}) != "
|
||||||
|
f"util_aggregate ({ua:.1f}) — should match for single issuer"
|
||||||
)
|
)
|
||||||
|
|
||||||
# (4) Effective BW for a single request = nbytes / total_ns
|
# (4) Effective BW for a single request = nbytes / total_ns
|
||||||
@@ -518,7 +601,7 @@ def _verify(rows_no_cong, rows_cong) -> list[str]:
|
|||||||
)
|
)
|
||||||
|
|
||||||
# (5) Congestion aggregate BW grows monotonically with issuer count on
|
# (5) Congestion aggregate BW grows monotonically with issuer count on
|
||||||
# the hot-target series (same shared bottleneck, more bytes / same peak).
|
# the hot-target series.
|
||||||
seq = ["ctrl_hot_1", "ctrl_hot_2", "ctrl_hot_3"]
|
seq = ["ctrl_hot_1", "ctrl_hot_2", "ctrl_hot_3"]
|
||||||
last = 0.0
|
last = 0.0
|
||||||
for n in seq:
|
for n in seq:
|
||||||
@@ -529,17 +612,27 @@ def _verify(rows_no_cong, rows_cong) -> list[str]:
|
|||||||
)
|
)
|
||||||
last = max(last, cong_map.get(n, {}).get("effective_bw_gbs", last))
|
last = max(last, cong_map.get(n, {}).get("effective_bw_gbs", last))
|
||||||
|
|
||||||
# (6) all_pe_to_pe0 must approach single-path peak (≥ 70 % util) —
|
# (6) all_pe_to_pe0 must approach the shared single-path peak.
|
||||||
# the shared r0c0 → hbm_ctrl.pe0 bottleneck is fully amortised when
|
|
||||||
# all 8 PEs target it.
|
|
||||||
if "all_pe_to_pe0" in cong_map:
|
if "all_pe_to_pe0" in cong_map:
|
||||||
u = cong_map["all_pe_to_pe0"]["util_pct"]
|
u = cong_map["all_pe_to_pe0"]["util_single_pct"]
|
||||||
if u < 70.0:
|
if u < 70.0:
|
||||||
issues.append(
|
issues.append(
|
||||||
f"congestion all_pe_to_pe0: util_pct={u:.1f}% < 70 % — "
|
f"congestion all_pe_to_pe0: util_single={u:.1f}% < 70 % — "
|
||||||
f"8-PE hotspot should saturate the shared HBM CTRL path"
|
f"8-PE hotspot should saturate the shared HBM CTRL path"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# (7) ucie_eastbound: util_aggregate should be meaningfully smaller
|
||||||
|
# than util_single (the multi-lane peak should pull the bar down).
|
||||||
|
if "ucie_eastbound" in cong_map:
|
||||||
|
e = cong_map["ucie_eastbound"]
|
||||||
|
if e["util_aggregate_pct"] >= e["util_single_pct"] - 5.0:
|
||||||
|
issues.append(
|
||||||
|
f"congestion ucie_eastbound: util_aggregate "
|
||||||
|
f"({e['util_aggregate_pct']:.1f}%) should be << "
|
||||||
|
f"util_single ({e['util_single_pct']:.1f}%) when UCIe's "
|
||||||
|
f"multi-lane peak applies"
|
||||||
|
)
|
||||||
|
|
||||||
return issues
|
return issues
|
||||||
|
|
||||||
|
|
||||||
@@ -558,15 +651,21 @@ def main(nbytes: int = DEFAULT_NBYTES) -> int:
|
|||||||
print("\n-- No-congestion summary --")
|
print("\n-- No-congestion summary --")
|
||||||
for r in no_cong:
|
for r in no_cong:
|
||||||
print(f" {r['scenario']:22s} total={r['total_ns']:7.1f} ns "
|
print(f" {r['scenario']:22s} total={r['total_ns']:7.1f} ns "
|
||||||
f"eff={r['effective_bw_gbs']:6.1f} peak={r['bottleneck_bw_gbs']:6.1f} "
|
f"eff={r['effective_bw_gbs']:6.1f} GB/s "
|
||||||
f"GB/s util={r['util_pct']:5.1f}%")
|
f"peak_s={r['peak_single_bw_gbs']:6.1f} "
|
||||||
|
f"peak_a={r['peak_aggregate_bw_gbs']:6.1f} "
|
||||||
|
f"util_s={r['util_single_pct']:5.1f}% "
|
||||||
|
f"util_a={r['util_aggregate_pct']:5.1f}%")
|
||||||
print("\n-- Congestion summary --")
|
print("\n-- Congestion summary --")
|
||||||
for r in cong:
|
for r in cong:
|
||||||
agg_bytes = r["nbytes"] * r["n_issuers"]
|
agg_bytes = r["nbytes"] * r["n_issuers"]
|
||||||
print(f" {r['scenario']:22s} makespan={r['makespan_ns']:7.1f} ns "
|
print(f" {r['scenario']:22s} makespan={r['makespan_ns']:7.1f} ns "
|
||||||
f"agg_bytes={agg_bytes:>7d} "
|
f"agg_bytes={agg_bytes:>7d} "
|
||||||
f"eff={r['effective_bw_gbs']:6.1f} peak={r['bottleneck_bw_gbs']:6.1f} "
|
f"eff={r['effective_bw_gbs']:6.1f} GB/s "
|
||||||
f"GB/s util={r['util_pct']:5.1f}%")
|
f"peak_s={r['peak_single_bw_gbs']:6.1f} "
|
||||||
|
f"peak_a={r['peak_aggregate_bw_gbs']:6.1f} "
|
||||||
|
f"util_s={r['util_single_pct']:5.1f}% "
|
||||||
|
f"util_a={r['util_aggregate_pct']:5.1f}%")
|
||||||
|
|
||||||
issues = _verify(no_cong, cong)
|
issues = _verify(no_cong, cong)
|
||||||
print("\n-- Self-verification --")
|
print("\n-- Self-verification --")
|
||||||
|
|||||||
Reference in New Issue
Block a user