Intercube allreduce: center root + bidirectional reduce

Move the algorithmic root cube from the corner (cube_w-1,
cube_h-1) to the geometric center (cube_w//2, cube_h//2) and
have each phase converge bidirectionally so the intra-SIP
critical path drops from ~12 hops to ~8 hops on a 4×4 mesh
(left half W→E + right half E→W in row reduce; top half N→S +
bottom half S→N in col reduce; mirrored on broadcast).

Result on torus_2d 6 SIPs at 96 KB / PE on TCM:
  before (corner root)  : 22.0 µs
  after  (center root)  : 17.2 µs   (−22%)

Same shape on ring_1d (−7%) and mesh_2d_no_wrap (−12%); also
holds across SRAM and HBM (~−20% each).

Phase 1 test (test_intercube_root_center.py) asserts the
torus_2d 96 KB latency drops below 20.5 µs and that all 96
cubes still validate (correctness preserved).

Plot updates:
- overview.png: replace constant 10.6 µs theoretical line with
  user-supplied hand-derived curve (per-cube packet count =
  bytes_per_pe × 8 PEs ÷ 128 B; 1346 ns startup + 1.20 ns/pkt).
- All summary.csv numbers and per-topology PNGs regenerated.
- pe2pe_latency_plots and ipcq diagram emitter PNGs refreshed.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-27 21:28:58 -07:00
parent 84a1325e5c
commit 1c5752a9ec
16 changed files with 324 additions and 157 deletions
+40 -54
View File
@@ -289,7 +289,8 @@ _SWEEP_TOPOLOGIES = [
# parametrized invocation writes one JSON file here; the aggregator
# (run from conftest.pytest_sessionfinish) reads them and emits the
# combined CSV + PNG plots.
_SWEEP_OUT_DIR = Path(__file__).parent / "allreduce_latency_plots"
_SWEEP_OUT_DIR = (Path(__file__).parent.parent / "docs" / "diagrams"
/ "allreduce_latency_plots")
_SWEEP_ROWS_DIR = _SWEEP_OUT_DIR / "_rows"
@@ -447,7 +448,7 @@ def _aggregate_sweep_plots() -> bool:
ax.plot(xs, ys, marker="o", color="tab:blue")
ax.set_xscale("log", base=2)
ax.set_xlabel("Bytes per PE (log scale)")
ax.set_ylabel("max pe_exec_ns (critical path)")
ax.set_ylabel("Time (ns)")
ax.set_title(title)
ax.grid(True, alpha=0.3)
ax.xaxis.set_major_formatter(_bytes_fmt)
@@ -457,7 +458,28 @@ def _aggregate_sweep_plots() -> bool:
colors = {"ring_1d": "tab:blue", "torus_2d": "tab:orange",
"mesh_2d_no_wrap": "tab:green"}
THEORETICAL_TORUS_2D_6SIP_NS = 10600.0
# ── Hand-derived theoretical model for torus_2d (6 SIPs) ──
# Critical-path analysis (per packet, packet = 128 B at NoC):
# local intra-SIP reduce + broadcast = 8 hops × 57 ns = 456 ns
# global X-direction reduce = 5 UCIe + 1 UAL = 445 ns
# global Y-direction reduce = 5 UCIe + 1 UAL = 445 ns
# per-packet startup latency = 456 + 445 + 445 = 1346 ns
# Packet count is PER CUBE (8 PEs/cube cooperate on the cube tile).
# At 6144 packets/cube the pipelined total is 8741 ns, so the
# bottleneck-stage interval τ = (8741 1346) / (6144 1) ≈ 1.204 ns.
# T_theoretical(N) = 1346 + (N 1) × τ
# where N = ceil((bytes_per_pe × 8) / 128) = ceil(bytes_per_pe / 16)
NOC_PACKET_BYTES = 128
PES_PER_CUBE = 8
T_STARTUP_NS = 1346.0
TAU_NS = (8741.0 - 1346.0) / (6144 - 1) # ≈ 1.2038 ns/packet
def _theoretical_torus_2d_ns(bytes_per_pe: int) -> float:
bytes_per_cube = int(bytes_per_pe) * PES_PER_CUBE
n_packets = max(1, -(-bytes_per_cube // NOC_PACKET_BYTES)) # ceil
return T_STARTUP_NS + (n_packets - 1) * TAU_NS
fig, ax = plt.subplots(figsize=(9, 6))
for topo_name in topologies:
rs = sorted(
@@ -473,64 +495,28 @@ def _aggregate_sweep_plots() -> bool:
label=f"{topo_name} (n_sips={rs[0]['n_sips']})",
color=colors.get(topo_name),
)
ax.axhline(
y=THEORETICAL_TORUS_2D_6SIP_NS,
color="tab:red", linestyle="--", linewidth=1.5,
label=f"theoretical torus_2d (6 SIPs) = "
f"{THEORETICAL_TORUS_2D_6SIP_NS:.0f} ns",
# Theoretical torus_2d curve across all payload sizes.
torus_rs = sorted(
[r for r in records if r["sip_topology"] == "torus_2d"],
key=lambda r: r["bytes_per_pe"],
)
BYTES_96KB = 96 * 1024
ax.axvline(
x=BYTES_96KB, ymin=0, ymax=1,
color="tab:red", linestyle=":", linewidth=1.2,
)
ax.plot(
[BYTES_96KB], [THEORETICAL_TORUS_2D_6SIP_NS],
marker="x", color="tab:red", markersize=10, markeredgewidth=2,
)
# Find simulated torus_2d latency at 96 KB (if present) for direct
# comparison with the theoretical value.
sim_torus_at_96kb = next(
(r["latency_ns"] for r in records
if r["sip_topology"] == "torus_2d" and r["bytes_per_pe"] == BYTES_96KB),
None,
)
if sim_torus_at_96kb is not None:
if torus_rs:
xs_th = [r["bytes_per_pe"] for r in torus_rs]
ys_th = [_theoretical_torus_2d_ns(r["bytes_per_pe"]) for r in torus_rs]
ax.plot(
[BYTES_96KB], [sim_torus_at_96kb],
marker="o", color="tab:orange",
markersize=10, markeredgecolor="black", markeredgewidth=1.2,
)
ax.annotate(
f"96 KB\n"
f"theoretical = {THEORETICAL_TORUS_2D_6SIP_NS:.0f} ns\n"
f"simulated = {sim_torus_at_96kb:.0f} ns",
xy=(BYTES_96KB, sim_torus_at_96kb),
xytext=(10, -20), textcoords="offset points",
color="tab:red", fontsize=9,
)
else:
ax.annotate(
f"96 KB\n→ theoretical {THEORETICAL_TORUS_2D_6SIP_NS:.0f} ns",
xy=(BYTES_96KB, THEORETICAL_TORUS_2D_6SIP_NS),
xytext=(8, -20), textcoords="offset points",
color="tab:red", fontsize=9,
xs_th, ys_th,
color="tab:red", linestyle="--", linewidth=1.6, marker="x",
label="theoretical torus_2d (6 SIPs)",
)
ax.set_xscale("log", base=2)
ax.set_xlabel("Bytes per PE (log scale)")
ax.set_ylabel("max pe_exec_ns (critical path)")
ax.set_ylabel("Time (ns)")
ax.set_title("Multi-device allreduce latency by topology")
ax.grid(True, alpha=0.3)
# Drop 128 KB tick (overlaps visually with the explicit 96 KB marker)
# and add 96 KB.
BYTES_128KB = 128 * 1024
existing_ticks = [t for t in ax.get_xticks() if int(t) != BYTES_128KB]
if BYTES_96KB not in existing_ticks:
existing_ticks.append(BYTES_96KB)
ax.set_xticks(sorted(existing_ticks))
ax.set_xlim(left=min(r["bytes_per_pe"] for r in records) / 2,
right=BYTES_96KB * 1.5)
right=max(r["bytes_per_pe"] for r in records) * 1.5)
ax.legend()
ax.xaxis.set_major_formatter(_bytes_fmt)
fig.tight_layout()
@@ -811,7 +797,7 @@ def _draw_cube_reduction(ax):
def emit_topology_diagram() -> str:
"""Emit a 2×2-panel topology diagram into allreduce_latency_plots/.
"""Emit a 2×2-panel topology diagram into docs/diagrams/allreduce_latency_plots/.
Top row: ring_1d | torus_2d (2×3)
Bot row: mesh_2d_no_wrap (2×3) | cube-level reduction in SIP 0