CCL allreduce: rename to lrab_hierarchical_allreduce + descriptive plots

Rename the intercube all-reduce identity to lrab_hierarchical_allreduce
(module, config key, distributed test) so the name reflects both levels
it implements: LRAB intra-SIP (local reduce to center root + broadcast)
and the hierarchical inter-SIP topology exchange (ring/torus/mesh).
ADR-0032 slug kept as the stable decision id; pure rename, no logic change.

Also in this batch:
- ADR-0032 (EN+KO): document the shipped center-root bidirectional reduce
  (doc was stale corner-root); annotate ccl.yaml root_cube as a placeholder.
- Rename allreduce + pe2pe latency plots to descriptive, title-matching
  filenames and retitle the in-plot headings; drop overview/overview_log.
- Point the PPTX image refs at the new plot names.

Doc + derived-artifact + rename only; no simulation behavior changed.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-20 20:50:48 -07:00
parent e77e4a1703
commit ff7d727ddd
38 changed files with 259 additions and 272 deletions
+28 -77
View File
@@ -189,15 +189,15 @@ TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"
CONFIGS = [
pytest.param(
"intercube_allreduce", "ring_1d", 6, None, None,
"lrab_hierarchical_allreduce", "ring_1d", 6, None, None,
id="ring_6sip",
),
pytest.param(
"intercube_allreduce", "torus_2d", 6, 2, 3,
"lrab_hierarchical_allreduce", "torus_2d", 6, 2, 3,
id="torus_6sip_2x3",
),
pytest.param(
"intercube_allreduce", "mesh_2d_no_wrap", 6, 2, 3,
"lrab_hierarchical_allreduce", "mesh_2d_no_wrap", 6, 2, 3,
id="mesh_6sip_2x3",
),
]
@@ -280,9 +280,9 @@ _SWEEP_N_ELEM = [
_ELEM_BYTES_F16 = 2
_SWEEP_TOPOLOGIES = [
("intercube_allreduce", "ring_1d", 6, None, None),
("intercube_allreduce", "torus_2d", 6, 2, 3),
("intercube_allreduce", "mesh_2d_no_wrap", 6, 2, 3),
("lrab_hierarchical_allreduce", "ring_1d", 6, None, None),
("lrab_hierarchical_allreduce", "torus_2d", 6, 2, 3),
("lrab_hierarchical_allreduce", "mesh_2d_no_wrap", 6, 2, 3),
]
# Shared on-disk staging dir for parametrized sweep rows. Each
@@ -440,10 +440,22 @@ def _aggregate_sweep_plots() -> bool:
continue
xs = [r["bytes_per_pe"] for r in rs]
ys = [r["latency_ns"] for r in rs]
title = (
f"Allreduce latency — {topo_name} "
f"(n_sips={rs[0]['n_sips']})"
_per_topo_titles = {
"ring_1d": "AllReduce_LRAB_Ring1D_6SiP(1x6)",
"torus_2d": "AllReduce_LRAB_2Dtorus_6SiP(2x3)",
"mesh_2d_no_wrap": "AllReduce_LRAB_2DMesh_6SiP(2x3)",
}
# Descriptive output filenames (parens → underscores for
# markdown/URL safety; topo key stays the summary.csv value).
_per_topo_files = {
"ring_1d": "AllReduce_LRAB_Ring1D_6SiP_1x6",
"torus_2d": "AllReduce_LRAB_2Dtorus_6SiP_2x3",
"mesh_2d_no_wrap": "AllReduce_LRAB_2DMesh_6SiP_2x3",
}
title = _per_topo_titles.get(
topo_name, f"Allreduce latency — {topo_name}"
)
out_stem = _per_topo_files.get(topo_name, topo_name)
fig, ax = plt.subplots(figsize=(8, 5))
ax.plot(xs, ys, marker="o", color="tab:blue")
ax.set_xscale("log", base=2)
@@ -453,75 +465,14 @@ def _aggregate_sweep_plots() -> bool:
ax.grid(True, alpha=0.3)
ax.xaxis.set_major_formatter(_bytes_fmt)
fig.tight_layout()
fig.savefig(_SWEEP_OUT_DIR / f"{topo_name}.png", dpi=120)
fig.savefig(_SWEEP_OUT_DIR / f"{out_stem}.png", dpi=120)
plt.close(fig)
colors = {"ring_1d": "tab:blue", "torus_2d": "tab:orange",
"mesh_2d_no_wrap": "tab:green"}
# ── Hand-derived theoretical model for torus_2d (6 SIPs) ──
# Critical-path analysis (per packet, packet = 128 B at NoC):
# local intra-SIP reduce + broadcast = 8 hops × 57 ns = 456 ns
# global X-direction reduce = 5 UCIe + 1 UAL = 445 ns
# global Y-direction reduce = 5 UCIe + 1 UAL = 445 ns
# per-packet startup latency = 456 + 445 + 445 = 1346 ns
# Packet count is PER CUBE (8 PEs/cube cooperate on the cube tile).
# At 6144 packets/cube the pipelined total is 8741 ns, so the
# bottleneck-stage interval τ = (8741 1346) / (6144 1) ≈ 1.204 ns.
# T_theoretical(N) = 1346 + (N 1) × τ
# where N = ceil((bytes_per_pe × 8) / 128) = ceil(bytes_per_pe / 16)
NOC_PACKET_BYTES = 128
PES_PER_CUBE = 8
T_STARTUP_NS = 1346.0
TAU_NS = (8741.0 - 1346.0) / (6144 - 1) # ≈ 1.2038 ns/packet
def _theoretical_torus_2d_ns(bytes_per_pe: int) -> float:
bytes_per_cube = int(bytes_per_pe) * PES_PER_CUBE
n_packets = max(1, -(-bytes_per_cube // NOC_PACKET_BYTES)) # ceil
return T_STARTUP_NS + (n_packets - 1) * TAU_NS
fig, ax = plt.subplots(figsize=(9, 6))
for topo_name in topologies:
rs = sorted(
[r for r in records if r["sip_topology"] == topo_name],
key=lambda r: r["bytes_per_pe"],
)
if not rs:
continue
ax.plot(
[r["bytes_per_pe"] for r in rs],
[r["latency_ns"] for r in rs],
marker="o",
label=f"{topo_name} (n_sips={rs[0]['n_sips']})",
color=colors.get(topo_name),
)
# Theoretical torus_2d curve across all payload sizes.
torus_rs = sorted(
[r for r in records if r["sip_topology"] == "torus_2d"],
key=lambda r: r["bytes_per_pe"],
)
if torus_rs:
xs_th = [r["bytes_per_pe"] for r in torus_rs]
ys_th = [_theoretical_torus_2d_ns(r["bytes_per_pe"]) for r in torus_rs]
ax.plot(
xs_th, ys_th,
color="tab:red", linestyle="--", linewidth=1.6, marker="x",
label="theoretical torus_2d (6 SIPs)",
)
ax.set_xscale("log", base=2)
ax.set_xlabel("Bytes per PE (log scale)")
ax.set_ylabel("Time (ns)")
ax.set_title("Multi-device allreduce latency by topology")
ax.grid(True, alpha=0.3)
ax.set_xlim(left=min(r["bytes_per_pe"] for r in records) / 2,
right=max(r["bytes_per_pe"] for r in records) * 1.5)
ax.legend()
ax.xaxis.set_major_formatter(_bytes_fmt)
fig.tight_layout()
fig.savefig(_SWEEP_OUT_DIR / "overview.png", dpi=120)
plt.close(fig)
# Combined overview.png is no longer emitted — the broken-y-axis
# comparison (scripts/emit_overview_with_external_ref.py →
# comparison_mesh_vs_ring_vs_2DTorus_vs_theoretical_vs_fsim.png)
# supersedes it. Per-topology plots above and summary.csv are still
# produced.
# Cleanup row staging dir so a partial future run doesn't pick up
# stale rows.
@@ -535,7 +486,7 @@ def _aggregate_sweep_plots() -> bool:
except OSError:
pass
print(f"\nWrote {_SWEEP_OUT_DIR / 'overview.png'} "
print(f"\nWrote per-topology plots + summary.csv to {_SWEEP_OUT_DIR} "
f"from {len(records)} rows")
return True