CCL allreduce: rename to lrab_hierarchical_allreduce + descriptive plots
Rename the intercube all-reduce identity to lrab_hierarchical_allreduce (module, config key, distributed test) so the name reflects both levels it implements: LRAB intra-SIP (local reduce to center root + broadcast) and the hierarchical inter-SIP topology exchange (ring/torus/mesh). ADR-0032 slug kept as the stable decision id; pure rename, no logic change. Also in this batch: - ADR-0032 (EN+KO): document the shipped center-root bidirectional reduce (doc was stale corner-root); annotate ccl.yaml root_cube as a placeholder. - Rename allreduce + pe2pe latency plots to descriptive, title-matching filenames and retitle the in-plot headings; drop overview/overview_log. - Point the PPTX image refs at the new plot names. Doc + derived-artifact + rename only; no simulation behavior changed. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -4,8 +4,8 @@ Slides:
|
||||
1. Overall architecture — how PEs are connected (cube_mesh_view)
|
||||
2. Model correctness — DMA vs P2P latency (pe2pe overview)
|
||||
3. PE-to-PE IPCQ communication (ipcq_two_pe_dma)
|
||||
4. 6-device allreduce — model vs theoretical vs ext-sim (overview_broken)
|
||||
5. IPCQ buffer-kind sweep — TCM vs SRAM vs HBM (buffer_kind_sweep)
|
||||
4. 6-device allreduce — model vs theoretical vs FSIM (comparison_…_fsim)
|
||||
5. IPCQ buffer-kind sweep — TCM vs SRAM vs HBM (…_with_TCM_SRAM_HBM)
|
||||
6. PE_accelerator data path (composite GEMM pipeline structure)
|
||||
7. matmul(32, 128, 32) — composite GEMM execution sequence
|
||||
8. matmul(32, 128, 128) — pipeline scaling and HBM contention
|
||||
@@ -63,7 +63,7 @@ SLIDES = [
|
||||
},
|
||||
{
|
||||
"title": "4. 6-Device Allreduce: Model vs Theoretical vs External Simulator",
|
||||
"image": DIAG / "allreduce_latency_plots" / "overview_broken.png",
|
||||
"image": DIAG / "allreduce_latency_plots" / "comparison_mesh_vs_ring_vs_2DTorus_vs_theoretical_vs_fsim.png",
|
||||
"bullets": [
|
||||
"Three SIP topologies (ring / torus / mesh) swept 16 B → 96 KB per PE",
|
||||
"Dashed red curve: hand-derived theoretical model for torus_2d (6 SIPs)",
|
||||
@@ -73,7 +73,7 @@ SLIDES = [
|
||||
},
|
||||
{
|
||||
"title": "5. IPCQ Slot Memory: TCM vs SRAM vs HBM",
|
||||
"image": DIAG / "allreduce_latency_plots" / "buffer_kind_sweep.png",
|
||||
"image": DIAG / "allreduce_latency_plots" / "AllReduce_LRAB_2Dtorus_6SiP_2x3_with_TCM_SRAM_HBM.png",
|
||||
"bullets": [
|
||||
"Same allreduce with slot memory swapped: TCM (per-PE local) / SRAM / HBM (cube-shared, behind router link)",
|
||||
"Cost = NoC drain + slot-IO + PE↔bank hop; only TCM skips the bank hop",
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
"""One-shot: render overview.png with an external 366 µs reference, in two
|
||||
variants — log scale and broken y-axis. Reads docs/diagrams/allreduce_latency_plots/summary.csv
|
||||
and writes overview_log.png and overview_broken.png alongside it.
|
||||
"""One-shot: render the broken-y-axis allreduce comparison with the FSIM
|
||||
single-device reference. Reads docs/diagrams/allreduce_latency_plots/summary.csv
|
||||
and writes comparison_mesh_vs_ring_vs_2DTorus_vs_theoretical_vs_fsim.png
|
||||
alongside it.
|
||||
|
||||
This is a derived-artifact generator (per CLAUDE.md): plotting only, no production
|
||||
or test logic touched.
|
||||
@@ -17,7 +18,7 @@ ROOT = Path(__file__).resolve().parent.parent
|
||||
PLOT_DIR = ROOT / "docs" / "diagrams" / "allreduce_latency_plots"
|
||||
CSV_PATH = PLOT_DIR / "summary.csv"
|
||||
|
||||
EXT_LABEL = "ext-sim single-device reduce: 366 µs"
|
||||
EXT_LABEL = "FSIM (single device): 366 µs"
|
||||
EXT_LATENCY_NS = 366_000.0
|
||||
|
||||
COLORS = {
|
||||
@@ -26,6 +27,15 @@ COLORS = {
|
||||
"mesh_2d_no_wrap": "tab:green",
|
||||
}
|
||||
|
||||
# Display labels (data keys above stay as the summary.csv sip_topology
|
||||
# values; these are only the human-readable legend strings). All non-FSIM
|
||||
# runs use 6 devices; the grid differs per topology.
|
||||
DISPLAY = {
|
||||
"ring_1d": "Ring 1x6 (6 devices)",
|
||||
"torus_2d": "2D Torus 2x3 (6 devices)",
|
||||
"mesh_2d_no_wrap": "2D Mesh 2x3 (6 devices)",
|
||||
}
|
||||
|
||||
# Hand-derived theoretical model for torus_2d (6 SIPs). Mirrors
|
||||
# _aggregate_sweep_plots in tests/test_allreduce_multidevice.py.
|
||||
NOC_PACKET_BYTES = 128
|
||||
@@ -51,7 +61,7 @@ def _plot_theoretical(ax, records):
|
||||
[r["bytes_per_pe"] for r in torus_rs],
|
||||
[_theoretical_torus_2d_ns(r["bytes_per_pe"]) for r in torus_rs],
|
||||
color="tab:red", linestyle="--", linewidth=1.6, marker="x",
|
||||
label="theoretical torus_2d (6 SIPs)",
|
||||
label="Theoretical 2D Torus 2x3",
|
||||
)
|
||||
|
||||
|
||||
@@ -91,36 +101,11 @@ def _plot_curves(ax, records, topologies):
|
||||
[r["bytes_per_pe"] for r in rs],
|
||||
[r["latency_ns"] for r in rs],
|
||||
marker="o",
|
||||
label=f"{topo}",
|
||||
label=DISPLAY.get(topo, topo),
|
||||
color=COLORS.get(topo),
|
||||
)
|
||||
|
||||
|
||||
def emit_log(records):
|
||||
topologies = sorted({r["sip_topology"] for r in records})
|
||||
fig, ax = plt.subplots(figsize=(9, 6))
|
||||
_plot_curves(ax, records, topologies)
|
||||
_plot_theoretical(ax, records)
|
||||
ax.scatter(
|
||||
[_ext_x(records)], [EXT_LATENCY_NS],
|
||||
marker="*", s=220, color="tab:red", zorder=5,
|
||||
label=EXT_LABEL,
|
||||
)
|
||||
ax.set_xscale("log", base=2)
|
||||
ax.set_yscale("log")
|
||||
ax.set_xlabel("Bytes per PE (log scale)")
|
||||
ax.set_ylabel("Time (ns) — log scale")
|
||||
ax.set_title("Multi-device allreduce latency vs external single-device reference")
|
||||
ax.grid(True, which="both", alpha=0.3)
|
||||
ax.xaxis.set_major_formatter(mticker.FuncFormatter(_bytes_fmt))
|
||||
ax.legend(loc="upper left")
|
||||
fig.tight_layout()
|
||||
out = PLOT_DIR / "overview_log.png"
|
||||
fig.savefig(out, dpi=120)
|
||||
plt.close(fig)
|
||||
print(f"wrote {out}")
|
||||
|
||||
|
||||
def emit_broken(records):
|
||||
topologies = sorted({r["sip_topology"] for r in records})
|
||||
max_local = max(r["latency_ns"] for r in records)
|
||||
@@ -172,9 +157,9 @@ def emit_broken(records):
|
||||
ax_bot.legend(handles_bot + handles_top, labels_bot + labels_top,
|
||||
loc="upper left")
|
||||
|
||||
fig.suptitle("Multi-device allreduce latency vs external single-device reference (broken y-axis)")
|
||||
fig.suptitle("Multidevice allreduce (ring, Mesh, 2DTorus) vs FSIM latency")
|
||||
fig.tight_layout()
|
||||
out = PLOT_DIR / "overview_broken.png"
|
||||
out = PLOT_DIR / "comparison_mesh_vs_ring_vs_2DTorus_vs_theoretical_vs_fsim.png"
|
||||
fig.savefig(out, dpi=120)
|
||||
plt.close(fig)
|
||||
print(f"wrote {out}")
|
||||
@@ -184,7 +169,6 @@ def main():
|
||||
records = _load_records()
|
||||
if not records:
|
||||
raise SystemExit(f"no rows in {CSV_PATH}")
|
||||
emit_log(records)
|
||||
emit_broken(records)
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user