CCL allreduce: rename to lrab_hierarchical_allreduce + descriptive plots

Rename the intercube all-reduce identity to lrab_hierarchical_allreduce (module, config key, distributed test) so the name reflects both levels it implements: LRAB intra-SIP (local reduce to center root + broadcast) and the hierarchical inter-SIP topology exchange (ring/torus/mesh). ADR-0032 slug kept as the stable decision id; pure rename, no logic change. Also in this batch: - ADR-0032 (EN+KO): document the shipped center-root bidirectional reduce (doc was stale corner-root); annotate ccl.yaml root_cube as a placeholder. - Rename allreduce + pe2pe latency plots to descriptive, title-matching filenames and retitle the in-plot headings; drop overview/overview_log. - Point the PPTX image refs at the new plot names. Doc + derived-artifact + rename only; no simulation behavior changed. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-20 20:50:48 -07:00
parent e77e4a1703
commit ff7d727ddd
38 changed files with 259 additions and 272 deletions
@@ -4,8 +4,8 @@ Slides:
  1. Overall architecture — how PEs are connected (cube_mesh_view)
  2. Model correctness — DMA vs P2P latency (pe2pe overview)
  3. PE-to-PE IPCQ communication (ipcq_two_pe_dma)
-  4. 6-device allreduce — model vs theoretical vs ext-sim (overview_broken)
-  5. IPCQ buffer-kind sweep — TCM vs SRAM vs HBM (buffer_kind_sweep)
+  4. 6-device allreduce — model vs theoretical vs FSIM (comparison_…_fsim)
+  5. IPCQ buffer-kind sweep — TCM vs SRAM vs HBM (…_with_TCM_SRAM_HBM)
  6. PE_accelerator data path (composite GEMM pipeline structure)
  7. matmul(32, 128, 32) — composite GEMM execution sequence
  8. matmul(32, 128, 128) — pipeline scaling and HBM contention
@@ -63,7 +63,7 @@ SLIDES = [
    },
    {
        "title": "4. 6-Device Allreduce: Model vs Theoretical vs External Simulator",
-        "image": DIAG / "allreduce_latency_plots" / "overview_broken.png",
+        "image": DIAG / "allreduce_latency_plots" / "comparison_mesh_vs_ring_vs_2DTorus_vs_theoretical_vs_fsim.png",
        "bullets": [
            "Three SIP topologies (ring / torus / mesh) swept 16 B → 96 KB per PE",
            "Dashed red curve: hand-derived theoretical model for torus_2d (6 SIPs)",
@@ -73,7 +73,7 @@ SLIDES = [
    },
    {
        "title": "5. IPCQ Slot Memory: TCM vs SRAM vs HBM",
-        "image": DIAG / "allreduce_latency_plots" / "buffer_kind_sweep.png",
+        "image": DIAG / "allreduce_latency_plots" / "AllReduce_LRAB_2Dtorus_6SiP_2x3_with_TCM_SRAM_HBM.png",
        "bullets": [
            "Same allreduce with slot memory swapped: TCM (per-PE local) / SRAM / HBM (cube-shared, behind router link)",
            "Cost = NoC drain + slot-IO + PE↔bank hop; only TCM skips the bank hop",
@@ -1,6 +1,7 @@
-"""One-shot: render overview.png with an external 366 µs reference, in two
-variants — log scale and broken y-axis. Reads docs/diagrams/allreduce_latency_plots/summary.csv
-and writes overview_log.png and overview_broken.png alongside it.
+"""One-shot: render the broken-y-axis allreduce comparison with the FSIM
+single-device reference. Reads docs/diagrams/allreduce_latency_plots/summary.csv
+and writes comparison_mesh_vs_ring_vs_2DTorus_vs_theoretical_vs_fsim.png
+alongside it.

 This is a derived-artifact generator (per CLAUDE.md): plotting only, no production
 or test logic touched.
@@ -17,7 +18,7 @@ ROOT = Path(__file__).resolve().parent.parent
 PLOT_DIR = ROOT / "docs" / "diagrams" / "allreduce_latency_plots"
 CSV_PATH = PLOT_DIR / "summary.csv"

-EXT_LABEL = "ext-sim single-device reduce: 366 µs"
+EXT_LABEL = "FSIM (single device): 366 µs"
 EXT_LATENCY_NS = 366_000.0

 COLORS = {
@@ -26,6 +27,15 @@ COLORS = {
    "mesh_2d_no_wrap": "tab:green",
 }

+# Display labels (data keys above stay as the summary.csv sip_topology
+# values; these are only the human-readable legend strings). All non-FSIM
+# runs use 6 devices; the grid differs per topology.
+DISPLAY = {
+    "ring_1d": "Ring 1x6 (6 devices)",
+    "torus_2d": "2D Torus 2x3 (6 devices)",
+    "mesh_2d_no_wrap": "2D Mesh 2x3 (6 devices)",
+}
+
 # Hand-derived theoretical model for torus_2d (6 SIPs). Mirrors
 # _aggregate_sweep_plots in tests/test_allreduce_multidevice.py.
 NOC_PACKET_BYTES = 128
@@ -51,7 +61,7 @@ def _plot_theoretical(ax, records):
        [r["bytes_per_pe"] for r in torus_rs],
        [_theoretical_torus_2d_ns(r["bytes_per_pe"]) for r in torus_rs],
        color="tab:red", linestyle="--", linewidth=1.6, marker="x",
-        label="theoretical torus_2d (6 SIPs)",
+        label="Theoretical 2D Torus 2x3",
    )


@@ -91,36 +101,11 @@ def _plot_curves(ax, records, topologies):
            [r["bytes_per_pe"] for r in rs],
            [r["latency_ns"] for r in rs],
            marker="o",
-            label=f"{topo}",
+            label=DISPLAY.get(topo, topo),
            color=COLORS.get(topo),
        )


-def emit_log(records):
-    topologies = sorted({r["sip_topology"] for r in records})
-    fig, ax = plt.subplots(figsize=(9, 6))
-    _plot_curves(ax, records, topologies)
-    _plot_theoretical(ax, records)
-    ax.scatter(
-        [_ext_x(records)], [EXT_LATENCY_NS],
-        marker="*", s=220, color="tab:red", zorder=5,
-        label=EXT_LABEL,
-    )
-    ax.set_xscale("log", base=2)
-    ax.set_yscale("log")
-    ax.set_xlabel("Bytes per PE (log scale)")
-    ax.set_ylabel("Time (ns) — log scale")
-    ax.set_title("Multi-device allreduce latency vs external single-device reference")
-    ax.grid(True, which="both", alpha=0.3)
-    ax.xaxis.set_major_formatter(mticker.FuncFormatter(_bytes_fmt))
-    ax.legend(loc="upper left")
-    fig.tight_layout()
-    out = PLOT_DIR / "overview_log.png"
-    fig.savefig(out, dpi=120)
-    plt.close(fig)
-    print(f"wrote {out}")
-
-
 def emit_broken(records):
    topologies = sorted({r["sip_topology"] for r in records})
    max_local = max(r["latency_ns"] for r in records)
@@ -172,9 +157,9 @@ def emit_broken(records):
    ax_bot.legend(handles_bot + handles_top, labels_bot + labels_top,
                  loc="upper left")

-    fig.suptitle("Multi-device allreduce latency vs external single-device reference (broken y-axis)")
+    fig.suptitle("Multidevice allreduce (ring, Mesh, 2DTorus) vs FSIM latency")
    fig.tight_layout()
-    out = PLOT_DIR / "overview_broken.png"
+    out = PLOT_DIR / "comparison_mesh_vs_ring_vs_2DTorus_vs_theoretical_vs_fsim.png"
    fig.savefig(out, dpi=120)
    plt.close(fig)
    print(f"wrote {out}")
@@ -184,7 +169,6 @@ def main():
    records = _load_records()
    if not records:
        raise SystemExit(f"no rows in {CSV_PATH}")
-    emit_log(records)
    emit_broken(records)