sccl: drive allreduce tests via torch.distributed; reorganize into tests/sccl/

Convert the multidevice allreduce correctness + latency/buffer-kind sweeps to run through the real PyTorch-distributed path (init_process_group(backend="ahbm") -> mp.spawn -> dist.all_reduce) instead of direct ctx.launch, and reorganize the CCL/allreduce tests into a tests/sccl/ package split one test per file. Production change (required for the distributed path on non-square SIP grids): - AhbmCCLBackend now reads explicit system.sips.w/h from the spec, with a square-only sqrt fallback that raises on ambiguity, instead of silently guessing round(sqrt(count)). This fixes the 2x3 / 3x2 torus + mesh cases, which previously resolved to a wrong 2x2 grid. Mirrors the test helper's _sip_topo_dims precedence (explicit w/h > square fallback > raise). Test reorganization (tests/sccl/): - _allreduce_helpers.py: shared plumbing (distributed driver, config writers, direct-launch run_allreduce parity reference, sweep/buffer-kind constants, plot aggregators, topology-diagram + FSIM-comparison emitters). - test_allreduce_ring_torus_mesh.py: correctness across ring/torus/mesh. - test_distributed_default_topology.py: full distributed path on topology.yaml. - test_plot_latency_sweep.py / test_plot_buffer_kind_sweep.py: sweep rows. - test_plot_topology_diagram.py / test_plot_comparison_fsim.py: plot emitters. - test_intercube_root_center.py: moved in (ADR-0032 center-root latency guard). Also: - Move the FSIM comparison plot generator out of scripts/ into the sccl suite. - Delete superseded test files (test_allreduce_multidevice, test_distributed_lrab_hierarchical_allreduce, test_allreduce_buffer_kind_sweep) and repoint conftest aggregators + the ipcq buffer-kind importers. - Regenerate the allreduce_latency_plots derived artifacts from the full sweep. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-20 22:24:43 -07:00
parent ff7d727ddd
commit b610cb0d9a
22 changed files with 745 additions and 759 deletions
@@ -1,176 +0,0 @@
-"""One-shot: render the broken-y-axis allreduce comparison with the FSIM
-single-device reference. Reads docs/diagrams/allreduce_latency_plots/summary.csv
-and writes comparison_mesh_vs_ring_vs_2DTorus_vs_theoretical_vs_fsim.png
-alongside it.
-
-This is a derived-artifact generator (per CLAUDE.md): plotting only, no production
-or test logic touched.
-"""
-from __future__ import annotations
-
-import csv
-from pathlib import Path
-
-import matplotlib.pyplot as plt
-import matplotlib.ticker as mticker
-
-ROOT = Path(__file__).resolve().parent.parent
-PLOT_DIR = ROOT / "docs" / "diagrams" / "allreduce_latency_plots"
-CSV_PATH = PLOT_DIR / "summary.csv"
-
-EXT_LABEL = "FSIM (single device): 366 µs"
-EXT_LATENCY_NS = 366_000.0
-
-COLORS = {
-    "ring_1d": "tab:blue",
-    "torus_2d": "tab:orange",
-    "mesh_2d_no_wrap": "tab:green",
-}
-
-# Display labels (data keys above stay as the summary.csv sip_topology
-# values; these are only the human-readable legend strings). All non-FSIM
-# runs use 6 devices; the grid differs per topology.
-DISPLAY = {
-    "ring_1d": "Ring 1x6 (6 devices)",
-    "torus_2d": "2D Torus 2x3 (6 devices)",
-    "mesh_2d_no_wrap": "2D Mesh 2x3 (6 devices)",
-}
-
-# Hand-derived theoretical model for torus_2d (6 SIPs). Mirrors
-# _aggregate_sweep_plots in tests/test_allreduce_multidevice.py.
-NOC_PACKET_BYTES = 128
-PES_PER_CUBE = 8
-T_STARTUP_NS = 1346.0
-TAU_NS = (8741.0 - 1346.0) / (6144 - 1)
-
-
-def _theoretical_torus_2d_ns(bytes_per_pe: int) -> float:
-    bytes_per_cube = int(bytes_per_pe) * PES_PER_CUBE
-    n_packets = max(1, -(-bytes_per_cube // NOC_PACKET_BYTES))
-    return T_STARTUP_NS + (n_packets - 1) * TAU_NS
-
-
-def _plot_theoretical(ax, records):
-    torus_rs = sorted(
-        [r for r in records if r["sip_topology"] == "torus_2d"],
-        key=lambda r: r["bytes_per_pe"],
-    )
-    if not torus_rs:
-        return
-    ax.plot(
-        [r["bytes_per_pe"] for r in torus_rs],
-        [_theoretical_torus_2d_ns(r["bytes_per_pe"]) for r in torus_rs],
-        color="tab:red", linestyle="--", linewidth=1.6, marker="x",
-        label="Theoretical 2D Torus 2x3",
-    )
-
-
-def _bytes_fmt(x, _pos):
-    if x >= 1024 * 1024:
-        return f"{x / (1024 * 1024):.0f}M"
-    if x >= 1024:
-        return f"{x / 1024:.0f}K"
-    return f"{int(x)}"
-
-
-def _load_records():
-    rows = []
-    with open(CSV_PATH, newline="") as f:
-        r = csv.DictReader(f)
-        for row in r:
-            rows.append({
-                "sip_topology": row["sip_topology"],
-                "bytes_per_pe": int(row["bytes_per_pe"]),
-                "latency_ns": float(row["latency_ns"]),
-            })
-    return rows
-
-
-def _ext_x(records):
-    """Anchor the external reference at the largest payload (96 KB / PE)."""
-    return max(r["bytes_per_pe"] for r in records)
-
-
-def _plot_curves(ax, records, topologies):
-    for topo in topologies:
-        rs = sorted([r for r in records if r["sip_topology"] == topo],
-                    key=lambda r: r["bytes_per_pe"])
-        if not rs:
-            continue
-        ax.plot(
-            [r["bytes_per_pe"] for r in rs],
-            [r["latency_ns"] for r in rs],
-            marker="o",
-            label=DISPLAY.get(topo, topo),
-            color=COLORS.get(topo),
-        )
-
-
-def emit_broken(records):
-    topologies = sorted({r["sip_topology"] for r in records})
-    max_local = max(r["latency_ns"] for r in records)
-
-    fig, (ax_top, ax_bot) = plt.subplots(
-        2, 1, sharex=True,
-        gridspec_kw={"height_ratios": [1, 4], "hspace": 0.05},
-        figsize=(9, 6.5),
-    )
-
-    # Bottom panel: today's three curves + theoretical, linear y.
-    _plot_curves(ax_bot, records, topologies)
-    _plot_theoretical(ax_bot, records)
-    ax_bot.set_ylim(0, max_local * 1.10)
-
-    # Top panel: only the external reference marker, linear y around 366 µs.
-    ax_top.scatter(
-        [_ext_x(records)], [EXT_LATENCY_NS],
-        marker="*", s=240, color="tab:red", zorder=5,
-        label=EXT_LABEL,
-    )
-    ax_top.set_ylim(EXT_LATENCY_NS * 0.93, EXT_LATENCY_NS * 1.05)
-
-    # Hide the spine between the two panels and draw diagonal "break" ticks.
-    ax_top.spines["bottom"].set_visible(False)
-    ax_bot.spines["top"].set_visible(False)
-    ax_top.tick_params(labeltop=False, bottom=False)
-    ax_bot.xaxis.tick_bottom()
-
-    d = 0.012  # diagonal-tick size, in axis-fraction
-    kw = dict(transform=ax_top.transAxes, color="k", clip_on=False, lw=1)
-    ax_top.plot((-d, +d), (-d, +d), **kw)
-    ax_top.plot((1 - d, 1 + d), (-d, +d), **kw)
-    kw.update(transform=ax_bot.transAxes)
-    ax_bot.plot((-d, +d), (1 - d * 4, 1 + d * 4), **kw)
-    ax_bot.plot((1 - d, 1 + d), (1 - d * 4, 1 + d * 4), **kw)
-
-    ax_bot.set_xscale("log", base=2)
-    ax_bot.set_xlabel("Bytes per PE (log scale)")
-    ax_bot.set_ylabel("Time (ns)")
-    ax_top.set_ylabel("Time (ns)")
-    ax_bot.grid(True, alpha=0.3)
-    ax_top.grid(True, alpha=0.3)
-    ax_bot.xaxis.set_major_formatter(mticker.FuncFormatter(_bytes_fmt))
-
-    # One legend covering both axes.
-    handles_bot, labels_bot = ax_bot.get_legend_handles_labels()
-    handles_top, labels_top = ax_top.get_legend_handles_labels()
-    ax_bot.legend(handles_bot + handles_top, labels_bot + labels_top,
-                  loc="upper left")
-
-    fig.suptitle("Multidevice allreduce (ring, Mesh, 2DTorus) vs FSIM latency")
-    fig.tight_layout()
-    out = PLOT_DIR / "comparison_mesh_vs_ring_vs_2DTorus_vs_theoretical_vs_fsim.png"
-    fig.savefig(out, dpi=120)
-    plt.close(fig)
-    print(f"wrote {out}")
-
-
-def main():
-    records = _load_records()
-    if not records:
-        raise SystemExit(f"no rows in {CSV_PATH}")
-    emit_broken(records)
-
-
-if __name__ == "__main__":
-    main()