sccl: drive allreduce tests via torch.distributed; reorganize into tests/sccl/

Convert the multidevice allreduce correctness + latency/buffer-kind sweeps
to run through the real PyTorch-distributed path
(init_process_group(backend="ahbm") -> mp.spawn -> dist.all_reduce) instead
of direct ctx.launch, and reorganize the CCL/allreduce tests into a
tests/sccl/ package split one test per file.

Production change (required for the distributed path on non-square SIP grids):
- AhbmCCLBackend now reads explicit system.sips.w/h from the spec, with a
  square-only sqrt fallback that raises on ambiguity, instead of silently
  guessing round(sqrt(count)). This fixes the 2x3 / 3x2 torus + mesh cases,
  which previously resolved to a wrong 2x2 grid. Mirrors the test helper's
  _sip_topo_dims precedence (explicit w/h > square fallback > raise).

Test reorganization (tests/sccl/):
- _allreduce_helpers.py: shared plumbing (distributed driver, config writers,
  direct-launch run_allreduce parity reference, sweep/buffer-kind constants,
  plot aggregators, topology-diagram + FSIM-comparison emitters).
- test_allreduce_ring_torus_mesh.py: correctness across ring/torus/mesh.
- test_distributed_default_topology.py: full distributed path on topology.yaml.
- test_plot_latency_sweep.py / test_plot_buffer_kind_sweep.py: sweep rows.
- test_plot_topology_diagram.py / test_plot_comparison_fsim.py: plot emitters.
- test_intercube_root_center.py: moved in (ADR-0032 center-root latency guard).

Also:
- Move the FSIM comparison plot generator out of scripts/ into the sccl suite.
- Delete superseded test files (test_allreduce_multidevice,
  test_distributed_lrab_hierarchical_allreduce, test_allreduce_buffer_kind_sweep)
  and repoint conftest aggregators + the ipcq buffer-kind importers.
- Regenerate the allreduce_latency_plots derived artifacts from the full sweep.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-20 22:24:43 -07:00
parent ff7d727ddd
commit b610cb0d9a
22 changed files with 745 additions and 759 deletions
-176
View File
@@ -1,176 +0,0 @@
"""One-shot: render the broken-y-axis allreduce comparison with the FSIM
single-device reference. Reads docs/diagrams/allreduce_latency_plots/summary.csv
and writes comparison_mesh_vs_ring_vs_2DTorus_vs_theoretical_vs_fsim.png
alongside it.
This is a derived-artifact generator (per CLAUDE.md): plotting only, no production
or test logic touched.
"""
from __future__ import annotations
import csv
from pathlib import Path
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
ROOT = Path(__file__).resolve().parent.parent
PLOT_DIR = ROOT / "docs" / "diagrams" / "allreduce_latency_plots"
CSV_PATH = PLOT_DIR / "summary.csv"
EXT_LABEL = "FSIM (single device): 366 µs"
EXT_LATENCY_NS = 366_000.0
COLORS = {
"ring_1d": "tab:blue",
"torus_2d": "tab:orange",
"mesh_2d_no_wrap": "tab:green",
}
# Display labels (data keys above stay as the summary.csv sip_topology
# values; these are only the human-readable legend strings). All non-FSIM
# runs use 6 devices; the grid differs per topology.
DISPLAY = {
"ring_1d": "Ring 1x6 (6 devices)",
"torus_2d": "2D Torus 2x3 (6 devices)",
"mesh_2d_no_wrap": "2D Mesh 2x3 (6 devices)",
}
# Hand-derived theoretical model for torus_2d (6 SIPs). Mirrors
# _aggregate_sweep_plots in tests/test_allreduce_multidevice.py.
NOC_PACKET_BYTES = 128
PES_PER_CUBE = 8
T_STARTUP_NS = 1346.0
TAU_NS = (8741.0 - 1346.0) / (6144 - 1)
def _theoretical_torus_2d_ns(bytes_per_pe: int) -> float:
bytes_per_cube = int(bytes_per_pe) * PES_PER_CUBE
n_packets = max(1, -(-bytes_per_cube // NOC_PACKET_BYTES))
return T_STARTUP_NS + (n_packets - 1) * TAU_NS
def _plot_theoretical(ax, records):
torus_rs = sorted(
[r for r in records if r["sip_topology"] == "torus_2d"],
key=lambda r: r["bytes_per_pe"],
)
if not torus_rs:
return
ax.plot(
[r["bytes_per_pe"] for r in torus_rs],
[_theoretical_torus_2d_ns(r["bytes_per_pe"]) for r in torus_rs],
color="tab:red", linestyle="--", linewidth=1.6, marker="x",
label="Theoretical 2D Torus 2x3",
)
def _bytes_fmt(x, _pos):
if x >= 1024 * 1024:
return f"{x / (1024 * 1024):.0f}M"
if x >= 1024:
return f"{x / 1024:.0f}K"
return f"{int(x)}"
def _load_records():
rows = []
with open(CSV_PATH, newline="") as f:
r = csv.DictReader(f)
for row in r:
rows.append({
"sip_topology": row["sip_topology"],
"bytes_per_pe": int(row["bytes_per_pe"]),
"latency_ns": float(row["latency_ns"]),
})
return rows
def _ext_x(records):
"""Anchor the external reference at the largest payload (96 KB / PE)."""
return max(r["bytes_per_pe"] for r in records)
def _plot_curves(ax, records, topologies):
for topo in topologies:
rs = sorted([r for r in records if r["sip_topology"] == topo],
key=lambda r: r["bytes_per_pe"])
if not rs:
continue
ax.plot(
[r["bytes_per_pe"] for r in rs],
[r["latency_ns"] for r in rs],
marker="o",
label=DISPLAY.get(topo, topo),
color=COLORS.get(topo),
)
def emit_broken(records):
topologies = sorted({r["sip_topology"] for r in records})
max_local = max(r["latency_ns"] for r in records)
fig, (ax_top, ax_bot) = plt.subplots(
2, 1, sharex=True,
gridspec_kw={"height_ratios": [1, 4], "hspace": 0.05},
figsize=(9, 6.5),
)
# Bottom panel: today's three curves + theoretical, linear y.
_plot_curves(ax_bot, records, topologies)
_plot_theoretical(ax_bot, records)
ax_bot.set_ylim(0, max_local * 1.10)
# Top panel: only the external reference marker, linear y around 366 µs.
ax_top.scatter(
[_ext_x(records)], [EXT_LATENCY_NS],
marker="*", s=240, color="tab:red", zorder=5,
label=EXT_LABEL,
)
ax_top.set_ylim(EXT_LATENCY_NS * 0.93, EXT_LATENCY_NS * 1.05)
# Hide the spine between the two panels and draw diagonal "break" ticks.
ax_top.spines["bottom"].set_visible(False)
ax_bot.spines["top"].set_visible(False)
ax_top.tick_params(labeltop=False, bottom=False)
ax_bot.xaxis.tick_bottom()
d = 0.012 # diagonal-tick size, in axis-fraction
kw = dict(transform=ax_top.transAxes, color="k", clip_on=False, lw=1)
ax_top.plot((-d, +d), (-d, +d), **kw)
ax_top.plot((1 - d, 1 + d), (-d, +d), **kw)
kw.update(transform=ax_bot.transAxes)
ax_bot.plot((-d, +d), (1 - d * 4, 1 + d * 4), **kw)
ax_bot.plot((1 - d, 1 + d), (1 - d * 4, 1 + d * 4), **kw)
ax_bot.set_xscale("log", base=2)
ax_bot.set_xlabel("Bytes per PE (log scale)")
ax_bot.set_ylabel("Time (ns)")
ax_top.set_ylabel("Time (ns)")
ax_bot.grid(True, alpha=0.3)
ax_top.grid(True, alpha=0.3)
ax_bot.xaxis.set_major_formatter(mticker.FuncFormatter(_bytes_fmt))
# One legend covering both axes.
handles_bot, labels_bot = ax_bot.get_legend_handles_labels()
handles_top, labels_top = ax_top.get_legend_handles_labels()
ax_bot.legend(handles_bot + handles_top, labels_bot + labels_top,
loc="upper left")
fig.suptitle("Multidevice allreduce (ring, Mesh, 2DTorus) vs FSIM latency")
fig.tight_layout()
out = PLOT_DIR / "comparison_mesh_vs_ring_vs_2DTorus_vs_theoretical_vs_fsim.png"
fig.savefig(out, dpi=120)
plt.close(fig)
print(f"wrote {out}")
def main():
records = _load_records()
if not records:
raise SystemExit(f"no rows in {CSV_PATH}")
emit_broken(records)
if __name__ == "__main__":
main()