Rectangular SIP topology + 6-device allreduce sweep

mesh_2d, torus_2d, and mesh_2d_no_wrap accept optional w,h kwargs; sqrt fall-back preserved for square layouts (back-compat tests confirm 4-SIP and 9-SIP square configs still work). sfr_config reads system.sips.w/h from spec and threads dims through to the topology fn. test_allreduce_multidevice CONFIGS switched from 4 SIPs (square) to 6 SIPs: ring_1d_6sip, torus_2d_6sip_2x3, mesh_2d_no_wrap_6sip_2x3. _write_temp_configs writes system.sips.w/h when supplied; _sip_topo_dims reads them back. Latency sweep loop also moved to 6-SIP layouts. Linear-scale plot variants dropped -- only log-scale *.png + summary.csv emitted. Plots in tests/allreduce_latency_plots regenerated. New tests/test_sip_topology_rectangular.py asserts neighbor correctness for 2x3 layouts and back-compat for square fallback. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-27 15:13:14 -07:00
parent c1a5cf3a2a
commit e9cc40f74d
9 changed files with 362 additions and 143 deletions
@@ -22,13 +22,23 @@ from kernbench.ccl.sfr_config import configure_sfr_intercube_multisip
 from kernbench.policy.placement.dp import DPPolicy


-def _sip_topo_dims(sip_topo: str, n_sips: int) -> tuple[int, int]:
+def _sip_topo_dims(
+    sip_topo: str, n_sips: int,
+    spec_w: int | None = None, spec_h: int | None = None,
+) -> tuple[int, int]:
    if sip_topo == "ring_1d":
        return (0, 0)
+    if spec_w is not None and spec_h is not None:
+        if spec_w * spec_h != n_sips:
+            raise ValueError(
+                f"sip layout {spec_w}x{spec_h} != n_sips ({n_sips})"
+            )
+        return (spec_w, spec_h)
    side = int(round(math.sqrt(n_sips)))
    if side * side != n_sips:
        raise ValueError(
-            f"SIP topology '{sip_topo}' requires square n_sips, got {n_sips}"
+            f"SIP topology '{sip_topo}' requires square n_sips or "
+            f"explicit w/h in spec, got {n_sips}"
        )
    return (side, side)

@@ -54,10 +64,13 @@ def run_allreduce(
    topo_name_to_kind = algo_module.TOPO_NAME_TO_KIND

    n_elem = int(cfg.get("n_elem", 8))
-    n_sips = int(spec.get("system", {}).get("sips", {}).get("count", 1))
-    sip_topo = str(
-        spec.get("system", {}).get("sips", {}).get("topology", "ring_1d")
-    )
+    sips_cfg = spec.get("system", {}).get("sips", {})
+    n_sips = int(sips_cfg.get("count", 1))
+    sip_topo = str(sips_cfg.get("topology", "ring_1d"))
+    spec_sip_w = sips_cfg.get("w")
+    spec_sip_h = sips_cfg.get("h")
+    spec_sip_w = int(spec_sip_w) if spec_sip_w is not None else None
+    spec_sip_h = int(spec_sip_h) if spec_sip_h is not None else None

    cm = spec["sip"]["cube_mesh"]
    cube_w = int(cm["w"])
@@ -65,7 +78,9 @@ def run_allreduce(
    n_cubes = cube_w * cube_h

    sip_topo_kind = topo_name_to_kind.get(sip_topo, 0)
-    sip_topo_w, sip_topo_h = _sip_topo_dims(sip_topo, n_sips)
+    sip_topo_w, sip_topo_h = _sip_topo_dims(
+        sip_topo, n_sips, spec_w=spec_sip_w, spec_h=spec_sip_h,
+    )

    algo_name = cfg.get("algorithm", "allreduce")
    print(f"\n{'=' * 60}")
@@ -173,20 +188,36 @@ from kernbench.topology.builder import resolve_topology
 TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"

 CONFIGS = [
-    pytest.param("intercube_allreduce", "ring_1d", 2, id="ring_2sip"),
-    pytest.param("intercube_allreduce", "torus_2d", 4, id="torus_4sip"),
-    pytest.param("intercube_allreduce", "mesh_2d_no_wrap", 4, id="mesh_4sip"),
+    pytest.param(
+        "intercube_allreduce", "ring_1d", 6, None, None,
+        id="ring_6sip",
+    ),
+    pytest.param(
+        "intercube_allreduce", "torus_2d", 6, 2, 3,
+        id="torus_6sip_2x3",
+    ),
+    pytest.param(
+        "intercube_allreduce", "mesh_2d_no_wrap", 6, 2, 3,
+        id="mesh_6sip_2x3",
+    ),
 ]


 def _write_temp_configs(
    tmp_path, sip_topology, n_sips, algorithm, n_elem_override=None,
+    sip_w=None, sip_h=None,
 ):
    """Write temp topology.yaml and ccl.yaml with the given overrides."""
    with open(TOPOLOGY_PATH) as f:
        topo_cfg = yaml.safe_load(f)
    topo_cfg["system"]["sips"]["count"] = n_sips
    topo_cfg["system"]["sips"]["topology"] = sip_topology
+    if sip_w is not None and sip_h is not None:
+        topo_cfg["system"]["sips"]["w"] = int(sip_w)
+        topo_cfg["system"]["sips"]["h"] = int(sip_h)
+    else:
+        topo_cfg["system"]["sips"].pop("w", None)
+        topo_cfg["system"]["sips"].pop("h", None)
    topo_path = tmp_path / "topology.yaml"
    with open(topo_path, "w") as f:
        yaml.dump(topo_cfg, f, default_flow_style=False)
@@ -211,10 +242,15 @@ def _write_temp_configs(
    return str(topo_path), str(tmp_ccl)


-@pytest.mark.parametrize("algorithm,sip_topology,n_sips", CONFIGS)
-def test_allreduce(tmp_path, algorithm, sip_topology, n_sips):
+@pytest.mark.parametrize(
+    "algorithm,sip_topology,n_sips,sip_w,sip_h", CONFIGS,
+)
+def test_allreduce(
+    tmp_path, algorithm, sip_topology, n_sips, sip_w, sip_h,
+):
    topo_path, ccl_path = _write_temp_configs(
        tmp_path, sip_topology, n_sips, algorithm,
+        sip_w=sip_w, sip_h=sip_h,
    )
    topo = resolve_topology(topo_path)
    engine = GraphEngine(topo.topology_obj, enable_data=True)
@@ -271,16 +307,17 @@ def test_allreduce_latency_sweep(tmp_path):
    records: list[dict] = []

    # Apples-to-apples: same n_sips across all three topologies.
-    for algorithm, sip_topology, n_sips in [
-        ("intercube_allreduce", "ring_1d", 4),
-        ("intercube_allreduce", "torus_2d", 4),
-        ("intercube_allreduce", "mesh_2d_no_wrap", 4),
+    for algorithm, sip_topology, n_sips, sip_w, sip_h in [
+        ("intercube_allreduce", "ring_1d", 6, None, None),
+        ("intercube_allreduce", "torus_2d", 6, 2, 3),
+        ("intercube_allreduce", "mesh_2d_no_wrap", 6, 2, 3),
    ]:
        for n_elem in _SWEEP_N_ELEM:
            sub = tmp_path / f"{sip_topology}_{n_elem}"
            sub.mkdir()
            topo_path, ccl_path = _write_temp_configs(
                sub, sip_topology, n_sips, algorithm,
+                sip_w=sip_w, sip_h=sip_h,
                n_elem_override=n_elem,
            )
            topo = resolve_topology(topo_path)
@@ -339,8 +376,7 @@ def test_allreduce_latency_sweep(tmp_path):
            w.writerow(r)

    topologies = sorted({r["sip_topology"] for r in records})
-    # Per-topology plots: log-scale + linear-scale side-by-side.
-    # X-axis = bytes per PE (per-message payload size).
+    # Per-topology plots, log-scale x-axis = bytes per PE.
    for topo_name in topologies:
        rs = sorted(
            [r for r in records if r["sip_topology"] == topo_name],
@@ -352,7 +388,6 @@ def test_allreduce_latency_sweep(tmp_path):
            f"Allreduce latency — {topo_name} "
            f"(n_sips={rs[0]['n_sips']})"
        )
-        # Log-scale
        fig, ax = plt.subplots(figsize=(8, 5))
        ax.plot(xs, ys, marker="o", color="tab:blue")
        ax.set_xscale("log", base=2)
@@ -364,58 +399,31 @@ def test_allreduce_latency_sweep(tmp_path):
        fig.tight_layout()
        fig.savefig(out_dir / f"{topo_name}.png", dpi=120)
        plt.close(fig)
-        # Linear-scale companion
-        fig, ax = plt.subplots(figsize=(8, 5))
-        ax.plot(xs, ys, marker="o", color="tab:blue")
-        ax.set_xlabel("Bytes per PE")
-        ax.set_ylabel("max pe_exec_ns (critical path)")
-        ax.set_title(title + " [linear scale]")
-        ax.grid(True, alpha=0.3)
-        ax.xaxis.set_major_formatter(_bytes_fmt)
-        fig.tight_layout()
-        fig.savefig(out_dir / f"{topo_name}_linear.png", dpi=120)
-        plt.close(fig)

-    # Combined overview — two variants: log-scale (overview.png) and
-    # linear-scale (overview_linear.png).
    colors = {"ring_1d": "tab:blue", "torus_2d": "tab:orange",
              "mesh_2d_no_wrap": "tab:green"}
+    fig, ax = plt.subplots(figsize=(9, 6))
+    for topo_name in topologies:
+        rs = sorted(
+            [r for r in records if r["sip_topology"] == topo_name],
+            key=lambda r: r["bytes_per_pe"],
+        )
+        ax.plot(
+            [r["bytes_per_pe"] for r in rs],
+            [r["latency_ns"] for r in rs],
+            marker="o",
+            label=f"{topo_name} (n_sips={rs[0]['n_sips']})",
+            color=colors.get(topo_name),
+        )
+    ax.set_xscale("log", base=2)
+    ax.set_xlabel("Bytes per PE (log scale)")
+    ax.set_ylabel("max pe_exec_ns (critical path)")
+    ax.set_title("Multi-device allreduce latency by topology")
+    ax.grid(True, alpha=0.3)
+    ax.legend()
+    ax.xaxis.set_major_formatter(_bytes_fmt)
+    fig.tight_layout()
+    fig.savefig(out_dir / "overview.png", dpi=120)
+    plt.close(fig)

-    def _draw_overview(log_x: bool, filename: str, title_suffix: str) -> None:
-        fig, ax = plt.subplots(figsize=(9, 6))
-        for topo_name in topologies:
-            rs = sorted(
-                [r for r in records if r["sip_topology"] == topo_name],
-                key=lambda r: r["bytes_per_pe"],
-            )
-            ax.plot(
-                [r["bytes_per_pe"] for r in rs],
-                [r["latency_ns"] for r in rs],
-                marker="o",
-                label=f"{topo_name} (n_sips={rs[0]['n_sips']})",
-                color=colors.get(topo_name),
-            )
-        if log_x:
-            ax.set_xscale("log", base=2)
-            ax.set_xlabel("Bytes per PE (log scale)")
-        else:
-            ax.set_xlabel("Bytes per PE")
-        ax.set_ylabel("max pe_exec_ns (critical path)")
-        ax.set_title("Multi-device allreduce latency by topology" + title_suffix)
-        ax.grid(True, alpha=0.3)
-        ax.legend()
-        ax.xaxis.set_major_formatter(_bytes_fmt)
-        fig.tight_layout()
-        fig.savefig(out_dir / filename, dpi=120)
-        plt.close(fig)
-
-    _draw_overview(log_x=True, filename="overview.png", title_suffix="")
-    _draw_overview(
-        log_x=False, filename="overview_linear.png",
-        title_suffix=" [linear scale]",
-    )
-
-    print(
-        f"\nWrote {out_dir / 'overview.png'} + "
-        f"{out_dir / 'overview_linear.png'}"
-    )
+    print(f"\nWrote {out_dir / 'overview.png'}")