Files
kernbench2/tests/test_ccl_allreduce_matrix.py
T
ywkang 79124daab1 ADR-0024 Phase B (partial): scheduler-level collective drain
Root cause (hang diagnosis):
`kernel_runner.run()` captures `greenlet.getcurrent()` at spawn time as
the kernel greenlet's `_parent`. When a worker greenlet (say g0) calls
`dist.all_reduce` → `ctx.wait(h)` → `env.run(until=h0)`, the SimPy
scheduler steps pe_cpu processes, which in turn spawn kernel greenlets.
Those kernels' `_parent` becomes g0 (current greenlet at spawn). When a
kernel yields via switch_to_simpy, control jumps back up to g0's LAST
switch point — which is the main scheduler's `g.switch()` call — rather
than the kernel_runner's generator frame. Main then re-enters its
`for g in alive: g.switch()` loop mid-wait, producing nested greenlet
re-entry. Scheduler spins: g0 never completes, g1 appears to complete
out of order, infinite loop at 100% CPU.

Fix:
- AhbmCCLBackend.all_reduce: in multi-greenlet mode, submit via
  launch(_defer_wait=True), extend backend._pending_collective_handles,
  and yield to the parent greenlet. Worker does NOT call wait.
- benches/ccl_allreduce.py run(): after each scheduler round, the MAIN
  greenlet drains backend._pending_collective_handles. This keeps
  env.run invocation in the main context, so kernel_runner's spawned
  kernel greenlets have main as their _parent — no nested re-entry.
- Legacy single-driver path (no bench scheduler): all_reduce falls back
  to inline wait when g.parent is None.

Result:
- Multi-greenlet cross-SIP ring no longer hangs (was 100% CPU infinite
  loop in kernel_runner._switch_kernel).
- ring_default_ws still xfail(strict=True): now fails as a data
  correctness issue — DataExecutor reports only 1 math op for a 2-rank
  ring (expected 2). Cross-SIP op_log replay integration is the
  remaining Phase B task.

514 passed, 1 xfailed (strict).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-14 09:14:03 -07:00

158 lines
4.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""End-to-end matrix tests for the unified ``ccl_allreduce`` bench.
Each parametrized case writes a tmp ``ccl.yaml`` overlay that selects a
specific (algorithm, world_size, buffer_kind, n_elem) combination, then
runs the bench via the CLI and asserts the printed line reports all
ranks OK.
This single test file replaces the per-variant bench tests
(test_ccl_allreduce_e2e, test_ccl_mesh_allreduce, test_ccl_tree_allreduce,
test_ccl_multicube, test_ccl_multisip).
"""
from __future__ import annotations
import os
import textwrap
import pytest
import kernbench.cli.main as cli_main
CCL_YAML_TEMPLATE = textwrap.dedent("""\
defaults:
algorithm: {algorithm}
buffer_kind: {buffer_kind}
backpressure: sleep
n_slots: 4
slot_size: 4096
vc_chunk_size: 256
ipcq_credit_size_bytes: 16
algorithms:
{algorithm}:
module: {module}
topology: {topology}
buffer_kind: {buffer_kind}
{world_size_line}{n_elem_line}
""")
def _write_ccl_yaml(
tmp_path,
*,
algorithm: str,
module: str,
topology: str,
buffer_kind: str = "tcm",
world_size: int | None = None,
n_elem: int | None = None,
) -> str:
"""Write a tmp ccl.yaml in tmp_path and return its directory."""
ws_line = f" world_size: {world_size}\n" if world_size is not None else ""
nel_line = f" n_elem: {n_elem}\n" if n_elem is not None else ""
body = CCL_YAML_TEMPLATE.format(
algorithm=algorithm,
module=module,
topology=topology,
buffer_kind=buffer_kind,
world_size_line=ws_line,
n_elem_line=nel_line,
)
yaml_path = tmp_path / "ccl.yaml"
yaml_path.write_text(body)
return str(tmp_path)
CASES = [
# algorithm, module, topology, buffer_kind, world_size, n_elem, expected_ws
#
# Default fallback — no world_size override → ADR-0024 D1 derives
# from topology (SIP count = 2). Exercises the new SIP-level TP
# launcher + cross-SIP ring.
# XFAIL: Phase A fix (scheduler-level wait) resolves the greenlet-
# re-entry hang, but Phase 2 DataExecutor still reports only 1 math
# op for a 2-rank ring (expected 2) — cross-SIP op_log replay
# integration pending ADR-0024 Phase B.
pytest.param(
"ring_allreduce_tcm", "kernbench.ccl.algorithms.ring_allreduce",
"ring_1d", "tcm", None, 8, 2,
id="ring_default_ws",
marks=pytest.mark.xfail(
reason="ADR-0024 Phase B: cross-SIP op_log replay integration",
strict=True,
),
),
# Buffer variants at 8-rank (fast — same kernel, different slot space).
pytest.param(
"ring_allreduce_tcm", "kernbench.ccl.algorithms.ring_allreduce",
"ring_1d", "tcm", 8, 32, 8,
id="ring_tcm_8",
),
pytest.param(
"ring_allreduce_hbm", "kernbench.ccl.algorithms.ring_allreduce",
"ring_1d", "hbm", 8, 32, 8,
id="ring_hbm_8",
),
pytest.param(
"ring_allreduce_sram", "kernbench.ccl.algorithms.ring_allreduce",
"ring_1d", "sram", 8, 32, 8,
id="ring_sram_8",
),
# Multi-cube (16-rank, cross-cube within 1 SIP).
pytest.param(
"ring_allreduce_16", "kernbench.ccl.algorithms.ring_allreduce",
"ring_1d", "tcm", 16, 16, 16,
id="ring_multi_cube",
),
# Mesh + tree algorithms.
pytest.param(
"mesh_allreduce_4", "kernbench.ccl.algorithms.mesh_allreduce",
"mesh_2d", "tcm", 4, 16, 4,
id="mesh_2x2",
),
pytest.param(
"tree_allreduce_7", "kernbench.ccl.algorithms.tree_allreduce",
"tree_binary", "tcm", 7, 16, 7,
id="tree_binary_7",
),
]
@pytest.mark.parametrize(
"algorithm,module,topology,buffer_kind,world_size,n_elem,expected_ws",
CASES,
)
def test_ccl_allreduce_matrix(
tmp_path, capsys, monkeypatch,
algorithm, module, topology, buffer_kind, world_size, n_elem, expected_ws,
):
"""Each (algorithm × buffer × world_size) combo passes through the
unified bench and yields all ranks OK."""
project_root = os.path.abspath(
os.path.join(os.path.dirname(__file__), "..")
)
yaml_dir = _write_ccl_yaml(
tmp_path,
algorithm=algorithm,
module=module,
topology=topology,
buffer_kind=buffer_kind,
world_size=world_size,
n_elem=n_elem,
)
monkeypatch.chdir(yaml_dir)
rc = cli_main.main([
"run",
"--topology", os.path.join(project_root, "topology.yaml"),
"--bench", "ccl_allreduce",
"--verify-data",
])
assert rc == 0
out = capsys.readouterr().out
assert "FAIL" not in out, f"unexpected FAIL in output:\n{out}"
assert f"{algorithm} (ws={expected_ws}): {expected_ws} OK" in out, (
f"expected '{algorithm} (ws={expected_ws}): {expected_ws} OK' "
f"in output:\n{out}"
)