kernbench2/tests/test_mp_spawn.py

"""ADR-0027 T4: torch.multiprocessing.spawn semantics.

Phase 1: imports `ctx.multiprocessing.spawn` which doesn't exist yet —
tests fail. Phase 2 (D1) lands the namespace + _MultiprocessingNamespace
+ SpawnException, and these pass.
"""
from __future__ import annotations

import os
import textwrap

import pytest
from greenlet import greenlet


def _write_minimal_ccl_yaml(tmp_path) -> str:
    body = textwrap.dedent("""\
        defaults:
          algorithm: ring_allreduce_tcm
          buffer_kind: tcm
          backpressure: sleep
          n_slots: 4
          slot_size: 4096
          vc_chunk_size: 256
          ipcq_credit_size_bytes: 16

        algorithms:
          ring_allreduce_tcm:
            module: kernbench.ccl.algorithms.ring_allreduce
            topology: ring_1d
            buffer_kind: tcm
            n_elem: 8
    """)
    yaml_path = tmp_path / "ccl.yaml"
    yaml_path.write_text(body)
    return str(tmp_path)


def _make_ctx(topology):
    from kernbench.runtime_api.context import RuntimeContext
    from kernbench.runtime_api.types import DeviceSelector
    from kernbench.sim_engine.engine import GraphEngine

    engine = GraphEngine(topology.topology_obj, enable_data=True)
    return RuntimeContext(
        engine=engine,
        target_device=DeviceSelector("all"),
        correlation_id="test_t4",
        spec=topology.topology_obj.spec,
    )


# ── D1.3 namespace attach ────────────────────────────────────────────


def test_multiprocessing_namespace_attached(topology):
    """RuntimeContext.__post_init__ attaches ctx.multiprocessing (D1.3)."""
    with _make_ctx(topology) as ctx:
        assert hasattr(ctx, "multiprocessing"), (
            "ADR-0027 D1.3: ctx.multiprocessing must exist"
        )
        assert hasattr(ctx.multiprocessing, "spawn"), (
            "ctx.multiprocessing must expose a spawn(fn, args, nprocs) method"
        )


# ── D1.1 / D1.2: spawn shape + rank binding ──────────────────────────


def test_spawn_invokes_fn_once_per_rank(topology):
    """spawn(fn, args, nprocs) calls fn(rank, *args) once for each rank."""
    with _make_ctx(topology) as ctx:
        calls: list[tuple[int, tuple]] = []

        def _worker(rank: int, world_size: int) -> None:
            calls.append((rank, (world_size,)))

        ctx.multiprocessing.spawn(_worker, args=(3,), nprocs=3)

        assert sorted(r for r, _ in calls) == [0, 1, 2]
        for _, (ws,) in calls:
            assert ws == 3


def test_spawn_binds_greenlet_local_rank(topology):
    """Inside the worker, torch.distributed.get_rank() returns the rank
    bound to the greenlet (ADR-0024 D9 + D1.2)."""
    with _make_ctx(topology) as ctx:
        # Distributed context needs to be initialised so get_rank is valid.
        # For T4 we don't run a real collective; just check rank lookup.
        observed: list[tuple[int, int]] = []

        def _worker(rank: int):
            g = greenlet.getcurrent()
            bound = ctx.distributed._rank_by_greenlet.get(g)
            observed.append((rank, bound))

        ctx.multiprocessing.spawn(_worker, args=(), nprocs=2)

        for rank, bound in observed:
            assert rank == bound, (
                f"rank {rank} must be bound to greenlet-local rank {rank}; "
                f"got {bound}"
            )


# ── D1.2 exception cleanup ───────────────────────────────────────────


def test_spawn_exception_raises_spawn_exception_with_root_cause(topology):
    """D0.4-(4): worker raise → siblings SystemExit + SpawnException(errors)."""
    with _make_ctx(topology) as ctx:
        from kernbench.runtime_api.multiprocessing import SpawnException

        def _worker(rank: int):
            if rank == 1:
                raise ValueError(f"rank {rank} boom")

        with pytest.raises(SpawnException) as exc_info:
            ctx.multiprocessing.spawn(_worker, args=(), nprocs=3)

        # Root cause rank is captured.
        assert 1 in exc_info.value.errors
        assert isinstance(exc_info.value.errors[1], ValueError)


def test_spawn_exception_clears_pending_queues(topology):
    """D0.4-(4): on raise, _pending_worker_waits and collective queue clear."""
    with _make_ctx(topology) as ctx:
        from kernbench.runtime_api.multiprocessing import SpawnException

        def _worker(rank: int):
            raise RuntimeError("fail")

        with pytest.raises(SpawnException):
            ctx.multiprocessing.spawn(_worker, args=(), nprocs=2)

        assert ctx._pending_worker_waits == []


# ── D1.4 migration compat: ccl_allreduce runs via mp.spawn ───────────


def test_ccl_allreduce_hand_rolled_loop_replaced_by_mp_spawn(
    topology, tmp_path, monkeypatch, spec,
):
    """D1.4: benches/ccl_allreduce.py's hand-rolled greenlet loop must still
    produce correct behaviour after migration to torch.multiprocessing.spawn.

    Minimal smoke — just that ``bench.run(ctx)`` completes without the
    loop short-circuiting or leaving pending queues dirty.
    """
    monkeypatch.chdir(_write_minimal_ccl_yaml(tmp_path))
    import benches.ccl_allreduce as bench

    calls: list[tuple[int, int]] = []

    def _fake_worker(rank, world_size, torch):
        calls.append((rank, world_size))

    monkeypatch.setattr(bench, "worker", _fake_worker)

    with _make_ctx(topology) as ctx:
        bench.run(ctx)

    expected_ws = int(spec["system"]["sips"]["count"])
    ranks = sorted(r for r, _ in calls)
    assert ranks == list(range(expected_ws))
    assert ctx._pending_worker_waits == []


# ── _drain_pending function is exported ──────────────────────────────


def test_drain_pending_exported():
    """D0.4: _drain_pending must be importable from runtime_api.multiprocessing."""
    from kernbench.runtime_api.multiprocessing import _drain_pending
    assert callable(_drain_pending)