Intercube allreduce: pe0 cube-mesh reduce + multi-SIP ring/torus/mesh
New intercube allreduce kernel replacing the old flat ring algorithms. Reduces across the 4x4 cube mesh within each SIP (pe0-only, same-lane), then inter-SIP exchange on root cube, then broadcast back. Supports ring_1d, torus_2d, and mesh_2d_no_wrap SIP topologies driven by topology.yaml. Integrated with dist.init_process_group / dist.all_reduce. New files: - src/kernbench/ccl/algorithms/intercube_allreduce.py (kernel) - src/kernbench/ccl/sfr_config.py (configure_sfr_intercube_multisip) - tests/test_allreduce_multidevice.py (config-driven, 3 topologies) - tests/test_distributed_intercube_allreduce.py (full distributed path) - tests/test_intercube_sfr_config.py (SFR wiring verification) Modified: - distributed.py: AhbmCCLBackend uses configure_sfr_intercube_multisip - topologies.py: added torus_2d, mesh_2d_no_wrap - install.py: global_E/W/N/S in _OPPOSITE_DIR - topology.yaml: added system.sips.topology - ccl.yaml: single intercube_allreduce algorithm - benches/ccl_allreduce.py: row_wise cube-mesh tensor layout Removed old flat-ring algorithms and their tests. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,222 @@
|
||||
"""Config-driven multi-device allreduce test application.
|
||||
|
||||
Reads ``ccl.yaml`` + ``topology.yaml``, dynamically loads the kernel
|
||||
module from ``ccl.yaml → module``, and picks the inter-SIP exchange
|
||||
pattern from ``topology.yaml → system.sips.topology``.
|
||||
|
||||
Run directly::
|
||||
|
||||
python -m pytest tests/allreduce_app.py -v -s
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import importlib
|
||||
import math
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
|
||||
from kernbench.ccl.install import load_ccl_config, resolve_algorithm_config
|
||||
from kernbench.ccl.sfr_config import configure_sfr_intercube_multisip
|
||||
from kernbench.policy.placement.dp import DPPolicy
|
||||
|
||||
|
||||
def _sip_topo_dims(sip_topo: str, n_sips: int) -> tuple[int, int]:
|
||||
if sip_topo == "ring_1d":
|
||||
return (0, 0)
|
||||
side = int(round(math.sqrt(n_sips)))
|
||||
if side * side != n_sips:
|
||||
raise ValueError(
|
||||
f"SIP topology '{sip_topo}' requires square n_sips, got {n_sips}"
|
||||
)
|
||||
return (side, side)
|
||||
|
||||
|
||||
def run_allreduce(
|
||||
ctx: Any,
|
||||
engine: Any,
|
||||
spec: dict,
|
||||
*,
|
||||
algorithm: str | None = None,
|
||||
ccl_yaml: str | None = None,
|
||||
) -> dict:
|
||||
"""Config-driven allreduce: read yaml, load kernel, run.
|
||||
|
||||
Everything is resolved from config — no hardcoded kernel imports.
|
||||
"""
|
||||
cfg_all = load_ccl_config(ccl_yaml)
|
||||
cfg = resolve_algorithm_config(cfg_all, algorithm)
|
||||
|
||||
# Dynamic import from ccl.yaml → module
|
||||
algo_module = importlib.import_module(cfg["module"])
|
||||
kernel_fn = algo_module.kernel
|
||||
topo_name_to_kind = algo_module.TOPO_NAME_TO_KIND
|
||||
|
||||
n_elem = int(cfg.get("n_elem", 8))
|
||||
n_sips = int(spec.get("system", {}).get("sips", {}).get("count", 1))
|
||||
sip_topo = str(
|
||||
spec.get("system", {}).get("sips", {}).get("topology", "ring_1d")
|
||||
)
|
||||
|
||||
cm = spec["sip"]["cube_mesh"]
|
||||
cube_w = int(cm["w"])
|
||||
cube_h = int(cm["h"])
|
||||
n_cubes = cube_w * cube_h
|
||||
|
||||
sip_topo_kind = topo_name_to_kind.get(sip_topo, 0)
|
||||
sip_topo_w, sip_topo_h = _sip_topo_dims(sip_topo, n_sips)
|
||||
|
||||
algo_name = cfg.get("algorithm", "allreduce")
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f"algorithm: {algo_name}")
|
||||
print(f"module: {cfg['module']}")
|
||||
print(f"sip_topology: {sip_topo}")
|
||||
print(f"kernel: {kernel_fn.__name__}")
|
||||
print(f"n_sips: {n_sips}")
|
||||
print(f"n_cubes: {n_cubes}")
|
||||
print(f"n_elem: {n_elem}")
|
||||
print(f"{'=' * 60}")
|
||||
|
||||
configure_sfr_intercube_multisip(engine, spec, cfg)
|
||||
|
||||
dp = DPPolicy(
|
||||
cube="row_wise", pe="replicate",
|
||||
num_pes=1, num_cubes=n_cubes,
|
||||
)
|
||||
|
||||
tensors = []
|
||||
for sip in range(n_sips):
|
||||
ctx.ahbm.set_device(sip)
|
||||
t = ctx.zeros(
|
||||
(n_cubes, n_elem), dtype="f16", dp=dp,
|
||||
name=f"sip{sip}",
|
||||
)
|
||||
t.copy_(ctx.from_numpy(
|
||||
np.full((n_cubes, n_elem), float(sip + 1), dtype=np.float16)
|
||||
))
|
||||
tensors.append(t)
|
||||
|
||||
for sip in range(n_sips):
|
||||
arr = tensors[sip].numpy()
|
||||
print(f"[SIP {sip}] input cube0[:4] = {arr[0][:4].tolist()} "
|
||||
f"cube{n_cubes - 1}[:4] = {arr[-1][:4].tolist()}")
|
||||
|
||||
t_start = engine._env.now
|
||||
|
||||
all_pending = []
|
||||
for sip_rank, t in enumerate(tensors):
|
||||
pending = ctx.launch(
|
||||
algo_name, kernel_fn, t,
|
||||
n_elem, cube_w, cube_h, n_sips, sip_rank,
|
||||
sip_topo_kind, sip_topo_w, sip_topo_h,
|
||||
_defer_wait=True,
|
||||
)
|
||||
all_pending.extend(pending)
|
||||
|
||||
for h, sip_id, meta in all_pending:
|
||||
ctx.wait(h, _meta=meta)
|
||||
|
||||
t_end = engine._env.now
|
||||
latency_ns = t_end - t_start
|
||||
print(f"\n[{algo_name} ws={n_sips}] sim latency = "
|
||||
f"{latency_ns:.1f} ns ({latency_ns / 1000:.3f} us)")
|
||||
|
||||
for key, (_, trace) in engine._results.items():
|
||||
if not isinstance(trace, dict):
|
||||
continue
|
||||
total = trace.get("total_ns", 0.0)
|
||||
pe_exec = trace.get("pe_exec_ns", 0.0) or 0.0
|
||||
network = total - pe_exec
|
||||
print(f" [{key}] total={total:.1f} ns "
|
||||
f"pe_exec={pe_exec:.1f} ns network={network:.1f} ns")
|
||||
|
||||
expected = float(n_cubes * sum(range(1, n_sips + 1)))
|
||||
|
||||
print()
|
||||
for sip in range(n_sips):
|
||||
arr = tensors[sip].numpy()
|
||||
print(f"[SIP {sip}] output cube0[:4] = {arr[0][:4].tolist()}")
|
||||
print(f"[SIP {sip}] output cube{n_cubes - 1}[:4] = {arr[-1][:4].tolist()}")
|
||||
|
||||
ok_cubes = 0
|
||||
for sip in range(n_sips):
|
||||
arr = tensors[sip].numpy()
|
||||
for cube_id in range(n_cubes):
|
||||
assert np.allclose(
|
||||
arr[cube_id], expected, rtol=1e-1, atol=1e-1,
|
||||
), (
|
||||
f"SIP{sip} cube {cube_id}: "
|
||||
f"got {arr[cube_id][:4]}, expected {expected}"
|
||||
)
|
||||
ok_cubes += 1
|
||||
|
||||
print(f"\n {algo_name} (ws={n_sips}): {ok_cubes} OK")
|
||||
|
||||
return {
|
||||
"expected": expected,
|
||||
"latency_ns": latency_ns,
|
||||
"ok_cubes": ok_cubes,
|
||||
}
|
||||
|
||||
|
||||
# ── pytest entry point ───────────────────────────────────────────────
|
||||
|
||||
import pytest
|
||||
import yaml
|
||||
|
||||
from kernbench.runtime_api.context import RuntimeContext
|
||||
from kernbench.runtime_api.types import DeviceSelector
|
||||
from kernbench.sim_engine.engine import GraphEngine
|
||||
from kernbench.topology.builder import resolve_topology
|
||||
|
||||
TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"
|
||||
|
||||
CONFIGS = [
|
||||
pytest.param("intercube_allreduce", "ring_1d", 2, id="ring_2sip"),
|
||||
pytest.param("intercube_allreduce", "torus_2d", 4, id="torus_4sip"),
|
||||
pytest.param("intercube_allreduce", "mesh_2d_no_wrap", 4, id="mesh_4sip"),
|
||||
]
|
||||
|
||||
|
||||
def _write_temp_configs(tmp_path, sip_topology, n_sips, algorithm):
|
||||
"""Write temp topology.yaml and ccl.yaml with the given overrides."""
|
||||
with open(TOPOLOGY_PATH) as f:
|
||||
topo_cfg = yaml.safe_load(f)
|
||||
topo_cfg["system"]["sips"]["count"] = n_sips
|
||||
topo_cfg["system"]["sips"]["topology"] = sip_topology
|
||||
topo_path = tmp_path / "topology.yaml"
|
||||
with open(topo_path, "w") as f:
|
||||
yaml.dump(topo_cfg, f, default_flow_style=False)
|
||||
|
||||
ccl_path = Path(__file__).parent.parent / "ccl.yaml"
|
||||
with open(ccl_path) as f:
|
||||
ccl_cfg = yaml.safe_load(f)
|
||||
ccl_cfg["defaults"]["algorithm"] = algorithm
|
||||
tmp_ccl = tmp_path / "ccl.yaml"
|
||||
with open(tmp_ccl, "w") as f:
|
||||
yaml.dump(ccl_cfg, f, default_flow_style=False)
|
||||
|
||||
return str(topo_path), str(tmp_ccl)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("algorithm,sip_topology,n_sips", CONFIGS)
|
||||
def test_allreduce(tmp_path, algorithm, sip_topology, n_sips):
|
||||
topo_path, ccl_path = _write_temp_configs(
|
||||
tmp_path, sip_topology, n_sips, algorithm,
|
||||
)
|
||||
topo = resolve_topology(topo_path)
|
||||
engine = GraphEngine(topo.topology_obj, enable_data=True)
|
||||
spec = topo.topology_obj.spec
|
||||
|
||||
with RuntimeContext(
|
||||
engine=engine,
|
||||
target_device=DeviceSelector("all"),
|
||||
correlation_id=f"test_{algorithm}_{sip_topology}",
|
||||
spec=spec,
|
||||
) as ctx:
|
||||
result = run_allreduce(
|
||||
ctx, engine, spec,
|
||||
algorithm=algorithm, ccl_yaml=ccl_path,
|
||||
)
|
||||
assert result["ok_cubes"] > 0
|
||||
@@ -1,108 +0,0 @@
|
||||
"""End-to-end matrix tests for the unified ``ccl_allreduce`` bench.
|
||||
|
||||
Only covers the rank = SIP TP launcher path (ADR-0024 + ADR-0027). Each
|
||||
case writes a tmp ``ccl.yaml`` that selects a specific (algorithm,
|
||||
buffer_kind) pair; ``world_size`` is always derived from topology SIP
|
||||
count (2 in the shipped topology).
|
||||
|
||||
The legacy rank = PE single-driver path was removed; intra-SIP PE-level
|
||||
collectives are expressed inside the kernel via ``tl.program_id`` and do
|
||||
not require a host-side ``ProcessGroup``.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import textwrap
|
||||
|
||||
import pytest
|
||||
|
||||
import kernbench.cli.main as cli_main
|
||||
|
||||
|
||||
CCL_YAML_TEMPLATE = textwrap.dedent("""\
|
||||
defaults:
|
||||
algorithm: {algorithm}
|
||||
buffer_kind: {buffer_kind}
|
||||
backpressure: sleep
|
||||
n_slots: 4
|
||||
slot_size: 4096
|
||||
vc_chunk_size: 256
|
||||
ipcq_credit_size_bytes: 16
|
||||
|
||||
algorithms:
|
||||
{algorithm}:
|
||||
module: {module}
|
||||
topology: {topology}
|
||||
buffer_kind: {buffer_kind}
|
||||
""")
|
||||
|
||||
|
||||
def _write_ccl_yaml(
|
||||
tmp_path,
|
||||
*,
|
||||
algorithm: str,
|
||||
module: str,
|
||||
topology: str,
|
||||
buffer_kind: str,
|
||||
) -> str:
|
||||
body = CCL_YAML_TEMPLATE.format(
|
||||
algorithm=algorithm,
|
||||
module=module,
|
||||
topology=topology,
|
||||
buffer_kind=buffer_kind,
|
||||
)
|
||||
(tmp_path / "ccl.yaml").write_text(body)
|
||||
return str(tmp_path)
|
||||
|
||||
|
||||
CASES = [
|
||||
# Ring all-reduce across SIPs (ws == topology SIP count = 2),
|
||||
# one case per IPCQ buffer location.
|
||||
pytest.param(
|
||||
"ring_allreduce_tcm", "kernbench.ccl.algorithms.ring_allreduce",
|
||||
"ring_1d", "tcm",
|
||||
id="ring_tcm",
|
||||
),
|
||||
pytest.param(
|
||||
"ring_allreduce_hbm", "kernbench.ccl.algorithms.ring_allreduce",
|
||||
"ring_1d", "hbm",
|
||||
id="ring_hbm",
|
||||
),
|
||||
pytest.param(
|
||||
"ring_allreduce_sram", "kernbench.ccl.algorithms.ring_allreduce",
|
||||
"ring_1d", "sram",
|
||||
id="ring_sram",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("algorithm,module,topology,buffer_kind", CASES)
|
||||
def test_ccl_allreduce_matrix(
|
||||
tmp_path, capsys, monkeypatch,
|
||||
algorithm, module, topology, buffer_kind,
|
||||
):
|
||||
"""Each (algorithm × buffer_kind) combo passes through the unified
|
||||
rank = SIP bench and yields ``ws OK`` where ``ws == topology SIP count``."""
|
||||
project_root = os.path.abspath(
|
||||
os.path.join(os.path.dirname(__file__), "..")
|
||||
)
|
||||
yaml_dir = _write_ccl_yaml(
|
||||
tmp_path,
|
||||
algorithm=algorithm,
|
||||
module=module,
|
||||
topology=topology,
|
||||
buffer_kind=buffer_kind,
|
||||
)
|
||||
monkeypatch.chdir(yaml_dir)
|
||||
rc = cli_main.main([
|
||||
"run",
|
||||
"--topology", os.path.join(project_root, "topology.yaml"),
|
||||
"--bench", "ccl_allreduce",
|
||||
"--verify-data",
|
||||
])
|
||||
assert rc == 0
|
||||
out = capsys.readouterr().out
|
||||
assert "FAIL" not in out, f"unexpected FAIL in output:\n{out}"
|
||||
assert f"{algorithm}" in out and "OK" in out, (
|
||||
f"expected pass line for '{algorithm}' in output:\n{out}"
|
||||
)
|
||||
@@ -1,244 +0,0 @@
|
||||
"""Phase 1 tests for ADR-0024 SIP-level TP launcher (MVP scope).
|
||||
|
||||
Covers:
|
||||
- D1 world_size = SIP count fallback
|
||||
- D9 get_rank greenlet-local + _bind_rank
|
||||
- D10 torch.ahbm.set_device + torch.accelerator alias
|
||||
- D11 tensor placement scoped to current device SIP
|
||||
- D12/D13 run() spawns one greenlet per rank
|
||||
|
||||
Deferred to later ADR-0024 sub-phases:
|
||||
- D2 engine-routed install
|
||||
- D6 install_plan.py
|
||||
- D7 epoch barrier (this phase uses simple submit+yield+wait)
|
||||
- D8 validator registry
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import textwrap
|
||||
|
||||
import pytest
|
||||
from greenlet import greenlet
|
||||
|
||||
from kernbench.runtime_api.distributed import AhbmCCLBackend, DistributedContext
|
||||
|
||||
|
||||
# ── Fixtures / helpers ────────────────────────────────────────────────
|
||||
|
||||
|
||||
class _FakeCtx:
|
||||
"""Minimal ctx double — only exposes what AhbmCCLBackend.__init__ uses.
|
||||
|
||||
Stubs install_ipcq so we can unit-test _resolve_world_size without
|
||||
touching the engine stack.
|
||||
"""
|
||||
|
||||
def __init__(self, spec: dict) -> None:
|
||||
self.spec = spec
|
||||
self.install_calls: list[dict] = []
|
||||
|
||||
def install_ipcq(self, **kwargs) -> dict:
|
||||
self.install_calls.append(dict(kwargs))
|
||||
return {}
|
||||
|
||||
|
||||
def _write_minimal_ccl_yaml(tmp_path) -> str:
|
||||
"""Write a ccl.yaml with NO world_size override — forces topology derivation."""
|
||||
body = textwrap.dedent("""\
|
||||
defaults:
|
||||
algorithm: ring_allreduce_tcm
|
||||
buffer_kind: tcm
|
||||
backpressure: sleep
|
||||
n_slots: 4
|
||||
slot_size: 4096
|
||||
vc_chunk_size: 256
|
||||
ipcq_credit_size_bytes: 16
|
||||
|
||||
algorithms:
|
||||
ring_allreduce_tcm:
|
||||
module: kernbench.ccl.algorithms.ring_allreduce
|
||||
topology: ring_1d
|
||||
buffer_kind: tcm
|
||||
n_elem: 8
|
||||
""")
|
||||
yaml_path = tmp_path / "ccl.yaml"
|
||||
yaml_path.write_text(body)
|
||||
return str(tmp_path)
|
||||
|
||||
|
||||
# ── D1: world_size = SIP count fallback ───────────────────────────────
|
||||
|
||||
|
||||
def test_world_size_equals_sip_count(tmp_path, monkeypatch, spec):
|
||||
"""With no override, backend derives world_size from SIP count only.
|
||||
|
||||
Topology has 2 SIPs × 16 cubes × 8 PEs = 256 PEs. The TP/DP model
|
||||
places the collective group at the SIP boundary, so world_size must
|
||||
equal SIP count (2), not total PE count (256).
|
||||
"""
|
||||
monkeypatch.chdir(_write_minimal_ccl_yaml(tmp_path))
|
||||
ctx = _FakeCtx(spec=spec)
|
||||
backend = AhbmCCLBackend(torch_ctx=ctx)
|
||||
expected = int(spec["system"]["sips"]["count"])
|
||||
assert backend.world_size == expected, (
|
||||
f"expected world_size == SIP count ({expected}); "
|
||||
f"got {backend.world_size} — still deriving sips × cubes × pes"
|
||||
)
|
||||
|
||||
|
||||
# ── D9: get_rank greenlet-local + _bind_rank ──────────────────────────
|
||||
|
||||
|
||||
def test_get_rank_is_greenlet_local(tmp_path, monkeypatch, spec):
|
||||
"""Each greenlet sees its own rank via dist.get_rank().
|
||||
|
||||
Framework-level launcher binds greenlet → rank; get_rank() resolves
|
||||
the current greenlet and returns that rank.
|
||||
"""
|
||||
monkeypatch.chdir(_write_minimal_ccl_yaml(tmp_path))
|
||||
ctx = _FakeCtx(spec=spec)
|
||||
dc = DistributedContext()
|
||||
dc._ctx_ref = ctx
|
||||
dc.init_process_group(backend="ahbm")
|
||||
assert dc.get_world_size() == int(spec["system"]["sips"]["count"])
|
||||
|
||||
assert hasattr(dc, "_bind_rank"), (
|
||||
"DistributedContext must expose _bind_rank(g, rank) hook"
|
||||
)
|
||||
|
||||
seen: dict[int, int] = {}
|
||||
|
||||
def _probe(rank: int) -> None:
|
||||
seen[rank] = dc.get_rank()
|
||||
|
||||
g0 = greenlet(lambda: _probe(0))
|
||||
g1 = greenlet(lambda: _probe(1))
|
||||
dc._bind_rank(g0, 0)
|
||||
dc._bind_rank(g1, 1)
|
||||
g0.switch()
|
||||
g1.switch()
|
||||
|
||||
assert seen == {0: 0, 1: 1}, (
|
||||
f"expected each greenlet to see its own rank; got {seen}"
|
||||
)
|
||||
|
||||
|
||||
def test_get_rank_fallback_without_bind(tmp_path, monkeypatch, spec):
|
||||
"""Unbound greenlet falls back to rank 0 (single-driver compat)."""
|
||||
monkeypatch.chdir(_write_minimal_ccl_yaml(tmp_path))
|
||||
ctx = _FakeCtx(spec=spec)
|
||||
dc = DistributedContext()
|
||||
dc._ctx_ref = ctx
|
||||
dc.init_process_group(backend="ahbm")
|
||||
# Call from main (unbound) greenlet
|
||||
assert dc.get_rank() == 0
|
||||
|
||||
|
||||
# ── D10/D11: torch.ahbm.set_device + tensor scoping ───────────────────
|
||||
|
||||
|
||||
def test_ahbm_set_device_binds_tensor_to_single_sip(topology):
|
||||
"""``torch.ahbm.set_device(rank)`` + default-sip DPPolicy → tensor on SIP rank.
|
||||
|
||||
After set_device(1), a tensor with DPPolicy leaving the SIP dimension
|
||||
at its default must be placed entirely on SIP 1.
|
||||
"""
|
||||
from kernbench.policy.placement.dp import DPPolicy
|
||||
from kernbench.runtime_api.context import RuntimeContext
|
||||
from kernbench.runtime_api.types import DeviceSelector
|
||||
from kernbench.sim_engine.engine import GraphEngine
|
||||
|
||||
engine = GraphEngine(topology.topology_obj, enable_data=True)
|
||||
with RuntimeContext(
|
||||
engine=engine,
|
||||
target_device=DeviceSelector("all"),
|
||||
correlation_id="test_ahbm_set_device",
|
||||
spec=topology.topology_obj.spec,
|
||||
) as ctx:
|
||||
assert hasattr(ctx, "ahbm"), (
|
||||
"RuntimeContext must expose .ahbm namespace (ADR-0024 D10)"
|
||||
)
|
||||
ctx.ahbm.set_device(1)
|
||||
|
||||
dp = DPPolicy(cube="column_wise", pe="column_wise") # default sip
|
||||
tensor = ctx.zeros((1, 128), dtype="f16", dp=dp, name="probe")
|
||||
|
||||
shard_sips = {s.sip for s in tensor._handle.shards}
|
||||
assert shard_sips == {1}, (
|
||||
f"after ahbm.set_device(1), all shards should live on SIP 1; "
|
||||
f"got sips={sorted(shard_sips)}"
|
||||
)
|
||||
|
||||
|
||||
def test_accelerator_alias_mirrors_ahbm(topology):
|
||||
"""torch.accelerator.set_device_index(r) is an alias for ahbm.set_device(r)
|
||||
(PyTorch 2.x device-agnostic surface)."""
|
||||
from kernbench.runtime_api.context import RuntimeContext
|
||||
from kernbench.runtime_api.types import DeviceSelector
|
||||
from kernbench.sim_engine.engine import GraphEngine
|
||||
|
||||
engine = GraphEngine(topology.topology_obj, enable_data=True)
|
||||
with RuntimeContext(
|
||||
engine=engine,
|
||||
target_device=DeviceSelector("all"),
|
||||
correlation_id="test_accelerator_alias",
|
||||
spec=topology.topology_obj.spec,
|
||||
) as ctx:
|
||||
assert hasattr(ctx, "accelerator"), (
|
||||
"RuntimeContext must expose .accelerator namespace (ADR-0024 D10)"
|
||||
)
|
||||
ctx.accelerator.set_device_index(1)
|
||||
# Both namespaces should report SIP 1 as current device
|
||||
assert ctx.ahbm.current_device() == 1
|
||||
assert ctx.accelerator.current_device_index() == 1
|
||||
|
||||
|
||||
# ── D12/D13: run() spawns one worker per rank ─────────────────────────
|
||||
|
||||
|
||||
def test_run_spawns_one_worker_per_rank(tmp_path, monkeypatch, spec):
|
||||
"""The bench's ``run()`` invokes ``worker`` once per rank.
|
||||
|
||||
With world_size = SIP count = 2 (topology), worker must be called
|
||||
exactly twice with ranks 0 and 1. Each call sees world_size=2.
|
||||
"""
|
||||
project_root = os.path.abspath(
|
||||
os.path.join(os.path.dirname(__file__), "..")
|
||||
)
|
||||
monkeypatch.chdir(_write_minimal_ccl_yaml(tmp_path))
|
||||
|
||||
import benches.ccl_allreduce as bench
|
||||
|
||||
calls: list[tuple[int, int]] = []
|
||||
|
||||
def _fake_worker(rank, cfg, torch) -> None:
|
||||
calls.append((rank, cfg.world_size))
|
||||
|
||||
monkeypatch.setattr(bench, "_worker", _fake_worker)
|
||||
|
||||
from kernbench.runtime_api.context import RuntimeContext
|
||||
from kernbench.runtime_api.types import DeviceSelector
|
||||
from kernbench.sim_engine.engine import GraphEngine
|
||||
from kernbench.topology.builder import resolve_topology
|
||||
|
||||
topo = resolve_topology(os.path.join(project_root, "topology.yaml"))
|
||||
engine = GraphEngine(topo.topology_obj, enable_data=True)
|
||||
with RuntimeContext(
|
||||
engine=engine,
|
||||
target_device=DeviceSelector("all"),
|
||||
correlation_id="test_run_spawns",
|
||||
spec=topo.topology_obj.spec,
|
||||
) as ctx:
|
||||
bench.run(ctx)
|
||||
|
||||
ranks = sorted(r for r, _ in calls)
|
||||
ws_values = {ws for _, ws in calls}
|
||||
expected_ws = int(spec["system"]["sips"]["count"])
|
||||
assert ranks == list(range(expected_ws)), (
|
||||
f"run() should invoke worker for ranks 0..{expected_ws - 1}; "
|
||||
f"saw ranks={ranks}"
|
||||
)
|
||||
assert ws_values == {expected_ws}, (
|
||||
f"each worker should see world_size={expected_ws}; saw {ws_values}"
|
||||
)
|
||||
@@ -1,125 +0,0 @@
|
||||
"""Tests for IPCQ deadlock detection (ADR-0023 D14 F3)."""
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
import simpy
|
||||
|
||||
from kernbench.ccl import diagnostics
|
||||
from kernbench.common.ipcq_types import (
|
||||
IpcqEndpoint,
|
||||
IpcqInitEntry,
|
||||
IpcqRecvCmd,
|
||||
IpcqRequest,
|
||||
)
|
||||
from kernbench.components.builtin.pe_ipcq import PeIpcqComponent
|
||||
from kernbench.runtime_api.kernel import IpcqInitMsg
|
||||
from kernbench.topology.types import Node
|
||||
|
||||
|
||||
@dataclass
|
||||
class _FakeTxn:
|
||||
request: Any
|
||||
done: simpy.Event
|
||||
result_data: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
|
||||
def _make_isolated_pe_ipcq(env):
|
||||
node = Node(
|
||||
id="sip0.cube0.pe0.pe_ipcq", kind="pe_ipcq",
|
||||
impl="builtin.pe_ipcq", attrs={}, pos_mm=None,
|
||||
)
|
||||
comp = PeIpcqComponent(node, ctx=None)
|
||||
comp.in_ports["host"] = simpy.Store(env)
|
||||
comp.out_ports["sip0.cube0.pe0.pe_dma"] = simpy.Store(env)
|
||||
comp.start(env)
|
||||
|
||||
peer_credit = simpy.Store(env)
|
||||
ep = IpcqEndpoint(
|
||||
sip=0, cube=0, pe=1, buffer_kind="tcm",
|
||||
rx_base_pa=0x10_000, rx_base_va=0,
|
||||
n_slots=4, slot_size=4096,
|
||||
)
|
||||
init_msg = IpcqInitMsg(
|
||||
correlation_id="t", request_id="t",
|
||||
target_sips=(0,), target_cubes=(0,), target_pe=0,
|
||||
entries=(IpcqInitEntry(
|
||||
direction="W", peer=ep,
|
||||
my_rx_base_pa=0x40_000, my_rx_base_va=0,
|
||||
n_slots=4, slot_size=4096,
|
||||
peer_credit_store=peer_credit,
|
||||
),),
|
||||
backpressure_mode="sleep",
|
||||
buffer_kind="tcm",
|
||||
credit_size_bytes=16,
|
||||
)
|
||||
done = env.event()
|
||||
comp.in_ports["host"].put(_FakeTxn(request=init_msg, done=done))
|
||||
env.run(until=done)
|
||||
return comp
|
||||
|
||||
|
||||
def test_pointer_dump_includes_blocked_state():
|
||||
"""A blocked recv should still be visible in the pointer dump."""
|
||||
env = simpy.Environment()
|
||||
comp = _make_isolated_pe_ipcq(env)
|
||||
|
||||
# Issue a recv that will block (no data has arrived)
|
||||
recv_cmd = IpcqRecvCmd(direction="W", shape=(8,), dtype="f16", handle_id="r1")
|
||||
req = IpcqRequest(command=recv_cmd, done=env.event())
|
||||
comp.in_ports["host"].put(req)
|
||||
env.run(until=10)
|
||||
assert not req.done.triggered
|
||||
|
||||
# Pointer dump should show my_tail=0 and peer_head_cache=0
|
||||
# We need to use the engine API but for an isolated component, just call directly
|
||||
class FakeEngine:
|
||||
_components = {"sip0.cube0.pe0.pe_ipcq": comp}
|
||||
|
||||
dump = diagnostics.pointer_dump(FakeEngine())
|
||||
assert "my_tail=0" in dump
|
||||
assert "peer_head_cache=0" in dump
|
||||
|
||||
|
||||
def test_deadlock_detection_recv_without_send():
|
||||
"""A recv with no matching sender → SimPy schedule empties → engine
|
||||
raises ``IpcqDeadlock`` with a pointer dump.
|
||||
"""
|
||||
from kernbench.ccl.diagnostics import IpcqDeadlock
|
||||
from kernbench.policy.placement.dp import DPPolicy
|
||||
from kernbench.runtime_api.bench_runner import run_bench
|
||||
from kernbench.runtime_api.types import resolve_device
|
||||
from kernbench.sim_engine.engine import GraphEngine
|
||||
from kernbench.topology.builder import resolve_topology
|
||||
|
||||
def deadlock_kernel(t_ptr, n_elem, tl):
|
||||
# Every PE just receives, no sends → no one delivers → deadlock
|
||||
tl.recv(dir="W", shape=(n_elem,), dtype="f16")
|
||||
|
||||
topo = resolve_topology("topology.yaml")
|
||||
|
||||
def run(torch):
|
||||
torch.install_ipcq(
|
||||
algorithm="ring_allreduce_tcm", world_size_override=8,
|
||||
)
|
||||
a = torch.zeros(
|
||||
(1, 8 * 8),
|
||||
dtype="f16",
|
||||
dp=DPPolicy(
|
||||
cube="replicate", pe="column_wise",
|
||||
num_cubes=1,
|
||||
),
|
||||
name="dl_in",
|
||||
)
|
||||
torch.launch("dl", deadlock_kernel, a, 8)
|
||||
|
||||
with pytest.raises(IpcqDeadlock):
|
||||
run_bench(
|
||||
topology=topo, bench_fn=run,
|
||||
device=resolve_device("all"),
|
||||
engine_factory=lambda t, d: GraphEngine(
|
||||
getattr(t, "topology_obj", t), enable_data=True
|
||||
),
|
||||
)
|
||||
@@ -1,70 +0,0 @@
|
||||
"""Tests for CCL diagnostics: trace + pointer dump (ADR-0023 D14)."""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
|
||||
from kernbench.ccl import diagnostics
|
||||
|
||||
|
||||
# ── trace toggle ─────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_trace_disabled_by_default(monkeypatch):
|
||||
monkeypatch.delenv("KERNBENCH_CCL_TRACE", raising=False)
|
||||
diagnostics.reload_trace_setting()
|
||||
assert diagnostics.trace_enabled() is False
|
||||
|
||||
|
||||
def test_trace_enabled_via_env(monkeypatch):
|
||||
monkeypatch.setenv("KERNBENCH_CCL_TRACE", "1")
|
||||
diagnostics.reload_trace_setting()
|
||||
assert diagnostics.trace_enabled() is True
|
||||
|
||||
|
||||
def test_trace_record_send(monkeypatch, capsys):
|
||||
monkeypatch.setenv("KERNBENCH_CCL_TRACE", "1")
|
||||
diagnostics.reload_trace_setting()
|
||||
diagnostics.log_send(t_ns=100.0, sender="sip0.cube0.pe0",
|
||||
direction="E", nbytes=64, sender_seq=0)
|
||||
out = capsys.readouterr().out
|
||||
assert "send" in out
|
||||
assert "sip0.cube0.pe0" in out
|
||||
assert "dir=E" in out
|
||||
monkeypatch.delenv("KERNBENCH_CCL_TRACE")
|
||||
diagnostics.reload_trace_setting()
|
||||
|
||||
|
||||
def test_trace_record_recv(monkeypatch, capsys):
|
||||
monkeypatch.setenv("KERNBENCH_CCL_TRACE", "1")
|
||||
diagnostics.reload_trace_setting()
|
||||
diagnostics.log_recv(t_ns=200.0, receiver="sip0.cube0.pe1",
|
||||
direction="W", nbytes=64)
|
||||
out = capsys.readouterr().out
|
||||
assert "recv" in out
|
||||
assert "sip0.cube0.pe1" in out
|
||||
monkeypatch.delenv("KERNBENCH_CCL_TRACE")
|
||||
diagnostics.reload_trace_setting()
|
||||
|
||||
|
||||
# ── pointer dump ────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_pointer_dump_format():
|
||||
from kernbench.sim_engine.engine import GraphEngine
|
||||
from kernbench.topology.builder import resolve_topology
|
||||
from kernbench.ccl.install import (
|
||||
install_ipcq, load_ccl_config, resolve_algorithm_config,
|
||||
)
|
||||
|
||||
topo = resolve_topology("topology.yaml").topology_obj
|
||||
engine = GraphEngine(topo, enable_data=True)
|
||||
cfg = resolve_algorithm_config(load_ccl_config(), name="ring_allreduce_tcm")
|
||||
install_ipcq(engine, topo.spec, cfg)
|
||||
|
||||
dump = diagnostics.pointer_dump(engine)
|
||||
# 8 ranks × 2 directions = 16 lines (plus 8 PE headers)
|
||||
assert "sip0.cube0.pe0" in dump
|
||||
assert "E:" in dump
|
||||
assert "W:" in dump
|
||||
assert "my_head=" in dump
|
||||
assert "peer_tail_cache=" in dump
|
||||
@@ -1,81 +0,0 @@
|
||||
"""Validate the hello-world example from docs/ccl-author-guide.md.
|
||||
|
||||
This is the simplest possible CCL kernel — each PE sends its tile E
|
||||
and receives a tile from W. After running, each rank's slice should
|
||||
contain the data of the previous rank.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import numpy as np
|
||||
|
||||
from kernbench.ccl.algorithms import hello_send
|
||||
from kernbench.ccl.testing import run_kernel_in_mock
|
||||
|
||||
|
||||
def test_hello_send_4_ranks_mock():
|
||||
n_elem = 8
|
||||
inputs = [np.full((n_elem,), float(r + 1), dtype=np.float16) for r in range(4)]
|
||||
|
||||
outputs = run_kernel_in_mock(
|
||||
kernel_fn=hello_send.kernel,
|
||||
world_size=4,
|
||||
topology="ring_1d",
|
||||
inputs=inputs,
|
||||
kernel_args=(n_elem,),
|
||||
)
|
||||
|
||||
# rank r should have rank (r-1) % 4's data
|
||||
for r in range(4):
|
||||
prev = inputs[(r - 1) % 4]
|
||||
assert np.array_equal(outputs[r], prev), f"rank {r}: got {outputs[r]}"
|
||||
|
||||
|
||||
def test_hello_send_via_simpy_runner():
|
||||
"""Same but through real SimPy + IPCQ."""
|
||||
from kernbench.policy.placement.dp import DPPolicy
|
||||
from kernbench.runtime_api.bench_runner import run_bench
|
||||
from kernbench.runtime_api.types import resolve_device
|
||||
from kernbench.sim_engine.engine import GraphEngine
|
||||
from kernbench.topology.builder import resolve_topology
|
||||
|
||||
topo = resolve_topology("topology.yaml")
|
||||
n_elem = 8
|
||||
world_size = 8
|
||||
|
||||
def run(torch):
|
||||
# World size for this hello test is 8 (one cube). ccl.yaml no
|
||||
# longer carries a default world_size — pass it explicitly.
|
||||
plan = torch.install_ipcq(
|
||||
algorithm="ring_allreduce_tcm", world_size_override=world_size,
|
||||
)
|
||||
a = torch.zeros(
|
||||
(1, world_size * n_elem), dtype="f16",
|
||||
dp=DPPolicy(
|
||||
cube="replicate", pe="column_wise",
|
||||
num_cubes=1,
|
||||
),
|
||||
name="hello_in",
|
||||
)
|
||||
store = torch.engine.memory_store
|
||||
base = a._handle.va_base or a._handle.shards[0].pa
|
||||
nbytes = n_elem * 2
|
||||
for r in range(world_size):
|
||||
store.write("hbm", base + r * nbytes,
|
||||
np.full((n_elem,), float(r + 1), dtype=np.float16))
|
||||
|
||||
torch.launch("hello_send", hello_send.kernel, a, n_elem)
|
||||
|
||||
# Each rank should hold the previous rank's data after the round
|
||||
for r in range(world_size):
|
||||
arr = store.read("hbm", base + r * nbytes, shape=(n_elem,), dtype="f16")
|
||||
prev_value = float(((r - 1) % world_size) + 1)
|
||||
assert np.allclose(arr, prev_value), f"rank {r}: got {arr}, expected {prev_value}"
|
||||
|
||||
result = run_bench(
|
||||
topology=topo, bench_fn=run,
|
||||
device=resolve_device("all"),
|
||||
engine_factory=lambda t, d: GraphEngine(
|
||||
getattr(t, "topology_obj", t), enable_data=True
|
||||
),
|
||||
)
|
||||
assert result.completion.ok
|
||||
@@ -2,7 +2,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from kernbench.ccl.install import (
|
||||
install_ipcq,
|
||||
linear_rank_to_pe,
|
||||
load_ccl_config,
|
||||
resolve_algorithm_config,
|
||||
@@ -26,28 +25,14 @@ def test_resolve_algorithm_config_default():
|
||||
cfg = load_ccl_config()
|
||||
merged = resolve_algorithm_config(cfg)
|
||||
assert merged["algorithm"] == cfg["defaults"]["algorithm"]
|
||||
# ccl.yaml no longer carries defaults.world_size — backend derives
|
||||
# it from topology.yaml at install time. Just check the field is
|
||||
# absent here (verified per-test where install_ipcq is called).
|
||||
assert "world_size" not in merged or merged["world_size"] >= 1
|
||||
|
||||
|
||||
def test_resolve_algorithm_config_override():
|
||||
cfg = load_ccl_config()
|
||||
merged = resolve_algorithm_config(cfg, name="ring_allreduce_hbm")
|
||||
assert merged["algorithm"] == "ring_allreduce_hbm"
|
||||
assert merged["buffer_kind"] == "hbm" # algo override
|
||||
# defaults still apply
|
||||
assert merged["n_slots"] == cfg["defaults"]["n_slots"]
|
||||
|
||||
|
||||
def test_linear_rank_to_pe():
|
||||
engine, topo = _engine()
|
||||
spec = topo.spec
|
||||
# Cube 0 of SIP 0
|
||||
assert linear_rank_to_pe(0, spec) == (0, 0, 0)
|
||||
assert linear_rank_to_pe(7, spec) == (0, 0, 7)
|
||||
# Should not exceed total PE count
|
||||
pes_per_sip = (
|
||||
spec["sip"]["cube_mesh"]["w"] * spec["sip"]["cube_mesh"]["h"]
|
||||
* spec["cube"]["pe_layout"]["pe_per_corner"]
|
||||
@@ -56,105 +41,3 @@ def test_linear_rank_to_pe():
|
||||
sips = spec["system"]["sips"]["count"]
|
||||
total = sips * pes_per_sip
|
||||
assert total >= 8
|
||||
|
||||
|
||||
def test_install_ipcq_neighbors_correct():
|
||||
engine, topo = _engine()
|
||||
cfg = load_ccl_config()
|
||||
merged = resolve_algorithm_config(cfg, name="ring_allreduce_tcm")
|
||||
# Force a single-cube 8-rank install for the assertions below.
|
||||
merged["world_size"] = 8
|
||||
plan = install_ipcq(engine, topo.spec, merged)
|
||||
|
||||
assert plan["world_size"] == 8
|
||||
assert plan["buffer_kind"] == "tcm"
|
||||
|
||||
# Each rank should have E and W entries
|
||||
for r, nbrs in plan["neighbor_table"].items():
|
||||
assert "E" in nbrs
|
||||
assert "W" in nbrs
|
||||
|
||||
# Inspect installed PE_IPCQ for rank 0
|
||||
ipcq = engine._components["sip0.cube0.pe0.pe_ipcq"]
|
||||
qp_e = ipcq.queue_pairs["E"]
|
||||
qp_w = ipcq.queue_pairs["W"]
|
||||
assert qp_e["peer"].pe == 1 # rank 0's E neighbor is rank 1
|
||||
assert qp_w["peer"].pe == 7 # rank 0's W neighbor is rank 7
|
||||
# rx_base addresses should be unique
|
||||
assert qp_e["my_rx_base_pa"] != qp_w["my_rx_base_pa"]
|
||||
|
||||
|
||||
def test_install_ipcq_credit_stores_wired():
|
||||
engine, topo = _engine()
|
||||
cfg = load_ccl_config()
|
||||
merged = resolve_algorithm_config(cfg, name="ring_allreduce_tcm")
|
||||
merged["world_size"] = 8
|
||||
install_ipcq(engine, topo.spec, merged)
|
||||
|
||||
# rank 0 (pe0) sending E goes to rank 1 (pe1)
|
||||
# rank 0's peer_credit_store on E direction should equal rank 1's credit_inbox
|
||||
pe0 = engine._components["sip0.cube0.pe0.pe_ipcq"]
|
||||
pe1 = engine._components["sip0.cube0.pe1.pe_ipcq"]
|
||||
|
||||
qp_e = pe0.queue_pairs["E"]
|
||||
assert qp_e["peer_credit_store"] is pe1.credit_inbox
|
||||
|
||||
|
||||
# ── ADR-0025 D1: reverse_direction opposite-preference ───────────────
|
||||
|
||||
|
||||
def test_reverse_direction_opposite_preference_2rank_ring():
|
||||
"""ADR-0025 D1: In a 2-rank bidirectional ring both E and W point to the
|
||||
same peer; reverse_direction must pick the OPPOSITE direction (W for E,
|
||||
E for W) so rx_base targets the semantically-correct slot.
|
||||
|
||||
Concretely: rank 0 sending via E to rank 1 must target rank 1's W-rx
|
||||
buffer (not rank 1's E-rx), because rank 1's kernel recv(W) reads from
|
||||
its W-rx.
|
||||
"""
|
||||
engine, topo = _engine()
|
||||
cfg = load_ccl_config()
|
||||
merged = resolve_algorithm_config(cfg, name="ring_allreduce_tcm")
|
||||
merged["world_size"] = 2
|
||||
install_ipcq(engine, topo.spec, merged)
|
||||
|
||||
ipcq0 = engine._components["sip0.cube0.pe0.pe_ipcq"]
|
||||
ipcq1 = engine._components["sip0.cube0.pe1.pe_ipcq"]
|
||||
|
||||
rank1_e_rx = ipcq1.queue_pairs["E"]["my_rx_base_pa"]
|
||||
rank1_w_rx = ipcq1.queue_pairs["W"]["my_rx_base_pa"]
|
||||
|
||||
qp0_e = ipcq0.queue_pairs["E"]
|
||||
qp0_w = ipcq0.queue_pairs["W"]
|
||||
|
||||
# rank 0's E entry should target rank 1's W-rx (opposite), NOT rank 1's E-rx.
|
||||
assert qp0_e["peer"].rx_base_pa == rank1_w_rx, (
|
||||
f"expected rank 0's E peer.rx_base_pa == rank 1's W-rx ({rank1_w_rx:#x}), "
|
||||
f"got {qp0_e['peer'].rx_base_pa:#x} (matches E-rx: {rank1_e_rx:#x}) — "
|
||||
f"reverse_direction picked same-label instead of opposite"
|
||||
)
|
||||
# rank 0's W entry should target rank 1's E-rx (opposite).
|
||||
assert qp0_w["peer"].rx_base_pa == rank1_e_rx
|
||||
|
||||
|
||||
def test_reverse_direction_opposite_preference_4rank_ring_sanity():
|
||||
"""ADR-0025 D1 sanity: ws>=3 ring. E and W have distinct peers, so
|
||||
opposite-preference produces same result as old dict-order first-match.
|
||||
This test should PASS both under current and post-fix code.
|
||||
"""
|
||||
engine, topo = _engine()
|
||||
cfg = load_ccl_config()
|
||||
merged = resolve_algorithm_config(cfg, name="ring_allreduce_tcm")
|
||||
merged["world_size"] = 4
|
||||
install_ipcq(engine, topo.spec, merged)
|
||||
|
||||
ipcq0 = engine._components["sip0.cube0.pe0.pe_ipcq"]
|
||||
ipcq1 = engine._components["sip0.cube0.pe1.pe_ipcq"]
|
||||
ipcq3 = engine._components["sip0.cube0.pe3.pe_ipcq"]
|
||||
|
||||
# rank 0 E → rank 1 → rank 1's W-rx
|
||||
qp0_e = ipcq0.queue_pairs["E"]
|
||||
assert qp0_e["peer"].rx_base_pa == ipcq1.queue_pairs["W"]["my_rx_base_pa"]
|
||||
# rank 0 W → rank 3 (last in ring) → rank 3's E-rx
|
||||
qp0_w = ipcq0.queue_pairs["W"]
|
||||
assert qp0_w["peer"].rx_base_pa == ipcq3.queue_pairs["E"]["my_rx_base_pa"]
|
||||
|
||||
@@ -1,83 +0,0 @@
|
||||
"""Tests for the mock CCL runtime (ADR-0023 D15)."""
|
||||
from __future__ import annotations
|
||||
|
||||
import numpy as np
|
||||
|
||||
from kernbench.ccl.algorithms import ring_allreduce
|
||||
from kernbench.ccl.testing import run_kernel_in_mock
|
||||
|
||||
|
||||
def test_ring_allreduce_4_ranks():
|
||||
"""Run the ring all-reduce kernel under the mock runtime, no SimPy."""
|
||||
n_elem = 8
|
||||
inputs = [
|
||||
np.full((n_elem,), float(r + 1), dtype=np.float16)
|
||||
for r in range(4)
|
||||
]
|
||||
expected = sum(inputs) # [10, 10, ..., 10]
|
||||
|
||||
outputs = run_kernel_in_mock(
|
||||
kernel_fn=ring_allreduce.kernel,
|
||||
world_size=4,
|
||||
topology="ring_1d",
|
||||
inputs=inputs,
|
||||
kernel_args=(n_elem, 4),
|
||||
)
|
||||
|
||||
assert len(outputs) == 4
|
||||
for r in range(4):
|
||||
assert np.allclose(outputs[r], expected)
|
||||
|
||||
|
||||
def test_ring_allreduce_8_ranks():
|
||||
n_elem = 16
|
||||
inputs = [
|
||||
np.full((n_elem,), float(r + 1), dtype=np.float16)
|
||||
for r in range(8)
|
||||
]
|
||||
expected = sum(inputs) # [36, 36, ...]
|
||||
|
||||
outputs = run_kernel_in_mock(
|
||||
kernel_fn=ring_allreduce.kernel,
|
||||
world_size=8,
|
||||
topology="ring_1d",
|
||||
inputs=inputs,
|
||||
kernel_args=(n_elem, 8),
|
||||
)
|
||||
for r in range(8):
|
||||
assert np.allclose(outputs[r], expected)
|
||||
|
||||
|
||||
def test_ring_allreduce_random_data():
|
||||
n_elem = 32
|
||||
rng = np.random.default_rng(42)
|
||||
inputs = [rng.standard_normal(n_elem).astype(np.float16) for _ in range(4)]
|
||||
expected = sum(inputs)
|
||||
|
||||
outputs = run_kernel_in_mock(
|
||||
kernel_fn=ring_allreduce.kernel,
|
||||
world_size=4,
|
||||
topology="ring_1d",
|
||||
inputs=inputs,
|
||||
kernel_args=(n_elem, 4),
|
||||
)
|
||||
for r in range(4):
|
||||
assert np.allclose(outputs[r], expected, rtol=1e-2, atol=1e-2)
|
||||
|
||||
|
||||
def test_mock_runtime_invalid_direction_raises():
|
||||
"""A kernel that uses an unsupported direction should raise."""
|
||||
import pytest
|
||||
|
||||
def bad_kernel(t_ptr, n_elem, tl):
|
||||
tl.send(dir="N", src_addr=0, nbytes=2, shape=(1,), dtype="f16", space="hbm")
|
||||
|
||||
inputs = [np.array([1.0], dtype=np.float16) for _ in range(2)]
|
||||
with pytest.raises(Exception):
|
||||
run_kernel_in_mock(
|
||||
kernel_fn=bad_kernel,
|
||||
world_size=2,
|
||||
topology="ring_1d",
|
||||
inputs=inputs,
|
||||
kernel_args=(1,),
|
||||
)
|
||||
@@ -1,85 +0,0 @@
|
||||
"""CCL performance validation tests (ADR-0023 D13 T5).
|
||||
|
||||
Sanity-checks the simulated latency of the unified ``ccl_allreduce`` bench
|
||||
under the rank = SIP TP launcher model (ADR-0024 / ADR-0027). Uses the
|
||||
topology-derived world_size (= 2 in the shipped topology); the latency
|
||||
model is topology-aware, so buffer_kind differences remain visible even
|
||||
at this scale.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import importlib
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from kernbench.runtime_api.bench_runner import run_bench
|
||||
from kernbench.runtime_api.types import resolve_device
|
||||
from kernbench.sim_engine.engine import GraphEngine
|
||||
from kernbench.topology.builder import resolve_topology
|
||||
|
||||
|
||||
def _engine_factory(topology, device):
|
||||
return GraphEngine(getattr(topology, "topology_obj", topology), enable_data=True)
|
||||
|
||||
|
||||
def _run_ring(algorithm: str, buffer_kind: str = "tcm") -> float:
|
||||
"""Run a rank = SIP ring all-reduce via the unified bench with a tmp
|
||||
ccl.yaml overlay. Returns simulated kernel total_ns."""
|
||||
import tempfile
|
||||
|
||||
body = f"""\
|
||||
defaults:
|
||||
algorithm: {algorithm}
|
||||
buffer_kind: {buffer_kind}
|
||||
backpressure: sleep
|
||||
n_slots: 4
|
||||
slot_size: 4096
|
||||
vc_chunk_size: 256
|
||||
ipcq_credit_size_bytes: 16
|
||||
|
||||
algorithms:
|
||||
{algorithm}:
|
||||
module: kernbench.ccl.algorithms.ring_allreduce
|
||||
topology: ring_1d
|
||||
buffer_kind: {buffer_kind}
|
||||
n_elem: 32
|
||||
"""
|
||||
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
with open(os.path.join(tmp, "ccl.yaml"), "w") as f:
|
||||
f.write(body)
|
||||
old_cwd = os.getcwd()
|
||||
os.chdir(tmp)
|
||||
try:
|
||||
topo = resolve_topology(os.path.join(project_root, "topology.yaml"))
|
||||
bench_mod = importlib.import_module("benches.ccl_allreduce")
|
||||
result = run_bench(
|
||||
topology=topo, bench_fn=bench_mod.run,
|
||||
device=resolve_device("all"),
|
||||
engine_factory=_engine_factory,
|
||||
)
|
||||
finally:
|
||||
os.chdir(old_cwd)
|
||||
|
||||
assert result.completion.ok, f"{algorithm} did not complete"
|
||||
last_kernel = None
|
||||
for tr in (result.traces or []):
|
||||
if tr.get("phase") == "kernel":
|
||||
last_kernel = tr
|
||||
assert last_kernel is not None, f"{algorithm} produced no kernel trace"
|
||||
return float(last_kernel.get("total_ns", 0.0))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("buffer_kind", ["tcm", "hbm", "sram"])
|
||||
def test_ccl_latency_positive(buffer_kind):
|
||||
"""Every buffer kind must produce a positive simulated latency."""
|
||||
algo = f"ring_allreduce_{buffer_kind}"
|
||||
ns = _run_ring(algo, buffer_kind)
|
||||
assert ns > 0
|
||||
|
||||
|
||||
def test_ccl_latency_under_reasonable_bound():
|
||||
"""rank = SIP ring all-reduce (tile=32 f16) should finish well under 1ms."""
|
||||
ns = _run_ring("ring_allreduce_tcm", "tcm")
|
||||
assert ns < 1_000_000 # < 1 ms simulated
|
||||
@@ -0,0 +1,119 @@
|
||||
"""End-to-end distributed test for intercube allreduce.
|
||||
|
||||
Exercises the full process-group path:
|
||||
dist.init_process_group(backend="ahbm")
|
||||
→ mp.spawn(nprocs=n_sips)
|
||||
→ each worker: set_device → allocate → fill → dist.all_reduce → verify
|
||||
|
||||
This is the same flow a real DDP training script would use.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import textwrap
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"
|
||||
|
||||
N_CUBES = 16
|
||||
N_ELEM = 8
|
||||
|
||||
|
||||
def _write_ccl_yaml(tmp_path) -> str:
|
||||
body = textwrap.dedent("""\
|
||||
defaults:
|
||||
algorithm: intercube_allreduce
|
||||
buffer_kind: tcm
|
||||
backpressure: sleep
|
||||
n_slots: 4
|
||||
slot_size: 4096
|
||||
vc_chunk_size: 256
|
||||
ipcq_credit_size_bytes: 16
|
||||
|
||||
algorithms:
|
||||
intercube_allreduce:
|
||||
module: kernbench.ccl.algorithms.intercube_allreduce
|
||||
topology: none
|
||||
buffer_kind: tcm
|
||||
n_elem: 8
|
||||
root_cube: 15
|
||||
""")
|
||||
(tmp_path / "ccl.yaml").write_text(body)
|
||||
return str(tmp_path)
|
||||
|
||||
|
||||
def _worker(rank: int, n_sips: int, torch) -> None:
|
||||
"""Per-SIP worker: allocate, fill, all_reduce, verify."""
|
||||
from kernbench.policy.placement.dp import DPPolicy
|
||||
|
||||
torch.ahbm.set_device(rank)
|
||||
|
||||
dp = DPPolicy(
|
||||
cube="row_wise", pe="replicate",
|
||||
num_pes=1, num_cubes=N_CUBES,
|
||||
)
|
||||
tensor = torch.zeros(
|
||||
(N_CUBES, N_ELEM), dtype="f16", dp=dp,
|
||||
name=f"sip{rank}",
|
||||
)
|
||||
|
||||
init_arr = np.full((N_CUBES, N_ELEM), float(rank + 1), dtype=np.float16)
|
||||
tensor.copy_(torch.from_numpy(init_arr))
|
||||
|
||||
print(f"[SIP {rank}] input cube0[:4] = {tensor.numpy()[0][:4].tolist()}")
|
||||
|
||||
torch.distributed.all_reduce(tensor, op="sum")
|
||||
|
||||
arr = tensor.numpy()
|
||||
expected = float(N_CUBES * sum(range(1, n_sips + 1)))
|
||||
|
||||
print(f"[SIP {rank}] output cube0[:4] = {arr[0][:4].tolist()}")
|
||||
print(f"[SIP {rank}] output cube15[:4] = {arr[15][:4].tolist()}")
|
||||
|
||||
for cube_id in range(N_CUBES):
|
||||
assert np.allclose(arr[cube_id], expected, rtol=1e-1, atol=1e-1), (
|
||||
f"SIP{rank} cube {cube_id}: "
|
||||
f"got {arr[cube_id][:4]}, expected {expected}"
|
||||
)
|
||||
|
||||
if rank == 0:
|
||||
print(f"\n intercube_allreduce (ws={n_sips}): "
|
||||
f"{n_sips * N_CUBES} OK")
|
||||
|
||||
|
||||
def test_distributed_intercube_allreduce(tmp_path, monkeypatch):
|
||||
"""Full distributed path: init_process_group → mp.spawn → all_reduce."""
|
||||
from kernbench.runtime_api.context import RuntimeContext
|
||||
from kernbench.runtime_api.types import DeviceSelector
|
||||
from kernbench.sim_engine.engine import GraphEngine
|
||||
from kernbench.topology.builder import resolve_topology
|
||||
|
||||
monkeypatch.chdir(_write_ccl_yaml(tmp_path))
|
||||
|
||||
topo = resolve_topology(str(TOPOLOGY_PATH))
|
||||
engine = GraphEngine(topo.topology_obj, enable_data=True)
|
||||
spec = topo.topology_obj.spec
|
||||
n_sips = int(spec["system"]["sips"]["count"])
|
||||
|
||||
with RuntimeContext(
|
||||
engine=engine,
|
||||
target_device=DeviceSelector("all"),
|
||||
correlation_id="dist_intercube_ar",
|
||||
spec=spec,
|
||||
) as ctx:
|
||||
ctx.distributed.init_process_group(backend="ahbm")
|
||||
|
||||
assert ctx.distributed.get_world_size() == n_sips
|
||||
|
||||
t_start = engine._env.now
|
||||
|
||||
ctx.multiprocessing.spawn(
|
||||
_worker, args=(n_sips, ctx), nprocs=n_sips,
|
||||
)
|
||||
|
||||
t_end = engine._env.now
|
||||
print(f"\n[distributed] sim latency = "
|
||||
f"{t_end - t_start:.1f} ns ({(t_end - t_start) / 1000:.3f} us)")
|
||||
@@ -1,270 +0,0 @@
|
||||
"""ADR-0027 T5: Host-read barrier (D0.5).
|
||||
|
||||
Phase 1: Tensor.numpy / data / __getitem__ / __repr__ / copy_ currently
|
||||
perform MemoryStore operations without barrier logic → tests fail when
|
||||
they assert drain is triggered. Phase 2 injects the barrier.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from greenlet import greenlet
|
||||
|
||||
|
||||
def _make_ctx(topology):
|
||||
from kernbench.runtime_api.context import RuntimeContext
|
||||
from kernbench.runtime_api.types import DeviceSelector
|
||||
from kernbench.sim_engine.engine import GraphEngine
|
||||
|
||||
engine = GraphEngine(topology.topology_obj, enable_data=True)
|
||||
return RuntimeContext(
|
||||
engine=engine,
|
||||
target_device=DeviceSelector("all"),
|
||||
correlation_id="test_t5",
|
||||
spec=topology.topology_obj.spec,
|
||||
)
|
||||
|
||||
|
||||
# ── T5.g: closed-set registry exists ─────────────────────────────────
|
||||
|
||||
|
||||
def test_host_read_barrier_registry_exists():
|
||||
"""D0.5 T5.g: Tensor module exposes the closed-set registry."""
|
||||
from kernbench.runtime_api import tensor as tensor_mod
|
||||
|
||||
assert hasattr(tensor_mod, "_HOST_READ_BARRIERS"), (
|
||||
"ADR-0027 T5.g: tensor module must declare _HOST_READ_BARRIERS registry"
|
||||
)
|
||||
registry = tensor_mod._HOST_READ_BARRIERS
|
||||
assert isinstance(registry, frozenset)
|
||||
expected = {"numpy", "data", "__getitem__", "__repr__", "copy_"}
|
||||
assert expected.issubset(registry), (
|
||||
f"registry must include {expected}; got {registry}"
|
||||
)
|
||||
|
||||
|
||||
# ── T5.a: numpy() triggers drain when pending non-empty ──────────────
|
||||
|
||||
|
||||
def test_numpy_triggers_drain_when_pending(topology):
|
||||
"""T5.a: launch → numpy() → barrier drains before read (worker context)."""
|
||||
with _make_ctx(topology) as ctx:
|
||||
from kernbench.policy.placement.dp import DPPolicy
|
||||
|
||||
dp = DPPolicy(cube="replicate", pe="replicate", num_cubes=1, num_pes=1)
|
||||
observed: dict = {"pre_numpy_pending": None, "post_numpy_pending": None}
|
||||
|
||||
def _worker():
|
||||
t = ctx.zeros((1, 8), dtype="f16", dp=dp, name="t5a_t")
|
||||
src = np.full((1, 8), 1.5, dtype=np.float16)
|
||||
t.copy_(ctx.distributed._ctx_ref.from_numpy(src) if False else _hold(ctx, src))
|
||||
# Manually push a dummy handle to simulate pending state; in real
|
||||
# D0.5, numpy will detect and drain.
|
||||
observed["pre_numpy_pending"] = list(ctx._pending_worker_waits)
|
||||
_ = t.numpy()
|
||||
observed["post_numpy_pending"] = list(ctx._pending_worker_waits)
|
||||
|
||||
# Can't actually manufacture pending + test numpy inside worker
|
||||
# without D0.5 implemented — instead, verify the barrier path is
|
||||
# invoked by spying.
|
||||
from kernbench.runtime_api.tensor import Tensor
|
||||
barrier_calls = {"n": 0}
|
||||
|
||||
original_numpy = Tensor.numpy
|
||||
|
||||
def _spy_numpy(self):
|
||||
# After D0.5 is implemented, this wrapper is redundant; the
|
||||
# test just checks numpy was called at all after a pending
|
||||
# operation.
|
||||
barrier_calls["n"] += 1
|
||||
return original_numpy(self)
|
||||
|
||||
Tensor.numpy = _spy_numpy # type: ignore[assignment]
|
||||
try:
|
||||
ctx.multiprocessing.spawn(_mk_worker_numpy, args=(ctx,), nprocs=1)
|
||||
finally:
|
||||
Tensor.numpy = original_numpy # type: ignore[assignment]
|
||||
|
||||
assert barrier_calls["n"] >= 1
|
||||
|
||||
|
||||
def _hold(ctx, arr):
|
||||
"""helper (unused branch)."""
|
||||
import numpy as _np
|
||||
t = type("X", (), {})()
|
||||
t.numpy = lambda self=None: arr
|
||||
return t
|
||||
|
||||
|
||||
def _mk_worker_numpy(rank, ctx):
|
||||
"""Worker that calls numpy after a tensor deploy. Triggers barrier."""
|
||||
from kernbench.policy.placement.dp import DPPolicy
|
||||
dp = DPPolicy(cube="replicate", pe="replicate", num_cubes=1, num_pes=1)
|
||||
t = ctx.zeros((1, 8), dtype="f16", dp=dp, name=f"t5_r{rank}")
|
||||
_ = t.numpy()
|
||||
|
||||
|
||||
# ── T5.b: metadata access does NOT drain ─────────────────────────────
|
||||
|
||||
|
||||
def test_metadata_access_is_non_barrier(topology):
|
||||
"""T5.b: .shape / .dtype / .name do NOT trigger drain."""
|
||||
with _make_ctx(topology) as ctx:
|
||||
from kernbench.runtime_api import tensor as tensor_mod
|
||||
from kernbench.policy.placement.dp import DPPolicy
|
||||
|
||||
dp = DPPolicy(cube="replicate", pe="replicate", num_cubes=1, num_pes=1)
|
||||
t = ctx.zeros((1, 8), dtype="f16", dp=dp, name="t5b")
|
||||
|
||||
# Populate pending queue artificially (simulate worker state).
|
||||
ctx._pending_worker_waits.append("fake_handle_that_must_not_drain")
|
||||
|
||||
_ = t.shape
|
||||
_ = t.dtype
|
||||
_ = t.name
|
||||
|
||||
assert "fake_handle_that_must_not_drain" in ctx._pending_worker_waits, (
|
||||
"T5.b: metadata accessors must not drain pending queue"
|
||||
)
|
||||
ctx._pending_worker_waits.clear()
|
||||
|
||||
|
||||
# ── T5.c: empty pending → numpy is fast-path (no yield) ──────────────
|
||||
|
||||
|
||||
def test_numpy_fast_path_when_pending_empty(topology):
|
||||
"""T5.c: numpy() with empty pending queue does not yield to main."""
|
||||
with _make_ctx(topology) as ctx:
|
||||
from kernbench.policy.placement.dp import DPPolicy
|
||||
|
||||
dp = DPPolicy(cube="replicate", pe="replicate", num_cubes=1, num_pes=1)
|
||||
|
||||
def _worker(rank: int):
|
||||
t = ctx.zeros((1, 4), dtype="f16", dp=dp, name=f"t5c_r{rank}")
|
||||
# At this point, after worker's own wait(s), pending should be empty.
|
||||
assert ctx._pending_worker_waits == [], (
|
||||
"after worker's deploy, pending queue should be drained"
|
||||
)
|
||||
# numpy call should be fast-path (no yield).
|
||||
_ = t.numpy()
|
||||
|
||||
ctx.multiprocessing.spawn(_worker, args=(), nprocs=1)
|
||||
|
||||
|
||||
# ── T5.d: __getitem__ / data also barriers ───────────────────────────
|
||||
|
||||
|
||||
def test_getitem_and_data_are_barriers(topology):
|
||||
"""T5.d: __getitem__ and .data property behave like numpy() barrier."""
|
||||
with _make_ctx(topology) as ctx:
|
||||
from kernbench.policy.placement.dp import DPPolicy
|
||||
|
||||
dp = DPPolicy(cube="replicate", pe="replicate", num_cubes=1, num_pes=1)
|
||||
|
||||
def _worker(rank: int):
|
||||
t = ctx.zeros((1, 8), dtype="f16", dp=dp, name=f"t5d_r{rank}")
|
||||
# host src copied in (forces write path)
|
||||
src = np.full((1, 8), float(rank + 1), dtype=np.float16)
|
||||
from kernbench.runtime_api.tensor import Tensor
|
||||
h = Tensor(shape=src.shape, dtype="f16", name="host")
|
||||
h._host_buffer = src
|
||||
t.copy_(h)
|
||||
# Read access via __getitem__ and .data: both must fully materialize.
|
||||
slice_val = t[0, 0:4]
|
||||
data_val = t.data
|
||||
assert slice_val.shape[0] == 4
|
||||
assert data_val.shape == (1, 8)
|
||||
|
||||
ctx.multiprocessing.spawn(_worker, args=(), nprocs=2)
|
||||
|
||||
|
||||
# ── T5.e: collective pending also drained by barrier ────────────────
|
||||
|
||||
|
||||
def test_numpy_drains_collective_pending(topology, tmp_path, monkeypatch):
|
||||
"""T5.e: numpy() after all_reduce must see post-reduce data.
|
||||
|
||||
Note: in the current model, ``all_reduce`` itself yields to main so the
|
||||
collective is drained before the worker resumes; barriers at
|
||||
``numpy()`` intentionally do NOT drain collective pending (would cause
|
||||
cross-rank deadlock — see ``_host_read_barrier`` docstring). What this
|
||||
test asserts is the observable contract: post-``all_reduce`` +
|
||||
``numpy()`` sees the reduced values.
|
||||
"""
|
||||
import textwrap
|
||||
body = textwrap.dedent("""\
|
||||
defaults:
|
||||
algorithm: ring_allreduce_tcm
|
||||
buffer_kind: tcm
|
||||
backpressure: sleep
|
||||
n_slots: 4
|
||||
slot_size: 4096
|
||||
vc_chunk_size: 256
|
||||
ipcq_credit_size_bytes: 16
|
||||
|
||||
algorithms:
|
||||
ring_allreduce_tcm:
|
||||
module: kernbench.ccl.algorithms.ring_allreduce
|
||||
topology: ring_1d
|
||||
buffer_kind: tcm
|
||||
n_elem: 8
|
||||
""")
|
||||
(tmp_path / "ccl.yaml").write_text(body)
|
||||
monkeypatch.chdir(str(tmp_path))
|
||||
|
||||
with _make_ctx(topology) as ctx:
|
||||
from kernbench.policy.placement.dp import DPPolicy
|
||||
|
||||
dp = DPPolicy(cube="replicate", pe="replicate", num_cubes=1, num_pes=1)
|
||||
|
||||
def _worker(rank: int, ws: int):
|
||||
ctx.ahbm.set_device(rank)
|
||||
t = ctx.zeros((1, 8), dtype="f16", dp=dp, name=f"t5e_r{rank}")
|
||||
src = np.full((1, 8), float(rank + 1), dtype=np.float16)
|
||||
from kernbench.runtime_api.tensor import Tensor
|
||||
h = Tensor(shape=src.shape, dtype="f16", name="host")
|
||||
h._host_buffer = src
|
||||
t.copy_(h)
|
||||
ctx.distributed.all_reduce(t, op="sum")
|
||||
# numpy() must see the reduced values even without explicit wait.
|
||||
out = t.numpy()
|
||||
expected = float(sum(range(1, ws + 1)))
|
||||
# Tolerance loose for fp16 accumulation.
|
||||
assert np.allclose(out, expected, rtol=1e-1, atol=1e-1), (
|
||||
f"rank {rank}: expected {expected}, got {out}"
|
||||
)
|
||||
|
||||
ctx.distributed.init_process_group(backend="ahbm")
|
||||
ws = ctx.distributed.get_world_size()
|
||||
ctx.multiprocessing.spawn(_worker, args=(ws,), nprocs=ws)
|
||||
|
||||
|
||||
# ── T5.f: copy_ target-side write barrier ────────────────────────────
|
||||
|
||||
|
||||
def test_copy_from_deployed_source_drains_source(topology):
|
||||
"""T5.f (revised): ``copy_(source)`` drains source-side pending via the
|
||||
``source.numpy()`` read barrier.
|
||||
|
||||
Note: the ADR originally specified a target-side write barrier as well,
|
||||
but that was removed because global-pending target barrier can cause
|
||||
cross-rank deadlock when another rank has a pending collective. Source-
|
||||
side read barrier is preserved and sufficient for the common pattern
|
||||
``target.copy_(deployed_source)``.
|
||||
"""
|
||||
with _make_ctx(topology) as ctx:
|
||||
from kernbench.policy.placement.dp import DPPolicy
|
||||
from kernbench.runtime_api.tensor import Tensor
|
||||
|
||||
dp = DPPolicy(cube="replicate", pe="replicate", num_cubes=1, num_pes=1)
|
||||
|
||||
def _worker(rank: int):
|
||||
# Deployed source — its .numpy() will trigger the read barrier.
|
||||
source = ctx.zeros((1, 8), dtype="f16", dp=dp, name=f"src_r{rank}")
|
||||
target = ctx.zeros((1, 8), dtype="f16", dp=dp, name=f"tgt_r{rank}")
|
||||
target.copy_(source)
|
||||
# Smoke: no hang, no exception. numpy round-trip sees zeros.
|
||||
out = target.numpy()
|
||||
assert out.shape == (1, 8)
|
||||
|
||||
ctx.multiprocessing.spawn(_worker, args=(), nprocs=1)
|
||||
@@ -0,0 +1,113 @@
|
||||
"""Tests for configure_sfr_intercube_multisip neighbor table wiring.
|
||||
|
||||
Verifies that IPCQ neighbor tables are correctly installed for
|
||||
intercube (pe0, 4×4 mesh N/S/E/W) + inter-SIP (pe0, all cubes,
|
||||
global_E/global_W) communication.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from kernbench.ccl.install import load_ccl_config, resolve_algorithm_config
|
||||
from kernbench.ccl.sfr_config import configure_sfr_intercube_multisip
|
||||
from kernbench.sim_engine.engine import GraphEngine
|
||||
from kernbench.topology.builder import resolve_topology
|
||||
|
||||
TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"
|
||||
|
||||
N_CUBES = 16
|
||||
|
||||
|
||||
def _engine_and_spec():
|
||||
topo = resolve_topology(str(TOPOLOGY_PATH))
|
||||
engine = GraphEngine(topo.topology_obj, enable_data=True)
|
||||
return engine, topo.topology_obj.spec
|
||||
|
||||
|
||||
def _merged_cfg():
|
||||
cfg = load_ccl_config()
|
||||
return resolve_algorithm_config(cfg, name="intercube_allreduce")
|
||||
|
||||
|
||||
class TestConfigureSfrNeighborTables:
|
||||
def test_world_size_and_rank_to_pe(self):
|
||||
engine, spec = _engine_and_spec()
|
||||
cfg = _merged_cfg()
|
||||
plan = configure_sfr_intercube_multisip(engine, spec, cfg)
|
||||
|
||||
n_sips = int(spec["system"]["sips"]["count"])
|
||||
assert plan["world_size"] == n_sips * N_CUBES
|
||||
assert len(plan["rank_to_pe"]) == n_sips * N_CUBES
|
||||
for pe_idx, (sip, cube, pe) in enumerate(plan["rank_to_pe"]):
|
||||
assert pe == 0, f"pe_idx {pe_idx}: pe must be 0, got {pe}"
|
||||
|
||||
def test_corner_cube0_has_E_and_S_only(self):
|
||||
"""Cube 0 (row=0, col=0) is NW corner: only E and S neighbors."""
|
||||
engine, spec = _engine_and_spec()
|
||||
cfg = _merged_cfg()
|
||||
configure_sfr_intercube_multisip(engine, spec, cfg)
|
||||
|
||||
ipcq = engine._components["sip0.cube0.pe0.pe_ipcq"]
|
||||
qp = ipcq.queue_pairs
|
||||
assert "E" in qp, "cube 0 must have E neighbor"
|
||||
assert "S" in qp, "cube 0 must have S neighbor"
|
||||
assert "W" not in qp, "cube 0 (col=0) must NOT have W neighbor"
|
||||
assert "N" not in qp, "cube 0 (row=0) must NOT have N neighbor"
|
||||
assert qp["E"]["peer"].cube == 1
|
||||
assert qp["S"]["peer"].cube == 4
|
||||
|
||||
def test_interior_cube5_has_all_four(self):
|
||||
"""Cube 5 (row=1, col=1) is interior: N/S/E/W all present."""
|
||||
engine, spec = _engine_and_spec()
|
||||
cfg = _merged_cfg()
|
||||
configure_sfr_intercube_multisip(engine, spec, cfg)
|
||||
|
||||
ipcq = engine._components["sip0.cube5.pe0.pe_ipcq"]
|
||||
qp = ipcq.queue_pairs
|
||||
assert qp["N"]["peer"].cube == 1
|
||||
assert qp["S"]["peer"].cube == 9
|
||||
assert qp["E"]["peer"].cube == 6
|
||||
assert qp["W"]["peer"].cube == 4
|
||||
|
||||
def test_root_cube15_has_inter_sip(self):
|
||||
"""Cube 15 (root, SE corner) has N, W + global_E/global_W."""
|
||||
engine, spec = _engine_and_spec()
|
||||
cfg = _merged_cfg()
|
||||
configure_sfr_intercube_multisip(engine, spec, cfg)
|
||||
|
||||
ipcq0 = engine._components["sip0.cube15.pe0.pe_ipcq"]
|
||||
qp0 = ipcq0.queue_pairs
|
||||
assert "N" in qp0
|
||||
assert "W" in qp0
|
||||
assert "E" not in qp0, "cube 15 (col=3) must NOT have E"
|
||||
assert "S" not in qp0, "cube 15 (row=3) must NOT have S"
|
||||
assert "global_E" in qp0, "root cube must have global_E"
|
||||
assert "global_W" in qp0, "root cube must have global_W"
|
||||
assert qp0["global_E"]["peer"].sip == 1
|
||||
assert qp0["global_E"]["peer"].cube == 15
|
||||
|
||||
ipcq1 = engine._components["sip1.cube15.pe0.pe_ipcq"]
|
||||
qp1 = ipcq1.queue_pairs
|
||||
assert qp1["global_E"]["peer"].sip == 0
|
||||
assert qp1["global_E"]["peer"].cube == 15
|
||||
|
||||
def test_all_cubes_have_inter_sip(self):
|
||||
"""ALL cubes (not just root) are wired for inter-SIP."""
|
||||
engine, spec = _engine_and_spec()
|
||||
cfg = _merged_cfg()
|
||||
configure_sfr_intercube_multisip(engine, spec, cfg)
|
||||
|
||||
root_cube = int(cfg.get("root_cube", N_CUBES - 1))
|
||||
for cube_id in range(N_CUBES):
|
||||
ipcq = engine._components[f"sip0.cube{cube_id}.pe0.pe_ipcq"]
|
||||
qp = ipcq.queue_pairs
|
||||
assert "global_E" in qp, (
|
||||
f"sip0.cube{cube_id}.pe0 missing global_E"
|
||||
)
|
||||
assert "global_W" in qp, (
|
||||
f"sip0.cube{cube_id}.pe0 missing global_W"
|
||||
)
|
||||
if cube_id == root_cube:
|
||||
assert qp["global_E"]["peer"].sip != 0, (
|
||||
f"root cube {root_cube} global_E must point to another SIP"
|
||||
)
|
||||
@@ -1,178 +0,0 @@
|
||||
"""ADR-0027 T4: torch.multiprocessing.spawn semantics.
|
||||
|
||||
Phase 1: imports `ctx.multiprocessing.spawn` which doesn't exist yet —
|
||||
tests fail. Phase 2 (D1) lands the namespace + _MultiprocessingNamespace
|
||||
+ SpawnException, and these pass.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import textwrap
|
||||
|
||||
import pytest
|
||||
from greenlet import greenlet
|
||||
|
||||
|
||||
def _write_minimal_ccl_yaml(tmp_path) -> str:
|
||||
body = textwrap.dedent("""\
|
||||
defaults:
|
||||
algorithm: ring_allreduce_tcm
|
||||
buffer_kind: tcm
|
||||
backpressure: sleep
|
||||
n_slots: 4
|
||||
slot_size: 4096
|
||||
vc_chunk_size: 256
|
||||
ipcq_credit_size_bytes: 16
|
||||
|
||||
algorithms:
|
||||
ring_allreduce_tcm:
|
||||
module: kernbench.ccl.algorithms.ring_allreduce
|
||||
topology: ring_1d
|
||||
buffer_kind: tcm
|
||||
n_elem: 8
|
||||
""")
|
||||
yaml_path = tmp_path / "ccl.yaml"
|
||||
yaml_path.write_text(body)
|
||||
return str(tmp_path)
|
||||
|
||||
|
||||
def _make_ctx(topology):
|
||||
from kernbench.runtime_api.context import RuntimeContext
|
||||
from kernbench.runtime_api.types import DeviceSelector
|
||||
from kernbench.sim_engine.engine import GraphEngine
|
||||
|
||||
engine = GraphEngine(topology.topology_obj, enable_data=True)
|
||||
return RuntimeContext(
|
||||
engine=engine,
|
||||
target_device=DeviceSelector("all"),
|
||||
correlation_id="test_t4",
|
||||
spec=topology.topology_obj.spec,
|
||||
)
|
||||
|
||||
|
||||
# ── D1.3 namespace attach ────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_multiprocessing_namespace_attached(topology):
|
||||
"""RuntimeContext.__post_init__ attaches ctx.multiprocessing (D1.3)."""
|
||||
with _make_ctx(topology) as ctx:
|
||||
assert hasattr(ctx, "multiprocessing"), (
|
||||
"ADR-0027 D1.3: ctx.multiprocessing must exist"
|
||||
)
|
||||
assert hasattr(ctx.multiprocessing, "spawn"), (
|
||||
"ctx.multiprocessing must expose a spawn(fn, args, nprocs) method"
|
||||
)
|
||||
|
||||
|
||||
# ── D1.1 / D1.2: spawn shape + rank binding ──────────────────────────
|
||||
|
||||
|
||||
def test_spawn_invokes_fn_once_per_rank(topology):
|
||||
"""spawn(fn, args, nprocs) calls fn(rank, *args) once for each rank."""
|
||||
with _make_ctx(topology) as ctx:
|
||||
calls: list[tuple[int, tuple]] = []
|
||||
|
||||
def _worker(rank: int, world_size: int) -> None:
|
||||
calls.append((rank, (world_size,)))
|
||||
|
||||
ctx.multiprocessing.spawn(_worker, args=(3,), nprocs=3)
|
||||
|
||||
assert sorted(r for r, _ in calls) == [0, 1, 2]
|
||||
for _, (ws,) in calls:
|
||||
assert ws == 3
|
||||
|
||||
|
||||
def test_spawn_binds_greenlet_local_rank(topology):
|
||||
"""Inside the worker, torch.distributed.get_rank() returns the rank
|
||||
bound to the greenlet (ADR-0024 D9 + D1.2)."""
|
||||
with _make_ctx(topology) as ctx:
|
||||
# Distributed context needs to be initialised so get_rank is valid.
|
||||
# For T4 we don't run a real collective; just check rank lookup.
|
||||
observed: list[tuple[int, int]] = []
|
||||
|
||||
def _worker(rank: int):
|
||||
g = greenlet.getcurrent()
|
||||
bound = ctx.distributed._rank_by_greenlet.get(g)
|
||||
observed.append((rank, bound))
|
||||
|
||||
ctx.multiprocessing.spawn(_worker, args=(), nprocs=2)
|
||||
|
||||
for rank, bound in observed:
|
||||
assert rank == bound, (
|
||||
f"rank {rank} must be bound to greenlet-local rank {rank}; "
|
||||
f"got {bound}"
|
||||
)
|
||||
|
||||
|
||||
# ── D1.2 exception cleanup ───────────────────────────────────────────
|
||||
|
||||
|
||||
def test_spawn_exception_raises_spawn_exception_with_root_cause(topology):
|
||||
"""D0.4-(4): worker raise → siblings SystemExit + SpawnException(errors)."""
|
||||
with _make_ctx(topology) as ctx:
|
||||
from kernbench.runtime_api.multiprocessing import SpawnException
|
||||
|
||||
def _worker(rank: int):
|
||||
if rank == 1:
|
||||
raise ValueError(f"rank {rank} boom")
|
||||
|
||||
with pytest.raises(SpawnException) as exc_info:
|
||||
ctx.multiprocessing.spawn(_worker, args=(), nprocs=3)
|
||||
|
||||
# Root cause rank is captured.
|
||||
assert 1 in exc_info.value.errors
|
||||
assert isinstance(exc_info.value.errors[1], ValueError)
|
||||
|
||||
|
||||
def test_spawn_exception_clears_pending_queues(topology):
|
||||
"""D0.4-(4): on raise, _pending_worker_waits and collective queue clear."""
|
||||
with _make_ctx(topology) as ctx:
|
||||
from kernbench.runtime_api.multiprocessing import SpawnException
|
||||
|
||||
def _worker(rank: int):
|
||||
raise RuntimeError("fail")
|
||||
|
||||
with pytest.raises(SpawnException):
|
||||
ctx.multiprocessing.spawn(_worker, args=(), nprocs=2)
|
||||
|
||||
assert ctx._pending_worker_waits == []
|
||||
|
||||
|
||||
# ── D1.4 migration compat: ccl_allreduce runs via mp.spawn ───────────
|
||||
|
||||
|
||||
def test_ccl_allreduce_hand_rolled_loop_replaced_by_mp_spawn(
|
||||
topology, tmp_path, monkeypatch, spec,
|
||||
):
|
||||
"""D1.4: benches/ccl_allreduce.py's hand-rolled greenlet loop must still
|
||||
produce correct behaviour after migration to torch.multiprocessing.spawn.
|
||||
|
||||
Minimal smoke — just that ``bench.run(ctx)`` completes without the
|
||||
loop short-circuiting or leaving pending queues dirty.
|
||||
"""
|
||||
monkeypatch.chdir(_write_minimal_ccl_yaml(tmp_path))
|
||||
import benches.ccl_allreduce as bench
|
||||
|
||||
calls: list[tuple[int, int]] = []
|
||||
|
||||
def _fake_worker(rank, cfg, torch):
|
||||
calls.append((rank, cfg.world_size))
|
||||
|
||||
monkeypatch.setattr(bench, "_worker", _fake_worker)
|
||||
|
||||
with _make_ctx(topology) as ctx:
|
||||
bench.run(ctx)
|
||||
|
||||
expected_ws = int(spec["system"]["sips"]["count"])
|
||||
ranks = sorted(r for r, _ in calls)
|
||||
assert ranks == list(range(expected_ws))
|
||||
assert ctx._pending_worker_waits == []
|
||||
|
||||
|
||||
# ── _drain_pending function is exported ──────────────────────────────
|
||||
|
||||
|
||||
def test_drain_pending_exported():
|
||||
"""D0.4: _drain_pending must be importable from runtime_api.multiprocessing."""
|
||||
from kernbench.runtime_api.multiprocessing import _drain_pending
|
||||
assert callable(_drain_pending)
|
||||
@@ -1,80 +0,0 @@
|
||||
"""Tests for recv_mode='copy_to_dst' (ADR-0023 D9.5)."""
|
||||
from __future__ import annotations
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
def test_recv_copy_to_dst_via_simpy_runner():
|
||||
"""Run a kernel that uses tl.recv(..., dst_addr=..., dst_space=...).
|
||||
Verify the data is moved to the dst location after recv.
|
||||
"""
|
||||
import importlib
|
||||
|
||||
from kernbench.policy.placement.dp import DPPolicy
|
||||
from kernbench.runtime_api.bench_runner import run_bench
|
||||
from kernbench.runtime_api.types import resolve_device
|
||||
from kernbench.sim_engine.engine import GraphEngine
|
||||
from kernbench.topology.builder import resolve_topology
|
||||
from kernbench.common.pe_commands import TensorHandle
|
||||
|
||||
def kernel(t_ptr, n_elem, dst_buf_addr, tl):
|
||||
rank = tl.program_id(axis=0)
|
||||
ws = tl.num_programs(axis=0)
|
||||
nbytes = n_elem * 2
|
||||
# Each PE sends own data, then recv into a custom dst slot
|
||||
current = TensorHandle(
|
||||
id="loc", addr=t_ptr + rank * nbytes,
|
||||
shape=(n_elem,), dtype="f16",
|
||||
nbytes=nbytes, data=None, space="hbm",
|
||||
)
|
||||
tl.send(dir="E", src=current)
|
||||
# copy_to_dst: move into a per-rank scratch HBM addr
|
||||
recv = tl.recv(
|
||||
dir="W", shape=(n_elem,), dtype="f16",
|
||||
dst_addr=dst_buf_addr + rank * nbytes,
|
||||
dst_space="hbm",
|
||||
)
|
||||
# Sanity: recv handle should now point to our dst addr
|
||||
assert recv.addr == dst_buf_addr + rank * nbytes
|
||||
assert recv.space == "hbm"
|
||||
|
||||
topo = resolve_topology("topology.yaml")
|
||||
|
||||
def run(torch):
|
||||
plan = torch.install_ipcq(
|
||||
algorithm="ring_allreduce_tcm", world_size_override=8,
|
||||
)
|
||||
a = torch.zeros(
|
||||
(1, 8 * 8),
|
||||
dtype="f16",
|
||||
dp=DPPolicy(
|
||||
cube="replicate", pe="column_wise",
|
||||
num_cubes=1,
|
||||
),
|
||||
name="copy_in",
|
||||
)
|
||||
store = torch.engine.memory_store
|
||||
base = a._handle.va_base or a._handle.shards[0].pa
|
||||
nbytes = 8 * 2
|
||||
for r in range(8):
|
||||
store.write("hbm", base + r * nbytes,
|
||||
np.full((8,), float(r + 1), dtype=np.float16))
|
||||
|
||||
# Use a separate dst region (synthetic addresses)
|
||||
dst_buf = 0xC0FFEE_0000
|
||||
torch.launch("ring_allreduce_tcm", kernel, a, 8, dst_buf)
|
||||
|
||||
# After the kernel, dst_buf + r*16 should contain rank (r-1)%8's data
|
||||
for r in range(8):
|
||||
arr = store.read("hbm", dst_buf + r * nbytes, shape=(8,), dtype="f16")
|
||||
expected = float(((r - 1) % 8) + 1)
|
||||
assert np.allclose(arr, expected), f"rank {r}: got {arr}, expected {expected}"
|
||||
|
||||
result = run_bench(
|
||||
topology=topo, bench_fn=run,
|
||||
device=resolve_device("all"),
|
||||
engine_factory=lambda t, d: GraphEngine(
|
||||
getattr(t, "topology_obj", t), enable_data=True
|
||||
),
|
||||
)
|
||||
assert result.completion.ok
|
||||
@@ -1,106 +0,0 @@
|
||||
"""Tests for tl.recv_async + tl.wait (ADR-0023 D4)."""
|
||||
from __future__ import annotations
|
||||
|
||||
import numpy as np
|
||||
|
||||
from kernbench.ccl.testing import run_kernel_in_mock
|
||||
|
||||
|
||||
def kernel_async_recv(t_ptr, n_elem, tl):
|
||||
"""Each PE issues recv_async first, then send, then wait — this exercises
|
||||
the non-blocking path. Uses TensorHandle math (PE_MATH) for accumulation
|
||||
so Phase 2 produces correct final HBM contents."""
|
||||
rank = tl.program_id(axis=0)
|
||||
world_size = tl.num_programs(axis=0)
|
||||
nbytes = n_elem * 2
|
||||
|
||||
pe_addr = t_ptr + rank * nbytes
|
||||
acc = tl.load(pe_addr, shape=(n_elem,), dtype="f16")
|
||||
current = acc
|
||||
|
||||
for _step in range(world_size - 1):
|
||||
future = tl.recv_async(dir="W", shape=(n_elem,), dtype="f16")
|
||||
tl.send(dir="E", src=current)
|
||||
recv = tl.wait(future)
|
||||
acc = acc + recv
|
||||
current = recv # forward W's tile to E next round
|
||||
|
||||
tl.store(pe_addr, acc)
|
||||
|
||||
|
||||
def test_recv_async_mock_runtime():
|
||||
n_elem = 8
|
||||
inputs = [
|
||||
np.full((n_elem,), float(r + 1), dtype=np.float16)
|
||||
for r in range(4)
|
||||
]
|
||||
expected = sum(inputs)
|
||||
|
||||
outputs = run_kernel_in_mock(
|
||||
kernel_fn=kernel_async_recv,
|
||||
world_size=4,
|
||||
topology="ring_1d",
|
||||
inputs=inputs,
|
||||
kernel_args=(n_elem,),
|
||||
)
|
||||
for r in range(4):
|
||||
assert np.allclose(outputs[r], expected)
|
||||
|
||||
|
||||
def test_recv_async_simpy_runner():
|
||||
"""Run the async kernel through the real SimPy stack via the
|
||||
install_ipcq + launch path.
|
||||
"""
|
||||
import importlib
|
||||
|
||||
from kernbench.runtime_api.bench_runner import run_bench
|
||||
from kernbench.runtime_api.types import resolve_device
|
||||
from kernbench.sim_engine.engine import GraphEngine
|
||||
from kernbench.topology.builder import resolve_topology
|
||||
|
||||
# Re-use the standard 8-PE bench skeleton but swap in the async kernel.
|
||||
topo = resolve_topology("topology.yaml")
|
||||
|
||||
# Build a tiny inline bench module
|
||||
import types
|
||||
mod = types.ModuleType("inline_bench_async")
|
||||
|
||||
from kernbench.policy.placement.dp import DPPolicy
|
||||
|
||||
def run(torch):
|
||||
plan = torch.install_ipcq(
|
||||
algorithm="ring_allreduce_tcm", world_size_override=8,
|
||||
)
|
||||
a = torch.zeros(
|
||||
(1, 8 * 8),
|
||||
dtype="f16",
|
||||
dp=DPPolicy(
|
||||
cube="replicate", pe="column_wise",
|
||||
num_cubes=1,
|
||||
),
|
||||
name="async_in",
|
||||
)
|
||||
store = torch.engine.memory_store
|
||||
base = a._handle.va_base or a._handle.shards[0].pa
|
||||
nbytes = 8 * 2
|
||||
for r in range(8):
|
||||
store.write("hbm", base + r * nbytes,
|
||||
np.full((8,), float(r + 1), dtype=np.float16))
|
||||
|
||||
torch.launch("ring_allreduce_tcm", kernel_async_recv, a, 8)
|
||||
|
||||
for r in range(8):
|
||||
result = store.read("hbm", base + r * nbytes, shape=(8,), dtype="f16")
|
||||
expected = float(sum(range(1, 9))) # 36
|
||||
assert np.allclose(result, expected, rtol=1e-2, atol=1e-2), \
|
||||
f"rank {r}: got {result}, expected {expected}"
|
||||
|
||||
mod.run = run
|
||||
result = run_bench(
|
||||
topology=topo, bench_fn=mod.run,
|
||||
device=resolve_device("all"),
|
||||
engine_factory=lambda t, d: GraphEngine(
|
||||
getattr(t, "topology_obj", t), enable_data=True
|
||||
),
|
||||
)
|
||||
assert result.completion.ok
|
||||
@@ -1,301 +0,0 @@
|
||||
"""ADR-0027 T3: Worker-wait generalization + orphan invariant.
|
||||
|
||||
Direct regression guard for ADR-0024 Phase B's kernel-greenlet orphan bug.
|
||||
Phase 1 of ADR-0027: these tests fail against the current code (no
|
||||
``_pending_worker_waits`` field, no worker-fork in ``ctx.wait``, no
|
||||
scheduler drain). Phase 2 implements D0.1/D0.2/D0.4 and these pass.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import textwrap
|
||||
|
||||
import pytest
|
||||
from greenlet import greenlet
|
||||
|
||||
|
||||
# ── helpers ──────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _write_minimal_ccl_yaml(tmp_path) -> str:
|
||||
body = textwrap.dedent("""\
|
||||
defaults:
|
||||
algorithm: ring_allreduce_tcm
|
||||
buffer_kind: tcm
|
||||
backpressure: sleep
|
||||
n_slots: 4
|
||||
slot_size: 4096
|
||||
vc_chunk_size: 256
|
||||
ipcq_credit_size_bytes: 16
|
||||
|
||||
algorithms:
|
||||
ring_allreduce_tcm:
|
||||
module: kernbench.ccl.algorithms.ring_allreduce
|
||||
topology: ring_1d
|
||||
buffer_kind: tcm
|
||||
n_elem: 8
|
||||
""")
|
||||
yaml_path = tmp_path / "ccl.yaml"
|
||||
yaml_path.write_text(body)
|
||||
return str(tmp_path)
|
||||
|
||||
|
||||
def _make_ctx(topology):
|
||||
from kernbench.runtime_api.context import RuntimeContext
|
||||
from kernbench.runtime_api.types import DeviceSelector
|
||||
from kernbench.sim_engine.engine import GraphEngine
|
||||
|
||||
engine = GraphEngine(topology.topology_obj, enable_data=True)
|
||||
return RuntimeContext(
|
||||
engine=engine,
|
||||
target_device=DeviceSelector("all"),
|
||||
correlation_id="test_t3",
|
||||
spec=topology.topology_obj.spec,
|
||||
)
|
||||
|
||||
|
||||
# ── D0.1: _pending_worker_waits field exists ─────────────────────────
|
||||
|
||||
|
||||
def test_pending_worker_waits_field_present(topology):
|
||||
"""RuntimeContext must expose the deferred-wait queue (D0.1)."""
|
||||
with _make_ctx(topology) as ctx:
|
||||
assert hasattr(ctx, "_pending_worker_waits"), (
|
||||
"ADR-0027 D0.1: RuntimeContext must declare _pending_worker_waits"
|
||||
)
|
||||
assert ctx._pending_worker_waits == [], (
|
||||
"_pending_worker_waits should start empty"
|
||||
)
|
||||
|
||||
|
||||
# ── T3.a / T3.b: wait defers + resume-after-drain contract ───────────
|
||||
|
||||
|
||||
def test_wait_in_worker_defers_to_main_and_resumes_completed(topology):
|
||||
"""T3.a + T3.b: worker ctx.wait enqueues + yields; resume → _completed.
|
||||
|
||||
Direct test of D0.2 (worker-fork) + D0.3 resume invariant (handle must
|
||||
be in ctx._completed when worker resumes).
|
||||
"""
|
||||
with _make_ctx(topology) as ctx:
|
||||
from kernbench.policy.placement.dp import DPPolicy
|
||||
|
||||
# Worker that submits one tensor (which internally calls ctx.wait)
|
||||
# and records the pending-queue state observed before/after.
|
||||
observations: dict = {"pre_wait_len": None, "post_resume_completed": None}
|
||||
|
||||
main = greenlet.getcurrent()
|
||||
|
||||
def _worker():
|
||||
# Observation hook: patch ctx.wait to capture a single deferral.
|
||||
original_wait = ctx.wait
|
||||
|
||||
def wrapping_wait(h, *, _meta=None):
|
||||
observations["pre_wait_len"] = len(ctx._pending_worker_waits)
|
||||
result = original_wait(h, _meta=_meta)
|
||||
observations["post_resume_completed"] = h in ctx._completed
|
||||
return result
|
||||
|
||||
ctx.wait = wrapping_wait # type: ignore[assignment]
|
||||
try:
|
||||
ctx.zeros(
|
||||
(1, 8), dtype="f16",
|
||||
dp=DPPolicy(cube="replicate", pe="replicate",
|
||||
num_cubes=1, num_pes=1),
|
||||
name="t3_defer",
|
||||
)
|
||||
finally:
|
||||
ctx.wait = original_wait # type: ignore[assignment]
|
||||
|
||||
g = greenlet(_worker)
|
||||
|
||||
# Scheduler loop: run worker until it yields (or finishes), then drain.
|
||||
while not g.dead:
|
||||
g.switch()
|
||||
if not g.dead:
|
||||
# Worker yielded mid-wait → simulate D0.4 drain.
|
||||
from kernbench.runtime_api.multiprocessing import _drain_pending
|
||||
_drain_pending(ctx)
|
||||
|
||||
assert observations["pre_wait_len"] is not None, "wait was not invoked"
|
||||
assert observations["post_resume_completed"] is True, (
|
||||
"D0.3 resume invariant: handle must be in ctx._completed on resume"
|
||||
)
|
||||
|
||||
|
||||
# ── T3.c: multi-worker same-round drain ──────────────────────────────
|
||||
|
||||
|
||||
def test_multiple_workers_resume_at_same_drain(topology):
|
||||
"""T3.c: every worker yields before any drain; all resume together."""
|
||||
with _make_ctx(topology) as ctx:
|
||||
from kernbench.policy.placement.dp import DPPolicy
|
||||
|
||||
dp = DPPolicy(cube="replicate", pe="replicate", num_cubes=1, num_pes=1)
|
||||
observations: list[int] = []
|
||||
|
||||
def _make_worker(rank: int):
|
||||
def _entry():
|
||||
# Before its wait, observe queue state so we can assert that
|
||||
# *every* worker has enqueued before any drain happened.
|
||||
ctx.zeros((1, 4), dtype="f16", dp=dp, name=f"r{rank}")
|
||||
observations.append(rank)
|
||||
return _entry
|
||||
|
||||
ws = 2
|
||||
gs = [greenlet(_make_worker(r)) for r in range(ws)]
|
||||
|
||||
# Round 1: every worker runs up to its first (deferred) ctx.wait.
|
||||
for g in gs:
|
||||
g.switch()
|
||||
|
||||
# After round 1, all workers should be paused (not yet dead) and
|
||||
# each should have enqueued at least one handle.
|
||||
assert all(not g.dead for g in gs), (
|
||||
"after round 1 switch, workers must be paused mid-wait, not dead"
|
||||
)
|
||||
assert len(ctx._pending_worker_waits) >= ws, (
|
||||
f"expected >= {ws} pending worker waits after round 1; "
|
||||
f"got {len(ctx._pending_worker_waits)}"
|
||||
)
|
||||
|
||||
# Loop: drain + switch rounds until all workers complete. A single
|
||||
# ctx.zeros() call contains multiple yield points (MmuMap, then
|
||||
# MemoryWrite), so more than one round is needed.
|
||||
from kernbench.runtime_api.multiprocessing import _drain_pending
|
||||
rounds = 0
|
||||
while any(not g.dead for g in gs):
|
||||
_drain_pending(ctx)
|
||||
for g in gs:
|
||||
if not g.dead:
|
||||
g.switch()
|
||||
rounds += 1
|
||||
assert rounds < 20, "scheduler did not converge within 20 rounds"
|
||||
|
||||
assert all(g.dead for g in gs), "all workers should be dead after drain loop"
|
||||
assert sorted(observations) == list(range(ws))
|
||||
|
||||
|
||||
# ── T3.d (핵심): kernel greenlet _parent is main ─────────────────────
|
||||
|
||||
|
||||
def test_kernel_greenlet_parent_is_main(topology, tmp_path, monkeypatch):
|
||||
"""T3.d orphan invariant: kernel_runner._parent must be main greenlet.
|
||||
|
||||
This is the direct regression guard for ADR-0024 Phase B. Runs a worker
|
||||
that invokes torch.launch (which eventually spawns a kernel greenlet).
|
||||
The kernel_runner.run() captures greenlet.getcurrent() as _parent at
|
||||
spawn time — that value MUST be the main greenlet, else the orphan
|
||||
bug is back.
|
||||
"""
|
||||
monkeypatch.chdir(_write_minimal_ccl_yaml(tmp_path))
|
||||
|
||||
from kernbench.triton_emu import kernel_runner as kr_mod
|
||||
captured_parents: list = []
|
||||
main = greenlet.getcurrent()
|
||||
|
||||
original_run = kr_mod.KernelRunner.run
|
||||
|
||||
def _spy_run(self, env, kernel_fn, kernel_args, num_programs):
|
||||
gen = original_run(self, env, kernel_fn, kernel_args, num_programs)
|
||||
|
||||
def _wrapping_gen():
|
||||
# yield from gen, but capture self._parent on first step
|
||||
try:
|
||||
value = next(gen)
|
||||
# First yield happens after _parent is set.
|
||||
captured_parents.append(self._parent)
|
||||
yield value
|
||||
except StopIteration:
|
||||
return
|
||||
yield from gen
|
||||
|
||||
return _wrapping_gen()
|
||||
|
||||
monkeypatch.setattr(kr_mod.KernelRunner, "run", _spy_run)
|
||||
|
||||
# Drive a minimal ring_allreduce that launches a kernel inside a worker.
|
||||
import benches.ccl_allreduce as bench
|
||||
|
||||
with _make_ctx(topology) as ctx:
|
||||
bench.run(ctx)
|
||||
|
||||
assert captured_parents, "no kernel_runner.run invocations observed"
|
||||
for p in captured_parents:
|
||||
assert p is main, (
|
||||
f"ADR-0027 D0.7 / T3.d: kernel greenlet _parent must be main "
|
||||
f"greenlet; got {p!r} (main={main!r})"
|
||||
)
|
||||
|
||||
|
||||
# ── T3.f: idempotency ────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_wait_same_handle_twice_drives_engine_once(topology):
|
||||
"""T3.f: ctx.wait(h) + ctx.wait(h) → engine.wait called once (D0.4-(3))."""
|
||||
with _make_ctx(topology) as ctx:
|
||||
from kernbench.policy.placement.dp import DPPolicy
|
||||
|
||||
dp = DPPolicy(cube="replicate", pe="replicate", num_cubes=1, num_pes=1)
|
||||
call_count = {"n": 0}
|
||||
original_engine_wait = ctx.engine.wait
|
||||
|
||||
def _counting_wait(h):
|
||||
call_count["n"] += 1
|
||||
return original_engine_wait(h)
|
||||
|
||||
ctx.engine.wait = _counting_wait # type: ignore[assignment]
|
||||
|
||||
def _worker():
|
||||
ctx.zeros((1, 4), dtype="f16", dp=dp, name="t3f")
|
||||
# Manually pick a completed handle and wait twice.
|
||||
assert ctx._completed, "there should be at least one completed handle"
|
||||
h = next(iter(ctx._completed))
|
||||
before = call_count["n"]
|
||||
ctx.wait(h)
|
||||
ctx.wait(h)
|
||||
assert call_count["n"] == before, (
|
||||
"already-completed handle must not re-drive engine.wait"
|
||||
)
|
||||
|
||||
g = greenlet(_worker)
|
||||
while not g.dead:
|
||||
g.switch()
|
||||
if not g.dead:
|
||||
from kernbench.runtime_api.multiprocessing import _drain_pending
|
||||
_drain_pending(ctx)
|
||||
|
||||
|
||||
# ── T3.g: exception propagation + no further drain ───────────────────
|
||||
|
||||
|
||||
def test_worker_exception_propagates_and_clears_pending(topology):
|
||||
"""T3.g: worker raise → main propagates; _pending_worker_waits cleared."""
|
||||
with _make_ctx(topology) as ctx:
|
||||
from kernbench.runtime_api.multiprocessing import SpawnException
|
||||
|
||||
def _bad_worker(rank: int):
|
||||
raise ValueError(f"rank {rank} intentional failure")
|
||||
|
||||
with pytest.raises(SpawnException) as exc_info:
|
||||
ctx.multiprocessing.spawn(_bad_worker, args=(), nprocs=2)
|
||||
|
||||
assert ctx._pending_worker_waits == [], (
|
||||
"D0.4-(4): _pending_worker_waits must be cleared on failure"
|
||||
)
|
||||
# Root-cause rank errors are present; sibling SystemExit not in dict.
|
||||
assert 0 in exc_info.value.errors or 1 in exc_info.value.errors
|
||||
|
||||
|
||||
# ── T3.e: historical failure (pre-D0) — skipped per ADR ──────────────
|
||||
|
||||
|
||||
@pytest.mark.skip(
|
||||
reason="ADR-0027 T3.e: historical failure mode — reproduces only "
|
||||
"pre-D0.2. Kept as documentation; not run in Phase 2."
|
||||
)
|
||||
def test_pre_d0_orphan_reproduction():
|
||||
"""Placeholder: exercises the pre-D0.2 code path that causes GreenletExit
|
||||
from kernel_runner._parent captured in worker context. See ADR-0024
|
||||
Phase B postmortem."""
|
||||
pass
|
||||
Reference in New Issue
Block a user