Add PE-level IPCQ collective infra + unified ccl_allreduce bench (ADR-0023)
Major changes:
PE-level IPCQ infrastructure:
- New PE_IPCQ component: ring-buffer control plane with 4-direction
neighbor mapping, head/tail pointers, backpressure (poll/sleep).
- PE_DMA extended with vc_comm channel for IPCQ outbound/inbound DMA,
including in-flight data snapshot (D9) and op_log recording at
outbound time for Phase 2 replay correctness.
- IpcqDmaToken piggyback model: data + metadata travel together,
atomic visibility at receiver (invariant I6).
- Credit return fast path: bottleneck-BW latency, no fabric vc_comm.
Phase 2 data execution (ADR-0020 integration):
- op_log extended: DmaWriteCmd now captures src_space/src_addr for
Phase 2 dma_write copy; ipcq_copy ops recorded at outbound time.
- DataExecutor replays dma_write + ipcq_copy in t_start order.
- Engine._flush_data_phase: incremental cursor-based replay after
each engine.wait() so host reads see post-Phase-2 data.
- KernelRunner Phase 1 writes disabled when op_log is active to
prevent stale data from corrupting the MemoryStore snapshot.
TLContext / kernel API:
- tl.send(dir, src=TensorHandle), tl.recv(dir, shape, dtype),
tl.recv_async, tl.wait(RecvFuture), copy_to_dst mode.
- TensorHandle operator overloading (add/sub/mul/div) via thread-local
active TLContext → MathCmd dispatch through PE_MATH.
- PE-local scratch allocator for math output handles.
- tl.load returns space="hbm" handles for correct Phase 2 addressing.
- Additional math functions: maximum, minimum, fma, clamp, softmax, cdiv.
Unified ccl_allreduce bench (PyTorch-compat host code):
- Single benches/ccl_allreduce.py with run() + worker(rank, ws, torch)
split matching real PyTorch DDP worker pattern.
- torch.distributed facade: init_process_group, get_world_size,
get_rank, get_backend, all_reduce, barrier — only real PyTorch names.
- AhbmCCLBackend: eager install_ipcq at init, all_reduce dispatches
kernel via tensor shard metadata (n_elem from shards[0].nbytes).
- world_size derived from topology spec (sips × cubes × pes_per_cube)
with optional algorithm-level override in ccl.yaml.
Tensor API (PyTorch-compat surface):
- Tensor.numpy(): gather-aware (all shards via VA-based addressing).
- Tensor.copy_(source): scatter from host tensor into sharded target.
- RuntimeContext.from_numpy(arr): host-side staging tensor.
- Tensor.data property fixed to use numpy() (was shards[0]-only).
Algorithm modules moved to src/kernbench/ccl/algorithms/:
- ring_allreduce, mesh_allreduce, tree_allreduce, hello_send.
- Each module exports kernel_args(world_size, n_elem) helper.
- ccl.yaml module paths updated to kernbench.ccl.algorithms.*.
Dead code removed:
- 7 per-variant bench files (ccl_allreduce_{tcm,hbm,sram}, etc.).
- _run_ccl_bench greenlet-per-SIP scheduler.
- benches.loader.is_ccl_bench + run_rank detection.
- benches/ccl/ directory.
Tests:
- New test_ccl_allreduce_matrix.py: 7 parametrized cases
(ring×3 buffers, ring 8/16, mesh 4, tree 7).
- New test_runtime_api_tensor.py: copy_/numpy/from_numpy unit tests.
- Existing tests updated for new import paths + world_size_override.
Docs:
- Korean ccl-author-guide.md and ADR-0023 paths updated.
- New English versions: ccl-author-guide.en.md, ADR-0023.en.md.
502 tests pass.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,142 @@
|
||||
"""End-to-end matrix tests for the unified ``ccl_allreduce`` bench.
|
||||
|
||||
Each parametrized case writes a tmp ``ccl.yaml`` overlay that selects a
|
||||
specific (algorithm, world_size, buffer_kind, n_elem) combination, then
|
||||
runs the bench via the CLI and asserts the printed line reports all
|
||||
ranks OK.
|
||||
|
||||
This single test file replaces the per-variant bench tests
|
||||
(test_ccl_allreduce_e2e, test_ccl_mesh_allreduce, test_ccl_tree_allreduce,
|
||||
test_ccl_multicube, test_ccl_multisip).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import textwrap
|
||||
|
||||
import pytest
|
||||
|
||||
import kernbench.cli.main as cli_main
|
||||
|
||||
|
||||
CCL_YAML_TEMPLATE = textwrap.dedent("""\
|
||||
defaults:
|
||||
algorithm: {algorithm}
|
||||
buffer_kind: {buffer_kind}
|
||||
backpressure: sleep
|
||||
n_slots: 4
|
||||
slot_size: 4096
|
||||
vc_chunk_size: 256
|
||||
ipcq_credit_size_bytes: 16
|
||||
|
||||
algorithms:
|
||||
{algorithm}:
|
||||
module: {module}
|
||||
topology: {topology}
|
||||
buffer_kind: {buffer_kind}
|
||||
{world_size_line}{n_elem_line}
|
||||
""")
|
||||
|
||||
|
||||
def _write_ccl_yaml(
|
||||
tmp_path,
|
||||
*,
|
||||
algorithm: str,
|
||||
module: str,
|
||||
topology: str,
|
||||
buffer_kind: str = "tcm",
|
||||
world_size: int | None = None,
|
||||
n_elem: int | None = None,
|
||||
) -> str:
|
||||
"""Write a tmp ccl.yaml in tmp_path and return its directory."""
|
||||
ws_line = f" world_size: {world_size}\n" if world_size is not None else ""
|
||||
nel_line = f" n_elem: {n_elem}\n" if n_elem is not None else ""
|
||||
body = CCL_YAML_TEMPLATE.format(
|
||||
algorithm=algorithm,
|
||||
module=module,
|
||||
topology=topology,
|
||||
buffer_kind=buffer_kind,
|
||||
world_size_line=ws_line,
|
||||
n_elem_line=nel_line,
|
||||
)
|
||||
yaml_path = tmp_path / "ccl.yaml"
|
||||
yaml_path.write_text(body)
|
||||
return str(tmp_path)
|
||||
|
||||
|
||||
CASES = [
|
||||
# algorithm, module, topology, buffer_kind, world_size, n_elem, expected_ws
|
||||
pytest.param(
|
||||
"ring_allreduce_tcm", "kernbench.ccl.algorithms.ring_allreduce",
|
||||
"ring_1d", "tcm", None, 8, 256,
|
||||
id="ring_full_system_tcm",
|
||||
),
|
||||
pytest.param(
|
||||
"ring_allreduce_hbm", "kernbench.ccl.algorithms.ring_allreduce",
|
||||
"ring_1d", "hbm", None, 8, 256,
|
||||
id="ring_full_system_hbm",
|
||||
),
|
||||
pytest.param(
|
||||
"ring_allreduce_sram", "kernbench.ccl.algorithms.ring_allreduce",
|
||||
"ring_1d", "sram", None, 8, 256,
|
||||
id="ring_full_system_sram",
|
||||
),
|
||||
pytest.param(
|
||||
"ring_allreduce_8", "kernbench.ccl.algorithms.ring_allreduce",
|
||||
"ring_1d", "tcm", 8, 32, 8,
|
||||
id="ring_single_cube",
|
||||
),
|
||||
pytest.param(
|
||||
"ring_allreduce_16", "kernbench.ccl.algorithms.ring_allreduce",
|
||||
"ring_1d", "tcm", 16, 16, 16,
|
||||
id="ring_multi_cube",
|
||||
),
|
||||
pytest.param(
|
||||
"mesh_allreduce_4", "kernbench.ccl.algorithms.mesh_allreduce",
|
||||
"mesh_2d", "tcm", 4, 16, 4,
|
||||
id="mesh_2x2",
|
||||
),
|
||||
pytest.param(
|
||||
"tree_allreduce_7", "kernbench.ccl.algorithms.tree_allreduce",
|
||||
"tree_binary", "tcm", 7, 16, 7,
|
||||
id="tree_binary_7",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"algorithm,module,topology,buffer_kind,world_size,n_elem,expected_ws",
|
||||
CASES,
|
||||
)
|
||||
def test_ccl_allreduce_matrix(
|
||||
tmp_path, capsys, monkeypatch,
|
||||
algorithm, module, topology, buffer_kind, world_size, n_elem, expected_ws,
|
||||
):
|
||||
"""Each (algorithm × buffer × world_size) combo passes through the
|
||||
unified bench and yields all ranks OK."""
|
||||
project_root = os.path.abspath(
|
||||
os.path.join(os.path.dirname(__file__), "..")
|
||||
)
|
||||
yaml_dir = _write_ccl_yaml(
|
||||
tmp_path,
|
||||
algorithm=algorithm,
|
||||
module=module,
|
||||
topology=topology,
|
||||
buffer_kind=buffer_kind,
|
||||
world_size=world_size,
|
||||
n_elem=n_elem,
|
||||
)
|
||||
monkeypatch.chdir(yaml_dir)
|
||||
rc = cli_main.main([
|
||||
"run",
|
||||
"--topology", os.path.join(project_root, "topology.yaml"),
|
||||
"--bench", "ccl_allreduce",
|
||||
"--verify-data",
|
||||
])
|
||||
assert rc == 0
|
||||
out = capsys.readouterr().out
|
||||
assert "FAIL" not in out, f"unexpected FAIL in output:\n{out}"
|
||||
assert f"{algorithm} (ws={expected_ws}): {expected_ws} OK" in out, (
|
||||
f"expected '{algorithm} (ws={expected_ws}): {expected_ws} OK' "
|
||||
f"in output:\n{out}"
|
||||
)
|
||||
@@ -0,0 +1,125 @@
|
||||
"""Tests for IPCQ deadlock detection (ADR-0023 D14 F3)."""
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
import simpy
|
||||
|
||||
from kernbench.ccl import diagnostics
|
||||
from kernbench.common.ipcq_types import (
|
||||
IpcqEndpoint,
|
||||
IpcqInitEntry,
|
||||
IpcqRecvCmd,
|
||||
IpcqRequest,
|
||||
)
|
||||
from kernbench.components.builtin.pe_ipcq import PeIpcqComponent
|
||||
from kernbench.runtime_api.kernel import IpcqInitMsg
|
||||
from kernbench.topology.types import Node
|
||||
|
||||
|
||||
@dataclass
|
||||
class _FakeTxn:
|
||||
request: Any
|
||||
done: simpy.Event
|
||||
result_data: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
|
||||
def _make_isolated_pe_ipcq(env):
|
||||
node = Node(
|
||||
id="sip0.cube0.pe0.pe_ipcq", kind="pe_ipcq",
|
||||
impl="builtin.pe_ipcq", attrs={}, pos_mm=None,
|
||||
)
|
||||
comp = PeIpcqComponent(node, ctx=None)
|
||||
comp.in_ports["host"] = simpy.Store(env)
|
||||
comp.out_ports["sip0.cube0.pe0.pe_dma"] = simpy.Store(env)
|
||||
comp.start(env)
|
||||
|
||||
peer_credit = simpy.Store(env)
|
||||
ep = IpcqEndpoint(
|
||||
sip=0, cube=0, pe=1, buffer_kind="tcm",
|
||||
rx_base_pa=0x10_000, rx_base_va=0,
|
||||
n_slots=4, slot_size=4096,
|
||||
)
|
||||
init_msg = IpcqInitMsg(
|
||||
correlation_id="t", request_id="t",
|
||||
target_sips=(0,), target_cubes=(0,), target_pe=0,
|
||||
entries=(IpcqInitEntry(
|
||||
direction="W", peer=ep,
|
||||
my_rx_base_pa=0x40_000, my_rx_base_va=0,
|
||||
n_slots=4, slot_size=4096,
|
||||
peer_credit_store=peer_credit,
|
||||
),),
|
||||
backpressure_mode="sleep",
|
||||
buffer_kind="tcm",
|
||||
credit_size_bytes=16,
|
||||
)
|
||||
done = env.event()
|
||||
comp.in_ports["host"].put(_FakeTxn(request=init_msg, done=done))
|
||||
env.run(until=done)
|
||||
return comp
|
||||
|
||||
|
||||
def test_pointer_dump_includes_blocked_state():
|
||||
"""A blocked recv should still be visible in the pointer dump."""
|
||||
env = simpy.Environment()
|
||||
comp = _make_isolated_pe_ipcq(env)
|
||||
|
||||
# Issue a recv that will block (no data has arrived)
|
||||
recv_cmd = IpcqRecvCmd(direction="W", shape=(8,), dtype="f16", handle_id="r1")
|
||||
req = IpcqRequest(command=recv_cmd, done=env.event())
|
||||
comp.in_ports["host"].put(req)
|
||||
env.run(until=10)
|
||||
assert not req.done.triggered
|
||||
|
||||
# Pointer dump should show my_tail=0 and peer_head_cache=0
|
||||
# We need to use the engine API but for an isolated component, just call directly
|
||||
class FakeEngine:
|
||||
_components = {"sip0.cube0.pe0.pe_ipcq": comp}
|
||||
|
||||
dump = diagnostics.pointer_dump(FakeEngine())
|
||||
assert "my_tail=0" in dump
|
||||
assert "peer_head_cache=0" in dump
|
||||
|
||||
|
||||
def test_deadlock_detection_recv_without_send():
|
||||
"""A recv with no matching sender → SimPy schedule empties → engine
|
||||
raises ``IpcqDeadlock`` with a pointer dump.
|
||||
"""
|
||||
from kernbench.ccl.diagnostics import IpcqDeadlock
|
||||
from kernbench.policy.placement.dp import DPPolicy
|
||||
from kernbench.runtime_api.bench_runner import run_bench
|
||||
from kernbench.runtime_api.types import resolve_device
|
||||
from kernbench.sim_engine.engine import GraphEngine
|
||||
from kernbench.topology.builder import resolve_topology
|
||||
|
||||
def deadlock_kernel(t_ptr, n_elem, tl):
|
||||
# Every PE just receives, no sends → no one delivers → deadlock
|
||||
tl.recv(dir="W", shape=(n_elem,), dtype="f16")
|
||||
|
||||
topo = resolve_topology("topology.yaml")
|
||||
|
||||
def run(torch):
|
||||
torch.install_ipcq(
|
||||
algorithm="ring_allreduce_tcm", world_size_override=8,
|
||||
)
|
||||
a = torch.zeros(
|
||||
(1, 8 * 8),
|
||||
dtype="f16",
|
||||
dp=DPPolicy(
|
||||
sip="replicate", cube="replicate", pe="column_wise",
|
||||
num_sips=1, num_cubes=1,
|
||||
),
|
||||
name="dl_in",
|
||||
)
|
||||
torch.launch("dl", deadlock_kernel, a, 8)
|
||||
|
||||
with pytest.raises(IpcqDeadlock):
|
||||
run_bench(
|
||||
topology=topo, bench_fn=run,
|
||||
device=resolve_device("all"),
|
||||
engine_factory=lambda t, d: GraphEngine(
|
||||
getattr(t, "topology_obj", t), enable_data=True
|
||||
),
|
||||
)
|
||||
@@ -0,0 +1,70 @@
|
||||
"""Tests for CCL diagnostics: trace + pointer dump (ADR-0023 D14)."""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
|
||||
from kernbench.ccl import diagnostics
|
||||
|
||||
|
||||
# ── trace toggle ─────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_trace_disabled_by_default(monkeypatch):
|
||||
monkeypatch.delenv("KERNBENCH_CCL_TRACE", raising=False)
|
||||
diagnostics.reload_trace_setting()
|
||||
assert diagnostics.trace_enabled() is False
|
||||
|
||||
|
||||
def test_trace_enabled_via_env(monkeypatch):
|
||||
monkeypatch.setenv("KERNBENCH_CCL_TRACE", "1")
|
||||
diagnostics.reload_trace_setting()
|
||||
assert diagnostics.trace_enabled() is True
|
||||
|
||||
|
||||
def test_trace_record_send(monkeypatch, capsys):
|
||||
monkeypatch.setenv("KERNBENCH_CCL_TRACE", "1")
|
||||
diagnostics.reload_trace_setting()
|
||||
diagnostics.log_send(t_ns=100.0, sender="sip0.cube0.pe0",
|
||||
direction="E", nbytes=64, sender_seq=0)
|
||||
out = capsys.readouterr().out
|
||||
assert "send" in out
|
||||
assert "sip0.cube0.pe0" in out
|
||||
assert "dir=E" in out
|
||||
monkeypatch.delenv("KERNBENCH_CCL_TRACE")
|
||||
diagnostics.reload_trace_setting()
|
||||
|
||||
|
||||
def test_trace_record_recv(monkeypatch, capsys):
|
||||
monkeypatch.setenv("KERNBENCH_CCL_TRACE", "1")
|
||||
diagnostics.reload_trace_setting()
|
||||
diagnostics.log_recv(t_ns=200.0, receiver="sip0.cube0.pe1",
|
||||
direction="W", nbytes=64)
|
||||
out = capsys.readouterr().out
|
||||
assert "recv" in out
|
||||
assert "sip0.cube0.pe1" in out
|
||||
monkeypatch.delenv("KERNBENCH_CCL_TRACE")
|
||||
diagnostics.reload_trace_setting()
|
||||
|
||||
|
||||
# ── pointer dump ────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_pointer_dump_format():
|
||||
from kernbench.sim_engine.engine import GraphEngine
|
||||
from kernbench.topology.builder import resolve_topology
|
||||
from kernbench.ccl.install import (
|
||||
install_ipcq, load_ccl_config, resolve_algorithm_config,
|
||||
)
|
||||
|
||||
topo = resolve_topology("topology.yaml").topology_obj
|
||||
engine = GraphEngine(topo, enable_data=True)
|
||||
cfg = resolve_algorithm_config(load_ccl_config(), name="ring_allreduce_tcm")
|
||||
install_ipcq(engine, topo.spec, cfg)
|
||||
|
||||
dump = diagnostics.pointer_dump(engine)
|
||||
# 8 ranks × 2 directions = 16 lines (plus 8 PE headers)
|
||||
assert "sip0.cube0.pe0" in dump
|
||||
assert "E:" in dump
|
||||
assert "W:" in dump
|
||||
assert "my_head=" in dump
|
||||
assert "peer_tail_cache=" in dump
|
||||
@@ -0,0 +1,62 @@
|
||||
"""Tests for the torch.distributed-compat facade (ADR-0023 D11).
|
||||
|
||||
These tests verify the public API surface of ``DistributedContext`` +
|
||||
``AhbmCCLBackend``. End-to-end correctness of the allreduce itself is
|
||||
covered by tests/test_ccl_allreduce_matrix.py.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from kernbench.runtime_api.distributed import AhbmCCLBackend, DistributedContext
|
||||
|
||||
|
||||
def test_init_process_group_requires_ctx_ref():
|
||||
"""Using DistributedContext without RuntimeContext binding should fail."""
|
||||
dist = DistributedContext()
|
||||
# Not bound to a RuntimeContext → init should raise.
|
||||
try:
|
||||
dist.init_process_group(backend="ahbm")
|
||||
assert False, "expected RuntimeError"
|
||||
except RuntimeError:
|
||||
pass
|
||||
|
||||
|
||||
def test_init_process_group_rejects_unknown_backend():
|
||||
"""Unknown backend raises ValueError (matches pytorch behavior)."""
|
||||
dist = DistributedContext()
|
||||
dist._ctx_ref = object() # dummy; won't be reached before the check
|
||||
try:
|
||||
dist.init_process_group(backend="nccl")
|
||||
assert False, "expected ValueError"
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
|
||||
def test_distributed_pytorch_compat_surface():
|
||||
"""DistributedContext only exposes real torch.distributed API names."""
|
||||
# Every public attribute should either be a real pytorch name or private.
|
||||
allowed = {
|
||||
"init_process_group",
|
||||
"is_initialized",
|
||||
"get_world_size",
|
||||
"get_rank",
|
||||
"get_backend",
|
||||
"all_reduce",
|
||||
"barrier",
|
||||
}
|
||||
dc = DistributedContext()
|
||||
for attr in dir(dc):
|
||||
if attr.startswith("_"):
|
||||
continue
|
||||
assert attr in allowed, (
|
||||
f"DistributedContext exposes non-pytorch API: {attr!r}"
|
||||
)
|
||||
|
||||
|
||||
def test_backend_class_surface():
|
||||
"""AhbmCCLBackend exposes only all_reduce + barrier + world_size."""
|
||||
# Ensure we don't accidentally leak internal method names.
|
||||
public = {m for m in dir(AhbmCCLBackend) if not m.startswith("_")}
|
||||
# Class must at minimum expose these.
|
||||
assert "all_reduce" in public
|
||||
assert "barrier" in public
|
||||
assert "world_size" in public
|
||||
@@ -0,0 +1,81 @@
|
||||
"""Validate the hello-world example from docs/ccl-author-guide.md.
|
||||
|
||||
This is the simplest possible CCL kernel — each PE sends its tile E
|
||||
and receives a tile from W. After running, each rank's slice should
|
||||
contain the data of the previous rank.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import numpy as np
|
||||
|
||||
from kernbench.ccl.algorithms import hello_send
|
||||
from kernbench.ccl.testing import run_kernel_in_mock
|
||||
|
||||
|
||||
def test_hello_send_4_ranks_mock():
|
||||
n_elem = 8
|
||||
inputs = [np.full((n_elem,), float(r + 1), dtype=np.float16) for r in range(4)]
|
||||
|
||||
outputs = run_kernel_in_mock(
|
||||
kernel_fn=hello_send.kernel,
|
||||
world_size=4,
|
||||
topology="ring_1d",
|
||||
inputs=inputs,
|
||||
kernel_args=(n_elem,),
|
||||
)
|
||||
|
||||
# rank r should have rank (r-1) % 4's data
|
||||
for r in range(4):
|
||||
prev = inputs[(r - 1) % 4]
|
||||
assert np.array_equal(outputs[r], prev), f"rank {r}: got {outputs[r]}"
|
||||
|
||||
|
||||
def test_hello_send_via_simpy_runner():
|
||||
"""Same but through real SimPy + IPCQ."""
|
||||
from kernbench.policy.placement.dp import DPPolicy
|
||||
from kernbench.runtime_api.bench_runner import run_bench
|
||||
from kernbench.runtime_api.types import resolve_device
|
||||
from kernbench.sim_engine.engine import GraphEngine
|
||||
from kernbench.topology.builder import resolve_topology
|
||||
|
||||
topo = resolve_topology("topology.yaml")
|
||||
n_elem = 8
|
||||
world_size = 8
|
||||
|
||||
def run(torch):
|
||||
# World size for this hello test is 8 (one cube). ccl.yaml no
|
||||
# longer carries a default world_size — pass it explicitly.
|
||||
plan = torch.install_ipcq(
|
||||
algorithm="ring_allreduce_tcm", world_size_override=world_size,
|
||||
)
|
||||
a = torch.zeros(
|
||||
(1, world_size * n_elem), dtype="f16",
|
||||
dp=DPPolicy(
|
||||
sip="replicate", cube="replicate", pe="column_wise",
|
||||
num_sips=1, num_cubes=1,
|
||||
),
|
||||
name="hello_in",
|
||||
)
|
||||
store = torch.engine.memory_store
|
||||
base = a._handle.va_base or a._handle.shards[0].pa
|
||||
nbytes = n_elem * 2
|
||||
for r in range(world_size):
|
||||
store.write("hbm", base + r * nbytes,
|
||||
np.full((n_elem,), float(r + 1), dtype=np.float16))
|
||||
|
||||
torch.launch("hello_send", hello_send.kernel, a, n_elem)
|
||||
|
||||
# Each rank should hold the previous rank's data after the round
|
||||
for r in range(world_size):
|
||||
arr = store.read("hbm", base + r * nbytes, shape=(n_elem,), dtype="f16")
|
||||
prev_value = float(((r - 1) % world_size) + 1)
|
||||
assert np.allclose(arr, prev_value), f"rank {r}: got {arr}, expected {prev_value}"
|
||||
|
||||
result = run_bench(
|
||||
topology=topo, bench_fn=run,
|
||||
device=resolve_device("all"),
|
||||
engine_factory=lambda t, d: GraphEngine(
|
||||
getattr(t, "topology_obj", t), enable_data=True
|
||||
),
|
||||
)
|
||||
assert result.completion.ok
|
||||
@@ -0,0 +1,68 @@
|
||||
"""Tests for CCL algorithm-author helpers (ADR-0023 D15)."""
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from kernbench.ccl.helpers import (
|
||||
Chunk,
|
||||
chunked,
|
||||
ring_step,
|
||||
tree_step,
|
||||
)
|
||||
|
||||
|
||||
# ── chunked ──────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_chunked_basic():
|
||||
chunks = chunked(base_addr=0x1000, n_chunks=4, n_elem=64, dtype="f16")
|
||||
assert len(chunks) == 4
|
||||
# Each chunk has 16 elements (64 / 4)
|
||||
assert chunks[0] == Chunk(addr=0x1000, n_elem=16, nbytes=32)
|
||||
assert chunks[1] == Chunk(addr=0x1020, n_elem=16, nbytes=32)
|
||||
assert chunks[2] == Chunk(addr=0x1040, n_elem=16, nbytes=32)
|
||||
assert chunks[3] == Chunk(addr=0x1060, n_elem=16, nbytes=32)
|
||||
|
||||
|
||||
def test_chunked_f32():
|
||||
chunks = chunked(base_addr=0x100, n_chunks=2, n_elem=8, dtype="f32")
|
||||
assert chunks[0].nbytes == 16 # 4 elem × 4 bytes
|
||||
assert chunks[1].addr == 0x100 + 16
|
||||
|
||||
|
||||
def test_chunked_uneven_raises():
|
||||
with pytest.raises(ValueError):
|
||||
chunked(base_addr=0x100, n_chunks=3, n_elem=10, dtype="f16")
|
||||
|
||||
|
||||
# ── ring_step ────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_ring_step_4_ranks():
|
||||
# Standard reduce-scatter ring step:
|
||||
# at step s, rank r sends chunk (r-s) and receives chunk (r-s-1) (mod ws)
|
||||
assert ring_step(rank=0, step=0, world_size=4) == (0, 3)
|
||||
assert ring_step(rank=0, step=1, world_size=4) == (3, 2)
|
||||
assert ring_step(rank=1, step=0, world_size=4) == (1, 0)
|
||||
assert ring_step(rank=2, step=0, world_size=4) == (2, 1)
|
||||
|
||||
|
||||
# ── tree_step ────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_tree_step_root():
|
||||
info = tree_step(rank=0, world_size=7)
|
||||
assert info["parent"] is None
|
||||
assert info["children"] == [1, 2]
|
||||
|
||||
|
||||
def test_tree_step_internal():
|
||||
info = tree_step(rank=1, world_size=7)
|
||||
assert info["parent"] == 0
|
||||
assert info["children"] == [3, 4]
|
||||
|
||||
|
||||
def test_tree_step_leaf():
|
||||
info = tree_step(rank=4, world_size=7)
|
||||
assert info["parent"] == 1
|
||||
assert info["children"] == []
|
||||
@@ -0,0 +1,100 @@
|
||||
"""Tests for CCL backend install (ADR-0023 D10/D11)."""
|
||||
from __future__ import annotations
|
||||
|
||||
from kernbench.ccl.install import (
|
||||
install_ipcq,
|
||||
linear_rank_to_pe,
|
||||
load_ccl_config,
|
||||
resolve_algorithm_config,
|
||||
)
|
||||
from kernbench.sim_engine.engine import GraphEngine
|
||||
from kernbench.topology.builder import resolve_topology
|
||||
|
||||
|
||||
def _engine():
|
||||
topo = resolve_topology("topology.yaml").topology_obj
|
||||
return GraphEngine(topo, enable_data=True), topo
|
||||
|
||||
|
||||
def test_load_ccl_config():
|
||||
cfg = load_ccl_config()
|
||||
assert "defaults" in cfg
|
||||
assert "algorithms" in cfg
|
||||
|
||||
|
||||
def test_resolve_algorithm_config_default():
|
||||
cfg = load_ccl_config()
|
||||
merged = resolve_algorithm_config(cfg)
|
||||
assert merged["algorithm"] == cfg["defaults"]["algorithm"]
|
||||
# ccl.yaml no longer carries defaults.world_size — backend derives
|
||||
# it from topology.yaml at install time. Just check the field is
|
||||
# absent here (verified per-test where install_ipcq is called).
|
||||
assert "world_size" not in merged or merged["world_size"] >= 1
|
||||
|
||||
|
||||
def test_resolve_algorithm_config_override():
|
||||
cfg = load_ccl_config()
|
||||
merged = resolve_algorithm_config(cfg, name="ring_allreduce_hbm")
|
||||
assert merged["algorithm"] == "ring_allreduce_hbm"
|
||||
assert merged["buffer_kind"] == "hbm" # algo override
|
||||
# defaults still apply
|
||||
assert merged["n_slots"] == cfg["defaults"]["n_slots"]
|
||||
|
||||
|
||||
def test_linear_rank_to_pe():
|
||||
engine, topo = _engine()
|
||||
spec = topo.spec
|
||||
# Cube 0 of SIP 0
|
||||
assert linear_rank_to_pe(0, spec) == (0, 0, 0)
|
||||
assert linear_rank_to_pe(7, spec) == (0, 0, 7)
|
||||
# Should not exceed total PE count
|
||||
pes_per_sip = (
|
||||
spec["sip"]["cube_mesh"]["w"] * spec["sip"]["cube_mesh"]["h"]
|
||||
* spec["cube"]["pe_layout"]["pe_per_corner"]
|
||||
* len(spec["cube"]["pe_layout"]["corners"])
|
||||
)
|
||||
sips = spec["system"]["sips"]["count"]
|
||||
total = sips * pes_per_sip
|
||||
assert total >= 8
|
||||
|
||||
|
||||
def test_install_ipcq_neighbors_correct():
|
||||
engine, topo = _engine()
|
||||
cfg = load_ccl_config()
|
||||
merged = resolve_algorithm_config(cfg, name="ring_allreduce_tcm")
|
||||
# Force a single-cube 8-rank install for the assertions below.
|
||||
merged["world_size"] = 8
|
||||
plan = install_ipcq(engine, topo.spec, merged)
|
||||
|
||||
assert plan["world_size"] == 8
|
||||
assert plan["buffer_kind"] == "tcm"
|
||||
|
||||
# Each rank should have E and W entries
|
||||
for r, nbrs in plan["neighbor_table"].items():
|
||||
assert "E" in nbrs
|
||||
assert "W" in nbrs
|
||||
|
||||
# Inspect installed PE_IPCQ for rank 0
|
||||
ipcq = engine._components["sip0.cube0.pe0.pe_ipcq"]
|
||||
qp_e = ipcq.queue_pairs["E"]
|
||||
qp_w = ipcq.queue_pairs["W"]
|
||||
assert qp_e["peer"].pe == 1 # rank 0's E neighbor is rank 1
|
||||
assert qp_w["peer"].pe == 7 # rank 0's W neighbor is rank 7
|
||||
# rx_base addresses should be unique
|
||||
assert qp_e["my_rx_base_pa"] != qp_w["my_rx_base_pa"]
|
||||
|
||||
|
||||
def test_install_ipcq_credit_stores_wired():
|
||||
engine, topo = _engine()
|
||||
cfg = load_ccl_config()
|
||||
merged = resolve_algorithm_config(cfg, name="ring_allreduce_tcm")
|
||||
merged["world_size"] = 8
|
||||
install_ipcq(engine, topo.spec, merged)
|
||||
|
||||
# rank 0 (pe0) sending E goes to rank 1 (pe1)
|
||||
# rank 0's peer_credit_store on E direction should equal rank 1's credit_inbox
|
||||
pe0 = engine._components["sip0.cube0.pe0.pe_ipcq"]
|
||||
pe1 = engine._components["sip0.cube0.pe1.pe_ipcq"]
|
||||
|
||||
qp_e = pe0.queue_pairs["E"]
|
||||
assert qp_e["peer_credit_store"] is pe1.credit_inbox
|
||||
@@ -0,0 +1,83 @@
|
||||
"""Tests for the mock CCL runtime (ADR-0023 D15)."""
|
||||
from __future__ import annotations
|
||||
|
||||
import numpy as np
|
||||
|
||||
from kernbench.ccl.algorithms import ring_allreduce
|
||||
from kernbench.ccl.testing import run_kernel_in_mock
|
||||
|
||||
|
||||
def test_ring_allreduce_4_ranks():
|
||||
"""Run the ring all-reduce kernel under the mock runtime, no SimPy."""
|
||||
n_elem = 8
|
||||
inputs = [
|
||||
np.full((n_elem,), float(r + 1), dtype=np.float16)
|
||||
for r in range(4)
|
||||
]
|
||||
expected = sum(inputs) # [10, 10, ..., 10]
|
||||
|
||||
outputs = run_kernel_in_mock(
|
||||
kernel_fn=ring_allreduce.kernel,
|
||||
world_size=4,
|
||||
topology="ring_1d",
|
||||
inputs=inputs,
|
||||
kernel_args=(n_elem, 4),
|
||||
)
|
||||
|
||||
assert len(outputs) == 4
|
||||
for r in range(4):
|
||||
assert np.allclose(outputs[r], expected)
|
||||
|
||||
|
||||
def test_ring_allreduce_8_ranks():
|
||||
n_elem = 16
|
||||
inputs = [
|
||||
np.full((n_elem,), float(r + 1), dtype=np.float16)
|
||||
for r in range(8)
|
||||
]
|
||||
expected = sum(inputs) # [36, 36, ...]
|
||||
|
||||
outputs = run_kernel_in_mock(
|
||||
kernel_fn=ring_allreduce.kernel,
|
||||
world_size=8,
|
||||
topology="ring_1d",
|
||||
inputs=inputs,
|
||||
kernel_args=(n_elem, 8),
|
||||
)
|
||||
for r in range(8):
|
||||
assert np.allclose(outputs[r], expected)
|
||||
|
||||
|
||||
def test_ring_allreduce_random_data():
|
||||
n_elem = 32
|
||||
rng = np.random.default_rng(42)
|
||||
inputs = [rng.standard_normal(n_elem).astype(np.float16) for _ in range(4)]
|
||||
expected = sum(inputs)
|
||||
|
||||
outputs = run_kernel_in_mock(
|
||||
kernel_fn=ring_allreduce.kernel,
|
||||
world_size=4,
|
||||
topology="ring_1d",
|
||||
inputs=inputs,
|
||||
kernel_args=(n_elem, 4),
|
||||
)
|
||||
for r in range(4):
|
||||
assert np.allclose(outputs[r], expected, rtol=1e-2, atol=1e-2)
|
||||
|
||||
|
||||
def test_mock_runtime_invalid_direction_raises():
|
||||
"""A kernel that uses an unsupported direction should raise."""
|
||||
import pytest
|
||||
|
||||
def bad_kernel(t_ptr, n_elem, tl):
|
||||
tl.send(dir="N", src_addr=0, nbytes=2, shape=(1,), dtype="f16", space="hbm")
|
||||
|
||||
inputs = [np.array([1.0], dtype=np.float16) for _ in range(2)]
|
||||
with pytest.raises(Exception):
|
||||
run_kernel_in_mock(
|
||||
kernel_fn=bad_kernel,
|
||||
world_size=2,
|
||||
topology="ring_1d",
|
||||
inputs=inputs,
|
||||
kernel_args=(1,),
|
||||
)
|
||||
@@ -0,0 +1,134 @@
|
||||
"""CCL performance validation tests (ADR-0023 D13 T5).
|
||||
|
||||
Sanity-checks the simulated latency of the unified ``ccl_allreduce`` bench
|
||||
under different ``ccl.yaml`` algorithm choices:
|
||||
|
||||
- All buffer kinds finish in non-zero simulated time.
|
||||
- Latency is bounded well under 1 ms for small tiles.
|
||||
|
||||
These are sanity checks on the model itself, not on absolute numbers.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import importlib
|
||||
import os
|
||||
from contextlib import contextmanager
|
||||
|
||||
import pytest
|
||||
|
||||
from kernbench.runtime_api.bench_runner import run_bench
|
||||
from kernbench.runtime_api.types import resolve_device
|
||||
from kernbench.sim_engine.engine import GraphEngine
|
||||
from kernbench.topology.builder import resolve_topology
|
||||
|
||||
|
||||
def _engine_factory(topology, device):
|
||||
return GraphEngine(getattr(topology, "topology_obj", topology), enable_data=True)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def _ccl_yaml_override(algorithm: str, world_size: int | None = None):
|
||||
"""Write a tmp ccl.yaml that forces a specific algorithm + world_size."""
|
||||
import tempfile
|
||||
entry_extra = f"\n world_size: {world_size}" if world_size is not None else ""
|
||||
body = f"""
|
||||
defaults:
|
||||
algorithm: {algorithm}
|
||||
buffer_kind: tcm
|
||||
backpressure: sleep
|
||||
n_slots: 4
|
||||
slot_size: 4096
|
||||
vc_chunk_size: 256
|
||||
ipcq_credit_size_bytes: 16
|
||||
|
||||
algorithms:
|
||||
ring_allreduce_tcm:
|
||||
module: kernbench.ccl.algorithms.ring_allreduce
|
||||
topology: ring_1d
|
||||
buffer_kind: tcm
|
||||
ring_allreduce_hbm:
|
||||
module: kernbench.ccl.algorithms.ring_allreduce
|
||||
topology: ring_1d
|
||||
buffer_kind: hbm
|
||||
ring_allreduce_sram:
|
||||
module: kernbench.ccl.algorithms.ring_allreduce
|
||||
topology: ring_1d
|
||||
buffer_kind: sram{entry_extra if algorithm.startswith("ring") else ""}
|
||||
{algorithm}:
|
||||
module: kernbench.ccl.algorithms.ring_allreduce
|
||||
topology: ring_1d
|
||||
buffer_kind: tcm{entry_extra}
|
||||
""" if world_size is not None else f"""
|
||||
defaults:
|
||||
algorithm: {algorithm}
|
||||
buffer_kind: tcm
|
||||
backpressure: sleep
|
||||
n_slots: 4
|
||||
slot_size: 4096
|
||||
vc_chunk_size: 256
|
||||
ipcq_credit_size_bytes: 16
|
||||
|
||||
algorithms:
|
||||
ring_allreduce_tcm:
|
||||
module: kernbench.ccl.algorithms.ring_allreduce
|
||||
topology: ring_1d
|
||||
buffer_kind: tcm
|
||||
ring_allreduce_hbm:
|
||||
module: kernbench.ccl.algorithms.ring_allreduce
|
||||
topology: ring_1d
|
||||
buffer_kind: hbm
|
||||
ring_allreduce_sram:
|
||||
module: kernbench.ccl.algorithms.ring_allreduce
|
||||
topology: ring_1d
|
||||
buffer_kind: sram
|
||||
"""
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
path = os.path.join(tmp, "ccl.yaml")
|
||||
with open(path, "w") as f:
|
||||
f.write(body)
|
||||
old_cwd = os.getcwd()
|
||||
os.chdir(tmp)
|
||||
try:
|
||||
yield path
|
||||
finally:
|
||||
os.chdir(old_cwd)
|
||||
|
||||
|
||||
def _run_unified(algorithm: str, world_size: int | None = None) -> float:
|
||||
"""Run the unified ccl_allreduce bench under a ccl.yaml override,
|
||||
return simulated kernel total_ns."""
|
||||
with _ccl_yaml_override(algorithm, world_size):
|
||||
topo = resolve_topology(
|
||||
os.path.join(os.path.dirname(__file__), "..", "topology.yaml")
|
||||
)
|
||||
bench_mod = importlib.import_module("benches.ccl_allreduce")
|
||||
result = run_bench(
|
||||
topology=topo, bench_fn=bench_mod.run,
|
||||
device=resolve_device("all"),
|
||||
engine_factory=_engine_factory,
|
||||
)
|
||||
assert result.completion.ok, f"{algorithm} did not complete"
|
||||
last_kernel = None
|
||||
for tr in (result.traces or []):
|
||||
if tr.get("phase") == "kernel":
|
||||
last_kernel = tr
|
||||
assert last_kernel is not None, f"{algorithm} produced no kernel trace"
|
||||
return float(last_kernel.get("total_ns", 0.0))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("algorithm", [
|
||||
"ring_allreduce_tcm",
|
||||
"ring_allreduce_hbm",
|
||||
"ring_allreduce_sram",
|
||||
])
|
||||
def test_ccl_latency_positive(algorithm):
|
||||
"""Every buffer kind must produce a positive simulated latency."""
|
||||
ns = _run_unified(algorithm)
|
||||
assert ns > 0
|
||||
|
||||
|
||||
def test_ccl_latency_under_reasonable_bound():
|
||||
"""Sanity bound: ring all-reduce (tile=32 f16) should finish in well
|
||||
under 1 ms simulated. Way overhead-dominated for small tiles."""
|
||||
ns = _run_unified("ring_allreduce_tcm")
|
||||
assert ns < 100_000_000 # < 100 ms simulated — very loose bound
|
||||
@@ -0,0 +1,48 @@
|
||||
"""Test that tl.recv() (no direction) works under the mock runtime
|
||||
and the SimPy PE_IPCQ component (ADR-0023 D4 weak fairness)."""
|
||||
from __future__ import annotations
|
||||
|
||||
import numpy as np
|
||||
|
||||
from kernbench.ccl.testing import run_kernel_in_mock
|
||||
|
||||
|
||||
def kernel_round_robin(t_ptr, n_elem, tl):
|
||||
"""Each PE sends one tile E then receives N-1 tiles via round-robin.
|
||||
Uses TensorHandle math (PE_MATH) so Phase 2 produces correct HBM
|
||||
contents under SimPy + op_log replay."""
|
||||
rank = tl.program_id(axis=0)
|
||||
world_size = tl.num_programs(axis=0)
|
||||
nbytes = n_elem * 2
|
||||
|
||||
pe_addr = t_ptr + rank * nbytes
|
||||
acc = tl.load(pe_addr, shape=(n_elem,), dtype="f16")
|
||||
current = acc
|
||||
|
||||
for _step in range(world_size - 1):
|
||||
tl.send(dir="E", src=current)
|
||||
# No direction → round-robin
|
||||
recv = tl.recv(shape=(n_elem,), dtype="f16")
|
||||
acc = acc + recv
|
||||
current = recv # forward W's tile to E next round
|
||||
|
||||
tl.store(pe_addr, acc)
|
||||
|
||||
|
||||
def test_round_robin_recv_mock_runtime():
|
||||
n_elem = 8
|
||||
inputs = [
|
||||
np.full((n_elem,), float(r + 1), dtype=np.float16)
|
||||
for r in range(4)
|
||||
]
|
||||
expected = sum(inputs) # [10,...]
|
||||
|
||||
outputs = run_kernel_in_mock(
|
||||
kernel_fn=kernel_round_robin,
|
||||
world_size=4,
|
||||
topology="ring_1d",
|
||||
inputs=inputs,
|
||||
kernel_args=(n_elem,),
|
||||
)
|
||||
for r in range(4):
|
||||
assert np.allclose(outputs[r], expected)
|
||||
@@ -0,0 +1,140 @@
|
||||
"""Tests for IPCQ strict shape/dtype validation (ADR-0023 D14 F2)."""
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
import simpy
|
||||
|
||||
from kernbench.common.ipcq_types import (
|
||||
IpcqDmaToken,
|
||||
IpcqEndpoint,
|
||||
IpcqInitEntry,
|
||||
IpcqInvalidDirection,
|
||||
IpcqMetaArrival,
|
||||
IpcqRecvCmd,
|
||||
IpcqRequest,
|
||||
IpcqSendCmd,
|
||||
)
|
||||
from kernbench.components.builtin.pe_ipcq import PeIpcqComponent
|
||||
from kernbench.runtime_api.kernel import IpcqInitMsg
|
||||
from kernbench.topology.types import Node
|
||||
|
||||
|
||||
# ── helpers (smaller copy of test_pe_ipcq fixtures) ────────────────
|
||||
|
||||
|
||||
@dataclass
|
||||
class _FakeTxn:
|
||||
request: Any
|
||||
done: simpy.Event
|
||||
result_data: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
|
||||
def _make(env, strict: bool = True):
|
||||
node = Node(
|
||||
id="sip0.cube0.pe0.pe_ipcq", kind="pe_ipcq",
|
||||
impl="builtin.pe_ipcq",
|
||||
attrs={"strict_validation": strict},
|
||||
pos_mm=None,
|
||||
)
|
||||
comp = PeIpcqComponent(node, ctx=None)
|
||||
comp.in_ports["host"] = simpy.Store(env)
|
||||
comp.out_ports["sip0.cube0.pe0.pe_dma"] = simpy.Store(env)
|
||||
comp.start(env)
|
||||
|
||||
peer_credit = simpy.Store(env)
|
||||
ep = IpcqEndpoint(
|
||||
sip=0, cube=0, pe=1, buffer_kind="tcm",
|
||||
rx_base_pa=0x10_000, rx_base_va=0,
|
||||
n_slots=4, slot_size=4096,
|
||||
)
|
||||
init_msg = IpcqInitMsg(
|
||||
correlation_id="t", request_id="t",
|
||||
target_sips=(0,), target_cubes=(0,), target_pe=0,
|
||||
entries=(IpcqInitEntry(
|
||||
direction="W", peer=ep,
|
||||
my_rx_base_pa=0x40_000, my_rx_base_va=0,
|
||||
n_slots=4, slot_size=4096,
|
||||
peer_credit_store=peer_credit,
|
||||
),),
|
||||
backpressure_mode="sleep",
|
||||
buffer_kind="tcm",
|
||||
credit_size_bytes=16,
|
||||
)
|
||||
done = env.event()
|
||||
comp.in_ports["host"].put(_FakeTxn(request=init_msg, done=done))
|
||||
env.run(until=done)
|
||||
return comp
|
||||
|
||||
|
||||
# ── F2 tests ─────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_strict_mode_dtype_mismatch_raises():
|
||||
env = simpy.Environment()
|
||||
comp = _make(env, strict=True)
|
||||
|
||||
# Pre-arrive metadata with f32 dtype
|
||||
fake_token = IpcqDmaToken(
|
||||
src_addr=0, src_space="tcm",
|
||||
dst_addr=0x40_000, dst_endpoint=comp._queue_pairs["W"]["peer"],
|
||||
nbytes=64, handle_id="x",
|
||||
shape=(8,), dtype="f32", # mismatched
|
||||
sender_seq=0,
|
||||
src_sip=0, src_cube=0, src_pe=1, src_direction="E",
|
||||
)
|
||||
comp.in_ports["host"].put(IpcqMetaArrival(token=fake_token))
|
||||
env.run(until=5)
|
||||
|
||||
# recv expecting f16 → should raise on strict
|
||||
recv_cmd = IpcqRecvCmd(direction="W", shape=(8,), dtype="f16", handle_id="r")
|
||||
req = IpcqRequest(command=recv_cmd, done=env.event())
|
||||
comp.in_ports["host"].put(req)
|
||||
with pytest.raises(ValueError, match="dtype"):
|
||||
env.run(until=req.done)
|
||||
|
||||
|
||||
def test_strict_mode_shape_mismatch_raises():
|
||||
env = simpy.Environment()
|
||||
comp = _make(env, strict=True)
|
||||
|
||||
fake_token = IpcqDmaToken(
|
||||
src_addr=0, src_space="tcm",
|
||||
dst_addr=0x40_000, dst_endpoint=comp._queue_pairs["W"]["peer"],
|
||||
nbytes=64, handle_id="x",
|
||||
shape=(16,), dtype="f16", # wrong shape
|
||||
sender_seq=0,
|
||||
src_sip=0, src_cube=0, src_pe=1, src_direction="E",
|
||||
)
|
||||
comp.in_ports["host"].put(IpcqMetaArrival(token=fake_token))
|
||||
env.run(until=5)
|
||||
|
||||
recv_cmd = IpcqRecvCmd(direction="W", shape=(8,), dtype="f16", handle_id="r")
|
||||
req = IpcqRequest(command=recv_cmd, done=env.event())
|
||||
comp.in_ports["host"].put(req)
|
||||
with pytest.raises(ValueError, match="shape"):
|
||||
env.run(until=req.done)
|
||||
|
||||
|
||||
def test_non_strict_mode_silently_accepts():
|
||||
env = simpy.Environment()
|
||||
comp = _make(env, strict=False)
|
||||
|
||||
fake_token = IpcqDmaToken(
|
||||
src_addr=0, src_space="tcm",
|
||||
dst_addr=0x40_000, dst_endpoint=comp._queue_pairs["W"]["peer"],
|
||||
nbytes=64, handle_id="x",
|
||||
shape=(16,), dtype="f32", # both wrong
|
||||
sender_seq=0,
|
||||
src_sip=0, src_cube=0, src_pe=1, src_direction="E",
|
||||
)
|
||||
comp.in_ports["host"].put(IpcqMetaArrival(token=fake_token))
|
||||
env.run(until=5)
|
||||
|
||||
recv_cmd = IpcqRecvCmd(direction="W", shape=(8,), dtype="f16", handle_id="r")
|
||||
req = IpcqRequest(command=recv_cmd, done=env.event())
|
||||
comp.in_ports["host"].put(req)
|
||||
env.run(until=req.done)
|
||||
assert req.done.triggered # no exception
|
||||
@@ -0,0 +1,164 @@
|
||||
"""Tests for CCL builtin topology generators (ADR-0023 D11)."""
|
||||
import pytest
|
||||
|
||||
from kernbench.ccl.topologies import (
|
||||
mesh_2d,
|
||||
none,
|
||||
resolve_topology,
|
||||
ring_1d,
|
||||
ring_1d_unidir,
|
||||
tree_binary,
|
||||
)
|
||||
|
||||
|
||||
# ── ring_1d ──────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_ring_1d_4_ranks():
|
||||
assert ring_1d(0, 4) == {"E": 1, "W": 3}
|
||||
assert ring_1d(1, 4) == {"E": 2, "W": 0}
|
||||
assert ring_1d(2, 4) == {"E": 3, "W": 1}
|
||||
assert ring_1d(3, 4) == {"E": 0, "W": 2}
|
||||
|
||||
|
||||
def test_ring_1d_2_ranks():
|
||||
assert ring_1d(0, 2) == {"E": 1, "W": 1}
|
||||
assert ring_1d(1, 2) == {"E": 0, "W": 0}
|
||||
|
||||
|
||||
# ── ring_1d_unidir ───────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_ring_1d_unidir():
|
||||
assert ring_1d_unidir(0, 4) == {"E": 1}
|
||||
assert ring_1d_unidir(3, 4) == {"E": 0}
|
||||
|
||||
|
||||
# ── mesh_2d ──────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_mesh_2d_2x2():
|
||||
# 2x2 mesh:
|
||||
# 0 1
|
||||
# 2 3
|
||||
assert mesh_2d(0, 4) == {"N": 2, "S": 2, "E": 1, "W": 1}
|
||||
assert mesh_2d(1, 4) == {"N": 3, "S": 3, "E": 0, "W": 0}
|
||||
assert mesh_2d(2, 4) == {"N": 0, "S": 0, "E": 3, "W": 3}
|
||||
assert mesh_2d(3, 4) == {"N": 1, "S": 1, "E": 2, "W": 2}
|
||||
|
||||
|
||||
def test_mesh_2d_4x4():
|
||||
# 4x4 mesh: rank = r*4 + c
|
||||
n = mesh_2d(5, 16) # r=1, c=1
|
||||
assert n["N"] == 1 # ((1-1)%4)*4 + 1
|
||||
assert n["S"] == 9 # ((1+1)%4)*4 + 1
|
||||
assert n["W"] == 4 # 1*4 + (1-1)%4
|
||||
assert n["E"] == 6 # 1*4 + (1+1)%4
|
||||
|
||||
|
||||
def test_mesh_2d_non_square_raises():
|
||||
with pytest.raises(ValueError):
|
||||
mesh_2d(0, 5)
|
||||
|
||||
|
||||
# ── tree_binary ──────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_tree_binary_root():
|
||||
n = tree_binary(0, 7)
|
||||
assert "parent" not in n
|
||||
assert n["child_left"] == 1
|
||||
assert n["child_right"] == 2
|
||||
|
||||
|
||||
def test_tree_binary_internal():
|
||||
n = tree_binary(1, 7)
|
||||
assert n["parent"] == 0
|
||||
assert n["child_left"] == 3
|
||||
assert n["child_right"] == 4
|
||||
|
||||
|
||||
def test_tree_binary_leaf():
|
||||
n = tree_binary(6, 7)
|
||||
assert n["parent"] == 2
|
||||
assert "child_left" not in n
|
||||
assert "child_right" not in n
|
||||
|
||||
|
||||
# ── none ─────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_none_returns_empty():
|
||||
assert none(0, 4) == {}
|
||||
assert none(3, 7) == {}
|
||||
|
||||
|
||||
# ── resolve_topology ─────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_resolve_topology_builtin():
|
||||
fn = resolve_topology("ring_1d")
|
||||
assert fn(0, 4) == {"E": 1, "W": 3}
|
||||
|
||||
|
||||
def test_resolve_topology_unknown_raises():
|
||||
with pytest.raises(ValueError):
|
||||
resolve_topology("nonsense")
|
||||
|
||||
|
||||
def test_resolve_topology_with_neighbors_override_pattern_a():
|
||||
"""Algorithm module with neighbors() that mutates builtin map."""
|
||||
class FakeModule:
|
||||
@staticmethod
|
||||
def neighbors(rank, world_size, neighbor_map):
|
||||
if rank % 2 == 1:
|
||||
neighbor_map.pop("W", None)
|
||||
return neighbor_map
|
||||
|
||||
fn = resolve_topology("ring_1d", algo_module=FakeModule)
|
||||
assert fn(0, 4) == {"E": 1, "W": 3}
|
||||
assert fn(1, 4) == {"E": 2} # W removed
|
||||
|
||||
|
||||
def test_resolve_topology_with_neighbors_override_pattern_b():
|
||||
"""Algorithm module with neighbors() that returns brand-new dict."""
|
||||
class FakeModule:
|
||||
@staticmethod
|
||||
def neighbors(rank, world_size, neighbor_map):
|
||||
return {"E": (rank + 2) % world_size}
|
||||
|
||||
fn = resolve_topology("ring_1d", algo_module=FakeModule)
|
||||
assert fn(0, 4) == {"E": 2}
|
||||
assert fn(3, 4) == {"E": 1}
|
||||
|
||||
|
||||
def test_resolve_topology_with_neighbors_override_pattern_c_none():
|
||||
"""Algorithm module's neighbors() returns None → builtin used as-is."""
|
||||
class FakeModule:
|
||||
@staticmethod
|
||||
def neighbors(rank, world_size, neighbor_map):
|
||||
return None
|
||||
|
||||
fn = resolve_topology("ring_1d", algo_module=FakeModule)
|
||||
assert fn(0, 4) == {"E": 1, "W": 3}
|
||||
|
||||
|
||||
def test_resolve_topology_none_with_neighbors_override():
|
||||
"""topology=none + custom neighbors() builds from scratch."""
|
||||
class FakeModule:
|
||||
@staticmethod
|
||||
def neighbors(rank, world_size, neighbor_map):
|
||||
assert neighbor_map == {} # builtin returned empty
|
||||
return {"E": (rank + 1) % world_size}
|
||||
|
||||
fn = resolve_topology("none", algo_module=FakeModule)
|
||||
assert fn(0, 4) == {"E": 1}
|
||||
|
||||
|
||||
def test_resolve_topology_module_without_neighbors():
|
||||
"""Algorithm module without neighbors() function works normally."""
|
||||
class FakeModule:
|
||||
pass # no neighbors attribute
|
||||
|
||||
fn = resolve_topology("ring_1d", algo_module=FakeModule)
|
||||
assert fn(0, 4) == {"E": 1, "W": 3}
|
||||
@@ -0,0 +1,73 @@
|
||||
"""Cross-SIP PE_DMA routing tests (ADR-0023, topology v2).
|
||||
|
||||
Verifies that PE_DMA in one SIP can route to PE_DMA in another SIP via
|
||||
the bidirectional pcie_ep ↔ fabric.switch0 path. Required for IPCQ
|
||||
multi-SIP collectives.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from kernbench.policy.routing.router import PathRouter, RoutingError
|
||||
from kernbench.topology.builder import resolve_topology
|
||||
|
||||
|
||||
def _topo():
|
||||
return resolve_topology("topology.yaml").topology_obj
|
||||
|
||||
|
||||
# ── New edge ────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_pcie_ep_to_switch_edge_exists():
|
||||
"""The reverse pcie_ep → switch edge must exist for outbound traffic."""
|
||||
topo = _topo()
|
||||
pairs = {(e.src, e.dst) for e in topo.edges}
|
||||
assert ("sip0.io0.pcie_ep", "fabric.switch0") in pairs
|
||||
assert ("sip1.io0.pcie_ep", "fabric.switch0") in pairs
|
||||
|
||||
|
||||
def test_existing_switch_to_pcie_ep_still_present():
|
||||
"""Host→device path must remain intact (regression)."""
|
||||
topo = _topo()
|
||||
pairs = {(e.src, e.dst) for e in topo.edges}
|
||||
assert ("fabric.switch0", "sip0.io0.pcie_ep") in pairs
|
||||
assert ("fabric.switch0", "sip1.io0.pcie_ep") in pairs
|
||||
|
||||
|
||||
# ── Cross-SIP path ──────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_router_finds_cross_sip_pe_dma_path():
|
||||
topo = _topo()
|
||||
r = PathRouter(topo)
|
||||
path = r.find_path("sip0.cube0.pe0", "sip1.cube0.pe0.pe_dma")
|
||||
assert len(path) > 0
|
||||
assert path[0] == "sip0.cube0.pe0.pe_dma"
|
||||
assert path[-1] == "sip1.cube0.pe0.pe_dma"
|
||||
assert "fabric.switch0" in path
|
||||
|
||||
|
||||
def test_router_finds_cross_sip_far_pe_path():
|
||||
"""Last cube of sip0 → first cube of sip1."""
|
||||
topo = _topo()
|
||||
r = PathRouter(topo)
|
||||
path = r.find_path("sip0.cube15.pe7", "sip1.cube0.pe0.pe_dma")
|
||||
assert "fabric.switch0" in path
|
||||
|
||||
|
||||
# ── Regression: intra-SIP routing unchanged ─────────────────────────
|
||||
|
||||
|
||||
def test_router_intra_sip_path_unchanged():
|
||||
topo = _topo()
|
||||
r = PathRouter(topo)
|
||||
path = r.find_path("sip0.cube0.pe0", "sip0.cube0.pe1.pe_dma")
|
||||
assert "fabric.switch0" not in path # should not detour through switch
|
||||
|
||||
|
||||
def test_router_intra_cube_path_unchanged():
|
||||
topo = _topo()
|
||||
r = PathRouter(topo)
|
||||
path = r.find_path("sip0.cube0.pe0", "sip0.cube0.hbm_ctrl")
|
||||
assert "fabric.switch0" not in path
|
||||
@@ -58,6 +58,69 @@ def test_math_exp():
|
||||
assert np.allclose(result, np.exp(x))
|
||||
|
||||
|
||||
def test_math_extra_ops():
|
||||
"""Phase 2 replay of tl.maximum/minimum/fma/clamp/softmax."""
|
||||
store = MemoryStore()
|
||||
a = np.array([1.0, 5.0, 3.0], dtype=np.float32)
|
||||
b = np.array([4.0, 2.0, 6.0], dtype=np.float32)
|
||||
c = np.array([0.5, 0.5, 0.5], dtype=np.float32)
|
||||
store.write("tcm", 0x0, a)
|
||||
store.write("tcm", 0x100, b)
|
||||
store.write("tcm", 0x200, c)
|
||||
|
||||
def _math(name, op, dst, inputs, axis=None):
|
||||
return OpRecord(
|
||||
t_start=float(dst), t_end=float(dst) + 1.0,
|
||||
component_id="pe_math", op_kind="math", op_name=name,
|
||||
params={
|
||||
"op": op,
|
||||
"input_addrs": [a for a, _ in inputs],
|
||||
"input_shapes": [s for _, s in inputs],
|
||||
"input_spaces": ["tcm"] * len(inputs),
|
||||
"input_dtypes": ["f32"] * len(inputs),
|
||||
"dst_addr": dst, "dst_space": "tcm",
|
||||
"shape_out": (3,), "dtype": "f32", "axis": axis,
|
||||
},
|
||||
)
|
||||
|
||||
ops = [
|
||||
_math("maximum", "maximum", 0x300, [(0x0, (3,)), (0x100, (3,))]),
|
||||
_math("minimum", "minimum", 0x400, [(0x0, (3,)), (0x100, (3,))]),
|
||||
_math("fma", "fma", 0x500, [(0x0, (3,)), (0x100, (3,)), (0x200, (3,))]),
|
||||
_math("clamp", "clamp", 0x600, [(0x0, (3,)), (0x200, (3,)), (0x100, (3,))]),
|
||||
]
|
||||
DataExecutor(ops, store).run()
|
||||
|
||||
assert np.array_equal(store.read("tcm", 0x300), np.maximum(a, b))
|
||||
assert np.array_equal(store.read("tcm", 0x400), np.minimum(a, b))
|
||||
assert np.array_equal(store.read("tcm", 0x500), a * b + c)
|
||||
assert np.array_equal(
|
||||
store.read("tcm", 0x600), np.minimum(np.maximum(a, c), b)
|
||||
)
|
||||
|
||||
|
||||
def test_math_softmax():
|
||||
store = MemoryStore()
|
||||
x = np.array([[1.0, 2.0, 3.0], [10.0, 20.0, 30.0]], dtype=np.float32)
|
||||
store.write("tcm", 0x0, x)
|
||||
|
||||
op = OpRecord(
|
||||
t_start=0.0, t_end=1.0,
|
||||
component_id="pe_math", op_kind="math", op_name="softmax",
|
||||
params={
|
||||
"op": "softmax",
|
||||
"input_addrs": [0x0], "input_shapes": [(2, 3)],
|
||||
"input_spaces": ["tcm"], "input_dtypes": ["f32"],
|
||||
"dst_addr": 0x100, "dst_space": "tcm",
|
||||
"shape_out": (2, 3), "dtype": "f32", "axis": -1,
|
||||
},
|
||||
)
|
||||
DataExecutor([op], store).run()
|
||||
expected = np.exp(x - x.max(axis=-1, keepdims=True))
|
||||
expected /= expected.sum(axis=-1, keepdims=True)
|
||||
assert np.allclose(store.read("tcm", 0x100), expected)
|
||||
|
||||
|
||||
def test_math_add():
|
||||
store = MemoryStore()
|
||||
a = np.array([1.0, 2.0], dtype=np.float32)
|
||||
|
||||
@@ -0,0 +1,169 @@
|
||||
"""Tests for IPCQ type schemas (ADR-0023 D2.5, D12, D14 F1)."""
|
||||
import pytest
|
||||
|
||||
from kernbench.common.ipcq_types import (
|
||||
IpcqCreditMetadata,
|
||||
IpcqDmaToken,
|
||||
IpcqEndpoint,
|
||||
IpcqInitEntry,
|
||||
IpcqInvalidDirection,
|
||||
IpcqMetaArrival,
|
||||
IpcqRecvCmd,
|
||||
IpcqSendCmd,
|
||||
)
|
||||
from kernbench.runtime_api.kernel import IpcqInitMsg
|
||||
|
||||
|
||||
# ── IpcqEndpoint ─────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_ipcq_endpoint_basic():
|
||||
ep = IpcqEndpoint(
|
||||
sip=0, cube=0, pe=1,
|
||||
buffer_kind="tcm",
|
||||
rx_base_pa=0x1000, rx_base_va=0,
|
||||
n_slots=8, slot_size=4096,
|
||||
)
|
||||
assert ep.sip == 0
|
||||
assert ep.buffer_kind == "tcm"
|
||||
assert ep.n_slots == 8
|
||||
|
||||
|
||||
def test_ipcq_endpoint_frozen():
|
||||
ep = IpcqEndpoint(
|
||||
sip=0, cube=0, pe=1, buffer_kind="tcm",
|
||||
rx_base_pa=0x1000, rx_base_va=0, n_slots=8, slot_size=4096,
|
||||
)
|
||||
with pytest.raises(Exception): # FrozenInstanceError
|
||||
ep.sip = 99 # type: ignore
|
||||
|
||||
|
||||
# ── IpcqDmaToken ─────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_ipcq_dma_token():
|
||||
ep = IpcqEndpoint(
|
||||
sip=0, cube=0, pe=1, buffer_kind="tcm",
|
||||
rx_base_pa=0x1000, rx_base_va=0, n_slots=8, slot_size=4096,
|
||||
)
|
||||
tok = IpcqDmaToken(
|
||||
src_addr=0x500, src_space="tcm",
|
||||
dst_addr=0x1000, dst_endpoint=ep,
|
||||
nbytes=128, handle_id="h1",
|
||||
sender_seq=0,
|
||||
src_sip=0, src_cube=0, src_pe=0, src_direction="E",
|
||||
)
|
||||
assert tok.nbytes == 128
|
||||
assert tok.dst_endpoint.buffer_kind == "tcm"
|
||||
assert tok.data_op is True
|
||||
|
||||
|
||||
# ── IpcqCreditMetadata ───────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_ipcq_credit_metadata():
|
||||
cm = IpcqCreditMetadata(
|
||||
consumer_seq=3, src_sip=0, src_cube=0, src_pe=1, src_direction="W",
|
||||
)
|
||||
assert cm.consumer_seq == 3
|
||||
assert cm.src_direction == "W"
|
||||
|
||||
|
||||
def test_ipcq_credit_metadata_frozen():
|
||||
cm = IpcqCreditMetadata(
|
||||
consumer_seq=3, src_sip=0, src_cube=0, src_pe=1, src_direction="W",
|
||||
)
|
||||
with pytest.raises(Exception):
|
||||
cm.consumer_seq = 99 # type: ignore
|
||||
|
||||
|
||||
# ── IpcqMetaArrival ──────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_ipcq_meta_arrival():
|
||||
ep = IpcqEndpoint(
|
||||
sip=0, cube=0, pe=1, buffer_kind="tcm",
|
||||
rx_base_pa=0x1000, rx_base_va=0, n_slots=8, slot_size=4096,
|
||||
)
|
||||
tok = IpcqDmaToken(
|
||||
src_addr=0x500, src_space="tcm",
|
||||
dst_addr=0x1000, dst_endpoint=ep,
|
||||
nbytes=128, handle_id="h1",
|
||||
sender_seq=0,
|
||||
src_sip=0, src_cube=0, src_pe=0, src_direction="E",
|
||||
)
|
||||
ma = IpcqMetaArrival(token=tok)
|
||||
assert ma.token.sender_seq == 0
|
||||
assert ma.token.src_direction == "E"
|
||||
|
||||
|
||||
# ── IpcqSendCmd / IpcqRecvCmd ────────────────────────────────────────
|
||||
|
||||
|
||||
def test_ipcq_send_cmd():
|
||||
cmd = IpcqSendCmd(
|
||||
direction="E", src_addr=0x100, src_space="tcm",
|
||||
nbytes=64, shape=(8, 8), dtype="f16", handle_id="s1",
|
||||
)
|
||||
assert cmd.direction == "E"
|
||||
assert cmd.data_op is True
|
||||
|
||||
|
||||
def test_ipcq_recv_cmd_default_return_slot():
|
||||
cmd = IpcqRecvCmd(direction="W", shape=(8, 8), dtype="f16", handle_id="r1")
|
||||
assert cmd.recv_mode == "return_slot"
|
||||
assert cmd.dst_addr == 0
|
||||
|
||||
|
||||
def test_ipcq_recv_cmd_round_robin():
|
||||
cmd = IpcqRecvCmd(direction=None, shape=(8, 8), dtype="f16", handle_id="r2")
|
||||
assert cmd.direction is None
|
||||
|
||||
|
||||
def test_ipcq_recv_cmd_copy_to_dst():
|
||||
cmd = IpcqRecvCmd(
|
||||
direction="W", recv_mode="copy_to_dst",
|
||||
dst_addr=0x2000, dst_space="hbm",
|
||||
shape=(8, 8), dtype="f16", handle_id="r3",
|
||||
)
|
||||
assert cmd.recv_mode == "copy_to_dst"
|
||||
assert cmd.dst_addr == 0x2000
|
||||
|
||||
|
||||
# ── IpcqInvalidDirection ─────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_ipcq_invalid_direction():
|
||||
with pytest.raises(IpcqInvalidDirection):
|
||||
raise IpcqInvalidDirection("direction 'X' not installed")
|
||||
|
||||
|
||||
# ── IpcqInitEntry / IpcqInitMsg ──────────────────────────────────────
|
||||
|
||||
|
||||
def test_ipcq_init_entry_and_msg():
|
||||
import simpy
|
||||
env = simpy.Environment()
|
||||
credit_store = simpy.Store(env)
|
||||
|
||||
ep = IpcqEndpoint(
|
||||
sip=0, cube=0, pe=1, buffer_kind="tcm",
|
||||
rx_base_pa=0x1000, rx_base_va=0, n_slots=8, slot_size=4096,
|
||||
)
|
||||
entry = IpcqInitEntry(
|
||||
direction="E", peer=ep,
|
||||
my_rx_base_pa=0x2000, my_rx_base_va=0,
|
||||
n_slots=8, slot_size=4096,
|
||||
peer_credit_store=credit_store,
|
||||
)
|
||||
msg = IpcqInitMsg(
|
||||
correlation_id="c1", request_id="r1",
|
||||
target_sips=(0,), target_cubes=(0,), target_pe=0,
|
||||
entries=(entry,),
|
||||
backpressure_mode="sleep",
|
||||
buffer_kind="tcm",
|
||||
credit_size_bytes=16,
|
||||
)
|
||||
assert msg.entries[0].direction == "E"
|
||||
assert msg.entries[0].peer.sip == 0
|
||||
assert msg.credit_size_bytes == 16
|
||||
@@ -0,0 +1,206 @@
|
||||
"""Tests for PE_DMA IPCQ handling (ADR-0023 D8 + D9 atomic).
|
||||
|
||||
PE_DMA gains two new behaviors:
|
||||
1. Outbound: when it receives an IpcqDmaToken from local PE_IPCQ, it
|
||||
forwards it through the fabric (next-hop port) toward the peer
|
||||
PE_DMA.
|
||||
2. Inbound: when it receives a Transaction wrapping an IpcqDmaToken,
|
||||
it performs MemoryStore.write at dst_endpoint.buffer_kind/dst_addr
|
||||
and forwards IpcqMetaArrival(token) to local PE_IPCQ — both in the
|
||||
SAME SimPy step (I6 MUST).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import simpy
|
||||
|
||||
from kernbench.common.ipcq_types import (
|
||||
IpcqDmaToken,
|
||||
IpcqEndpoint,
|
||||
IpcqMetaArrival,
|
||||
)
|
||||
from kernbench.components.builtin.pe_dma import PeDmaComponent
|
||||
from kernbench.sim_engine.memory_store import MemoryStore
|
||||
from kernbench.sim_engine.transaction import Transaction
|
||||
from kernbench.topology.types import Node
|
||||
|
||||
|
||||
# ── Mock context ─────────────────────────────────────────────────────
|
||||
|
||||
|
||||
@dataclass
|
||||
class _MockResolver:
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class _MockRouter:
|
||||
"""Returns a fixed two-hop path for any (src, dst)."""
|
||||
|
||||
def find_path(self, src: str, dst: str) -> list[str]:
|
||||
return [src, "fake_router", dst]
|
||||
|
||||
|
||||
@dataclass
|
||||
class _MockCtx:
|
||||
router: Any = field(default_factory=_MockRouter)
|
||||
resolver: Any = field(default_factory=_MockResolver)
|
||||
memory_store: Any = None
|
||||
edge_map: dict = field(default_factory=dict)
|
||||
spec: dict = field(default_factory=dict)
|
||||
op_logger: Any = None
|
||||
|
||||
def compute_drain_ns(self, path: list[str], nbytes: int) -> float:
|
||||
return 0.0
|
||||
|
||||
def get_shared_resource(self, env, key, capacity=1):
|
||||
return simpy.Resource(env, capacity=capacity)
|
||||
|
||||
|
||||
def _make_pe_dma(
|
||||
env: simpy.Environment, pe_prefix: str, store: MemoryStore | None = None,
|
||||
) -> PeDmaComponent:
|
||||
node = Node(
|
||||
id=f"{pe_prefix}.pe_dma",
|
||||
kind="pe_dma",
|
||||
impl="builtin.pe_dma",
|
||||
attrs={},
|
||||
pos_mm=None,
|
||||
)
|
||||
ctx = _MockCtx(memory_store=store)
|
||||
comp = PeDmaComponent(node, ctx=ctx)
|
||||
comp.in_ports["host"] = simpy.Store(env)
|
||||
comp.out_ports["fake_router"] = simpy.Store(env)
|
||||
comp.out_ports[f"{pe_prefix}.pe_ipcq"] = simpy.Store(env)
|
||||
comp.start(env)
|
||||
return comp
|
||||
|
||||
|
||||
def _make_endpoint(sip=0, cube=0, pe=1, buffer_kind="tcm") -> IpcqEndpoint:
|
||||
return IpcqEndpoint(
|
||||
sip=sip, cube=cube, pe=pe,
|
||||
buffer_kind=buffer_kind,
|
||||
rx_base_pa=0x10_000, rx_base_va=0,
|
||||
n_slots=4, slot_size=4096,
|
||||
)
|
||||
|
||||
|
||||
# ── Outbound: PE_IPCQ → PE_DMA → fabric ──────────────────────────────
|
||||
|
||||
|
||||
def test_outbound_forwards_token_through_fabric():
|
||||
env = simpy.Environment()
|
||||
store = MemoryStore()
|
||||
src_arr = np.arange(16, dtype=np.float16)
|
||||
store.write("tcm", 0x500, src_arr)
|
||||
|
||||
src = _make_pe_dma(env, "sip0.cube0.pe0", store=store)
|
||||
|
||||
peer = _make_endpoint(pe=1)
|
||||
token = IpcqDmaToken(
|
||||
src_addr=0x500, src_space="tcm",
|
||||
dst_addr=0x10_000, dst_endpoint=peer,
|
||||
nbytes=32, handle_id="t1",
|
||||
shape=(16,), dtype="f16",
|
||||
sender_seq=0,
|
||||
src_sip=0, src_cube=0, src_pe=0, src_direction="E",
|
||||
)
|
||||
src.in_ports["host"].put(token)
|
||||
env.run(until=10)
|
||||
|
||||
# The token should be wrapped in a Transaction and forwarded to "fake_router"
|
||||
fab = src.out_ports["fake_router"]
|
||||
assert len(fab.items) == 1
|
||||
txn = fab.items[0]
|
||||
assert isinstance(txn, Transaction)
|
||||
assert isinstance(txn.request, IpcqDmaToken)
|
||||
assert txn.request.dst_addr == 0x10_000
|
||||
|
||||
|
||||
# ── Inbound: PE_DMA → MemoryStore.write + IpcqMetaArrival forward ───
|
||||
|
||||
|
||||
def test_inbound_writes_memory_and_forwards_metadata_atomically():
|
||||
env = simpy.Environment()
|
||||
store = MemoryStore()
|
||||
|
||||
# Sender wrote source data to MemoryStore
|
||||
src_arr = np.arange(16, dtype=np.float16) + 100
|
||||
store.write("tcm", 0x500, src_arr)
|
||||
|
||||
dst = _make_pe_dma(env, "sip0.cube0.pe1", store=store)
|
||||
|
||||
peer = _make_endpoint(sip=0, cube=0, pe=1, buffer_kind="tcm")
|
||||
token = IpcqDmaToken(
|
||||
src_addr=0x500, src_space="tcm",
|
||||
dst_addr=0x10_000, dst_endpoint=peer,
|
||||
nbytes=32, handle_id="t1",
|
||||
shape=(16,), dtype="f16",
|
||||
sender_seq=0,
|
||||
src_sip=0, src_cube=0, src_pe=0, src_direction="E",
|
||||
)
|
||||
|
||||
# Wrap in a Transaction with this PE_DMA as the terminal
|
||||
done = env.event()
|
||||
txn = Transaction(
|
||||
request=token, path=["fake_router", "sip0.cube0.pe1.pe_dma"],
|
||||
step=1, nbytes=32, done=done,
|
||||
)
|
||||
dst.in_ports["host"].put(txn)
|
||||
env.run(until=done)
|
||||
|
||||
# 1. MemoryStore should have the data at dst_addr
|
||||
arrived = store.read("tcm", 0x10_000, shape=(16,), dtype="f16")
|
||||
assert np.array_equal(arrived, src_arr)
|
||||
|
||||
# 2. IpcqMetaArrival should be in PE_IPCQ port
|
||||
ipcq_port = dst.out_ports["sip0.cube0.pe1.pe_ipcq"]
|
||||
assert len(ipcq_port.items) == 1
|
||||
arrival = ipcq_port.items[0]
|
||||
assert isinstance(arrival, IpcqMetaArrival)
|
||||
assert arrival.token.sender_seq == 0
|
||||
assert arrival.token.src_pe == 0
|
||||
|
||||
|
||||
def test_inbound_no_yield_between_write_and_metadata_forward():
|
||||
"""Soft check: when multiple inbound IPCQ tokens arrive, the order of
|
||||
MemoryStore writes and IpcqMetaArrival forwards is preserved (no
|
||||
interleaving from extraneous yields).
|
||||
"""
|
||||
env = simpy.Environment()
|
||||
store = MemoryStore()
|
||||
|
||||
for i in range(3):
|
||||
store.write("tcm", 0x500 + i * 0x100, np.arange(8, dtype=np.float16) + i * 10)
|
||||
|
||||
dst = _make_pe_dma(env, "sip0.cube0.pe1", store=store)
|
||||
peer = _make_endpoint(sip=0, cube=0, pe=1)
|
||||
|
||||
for i in range(3):
|
||||
token = IpcqDmaToken(
|
||||
src_addr=0x500 + i * 0x100, src_space="tcm",
|
||||
dst_addr=0x10_000 + i * 0x100, dst_endpoint=peer,
|
||||
nbytes=16, handle_id=f"t{i}",
|
||||
shape=(8,), dtype="f16",
|
||||
sender_seq=i,
|
||||
src_sip=0, src_cube=0, src_pe=0, src_direction="E",
|
||||
)
|
||||
done = env.event()
|
||||
txn = Transaction(
|
||||
request=token, path=["fake_router", "sip0.cube0.pe1.pe_dma"],
|
||||
step=1, nbytes=16, done=done,
|
||||
)
|
||||
dst.in_ports["host"].put(txn)
|
||||
env.run(until=done)
|
||||
|
||||
# Check ordering of arrivals
|
||||
ipcq_port = dst.out_ports["sip0.cube0.pe1.pe_ipcq"]
|
||||
arrivals = list(ipcq_port.items)
|
||||
assert [a.token.sender_seq for a in arrivals] == [0, 1, 2]
|
||||
# Memory must be in order
|
||||
for i in range(3):
|
||||
arr = store.read("tcm", 0x10_000 + i * 0x100, shape=(8,), dtype="f16")
|
||||
assert arr[0] == i * 10
|
||||
@@ -0,0 +1,317 @@
|
||||
"""Tests for PE_IPCQ component (ADR-0023 D1, D2, D9, D14).
|
||||
|
||||
These tests use a mock setup: PeIpcqComponent is instantiated directly,
|
||||
its in_ports/out_ports are wired to plain SimPy Stores, and IpcqInitMsg
|
||||
is delivered via a simple dummy transaction wrapper. PE_DMA is mocked
|
||||
as a Store that we drain manually.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
import simpy
|
||||
|
||||
from kernbench.common.ipcq_types import (
|
||||
IpcqCreditMetadata,
|
||||
IpcqDmaToken,
|
||||
IpcqEndpoint,
|
||||
IpcqInitEntry,
|
||||
IpcqInvalidDirection,
|
||||
IpcqMetaArrival,
|
||||
IpcqRecvCmd,
|
||||
IpcqRequest,
|
||||
IpcqSendCmd,
|
||||
)
|
||||
from kernbench.components.builtin.pe_ipcq import PeIpcqComponent
|
||||
from kernbench.runtime_api.kernel import IpcqInitMsg
|
||||
from kernbench.topology.types import Node
|
||||
|
||||
|
||||
# ── Fakes / fixtures ─────────────────────────────────────────────────
|
||||
|
||||
|
||||
@dataclass
|
||||
class _FakeTxn:
|
||||
request: Any
|
||||
done: simpy.Event
|
||||
result_data: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
|
||||
def _make_pe_ipcq(env: simpy.Environment, pe_prefix: str = "sip0.cube0.pe0") -> PeIpcqComponent:
|
||||
"""Create a PeIpcqComponent with mocked ports.
|
||||
|
||||
Returns the component with:
|
||||
- in_ports["host"] for posting IpcqInitMsg / IpcqRequest
|
||||
- out_ports["__pe_dma__"] for outgoing IpcqDmaToken (drain manually)
|
||||
- The component is started.
|
||||
"""
|
||||
node = Node(
|
||||
id=f"{pe_prefix}.pe_ipcq",
|
||||
kind="pe_ipcq",
|
||||
impl="builtin.pe_ipcq",
|
||||
attrs={},
|
||||
pos_mm=None,
|
||||
)
|
||||
comp = PeIpcqComponent(node, ctx=None)
|
||||
comp.in_ports["host"] = simpy.Store(env)
|
||||
comp.out_ports[f"{pe_prefix}.pe_dma"] = simpy.Store(env)
|
||||
comp.start(env)
|
||||
return comp
|
||||
|
||||
|
||||
def _install_two_neighbors(env: simpy.Environment, comp: PeIpcqComponent) -> tuple[simpy.Store, simpy.Store]:
|
||||
"""Install E and W neighbor entries with peer_credit_stores.
|
||||
|
||||
Returns (peer_e_credit_store, peer_w_credit_store) — i.e. the stores
|
||||
that the component will put credits into when it receives data.
|
||||
"""
|
||||
peer_e_credit = simpy.Store(env)
|
||||
peer_w_credit = simpy.Store(env)
|
||||
|
||||
ep_e = IpcqEndpoint(
|
||||
sip=0, cube=0, pe=1,
|
||||
buffer_kind="tcm",
|
||||
rx_base_pa=0x10_000, rx_base_va=0,
|
||||
n_slots=4, slot_size=4096,
|
||||
)
|
||||
ep_w = IpcqEndpoint(
|
||||
sip=0, cube=0, pe=2,
|
||||
buffer_kind="tcm",
|
||||
rx_base_pa=0x20_000, rx_base_va=0,
|
||||
n_slots=4, slot_size=4096,
|
||||
)
|
||||
init_msg = IpcqInitMsg(
|
||||
correlation_id="t", request_id="t",
|
||||
target_sips=(0,), target_cubes=(0,), target_pe=0,
|
||||
entries=(
|
||||
IpcqInitEntry(
|
||||
direction="E", peer=ep_e,
|
||||
my_rx_base_pa=0x30_000, my_rx_base_va=0,
|
||||
n_slots=4, slot_size=4096,
|
||||
peer_credit_store=peer_e_credit,
|
||||
),
|
||||
IpcqInitEntry(
|
||||
direction="W", peer=ep_w,
|
||||
my_rx_base_pa=0x40_000, my_rx_base_va=0,
|
||||
n_slots=4, slot_size=4096,
|
||||
peer_credit_store=peer_w_credit,
|
||||
),
|
||||
),
|
||||
backpressure_mode="sleep",
|
||||
buffer_kind="tcm",
|
||||
credit_size_bytes=16,
|
||||
)
|
||||
done = env.event()
|
||||
comp.in_ports["host"].put(_FakeTxn(request=init_msg, done=done))
|
||||
env.run(until=done)
|
||||
return peer_e_credit, peer_w_credit
|
||||
|
||||
|
||||
# ── send: forward token to PE_DMA ────────────────────────────────────
|
||||
|
||||
|
||||
def test_send_forwards_token_to_pe_dma():
|
||||
env = simpy.Environment()
|
||||
comp = _make_pe_ipcq(env)
|
||||
_install_two_neighbors(env, comp)
|
||||
pe_dma = comp.out_ports["sip0.cube0.pe0.pe_dma"]
|
||||
|
||||
cmd = IpcqSendCmd(
|
||||
direction="E", src_addr=0x500, src_space="tcm",
|
||||
nbytes=128, shape=(8, 8), dtype="f16", handle_id="s1",
|
||||
)
|
||||
done = env.event()
|
||||
comp.in_ports["host"].put(IpcqRequest(command=cmd, done=done))
|
||||
env.run(until=done)
|
||||
|
||||
# Token should be in PE_DMA's mock store
|
||||
assert len(pe_dma.items) == 1
|
||||
token = pe_dma.items[0]
|
||||
assert isinstance(token, IpcqDmaToken)
|
||||
assert token.dst_addr == 0x10_000 # peer.rx_base_pa + 0
|
||||
assert token.nbytes == 128
|
||||
assert token.sender_seq == 0
|
||||
assert token.src_direction == "E"
|
||||
|
||||
|
||||
def test_send_advances_my_head_and_slot_addresses():
|
||||
env = simpy.Environment()
|
||||
comp = _make_pe_ipcq(env)
|
||||
_install_two_neighbors(env, comp)
|
||||
pe_dma = comp.out_ports["sip0.cube0.pe0.pe_dma"]
|
||||
|
||||
for i in range(3):
|
||||
cmd = IpcqSendCmd(
|
||||
direction="E", src_addr=0x500 + i,
|
||||
src_space="tcm", nbytes=64,
|
||||
shape=(8,), dtype="f16", handle_id=f"s{i}",
|
||||
)
|
||||
done = env.event()
|
||||
comp.in_ports["host"].put(IpcqRequest(command=cmd, done=done))
|
||||
env.run(until=done)
|
||||
|
||||
tokens = pe_dma.items
|
||||
assert [t.sender_seq for t in tokens] == [0, 1, 2]
|
||||
# slot addresses: peer.rx_base_pa (0x10_000) + i * slot_size (4096)
|
||||
assert [t.dst_addr for t in tokens] == [0x10_000, 0x11_000, 0x12_000]
|
||||
|
||||
|
||||
def test_send_invalid_direction_raises():
|
||||
env = simpy.Environment()
|
||||
comp = _make_pe_ipcq(env)
|
||||
_install_two_neighbors(env, comp)
|
||||
|
||||
cmd = IpcqSendCmd(
|
||||
direction="N", src_addr=0x100, src_space="tcm",
|
||||
nbytes=64, shape=(8,), dtype="f16", handle_id="s_bad",
|
||||
)
|
||||
done = env.event()
|
||||
comp.in_ports["host"].put(IpcqRequest(command=cmd, done=done))
|
||||
|
||||
with pytest.raises(IpcqInvalidDirection):
|
||||
env.run(until=done)
|
||||
|
||||
|
||||
# ── recv: wait for data and return slot address ─────────────────────
|
||||
|
||||
|
||||
def test_recv_waits_until_metadata_arrives():
|
||||
env = simpy.Environment()
|
||||
comp = _make_pe_ipcq(env)
|
||||
_install_two_neighbors(env, comp)
|
||||
|
||||
recv_cmd = IpcqRecvCmd(
|
||||
direction="W", shape=(8,), dtype="f16", handle_id="r1",
|
||||
)
|
||||
recv_req = IpcqRequest(command=recv_cmd, done=env.event())
|
||||
comp.in_ports["host"].put(recv_req)
|
||||
|
||||
# Run a bit — recv should not complete yet (no data)
|
||||
env.run(until=10)
|
||||
assert not recv_req.done.triggered
|
||||
|
||||
# Simulate metadata arrival from peer (W direction = sender pe=2)
|
||||
fake_token = IpcqDmaToken(
|
||||
src_addr=0, src_space="tcm",
|
||||
dst_addr=0x40_000, dst_endpoint=comp._queue_pairs["W"]["peer"],
|
||||
nbytes=64, handle_id="x",
|
||||
shape=(8,), dtype="f16",
|
||||
sender_seq=0,
|
||||
src_sip=0, src_cube=0, src_pe=2, src_direction="E",
|
||||
)
|
||||
comp.in_ports["host"].put(IpcqMetaArrival(token=fake_token))
|
||||
env.run(until=recv_req.done)
|
||||
|
||||
assert recv_req.result_data["src_addr"] == 0x40_000 # my_rx_base_pa for W
|
||||
assert recv_req.result_data["direction"] == "W"
|
||||
|
||||
|
||||
def test_recv_returns_immediately_if_data_already_present():
|
||||
env = simpy.Environment()
|
||||
comp = _make_pe_ipcq(env)
|
||||
_install_two_neighbors(env, comp)
|
||||
|
||||
# Pre-arrive metadata
|
||||
fake_token = IpcqDmaToken(
|
||||
src_addr=0, src_space="tcm",
|
||||
dst_addr=0x40_000, dst_endpoint=comp._queue_pairs["W"]["peer"],
|
||||
nbytes=64, handle_id="x",
|
||||
shape=(8,), dtype="f16",
|
||||
sender_seq=0,
|
||||
src_sip=0, src_cube=0, src_pe=2, src_direction="E",
|
||||
)
|
||||
comp.in_ports["host"].put(IpcqMetaArrival(token=fake_token))
|
||||
env.run(until=5)
|
||||
|
||||
recv_cmd = IpcqRecvCmd(
|
||||
direction="W", shape=(8,), dtype="f16", handle_id="r1",
|
||||
)
|
||||
recv_req = IpcqRequest(command=recv_cmd, done=env.event())
|
||||
comp.in_ports["host"].put(recv_req)
|
||||
env.run(until=recv_req.done)
|
||||
|
||||
assert recv_req.result_data["src_addr"] == 0x40_000
|
||||
|
||||
|
||||
def test_recv_round_robin_picks_arrived_direction():
|
||||
env = simpy.Environment()
|
||||
comp = _make_pe_ipcq(env)
|
||||
_install_two_neighbors(env, comp)
|
||||
|
||||
# Pre-arrive metadata only on W direction
|
||||
fake_token = IpcqDmaToken(
|
||||
src_addr=0, src_space="tcm",
|
||||
dst_addr=0x40_000, dst_endpoint=comp._queue_pairs["W"]["peer"],
|
||||
nbytes=64, handle_id="x",
|
||||
shape=(8,), dtype="f16",
|
||||
sender_seq=0,
|
||||
src_sip=0, src_cube=0, src_pe=2, src_direction="E",
|
||||
)
|
||||
comp.in_ports["host"].put(IpcqMetaArrival(token=fake_token))
|
||||
env.run(until=5)
|
||||
|
||||
# recv() with no direction → round-robin
|
||||
recv_cmd = IpcqRecvCmd(
|
||||
direction=None, shape=(8,), dtype="f16", handle_id="r_rr",
|
||||
)
|
||||
recv_req = IpcqRequest(command=recv_cmd, done=env.event())
|
||||
comp.in_ports["host"].put(recv_req)
|
||||
env.run(until=recv_req.done)
|
||||
|
||||
assert recv_req.result_data["direction"] == "W"
|
||||
|
||||
|
||||
# ── backpressure: send blocks when full ──────────────────────────────
|
||||
|
||||
|
||||
def test_send_blocks_when_peer_slot_full():
|
||||
env = simpy.Environment()
|
||||
comp = _make_pe_ipcq(env)
|
||||
_install_two_neighbors(env, comp)
|
||||
|
||||
# n_slots = 4, so 4 sends should succeed; 5th blocks
|
||||
for i in range(4):
|
||||
cmd = IpcqSendCmd(
|
||||
direction="E", src_addr=0x500, src_space="tcm",
|
||||
nbytes=64, shape=(8,), dtype="f16", handle_id=f"s{i}",
|
||||
)
|
||||
done = env.event()
|
||||
comp.in_ports["host"].put(IpcqRequest(command=cmd, done=done))
|
||||
env.run(until=done)
|
||||
|
||||
# 5th send: should not complete
|
||||
cmd5 = IpcqSendCmd(
|
||||
direction="E", src_addr=0x500, src_space="tcm",
|
||||
nbytes=64, shape=(8,), dtype="f16", handle_id="s5",
|
||||
)
|
||||
req5 = IpcqRequest(command=cmd5, done=env.event())
|
||||
comp.in_ports["host"].put(req5)
|
||||
env.run(until=20)
|
||||
assert not req5.done.triggered
|
||||
|
||||
# Send a credit return: peer (E direction, pe=1) consumed slot 0
|
||||
credit = IpcqCreditMetadata(
|
||||
consumer_seq=1, # peer consumed up to my_tail=1
|
||||
src_sip=0, src_cube=0, src_pe=1, src_direction="W", # peer's view
|
||||
)
|
||||
comp.credit_inbox.put(credit)
|
||||
env.run(until=req5.done)
|
||||
assert req5.done.triggered
|
||||
|
||||
|
||||
# ── Init test ────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_init_installs_neighbors():
|
||||
env = simpy.Environment()
|
||||
comp = _make_pe_ipcq(env)
|
||||
_install_two_neighbors(env, comp)
|
||||
|
||||
assert "E" in comp._queue_pairs
|
||||
assert "W" in comp._queue_pairs
|
||||
assert comp._queue_pairs["E"]["peer"].pe == 1
|
||||
assert comp._queue_pairs["W"]["peer"].pe == 2
|
||||
assert comp._queue_pairs["E"]["my_head"] == 0
|
||||
assert comp._queue_pairs["E"]["peer_tail_cache"] == 0
|
||||
@@ -0,0 +1,80 @@
|
||||
"""Tests for recv_mode='copy_to_dst' (ADR-0023 D9.5)."""
|
||||
from __future__ import annotations
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
def test_recv_copy_to_dst_via_simpy_runner():
|
||||
"""Run a kernel that uses tl.recv(..., dst_addr=..., dst_space=...).
|
||||
Verify the data is moved to the dst location after recv.
|
||||
"""
|
||||
import importlib
|
||||
|
||||
from kernbench.policy.placement.dp import DPPolicy
|
||||
from kernbench.runtime_api.bench_runner import run_bench
|
||||
from kernbench.runtime_api.types import resolve_device
|
||||
from kernbench.sim_engine.engine import GraphEngine
|
||||
from kernbench.topology.builder import resolve_topology
|
||||
from kernbench.common.pe_commands import TensorHandle
|
||||
|
||||
def kernel(t_ptr, n_elem, dst_buf_addr, tl):
|
||||
rank = tl.program_id(axis=0)
|
||||
ws = tl.num_programs(axis=0)
|
||||
nbytes = n_elem * 2
|
||||
# Each PE sends own data, then recv into a custom dst slot
|
||||
current = TensorHandle(
|
||||
id="loc", addr=t_ptr + rank * nbytes,
|
||||
shape=(n_elem,), dtype="f16",
|
||||
nbytes=nbytes, data=None, space="hbm",
|
||||
)
|
||||
tl.send(dir="E", src=current)
|
||||
# copy_to_dst: move into a per-rank scratch HBM addr
|
||||
recv = tl.recv(
|
||||
dir="W", shape=(n_elem,), dtype="f16",
|
||||
dst_addr=dst_buf_addr + rank * nbytes,
|
||||
dst_space="hbm",
|
||||
)
|
||||
# Sanity: recv handle should now point to our dst addr
|
||||
assert recv.addr == dst_buf_addr + rank * nbytes
|
||||
assert recv.space == "hbm"
|
||||
|
||||
topo = resolve_topology("topology.yaml")
|
||||
|
||||
def run(torch):
|
||||
plan = torch.install_ipcq(
|
||||
algorithm="ring_allreduce_tcm", world_size_override=8,
|
||||
)
|
||||
a = torch.zeros(
|
||||
(1, 8 * 8),
|
||||
dtype="f16",
|
||||
dp=DPPolicy(
|
||||
sip="replicate", cube="replicate", pe="column_wise",
|
||||
num_sips=1, num_cubes=1,
|
||||
),
|
||||
name="copy_in",
|
||||
)
|
||||
store = torch.engine.memory_store
|
||||
base = a._handle.va_base or a._handle.shards[0].pa
|
||||
nbytes = 8 * 2
|
||||
for r in range(8):
|
||||
store.write("hbm", base + r * nbytes,
|
||||
np.full((8,), float(r + 1), dtype=np.float16))
|
||||
|
||||
# Use a separate dst region (synthetic addresses)
|
||||
dst_buf = 0xC0FFEE_0000
|
||||
torch.launch("ring_allreduce_tcm", kernel, a, 8, dst_buf)
|
||||
|
||||
# After the kernel, dst_buf + r*16 should contain rank (r-1)%8's data
|
||||
for r in range(8):
|
||||
arr = store.read("hbm", dst_buf + r * nbytes, shape=(8,), dtype="f16")
|
||||
expected = float(((r - 1) % 8) + 1)
|
||||
assert np.allclose(arr, expected), f"rank {r}: got {arr}, expected {expected}"
|
||||
|
||||
result = run_bench(
|
||||
topology=topo, bench_fn=run,
|
||||
device=resolve_device("all"),
|
||||
engine_factory=lambda t, d: GraphEngine(
|
||||
getattr(t, "topology_obj", t), enable_data=True
|
||||
),
|
||||
)
|
||||
assert result.completion.ok
|
||||
@@ -0,0 +1,136 @@
|
||||
"""Tests for the pytorch-compat Tensor API extensions.
|
||||
|
||||
Covers the new ``torch.from_numpy`` factory and ``Tensor.numpy``,
|
||||
``Tensor.copy_`` methods used by the unified ``ccl_allreduce`` bench.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from kernbench.policy.placement.dp import DPPolicy
|
||||
from kernbench.runtime_api.bench_runner import run_bench
|
||||
from kernbench.runtime_api.types import resolve_device
|
||||
from kernbench.sim_engine.engine import GraphEngine
|
||||
from kernbench.topology.builder import resolve_topology
|
||||
|
||||
|
||||
def _engine_factory(topology, device):
|
||||
return GraphEngine(getattr(topology, "topology_obj", topology), enable_data=True)
|
||||
|
||||
|
||||
def _run_with(bench_body):
|
||||
topo = resolve_topology("topology.yaml")
|
||||
return run_bench(
|
||||
topology=topo,
|
||||
bench_fn=bench_body,
|
||||
device=resolve_device("all"),
|
||||
engine_factory=_engine_factory,
|
||||
)
|
||||
|
||||
|
||||
# ── from_numpy ──────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_from_numpy_creates_host_tensor():
|
||||
"""torch.from_numpy returns a kernbench Tensor with the array stored
|
||||
in its host buffer (not deployed to any PE)."""
|
||||
|
||||
def body(torch):
|
||||
arr = np.arange(8, dtype=np.float16).reshape(1, 8)
|
||||
h = torch.from_numpy(arr)
|
||||
# Host tensor has shape/dtype matching the array.
|
||||
assert h.shape == (1, 8)
|
||||
assert h.dtype == "f16"
|
||||
# numpy() round-trips the host buffer.
|
||||
assert np.array_equal(h.numpy(), arr)
|
||||
# No deploy → no real shards.
|
||||
assert h._handle is None
|
||||
# Submit a no-op so run_bench has at least one handle.
|
||||
torch.zeros((1, 8), dtype="f16",
|
||||
dp=DPPolicy(sip="replicate", cube="replicate", pe="replicate",
|
||||
num_sips=1, num_cubes=1, num_pes=1),
|
||||
name="dummy")
|
||||
|
||||
_run_with(body)
|
||||
|
||||
|
||||
# ── single-PE replicated tensor ─────────────────────────────────────
|
||||
|
||||
|
||||
def test_copy_and_numpy_single_pe():
|
||||
"""copy_ from a numpy array, then numpy() round-trips correctly on
|
||||
a single-PE (no real sharding) tensor."""
|
||||
|
||||
def body(torch):
|
||||
dp = DPPolicy(sip="replicate", cube="replicate", pe="replicate",
|
||||
num_sips=1, num_cubes=1, num_pes=1)
|
||||
t = torch.zeros((1, 16), dtype="f16", dp=dp, name="t")
|
||||
src = np.arange(16, dtype=np.float16).reshape(1, 16)
|
||||
t.copy_(torch.from_numpy(src))
|
||||
gathered = t.numpy()
|
||||
assert gathered.shape == (1, 16)
|
||||
assert np.array_equal(gathered, src)
|
||||
|
||||
_run_with(body)
|
||||
|
||||
|
||||
# ── multi-PE column-wise sharding (1 cube) ──────────────────────────
|
||||
|
||||
|
||||
def test_copy_and_numpy_multi_pe_column_wise():
|
||||
"""copy_ splits across 8 PEs in one cube, numpy() reassembles."""
|
||||
|
||||
def body(torch):
|
||||
n_pe = 8
|
||||
dp = DPPolicy(sip="replicate", cube="replicate", pe="column_wise",
|
||||
num_sips=1, num_cubes=1, num_pes=n_pe)
|
||||
t = torch.zeros((1, n_pe * 4), dtype="f16", dp=dp, name="t")
|
||||
src = np.arange(n_pe * 4, dtype=np.float16).reshape(1, n_pe * 4)
|
||||
t.copy_(torch.from_numpy(src))
|
||||
gathered = t.numpy()
|
||||
assert gathered.shape == (1, n_pe * 4)
|
||||
assert np.array_equal(gathered, src)
|
||||
# Sanity: there really were 8 shards.
|
||||
assert len(t._handle.shards) == n_pe
|
||||
|
||||
_run_with(body)
|
||||
|
||||
|
||||
# ── multi-cube sharding ─────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_copy_and_numpy_multi_cube():
|
||||
"""copy_ across 2 cubes (16 PEs total), numpy() reassembles."""
|
||||
|
||||
def body(torch):
|
||||
n_pe_per_cube = 8
|
||||
n_cubes = 2
|
||||
total = n_cubes * n_pe_per_cube # 16
|
||||
dp = DPPolicy(sip="replicate", cube="column_wise", pe="column_wise",
|
||||
num_sips=1, num_cubes=n_cubes)
|
||||
t = torch.zeros((1, total * 4), dtype="f16", dp=dp, name="t")
|
||||
src = np.arange(total * 4, dtype=np.float16).reshape(1, total * 4)
|
||||
t.copy_(torch.from_numpy(src))
|
||||
gathered = t.numpy()
|
||||
assert np.array_equal(gathered, src)
|
||||
assert len(t._handle.shards) == total
|
||||
|
||||
_run_with(body)
|
||||
|
||||
|
||||
# ── shape mismatch raises ───────────────────────────────────────────
|
||||
|
||||
|
||||
def test_copy_shape_mismatch_raises():
|
||||
"""copy_ with mismatched shapes raises ValueError."""
|
||||
|
||||
def body(torch):
|
||||
dp = DPPolicy(sip="replicate", cube="replicate", pe="replicate",
|
||||
num_sips=1, num_cubes=1, num_pes=1)
|
||||
t = torch.zeros((1, 8), dtype="f16", dp=dp, name="t")
|
||||
src = np.zeros((1, 16), dtype=np.float16)
|
||||
with pytest.raises(ValueError, match="copy_ shape mismatch"):
|
||||
t.copy_(torch.from_numpy(src))
|
||||
|
||||
_run_with(body)
|
||||
@@ -0,0 +1,95 @@
|
||||
"""Tests for tl.send / tl.recv API (ADR-0023 D4 + D9.5)."""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
import simpy
|
||||
from greenlet import greenlet
|
||||
|
||||
from kernbench.common.ipcq_types import (
|
||||
IpcqRecvCmd,
|
||||
IpcqRequest,
|
||||
IpcqSendCmd,
|
||||
)
|
||||
from kernbench.triton_emu.tl_context import TLContext
|
||||
|
||||
|
||||
# ── Command-list mode (no runner) ────────────────────────────────────
|
||||
|
||||
|
||||
def test_tl_send_command_list_mode():
|
||||
tl = TLContext(pe_id=0, num_programs=4, dispatch_cycles=0)
|
||||
tl.send(dir="E", src_addr=0x500, nbytes=64, shape=(8,), dtype="f16")
|
||||
cmds = tl.commands
|
||||
sends = [c for c in cmds if isinstance(c, IpcqSendCmd)]
|
||||
assert len(sends) == 1
|
||||
assert sends[0].direction == "E"
|
||||
assert sends[0].src_addr == 0x500
|
||||
assert sends[0].nbytes == 64
|
||||
|
||||
|
||||
def test_tl_recv_command_list_mode():
|
||||
tl = TLContext(pe_id=0, num_programs=4, dispatch_cycles=0)
|
||||
handle = tl.recv(dir="W", shape=(8,), dtype="f16")
|
||||
cmds = tl.commands
|
||||
recvs = [c for c in cmds if isinstance(c, IpcqRecvCmd)]
|
||||
assert len(recvs) == 1
|
||||
assert recvs[0].direction == "W"
|
||||
# In command-list mode (no runner), tl.recv returns a placeholder
|
||||
# TensorHandle (no actual data movement happens until SimPy)
|
||||
assert handle.shape == (8,)
|
||||
assert handle.dtype == "f16"
|
||||
|
||||
|
||||
def test_tl_recv_round_robin_no_dir():
|
||||
tl = TLContext(pe_id=0, num_programs=4, dispatch_cycles=0)
|
||||
tl.recv(shape=(8,), dtype="f16")
|
||||
cmds = tl.commands
|
||||
recvs = [c for c in cmds if isinstance(c, IpcqRecvCmd)]
|
||||
assert recvs[0].direction is None
|
||||
|
||||
|
||||
# ── Runner mode (greenlet) ──────────────────────────────────────────
|
||||
|
||||
|
||||
class _StubRunner:
|
||||
"""Minimal runner that auto-responds to IpcqSendCmd / IpcqRecvCmd."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.received: list[Any] = []
|
||||
|
||||
def switch_to_simpy(self, cmd: Any) -> Any:
|
||||
self.received.append(cmd)
|
||||
if isinstance(cmd, IpcqSendCmd):
|
||||
return None
|
||||
if isinstance(cmd, IpcqRecvCmd):
|
||||
# Return a fake slot dict
|
||||
return {
|
||||
"data": None,
|
||||
"src_space": "tcm",
|
||||
"src_addr": 0xABCD,
|
||||
"direction": cmd.direction or "E",
|
||||
"dtype": cmd.dtype,
|
||||
"shape": cmd.shape,
|
||||
"nbytes": 16,
|
||||
}
|
||||
return None
|
||||
|
||||
|
||||
def test_tl_send_runner_mode():
|
||||
runner = _StubRunner()
|
||||
tl = TLContext(pe_id=0, num_programs=4, dispatch_cycles=0, runner=runner)
|
||||
tl.send(dir="E", src_addr=0x500, nbytes=64, shape=(8,), dtype="f16")
|
||||
assert len(runner.received) == 1
|
||||
assert isinstance(runner.received[0], IpcqSendCmd)
|
||||
|
||||
|
||||
def test_tl_recv_runner_mode_returns_handle_with_slot_addr():
|
||||
runner = _StubRunner()
|
||||
tl = TLContext(pe_id=0, num_programs=4, dispatch_cycles=0, runner=runner)
|
||||
h = tl.recv(dir="W", shape=(8,), dtype="f16")
|
||||
assert isinstance(runner.received[0], IpcqRecvCmd)
|
||||
# The returned TensorHandle's addr should reflect the slot
|
||||
assert h.addr == 0xABCD
|
||||
assert h.shape == (8,)
|
||||
assert h.dtype == "f16"
|
||||
@@ -0,0 +1,106 @@
|
||||
"""Tests for tl.recv_async + tl.wait (ADR-0023 D4)."""
|
||||
from __future__ import annotations
|
||||
|
||||
import numpy as np
|
||||
|
||||
from kernbench.ccl.testing import run_kernel_in_mock
|
||||
|
||||
|
||||
def kernel_async_recv(t_ptr, n_elem, tl):
|
||||
"""Each PE issues recv_async first, then send, then wait — this exercises
|
||||
the non-blocking path. Uses TensorHandle math (PE_MATH) for accumulation
|
||||
so Phase 2 produces correct final HBM contents."""
|
||||
rank = tl.program_id(axis=0)
|
||||
world_size = tl.num_programs(axis=0)
|
||||
nbytes = n_elem * 2
|
||||
|
||||
pe_addr = t_ptr + rank * nbytes
|
||||
acc = tl.load(pe_addr, shape=(n_elem,), dtype="f16")
|
||||
current = acc
|
||||
|
||||
for _step in range(world_size - 1):
|
||||
future = tl.recv_async(dir="W", shape=(n_elem,), dtype="f16")
|
||||
tl.send(dir="E", src=current)
|
||||
recv = tl.wait(future)
|
||||
acc = acc + recv
|
||||
current = recv # forward W's tile to E next round
|
||||
|
||||
tl.store(pe_addr, acc)
|
||||
|
||||
|
||||
def test_recv_async_mock_runtime():
|
||||
n_elem = 8
|
||||
inputs = [
|
||||
np.full((n_elem,), float(r + 1), dtype=np.float16)
|
||||
for r in range(4)
|
||||
]
|
||||
expected = sum(inputs)
|
||||
|
||||
outputs = run_kernel_in_mock(
|
||||
kernel_fn=kernel_async_recv,
|
||||
world_size=4,
|
||||
topology="ring_1d",
|
||||
inputs=inputs,
|
||||
kernel_args=(n_elem,),
|
||||
)
|
||||
for r in range(4):
|
||||
assert np.allclose(outputs[r], expected)
|
||||
|
||||
|
||||
def test_recv_async_simpy_runner():
|
||||
"""Run the async kernel through the real SimPy stack via the
|
||||
install_ipcq + launch path.
|
||||
"""
|
||||
import importlib
|
||||
|
||||
from kernbench.runtime_api.bench_runner import run_bench
|
||||
from kernbench.runtime_api.types import resolve_device
|
||||
from kernbench.sim_engine.engine import GraphEngine
|
||||
from kernbench.topology.builder import resolve_topology
|
||||
|
||||
# Re-use the standard 8-PE bench skeleton but swap in the async kernel.
|
||||
topo = resolve_topology("topology.yaml")
|
||||
|
||||
# Build a tiny inline bench module
|
||||
import types
|
||||
mod = types.ModuleType("inline_bench_async")
|
||||
|
||||
from kernbench.policy.placement.dp import DPPolicy
|
||||
|
||||
def run(torch):
|
||||
plan = torch.install_ipcq(
|
||||
algorithm="ring_allreduce_tcm", world_size_override=8,
|
||||
)
|
||||
a = torch.zeros(
|
||||
(1, 8 * 8),
|
||||
dtype="f16",
|
||||
dp=DPPolicy(
|
||||
sip="replicate", cube="replicate", pe="column_wise",
|
||||
num_sips=1, num_cubes=1,
|
||||
),
|
||||
name="async_in",
|
||||
)
|
||||
store = torch.engine.memory_store
|
||||
base = a._handle.va_base or a._handle.shards[0].pa
|
||||
nbytes = 8 * 2
|
||||
for r in range(8):
|
||||
store.write("hbm", base + r * nbytes,
|
||||
np.full((8,), float(r + 1), dtype=np.float16))
|
||||
|
||||
torch.launch("ring_allreduce_tcm", kernel_async_recv, a, 8)
|
||||
|
||||
for r in range(8):
|
||||
result = store.read("hbm", base + r * nbytes, shape=(8,), dtype="f16")
|
||||
expected = float(sum(range(1, 9))) # 36
|
||||
assert np.allclose(result, expected, rtol=1e-2, atol=1e-2), \
|
||||
f"rank {r}: got {result}, expected {expected}"
|
||||
|
||||
mod.run = run
|
||||
result = run_bench(
|
||||
topology=topo, bench_fn=mod.run,
|
||||
device=resolve_device("all"),
|
||||
engine_factory=lambda t, d: GraphEngine(
|
||||
getattr(t, "topology_obj", t), enable_data=True
|
||||
),
|
||||
)
|
||||
assert result.completion.ok
|
||||
@@ -19,16 +19,19 @@ def test_full_graph_node_count():
|
||||
# + 2 SIPs x (1 IO x 23 io_nodes
|
||||
# + 16 cubes x (32 routers + 1 hbm_ctrl + 1 m_cpu + 1 sram
|
||||
# + 20 ucie (4 ports x (1 port + 4 conn))
|
||||
# + 8 PEs x 8 pe_comps)) (ADR-0021: +pe_fetch_store)
|
||||
# + 8 PEs x 9 pe_comps)) (ADR-0023: +pe_ipcq)
|
||||
# IO: pcie_ep + io_cpu + noc + 4 io_ucie_ports + 4*4 io_ucie_conn = 23
|
||||
# cube: 32 + 3 + 20 + 64 = 119
|
||||
# = 1 + 2*(23 + 16*119) = 1 + 2*(23+1904) = 1 + 3854 = 3855
|
||||
assert len(g.nodes) == 3855
|
||||
# cube: 32 + 3 + 20 + 72 = 127
|
||||
# = 1 + 2*(23 + 16*127) = 1 + 2*(23+2032) = 1 + 4110 = 4111
|
||||
assert len(g.nodes) == 4111
|
||||
|
||||
|
||||
def test_full_graph_edge_count():
|
||||
g = _graph()
|
||||
assert len(g.edges) == 12922 # ADR-0021: +pe_fetch_store + chaining edges
|
||||
# ADR-0023: +3 IPCQ edges per PE (cpu→ipcq, ipcq→dma, dma→ipcq)
|
||||
# 2 SIPs × 16 cubes × 8 PEs × 3 = 768 new edges
|
||||
# Cross-SIP routing: +1 reverse pcie_ep→switch edge per SIP = +2
|
||||
assert len(g.edges) == 13692
|
||||
|
||||
|
||||
# -- Full graph: specific nodes exist -----------------------------------------
|
||||
@@ -287,7 +290,7 @@ def test_pe_view_has_all_components():
|
||||
v = _graph().pe_view
|
||||
assert set(v.nodes.keys()) == {
|
||||
"pe_cpu", "pe_scheduler", "pe_dma", "pe_fetch_store",
|
||||
"pe_gemm", "pe_math", "pe_mmu", "pe_tcm",
|
||||
"pe_gemm", "pe_math", "pe_mmu", "pe_tcm", "pe_ipcq",
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -24,7 +24,7 @@ def test_pe_template_components():
|
||||
comps = spec["cube"]["pe_template"]["components"]
|
||||
assert set(comps.keys()) == {
|
||||
"pe_cpu", "pe_scheduler", "pe_dma", "pe_fetch_store",
|
||||
"pe_gemm", "pe_math", "pe_mmu", "pe_tcm",
|
||||
"pe_gemm", "pe_math", "pe_mmu", "pe_tcm", "pe_ipcq",
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -87,6 +87,37 @@ def test_tl_math_unary_ops():
|
||||
assert ops == ["exp", "log", "sqrt", "abs", "sigmoid", "cos", "sin"]
|
||||
|
||||
|
||||
def test_tl_math_extra_ops():
|
||||
"""tl.maximum/minimum/fma/clamp/softmax + tl.cdiv (real-Triton parity)."""
|
||||
tl = _ctx()
|
||||
a = tl.load(0x1000, shape=(8, 8), dtype="f16")
|
||||
b = tl.load(0x2000, shape=(8, 8), dtype="f16")
|
||||
c = tl.load(0x3000, shape=(8, 8), dtype="f16")
|
||||
|
||||
tl.maximum(a, b)
|
||||
tl.minimum(a, b)
|
||||
tl.fma(a, b, c)
|
||||
tl.clamp(a, b, c)
|
||||
tl.softmax(a, axis=1)
|
||||
|
||||
math_cmds = [cm for cm in tl.commands if isinstance(cm, MathCmd)]
|
||||
ops = [cm.op for cm in math_cmds]
|
||||
assert ops == ["maximum", "minimum", "fma", "clamp", "softmax"]
|
||||
# ternary fma/clamp must record three inputs
|
||||
fma_cmd = math_cmds[2]
|
||||
assert len(fma_cmd.inputs) == 3
|
||||
clamp_cmd = math_cmds[3]
|
||||
assert len(clamp_cmd.inputs) == 3
|
||||
# softmax records the axis
|
||||
assert math_cmds[4].axis == 1
|
||||
|
||||
# cdiv is a scalar helper, not a tensor op
|
||||
from kernbench.triton_emu.tl_context import TLContext
|
||||
assert TLContext.cdiv(10, 3) == 4
|
||||
assert TLContext.cdiv(9, 3) == 3
|
||||
assert TLContext.cdiv(0, 4) == 0
|
||||
|
||||
|
||||
# ── 5. a + b, a * b → MathCmd ────────────────────────────────────
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user