998cc85762
Major changes:
PE-level IPCQ infrastructure:
- New PE_IPCQ component: ring-buffer control plane with 4-direction
neighbor mapping, head/tail pointers, backpressure (poll/sleep).
- PE_DMA extended with vc_comm channel for IPCQ outbound/inbound DMA,
including in-flight data snapshot (D9) and op_log recording at
outbound time for Phase 2 replay correctness.
- IpcqDmaToken piggyback model: data + metadata travel together,
atomic visibility at receiver (invariant I6).
- Credit return fast path: bottleneck-BW latency, no fabric vc_comm.
Phase 2 data execution (ADR-0020 integration):
- op_log extended: DmaWriteCmd now captures src_space/src_addr for
Phase 2 dma_write copy; ipcq_copy ops recorded at outbound time.
- DataExecutor replays dma_write + ipcq_copy in t_start order.
- Engine._flush_data_phase: incremental cursor-based replay after
each engine.wait() so host reads see post-Phase-2 data.
- KernelRunner Phase 1 writes disabled when op_log is active to
prevent stale data from corrupting the MemoryStore snapshot.
TLContext / kernel API:
- tl.send(dir, src=TensorHandle), tl.recv(dir, shape, dtype),
tl.recv_async, tl.wait(RecvFuture), copy_to_dst mode.
- TensorHandle operator overloading (add/sub/mul/div) via thread-local
active TLContext → MathCmd dispatch through PE_MATH.
- PE-local scratch allocator for math output handles.
- tl.load returns space="hbm" handles for correct Phase 2 addressing.
- Additional math functions: maximum, minimum, fma, clamp, softmax, cdiv.
Unified ccl_allreduce bench (PyTorch-compat host code):
- Single benches/ccl_allreduce.py with run() + worker(rank, ws, torch)
split matching real PyTorch DDP worker pattern.
- torch.distributed facade: init_process_group, get_world_size,
get_rank, get_backend, all_reduce, barrier — only real PyTorch names.
- AhbmCCLBackend: eager install_ipcq at init, all_reduce dispatches
kernel via tensor shard metadata (n_elem from shards[0].nbytes).
- world_size derived from topology spec (sips × cubes × pes_per_cube)
with optional algorithm-level override in ccl.yaml.
Tensor API (PyTorch-compat surface):
- Tensor.numpy(): gather-aware (all shards via VA-based addressing).
- Tensor.copy_(source): scatter from host tensor into sharded target.
- RuntimeContext.from_numpy(arr): host-side staging tensor.
- Tensor.data property fixed to use numpy() (was shards[0]-only).
Algorithm modules moved to src/kernbench/ccl/algorithms/:
- ring_allreduce, mesh_allreduce, tree_allreduce, hello_send.
- Each module exports kernel_args(world_size, n_elem) helper.
- ccl.yaml module paths updated to kernbench.ccl.algorithms.*.
Dead code removed:
- 7 per-variant bench files (ccl_allreduce_{tcm,hbm,sram}, etc.).
- _run_ccl_bench greenlet-per-SIP scheduler.
- benches.loader.is_ccl_bench + run_rank detection.
- benches/ccl/ directory.
Tests:
- New test_ccl_allreduce_matrix.py: 7 parametrized cases
(ring×3 buffers, ring 8/16, mesh 4, tree 7).
- New test_runtime_api_tensor.py: copy_/numpy/from_numpy unit tests.
- Existing tests updated for new import paths + world_size_override.
Docs:
- Korean ccl-author-guide.md and ADR-0023 paths updated.
- New English versions: ccl-author-guide.en.md, ADR-0023.en.md.
502 tests pass.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
207 lines
6.6 KiB
Python
207 lines
6.6 KiB
Python
"""Tests for PE_DMA IPCQ handling (ADR-0023 D8 + D9 atomic).
|
|
|
|
PE_DMA gains two new behaviors:
|
|
1. Outbound: when it receives an IpcqDmaToken from local PE_IPCQ, it
|
|
forwards it through the fabric (next-hop port) toward the peer
|
|
PE_DMA.
|
|
2. Inbound: when it receives a Transaction wrapping an IpcqDmaToken,
|
|
it performs MemoryStore.write at dst_endpoint.buffer_kind/dst_addr
|
|
and forwards IpcqMetaArrival(token) to local PE_IPCQ — both in the
|
|
SAME SimPy step (I6 MUST).
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass, field
|
|
from typing import Any
|
|
|
|
import numpy as np
|
|
import simpy
|
|
|
|
from kernbench.common.ipcq_types import (
|
|
IpcqDmaToken,
|
|
IpcqEndpoint,
|
|
IpcqMetaArrival,
|
|
)
|
|
from kernbench.components.builtin.pe_dma import PeDmaComponent
|
|
from kernbench.sim_engine.memory_store import MemoryStore
|
|
from kernbench.sim_engine.transaction import Transaction
|
|
from kernbench.topology.types import Node
|
|
|
|
|
|
# ── Mock context ─────────────────────────────────────────────────────
|
|
|
|
|
|
@dataclass
|
|
class _MockResolver:
|
|
pass
|
|
|
|
|
|
@dataclass
|
|
class _MockRouter:
|
|
"""Returns a fixed two-hop path for any (src, dst)."""
|
|
|
|
def find_path(self, src: str, dst: str) -> list[str]:
|
|
return [src, "fake_router", dst]
|
|
|
|
|
|
@dataclass
|
|
class _MockCtx:
|
|
router: Any = field(default_factory=_MockRouter)
|
|
resolver: Any = field(default_factory=_MockResolver)
|
|
memory_store: Any = None
|
|
edge_map: dict = field(default_factory=dict)
|
|
spec: dict = field(default_factory=dict)
|
|
op_logger: Any = None
|
|
|
|
def compute_drain_ns(self, path: list[str], nbytes: int) -> float:
|
|
return 0.0
|
|
|
|
def get_shared_resource(self, env, key, capacity=1):
|
|
return simpy.Resource(env, capacity=capacity)
|
|
|
|
|
|
def _make_pe_dma(
|
|
env: simpy.Environment, pe_prefix: str, store: MemoryStore | None = None,
|
|
) -> PeDmaComponent:
|
|
node = Node(
|
|
id=f"{pe_prefix}.pe_dma",
|
|
kind="pe_dma",
|
|
impl="builtin.pe_dma",
|
|
attrs={},
|
|
pos_mm=None,
|
|
)
|
|
ctx = _MockCtx(memory_store=store)
|
|
comp = PeDmaComponent(node, ctx=ctx)
|
|
comp.in_ports["host"] = simpy.Store(env)
|
|
comp.out_ports["fake_router"] = simpy.Store(env)
|
|
comp.out_ports[f"{pe_prefix}.pe_ipcq"] = simpy.Store(env)
|
|
comp.start(env)
|
|
return comp
|
|
|
|
|
|
def _make_endpoint(sip=0, cube=0, pe=1, buffer_kind="tcm") -> IpcqEndpoint:
|
|
return IpcqEndpoint(
|
|
sip=sip, cube=cube, pe=pe,
|
|
buffer_kind=buffer_kind,
|
|
rx_base_pa=0x10_000, rx_base_va=0,
|
|
n_slots=4, slot_size=4096,
|
|
)
|
|
|
|
|
|
# ── Outbound: PE_IPCQ → PE_DMA → fabric ──────────────────────────────
|
|
|
|
|
|
def test_outbound_forwards_token_through_fabric():
|
|
env = simpy.Environment()
|
|
store = MemoryStore()
|
|
src_arr = np.arange(16, dtype=np.float16)
|
|
store.write("tcm", 0x500, src_arr)
|
|
|
|
src = _make_pe_dma(env, "sip0.cube0.pe0", store=store)
|
|
|
|
peer = _make_endpoint(pe=1)
|
|
token = IpcqDmaToken(
|
|
src_addr=0x500, src_space="tcm",
|
|
dst_addr=0x10_000, dst_endpoint=peer,
|
|
nbytes=32, handle_id="t1",
|
|
shape=(16,), dtype="f16",
|
|
sender_seq=0,
|
|
src_sip=0, src_cube=0, src_pe=0, src_direction="E",
|
|
)
|
|
src.in_ports["host"].put(token)
|
|
env.run(until=10)
|
|
|
|
# The token should be wrapped in a Transaction and forwarded to "fake_router"
|
|
fab = src.out_ports["fake_router"]
|
|
assert len(fab.items) == 1
|
|
txn = fab.items[0]
|
|
assert isinstance(txn, Transaction)
|
|
assert isinstance(txn.request, IpcqDmaToken)
|
|
assert txn.request.dst_addr == 0x10_000
|
|
|
|
|
|
# ── Inbound: PE_DMA → MemoryStore.write + IpcqMetaArrival forward ───
|
|
|
|
|
|
def test_inbound_writes_memory_and_forwards_metadata_atomically():
|
|
env = simpy.Environment()
|
|
store = MemoryStore()
|
|
|
|
# Sender wrote source data to MemoryStore
|
|
src_arr = np.arange(16, dtype=np.float16) + 100
|
|
store.write("tcm", 0x500, src_arr)
|
|
|
|
dst = _make_pe_dma(env, "sip0.cube0.pe1", store=store)
|
|
|
|
peer = _make_endpoint(sip=0, cube=0, pe=1, buffer_kind="tcm")
|
|
token = IpcqDmaToken(
|
|
src_addr=0x500, src_space="tcm",
|
|
dst_addr=0x10_000, dst_endpoint=peer,
|
|
nbytes=32, handle_id="t1",
|
|
shape=(16,), dtype="f16",
|
|
sender_seq=0,
|
|
src_sip=0, src_cube=0, src_pe=0, src_direction="E",
|
|
)
|
|
|
|
# Wrap in a Transaction with this PE_DMA as the terminal
|
|
done = env.event()
|
|
txn = Transaction(
|
|
request=token, path=["fake_router", "sip0.cube0.pe1.pe_dma"],
|
|
step=1, nbytes=32, done=done,
|
|
)
|
|
dst.in_ports["host"].put(txn)
|
|
env.run(until=done)
|
|
|
|
# 1. MemoryStore should have the data at dst_addr
|
|
arrived = store.read("tcm", 0x10_000, shape=(16,), dtype="f16")
|
|
assert np.array_equal(arrived, src_arr)
|
|
|
|
# 2. IpcqMetaArrival should be in PE_IPCQ port
|
|
ipcq_port = dst.out_ports["sip0.cube0.pe1.pe_ipcq"]
|
|
assert len(ipcq_port.items) == 1
|
|
arrival = ipcq_port.items[0]
|
|
assert isinstance(arrival, IpcqMetaArrival)
|
|
assert arrival.token.sender_seq == 0
|
|
assert arrival.token.src_pe == 0
|
|
|
|
|
|
def test_inbound_no_yield_between_write_and_metadata_forward():
|
|
"""Soft check: when multiple inbound IPCQ tokens arrive, the order of
|
|
MemoryStore writes and IpcqMetaArrival forwards is preserved (no
|
|
interleaving from extraneous yields).
|
|
"""
|
|
env = simpy.Environment()
|
|
store = MemoryStore()
|
|
|
|
for i in range(3):
|
|
store.write("tcm", 0x500 + i * 0x100, np.arange(8, dtype=np.float16) + i * 10)
|
|
|
|
dst = _make_pe_dma(env, "sip0.cube0.pe1", store=store)
|
|
peer = _make_endpoint(sip=0, cube=0, pe=1)
|
|
|
|
for i in range(3):
|
|
token = IpcqDmaToken(
|
|
src_addr=0x500 + i * 0x100, src_space="tcm",
|
|
dst_addr=0x10_000 + i * 0x100, dst_endpoint=peer,
|
|
nbytes=16, handle_id=f"t{i}",
|
|
shape=(8,), dtype="f16",
|
|
sender_seq=i,
|
|
src_sip=0, src_cube=0, src_pe=0, src_direction="E",
|
|
)
|
|
done = env.event()
|
|
txn = Transaction(
|
|
request=token, path=["fake_router", "sip0.cube0.pe1.pe_dma"],
|
|
step=1, nbytes=16, done=done,
|
|
)
|
|
dst.in_ports["host"].put(txn)
|
|
env.run(until=done)
|
|
|
|
# Check ordering of arrivals
|
|
ipcq_port = dst.out_ports["sip0.cube0.pe1.pe_ipcq"]
|
|
arrivals = list(ipcq_port.items)
|
|
assert [a.token.sender_seq for a in arrivals] == [0, 1, 2]
|
|
# Memory must be in order
|
|
for i in range(3):
|
|
arr = store.read("tcm", 0x10_000 + i * 0x100, shape=(8,), dtype="f16")
|
|
assert arr[0] == i * 10
|