diff --git a/benches/ccl_allreduce.py b/benches/ccl_allreduce.py new file mode 100644 index 0000000..c12a168 --- /dev/null +++ b/benches/ccl_allreduce.py @@ -0,0 +1,129 @@ +"""CCL all-reduce bench — single unified entry point. + +Driven entirely by ``ccl.yaml`` + ``topology.yaml``: + +- ``defaults.algorithm`` in ``ccl.yaml`` picks which kernel to run + (``ring_allreduce_{tcm,hbm,sram}`` / ``mesh_allreduce_4`` / + ``tree_allreduce_7``). +- ``world_size`` is derived from the algorithm entry's override or from + the topology spec (``sips × cubes_per_sip × pes_per_cube``). +- The host code uses only real PyTorch ``torch.distributed`` names: + ``init_process_group``, ``get_world_size``, ``get_rank``, ``all_reduce``. + +The bench is split into ``worker(rank, world_size, torch)`` — the +per-rank business logic, designed to look like a real PyTorch DDP +training worker so future model benches can reuse the same skeleton — +and ``run(torch)`` — the kernbench-specific launcher that initializes +the process group and invokes the worker. +""" +from __future__ import annotations + +import numpy as np + +from kernbench.ccl.install import load_ccl_config, resolve_algorithm_config +from kernbench.policy.placement.dp import DPPolicy + +# Default per-rank tile size if ccl.yaml doesn't override it. Real +# pytorch benches hardcode batch/feature dims similarly. +DEFAULT_N_ELEM = 32 + + +def _derive_dp(spec: dict, world_size: int) -> DPPolicy: + """Pick a DPPolicy that fans the tensor across exactly ``world_size`` PEs. + + Mirrors what a real PyTorch DDP user does manually with + ``tensor.to(f"cuda:{rank}")``: the host code chooses the placement so + that the collective sees the right number of participating ranks. + """ + sips = int(spec["system"]["sips"]["count"]) + cm = spec["sip"]["cube_mesh"] + pl = spec["cube"]["pe_layout"] + pes_per_cube = int(pl["pe_per_corner"]) * len(pl["corners"]) + cubes_per_sip = int(cm["w"]) * int(cm["h"]) + total = sips * cubes_per_sip * pes_per_cube + if world_size == total: + return DPPolicy(sip="column_wise", cube="column_wise", pe="column_wise") + if world_size <= pes_per_cube: + return DPPolicy( + sip="replicate", cube="replicate", pe="column_wise", + num_sips=1, num_cubes=1, num_pes=world_size, + ) + if world_size <= cubes_per_sip * pes_per_cube: + return DPPolicy( + sip="replicate", cube="column_wise", pe="column_wise", + num_sips=1, num_cubes=world_size // pes_per_cube, + ) + return DPPolicy(sip="column_wise", cube="column_wise", pe="column_wise") + + +def worker(rank: int, world_size: int, torch) -> None: + """Per-rank business logic. Mirrors a real PyTorch DDP worker. + + In real PyTorch DDP, this function runs in N separate processes, + each with its own ``rank``. In kernbench (single-process multi-device) + it is invoked once with ``rank=0`` on the single host driver; the + actual per-PE parallelism is handled by ``torch.launch`` fanning out + the kernel across all participating PEs via the tensor's DPPolicy. + The ``rank`` parameter is therefore always 0 today, and is kept as + an explicit argument for parity with real DDP workers (``if rank == + 0`` logging guards, future multi-host extensions). + """ + cfg = resolve_algorithm_config(load_ccl_config()) + algo_name = cfg["algorithm"] + n_elem = int(cfg.get("n_elem", DEFAULT_N_ELEM)) + + # Pick a DP that produces exactly ``world_size`` shards on this topology. + dp = _derive_dp(torch.spec, world_size) + tensor = torch.zeros( + (1, world_size * n_elem), dtype="f16", dp=dp, name="ccl_in", + ) + + # Initialize: CCL rank r's slice gets value (r + 1). Real PyTorch idiom: + # target.copy_(torch.from_numpy(source)) + init = np.zeros((1, world_size * n_elem), dtype=np.float16) + for r in range(world_size): + init[0, r * n_elem : (r + 1) * n_elem] = float(r + 1) + tensor.copy_(torch.from_numpy(init)) + + # The main act: one all_reduce call — the backend installs IPCQ at + # init_process_group time and here only dispatches the kernel. + torch.distributed.all_reduce(tensor, op="sum") + + # Verify: each shard should hold sum(1..world_size) after all-reduce. + result = tensor.numpy() + expected = float(sum(range(1, world_size + 1))) + all_ok = bool(np.allclose(result, expected, rtol=1e-1, atol=1e-1)) + + # Print only on rank 0 — real PyTorch DDP idiom for single-source logs. + if rank == 0: + if all_ok: + print(f" {algo_name} (ws={world_size}): {world_size} OK") + else: + flat = result.reshape(-1) + n_fail = 0 + for r in range(world_size): + slice_r = flat[r * n_elem : (r + 1) * n_elem] + if not np.allclose(slice_r, expected, rtol=1e-1, atol=1e-1): + n_fail += 1 + if n_fail <= 5: + print( + f" [FAIL] rank {r} " + f"(ws={world_size}, algo={algo_name}): " + f"got mean={float(slice_r.mean()):.3f}, " + f"expected={expected:.3f}" + ) + print( + f" {algo_name} (ws={world_size}): " + f"{world_size - n_fail} OK / {n_fail} FAIL" + ) + + +def run(torch) -> None: + """CLI entry point: initialize the process group, invoke worker.""" + dist = torch.distributed + dist.init_process_group(backend="ahbm") + worker( + rank=dist.get_rank(), + world_size=dist.get_world_size(), + torch=torch, + ) diff --git a/benches/loader.py b/benches/loader.py index abc5ac7..fd6bb24 100644 --- a/benches/loader.py +++ b/benches/loader.py @@ -9,29 +9,32 @@ from kernbench.runtime_api.context import RuntimeContext BenchFn = Callable[[RuntimeContext], Any] +def _load_module(bench_id: str): + bench_id = bench_id.strip() + if not bench_id: + raise ValueError("Bench id is empty.") + module_path = f"benches.{bench_id}" + try: + return importlib.import_module(module_path) + except ModuleNotFoundError as e: + raise ValueError( + f"Unknown bench '{bench_id}'. Expected module {module_path}.py" + ) from e + + def resolve_bench(bench_id: str) -> BenchFn: - """ - Resolve a bench id into a callable bench function. + """Resolve a bench id into its ``run(torch)`` callable. Expected layout (repo root): benches/.py def run(torch: RuntimeContext) -> Any """ - bench_id = bench_id.strip() - if not bench_id: - raise ValueError("Bench id is empty.") - - module_path = f"benches.{bench_id}" - - try: - mod = importlib.import_module(module_path) - except ModuleNotFoundError as e: - raise ValueError(f"Unknown bench '{bench_id}'. Expected module {module_path}.py") from e - + mod = _load_module(bench_id) run_fn = getattr(mod, "run", None) if run_fn is None: - raise ValueError(f"Bench module {module_path} must define a 'run(torch)' function.") + raise ValueError( + f"Bench module benches.{bench_id} must define 'run(torch)'." + ) if not callable(run_fn): - raise ValueError(f"'run' in {module_path} is not callable.") - + raise ValueError(f"'run' in benches.{bench_id} is not callable.") return run_fn diff --git a/ccl.yaml b/ccl.yaml new file mode 100644 index 0000000..4bac308 --- /dev/null +++ b/ccl.yaml @@ -0,0 +1,80 @@ +# ccl.yaml — CCL backend (ahbm) configuration (ADR-0023 D11) +# +# Loaded by AhbmCCLBackend at init_process_group time. +# defaults.algorithm chooses which kernel + topology is installed +# into PE_IPCQ neighbor tables. Host code is unaware of these settings. + +defaults: + # Algorithm to run for this benchmark execution. + algorithm: ring_allreduce_tcm + + # NOTE: world_size is not set here by default. AhbmCCLBackend derives it + # from the chosen algorithm's entry (if it sets ``world_size``) or from + # topology.yaml (``sips × cubes_per_sip × pes_per_cube``). This mirrors + # real PyTorch DDP where ranks/world_size come from env vars, not code. + + # IPCQ ring buffer location. + # tcm — PE-local TCM (fast, small, conflicts with compute TCM access) + # hbm — PE-local HBM (large, slower DMA latency) + # sram — Cube-shared SRAM (medium, cube-internal contention) + buffer_kind: tcm + + # Backpressure mode. + # poll — spin-loop polling of cached peer pointers + # sleep — yield SimPy event, wake on credit return + backpressure: sleep + + # Ring depth: number of slots per (direction, tx|rx) buffer. + n_slots: 4 + + # Slot size in bytes (must hold one tile worth of data). + slot_size: 4096 + + # PE_DMA virtual channel chunk size (D8). First implementation does not + # use chunk-level interleave; this is reserved for future precision. + vc_chunk_size: 256 + + # Credit return fast path message size (D9). Used by bottleneck-BW + # latency calculation. 16-64 bytes typical. + ipcq_credit_size_bytes: 16 + +algorithms: + # ── ring all-reduce, buffer in PE_TCM ── + # Defaults to topology-derived world_size (full system, 256 ranks). + # Use a smaller tile size at high rank counts so f16 sums stay within + # the verification tolerance and op_log replay scales. + ring_allreduce_tcm: + module: kernbench.ccl.algorithms.ring_allreduce + topology: ring_1d + buffer_kind: tcm + n_elem: 8 + + # ── ring all-reduce, buffer in PE-local HBM ── + ring_allreduce_hbm: + module: kernbench.ccl.algorithms.ring_allreduce + topology: ring_1d + buffer_kind: hbm + n_elem: 8 + + # ── ring all-reduce, buffer in cube SRAM ── + ring_allreduce_sram: + module: kernbench.ccl.algorithms.ring_allreduce + topology: ring_1d + buffer_kind: sram + n_elem: 8 + + # ── 2D mesh all-reduce: perfect square only (2×2 = 4 PEs) ── + mesh_allreduce_4: + module: kernbench.ccl.algorithms.mesh_allreduce + topology: mesh_2d + buffer_kind: tcm + world_size: 4 + n_elem: 16 + + # ── tree all-reduce (binary, 7 PEs) ── + tree_allreduce_7: + module: kernbench.ccl.algorithms.tree_allreduce + topology: tree_binary + buffer_kind: tcm + world_size: 7 + n_elem: 16 diff --git a/components.yaml b/components.yaml index f3ab639..75db756 100644 --- a/components.yaml +++ b/components.yaml @@ -51,5 +51,6 @@ components: builtin.pe_fetch_store: kernbench.components.builtin.pe_fetch_store:PeFetchStoreComponent builtin.pe_mmu: kernbench.components.builtin.pe_mmu:PeMmuComponent builtin.pe_tcm: kernbench.components.builtin.pe_tcm:PeTcmComponent + builtin.pe_ipcq: kernbench.components.builtin.pe_ipcq:PeIpcqComponent # Custom — add your implementations here diff --git a/docs/adr/ADR-0023-ipcq-pe-collective.en.md b/docs/adr/ADR-0023-ipcq-pe-collective.en.md new file mode 100644 index 0000000..c97f20e --- /dev/null +++ b/docs/adr/ADR-0023-ipcq-pe-collective.en.md @@ -0,0 +1,866 @@ +# ADR-0023: PE-level IPCQ — Inter-PE Collective Communication + +## Status + +Proposed + +## Context + +### Goal + +Add the infrastructure that lets CCL (Collective Communication Library) +kernels run **inside** a PE. The host just launches a kernel on each +SIP; the actual synchronization and data movement happen **inside the +PE kernel via an IPCQ (Inter-Process Communication Queue)**. + +This mirrors how NCCL performs NVLink communication inside a GPU +kernel, or how Cerebras / Tenstorrent expose core-local communication +queues. Host-level collectives (`dist.all_reduce`) are deferred to +**future work**; this ADR focuses solely on the kernel-side collective +infrastructure. + +### Current state + +- ADR-0021 PE pipeline refactor: each PE is decomposed into components + (PE_CPU, PE_SCHEDULER, PE_DMA, PE_FETCH_STORE, PE_GEMM, PE_MATH, + PE_TCM, PE_MMU). +- No direct PE-to-PE channel exists today. All data movement goes + through PE_DMA → cube_noc / UCIe / PCIE → HBM. +- A pre-ADR host CCL skeleton exists (`dist.init_process_group(backend="ahbm")`, + `_run_ccl_bench` running per-rank greenlets concurrently). The + collective itself is a stub. + +### Problems to solve + +1. PE-to-PE direct data movement (writing into a peer's memory). +2. Synchronization — the sender must check that the receiver has space + in its buffer (backpressure). +3. Resource contention between compute traffic and communication + traffic (Head-of-Line blocking). +4. The host must be able to construct logical neighbor topologies + (ring / mesh / tree) per algorithm. + +--- + +## Decision + +### D1. Add a new `PE_IPCQ` component + +A new component `PE_IPCQ` is added inside each PE. It follows the same +pattern as PE_GEMM / PE_MATH — modeling a sub-block of the PE as a +distinct component. + +``` +PE +├── PE_CPU +├── PE_SCHEDULER +├── PE_DMA +├── PE_IPCQ ← new +├── PE_FETCH_STORE +├── PE_GEMM +├── PE_MATH +├── PE_TCM +├── PE_MMU +``` + +**Role separation** (control plane vs. data plane): + +- **PE_IPCQ (control plane)**: ring-buffer address arithmetic, head / + tail pointer management, peer pointer caches, backpressure, 4-direction + neighbor mapping. +- **PE_DMA (data plane)**: actually moves data through cube_noc / UCIe + / PCIE into the peer's memory. + +PE_IPCQ does **not** move data itself — it delegates to PE_DMA. + +### D2. Ring buffer model + +Each PE owns 4 directions (N/S/E/W) × {tx, rx} = 8 ring buffers. + +```python +@dataclass +class IpcqQueuePair: + direction: Direction # N/S/E/W + peer: IpcqEndpoint # set by host at init time (D2.5) + tx_buffer_base: int # outgoing data base addr (in our memory) + rx_buffer_base: int # incoming data base addr (in our memory) + slot_size: int # 1 tile per slot + n_slots: int # ring depth + my_head: int # next slot we will write/send into + my_tail: int # next slot we will read/recv from + peer_head_cache: int # peer's last-seen head (updated via D9 piggyback) + peer_tail_cache: int # peer's last-seen tail (updated via D9 fast-path credit) +``` + +**Canonical field names**: throughout this ADR the four names above +(`my_head`, `my_tail`, `peer_head_cache`, `peer_tail_cache`) are used +consistently. Synonyms (`peer_head_local`, `peer_head`, `peer_tail`, +etc.) are not used. + +| Field | Owner | Updated when | +|-------|-------|--------------| +| `my_head` | local PE_IPCQ | immediately after `tl.send` (send tracking) | +| `my_tail` | local PE_IPCQ | immediately after `tl.recv` (recv tracking) | +| `peer_head_cache` | local PE_IPCQ | on `IpcqMetaArrival` (D9 piggyback) | +| `peer_tail_cache` | local PE_IPCQ | on `IpcqCreditMetadata` (D9 fast path) | + +**Slot unit**: fixed-size, one slot holds one full tile (no descriptor +indirection). Full data embedded in the slot. See D5. + +### D2.5. `IpcqEndpoint` schema + +`IpcqQueuePair.peer` carries everything the sender needs to compute the +peer's rx slot address: + +```python +@dataclass(frozen=True) +class IpcqEndpoint: + sip: int + cube: int + pe: int + buffer_kind: str # "tcm" | "hbm" | "sram" + rx_base_pa: int # peer rx_buffer base PA (PhysAddr.encode()) + rx_base_va: int # peer rx_buffer base VA (optional, MMU mode) + n_slots: int # peer ring depth (for wrap-around) + slot_size: int # peer slot size (for offset) +``` + +Address computation: + +```python +slot_idx = self.my_head % peer.n_slots +dst_pa = peer.rx_base_pa + slot_idx * peer.slot_size +``` + +PE_IPCQ passes `dst_pa` to PE_DMA inside an `IpcqDmaToken`. PE_DMA +(vc_comm) routes the data to `dst_pa` through the fabric. + +**Endpoint construction order**: at backend init (D10), the IPCQ +buffers for **every PE** are allocated first (so each rank knows the +others' PA), then the per-rank neighbor tables are built and pushed to +PE_IPCQ via `IpcqInitMsg`. + +### D3. Four-direction mapping ≡ logical ProcessGroup + +The PE views four directions (N/S/E/W) as logical ports. Real peer +addresses are configured by the host CCL init, per the chosen +algorithm. The PE kernel never knows the topology, only directions. + +```python +# 1D ring +for rank in range(world_size): + ipcq_set_neighbor(rank, "E", peer=ranks[(rank + 1) % world_size]) + ipcq_set_neighbor(rank, "W", peer=ranks[(rank - 1) % world_size]) + +# 2D mesh +for r in range(R): + for c in range(C): + ipcq_set_neighbor((r, c), "N", peer=((r - 1) % R, c)) + ipcq_set_neighbor((r, c), "S", peer=((r + 1) % R, c)) + ipcq_set_neighbor((r, c), "E", peer=(r, (c + 1) % C)) + ipcq_set_neighbor((r, c), "W", peer=(r, (c - 1) % C)) +``` + +The PE code does not need to know where `tl.send(dir="E", ...)` actually +ends up. + +### D4. PE kernel API + +```python +# Send (blocking; may stall on backpressure) +tl.send(dir: str, src=TensorHandle) +tl.send(dir: str, src_addr=..., nbytes=..., shape=..., dtype=..., space=...) + +# Recv (blocking) +recv = tl.recv(dir: str, shape=..., dtype=...) +recv = tl.recv(shape=..., dtype=...) # round-robin across 4 directions + +# Recv (non-blocking) +fut = tl.recv_async(dir: str, shape=..., dtype=...) +recv = tl.wait(fut) +``` + +`tl.recv()` (no direction) keeps a `last_polled_dir` cursor and on each +call rotates through directions, returning the first available slot. +Empty in all 4 directions → wait. + +**Fairness is weak**: the rotating start mitigates simple bias, but if +one direction always wins the race the others can starve. Algorithms +that need strict fairness must call `tl.recv(dir=...)` explicitly. + +### D5. Single-hop DMA write + full-data slot model + +Data moves from sender memory into the receiver's ring slot in **one +DMA transfer**. Key properties: + +- **Single-hop**: the sender already knows the peer rx slot address and + fires one fabric DMA into it. +- **No CPU memcpy**: the CPU never copies data. +- **No intermediate staging**: neither side keeps a separate staging + buffer (sender uses the source addr directly; receiver gets the data + in its ring slot directly). + +(Strictly speaking the fabric DMA write does happen, so this is not +literally "no data movement" — it's the same property NCCL labels +"zero-copy", meaning no CPU memcpy and no staging copy.) + +``` +PE A: tl.send(E, src_addr, nbytes) + 1. IPCQ computes the peer rx slot address: + dst_addr = peer.rx_base_pa + (my_head % peer.n_slots) * peer.slot_size + 2. Backpressure: my_head - peer_tail_cache < peer.n_slots ? + (full → sleep / poll) + 3. Submit DMA on PE_DMA(vc_comm): src_addr → peer dst_addr, nbytes + 4. my_head += 1 + +PE B: data = tl.recv(W) + 1. Look at rx_buffer[my_tail % n_slots] + 2. Wait for the data to arrive (D7 backpressure mode) + 3. Return the slot address to the kernel (or fetch into register file) + 4. my_tail += 1 + 5. Issue a credit-return fast path (D9): after the bottleneck-BW + latency the peer A's peer_tail_cache is updated. +``` + +The slot holds the full tile. The receiver only reads its own +rx_buffer; it never reads back into A's memory. The sender knows the +peer rx slot address and DMAs directly into it (single-hop). + +The PE's own PE_TCM read/write does not go through DMA (PE_TCM is local +to the PE). + +### D6. Buffer placement — three-way benchmark + +The host CCL init picks the IPCQ ring-buffer location: + +```python +ipcq_init( + backend="ahbm", + buffer_kind="tcm" | "hbm" | "sram", + n_slots=8, + slot_size=4096, +) +``` + +| Location | Trait | Trade-off | +|----------|-------|-----------| +| **PE_TCM** | Attached to the PE; fast | Small; competes with PE-internal resources | +| **PE-local HBM** | Large; via DMA | Higher latency | +| **Cube SRAM** | Mid-size; cube-shared | Cube-internal contention | + +All three locations run the same kernel code; only the init differs. + +### D7. Backpressure — two-mode benchmark + +How the sender or receiver waits when peer slots are full / data not +yet arrived: + +| Mode | Behavior | Model | +|------|----------|-------| +| **poll** | Periodically re-check the cached peer pointer | Spin loop | +| **sleep** | Yield a SimPy event; wake on a peer-trigger | Interrupt-like | + +```python +ipcq_init(backpressure="poll" | "sleep", ...) +``` + +Both modes are implemented so latency / throughput trade-offs can be +benchmarked. + +### D8. PE_DMA virtual channels + +Extend PE_DMA from a single queue into a **two-channel virtual-channel** +model. + +``` +PE_DMA +├── vc_compute: tile load / store / writeback for GEMM and Math +└── vc_comm: IPCQ send data +``` + +Each VC has an independent state machine: + +- One channel stalling does not block the other. +- The same physical link (cube_noc, UCIe, …) is shared, but link BW is + split between channels. + +**Chunk-level interleave**: + +- Large GEMM tile DMAs do not lock the link end-to-end. +- Progress happens in chunks (e.g. 256 B); each chunk shares link BW + with the other VC's pending chunks. +- Chunk size is an init parameter (smaller = fairer, larger = more + efficient). + +Net effect: + +- HoL blocking is eliminated (an IPCQ send can interleave with a long + compute DMA). +- Compute / comm overlap is natural (NVIDIA copy-engine + compute-SM + pattern). +- Matches the NoC-virtual-channel pattern used in real HW. + +**First-implementation accuracy limit (intentional)**: this ADR's +first cut uses **deterministic chunk-level interleave + weighted +round-robin arbitration** (default 50 / 50, exposed in `ccl.yaml`). +This is a first-order approximation and is simpler than real HW +dynamic-contention / credit-based arbiters. Functional correctness is +unaffected, but heavy-contention scenarios may report slightly +optimistic latency vs. real HW. A separate ADR can add a NoC arbiter +component later if more precision is needed. + +#### Token routing + +- Compute tokens (`TileToken`) — go through the existing + PE_FETCH_STORE → PE_DMA chain. +- Communication tokens (`IpcqDmaToken`, new) — PE_IPCQ → PE_DMA + self-routing. +- PE_DMA picks the channel by token type. + +```python +class PeDmaComponent: + def _process(self, env, token): + if isinstance(token, IpcqDmaToken): + yield from self._vc_comm_process(env, token) + else: + yield from self._vc_compute_process(env, token) +``` + +### D9. Pointer synchronization — DMA payload piggyback + +Real HW (NVLink, UCIe, etc.) piggybacks metadata onto DMA payloads so +pointers update along with the data. This simulation adopts the same +model: **no separate control channel** — metadata travels with the +data. + +The big benefits: + +- **Automatic ordering**: data and metadata move on the same token, so + data is visible **before** the head_cache update. No race. +- **HW fidelity**: matches NVLink / UCIe piggybacked headers. +- **Component simplification**: no separate `IpcqPtrUpdate` event type. + +#### Send flow (head update via piggyback) + +``` +PE A: tl.send(E, src_addr, nbytes) + 1. PE_IPCQ checks backpressure (using peer_tail_cache) + 2. PE_IPCQ creates an IpcqDmaToken: + - data body (src_addr → peer dst_addr) + - piggyback metadata: (sender_seq, src_sip/cube/pe, src_direction) + 3. Hand the token to PE_DMA(vc_comm) + 4. PE A increments my_head (send tracking) + +[fabric DMA: latency elapses] + +PE B's PE_DMA receives the token + 5. Writes data into dst_addr (B's rx slot) via MemoryStore.write + 6. Forwards token metadata to PE B's PE_IPCQ (PE-internal wire, ~1 cycle) + +PE B's PE_IPCQ receives the metadata + 7. Updates peer_head_cache (= A's head) + 8. Wakes any pending recv on that direction +``` + +**Steps 5 and 6 must execute in the same SimPy step** — DMA completion +makes data and metadata atomically visible. + +#### Recv flow (credit return — fast path with bottleneck-BW latency) + +When the receiver frees a slot, the sender must learn about it +(backpressure release). Unlike data, the credit return does **not** +travel through general vc_comm fabric — it uses a **separate fast +path**, an abstraction of the NVLink / UCIe credit-return wire. + +**Latency** is computed from the **bottleneck BW on the path**, not a +magic constant: + +``` +credit_size_bytes = 16 (ccl.yaml: ipcq_credit_size_bytes) +path = router.find_path(self_pe, peer_pe) +latency = compute_drain_ns(path, credit_size_bytes) + = credit_size_bytes / bottleneck_bw_on_path +``` + +That gives us: + +- **Topology-proportional approximation**: an in-cube credit return is + automatically faster than a cross-SIP credit return. +- **No magic constants**: no arbitrary `ipcq_ctrl_latency_ns`. +- **No deadlock risk**: unlike piggyback, B can issue credit even when + it has no data to send back. +- **Reuses existing utility**: `ComponentContext.compute_drain_ns`. + +#### Component coupling — SimPy Store channel + +PE B's PE_IPCQ does not call PE A's PE_IPCQ directly. Instead, at init +time, **a SimPy Store is wired between the two** (a per-direction +fast-path channel) and credit metadata is `put` into that store. + +```python +class PeIpcqComponent: + def _delayed_credit_send(self, env, peer_credit_store, my_tail, latency_ns): + yield env.timeout(latency_ns) + yield peer_credit_store.put(IpcqCreditMetadata(seq=my_tail, ...)) +``` + +Backend init wires both directions of the fast-path channel as part of +fan-out (see `IpcqInitMsg` in D12). + +#### Credit-return fast path limitations + +- `credit_size_bytes` is an estimate (typically 16–64 bytes). +- The fast path is **excluded from vc_comm BW contention** (separate + wire). Real HW credit-return wires are very lightweight, so this is a + reasonable first approximation. +- A follow-up ADR can: model the credit fast path as a separate link + (BW limit + contention), or switch to piggyback (`credit_return_mode: + piggyback`). + +#### PE_DMA's added responsibility + +When `vc_comm` receives a token, PE_DMA processes it as the following +**atomic** sequence. **No SimPy yield is allowed between the two steps** +(invariant I6): + +```python +def _on_vc_comm_recv(self, env, token): + # ── ATOMIC: no yield between these two operations ── + data = self._memory_store.read(token.src_space, token.src_addr, + shape=..., dtype=...) + self._memory_store.write(token.dst_endpoint.buffer_kind, + token.dst_addr, data) + # 2. Forward metadata to the local PE_IPCQ + yield self.out_ports[self._ipcq_id].put(IpcqMetaArrival(token=token)) + # ─────────────────────────────────────────────────── +``` + +The final `put` is yieldable but uses an unbounded internal store, so +it completes in a single step. That `put` is the closing call of the +atomic block; nothing may be inserted before it. + +### D9.5. ADR-0020 (2-pass) integration + +`tl.send` / `tl.recv` integrates with ADR-0020's two-pass model. Phase +1 simulates timing **and** moves data via MemoryStore; Phase 2 enables +op-log-based correctness verification. + +#### Phase 1 (timing + data) + +D9 models head and tail updates with two different mechanisms: + +- **Send-side (head update)** — DMA payload piggyback. Data write and + metadata forward happen in the same SimPy step → automatic atomic + visibility. +- **Recv-side (tail credit return)** — fast-path SimPy Store channel + with bottleneck-BW latency, then `peer_tail_cache` update. + +Together they preserve ring-buffer pointer consistency. + +The op-log records `op_kind="ipcq"` entries for sends (with +`src/dst/space/addr/nbytes/dir/dtype/shape/sender_seq`) and recvs (with +`recv_mode/src/dst/space/addr/nbytes/dir/dtype/shape/consumer_seq`). +Two recv modes: + +- **`return_slot`** (default): the slot address is returned to the + kernel. Zero-copy. +- **`copy_to_dst`**: when the kernel passes `dst_addr` + `dst_space`, + PE_IPCQ copies the slot data into the user dst. + +#### Phase 2 (op_log replay) + +When `DataExecutor` encounters an `op_kind="ipcq"` record: + +- **send**: idempotent `src → dst` ndarray write. +- **recv (`return_slot`)**: no-op (the slot already holds the data). +- **recv (`copy_to_dst`)**: idempotent `slot → dst_addr` copy. + +IPCQ ops are pure data movement — Phase 2 has nothing extra to compute. +The downstream GEMM / Math ops in `DataExecutor` will consume the data +and naturally validate correctness. + +### D10. Host CCL init keeps the PyTorch shape + +The host code looks just like real PyTorch DDP. `init_process_group` +creates the backend object; it does **not** receive IPCQ knobs +(neighbor topology, buffer_kind, backpressure …). + +```python +# benches/ccl_allreduce.py — same shape as real PyTorch +def worker(rank, world_size, torch): + dist = torch.distributed + dist.init_process_group(backend="ahbm") # reads ccl.yaml + topology + tensor = torch.zeros((1, world_size * N_ELEM), dtype="f16", dp=...) + tensor.copy_(torch.from_numpy(init)) + dist.all_reduce(tensor, op="sum") +``` + +The IPCQ configuration is decided by the backend at +`init_process_group` time: it loads `ccl.yaml`, picks the algorithm, +and pushes IPCQ neighbor tables to every participating PE_IPCQ. The +host code never has to know about IPCQ. + +A bench runs one algorithm, chosen via `ccl.yaml`'s `defaults.algorithm`. +Switching algorithms is purely a `ccl.yaml` change — no host edits +required. + +#### Init flow (eager) + +1. `init_process_group(backend="ahbm")` is called. +2. Backend loads `ccl.yaml` → resolves `defaults.algorithm`. +3. Pulls topology + buffer_kind + backpressure + slot config from + `algorithms[]`. +4. **Immediately** installs neighbor tables on every PE_IPCQ + (sideband or fabric `IpcqInitMsg`). +5. Subsequent `torch.launch(kernel_name, ...)` calls behave normally — + PE_IPCQ is already prepared whether the kernel is a CCL kernel or + not. + +### D11. CCL config file (`ccl.yaml`) + +IPCQ config and algorithm metadata live in a separate YAML file, +following the same pattern as `components.yaml` and `topology.yaml`. + +A single benchmark execution runs one algorithm +(`defaults.algorithm`). Switching algorithms means editing +`defaults.algorithm` only. + +```yaml +defaults: + algorithm: ring_allreduce_tcm + buffer_kind: tcm # tcm | hbm | sram + backpressure: sleep # poll | sleep + n_slots: 8 + slot_size: 4096 + vc_chunk_size: 256 + ipcq_credit_size_bytes: 16 + +algorithms: + ring_allreduce_tcm: + module: kernbench.ccl.algorithms.ring_allreduce + topology: ring_1d # builtin name or "custom" + buffer_kind: tcm + n_elem: 8 # optional, per-algorithm tile width + + tree_allreduce_7: + module: kernbench.ccl.algorithms.tree_allreduce + topology: tree_binary + buffer_kind: tcm + world_size: 7 # algorithm-level override + n_elem: 16 + + custom_mesh: + module: kernbench.ccl.algorithms.custom_mesh + topology: custom # the module supplies its own neighbors() +``` + +`world_size` is **not set in `defaults`**. The backend resolves it via: +`algorithm-level override > defaults override > topology spec`. The +last fallback (`sips × cubes_per_sip × pes_per_cube`) mirrors real DDP +where `WORLD_SIZE` comes from env vars rather than config files. + +#### Algorithm module structure + +Each algorithm module exports two hooks — `kernel` (required) and +`neighbors` (optional) — plus a `kernel_args` helper that the +backend uses to populate positional kernel arguments at `all_reduce` +time: + +```python +# src/kernbench/ccl/algorithms/ring_allreduce.py + +def kernel_args(world_size: int, n_elem: int) -> tuple: + return (n_elem, world_size) + + +def kernel(t_ptr, n_elem, world_size, tl): + """Required — the PE kernel. + + IPCQ is already installed by the backend before this is called. + The kernel only uses the four-direction send / recv API. + """ + ... + + +def neighbors(rank, world_size, neighbor_map): + """Optional — override the builtin topology's neighbor map. + + Returns a new dict, the modified-in-place dict, or None to keep the + builtin map. + """ + return None +``` + +#### `neighbors` override patterns + +- **Pattern A — tweak a builtin**: drop a direction for some ranks, etc. +- **Pattern B — replace entirely**: ignore `neighbor_map` and return a + brand-new dict. +- **Pattern C — keep builtin**: omit `neighbors` or return None. + +#### Builtin topologies + +| topology | direction set | +|----------|---------------| +| `ring_1d` | E, W | +| `ring_1d_unidir` | E only | +| `mesh_2d` | N, S, E, W | +| `tree_binary` | parent, child_left, child_right | +| `none` | (empty) — algorithm must supply `neighbors()` | + +#### Adding a new algorithm + +1. Write `kernel` and `kernel_args` in + `src/kernbench/ccl/algorithms/.py`. +2. Add an entry in `ccl.yaml`'s `algorithms` section. +3. (Optional) provide `neighbors()` for custom topology. +4. Set `defaults.algorithm` to the new algorithm. + +The host bench (`benches/ccl_allreduce.py`) does not change. + +### D12. Message / token schema + +The new message types added by this ADR. They live in +`src/kernbench/common/pe_commands.py` and +`src/kernbench/runtime_api/kernel.py`. + +#### `IpcqInitMsg` (sideband, fan-out at init) + +The backend pushes neighbor tables to every PE_IPCQ. Structure mirrors +`MmuMapMsg` (`target_sips`, `target_cubes`, `target_pe`, `entries`). +Each `IpcqInitEntry` has `direction`, `peer: IpcqEndpoint`, +`my_rx_base_pa/va`, `n_slots`, `slot_size`, plus a `peer_credit_store` +field — a `simpy.Store` instance pre-wired so the sender PE_IPCQ can +push `IpcqCreditMetadata` directly into the receiver's input queue. + +#### `IpcqSendCmd` (PE_CPU → PE_IPCQ) + +Carries `direction`, source addr/space, nbytes, shape, dtype, and a +handle id. `data_op=True` so it lands in the op_log. + +#### `IpcqRecvCmd` (PE_CPU → PE_IPCQ) + +Carries `direction` (or None for round-robin), `recv_mode` +(`return_slot` / `copy_to_dst`), optional `dst_addr/dst_space`, shape, +dtype, blocking flag. + +#### `IpcqDmaToken` (PE_IPCQ → PE_DMA, vc_comm channel) + +Per D9 piggyback: the token carries the data (`src/dst/space/nbytes`) +plus the head metadata (`sender_seq`, `src_sip/cube/pe`, +`src_direction`). PE_DMA picks the channel by token type +(`IpcqDmaToken → vc_comm`, `TileToken → vc_compute`). + +The receiver's PE_DMA, on token arrival, performs the I6 atomic +sequence: write data into MemoryStore, then forward `IpcqMetaArrival` +to the local PE_IPCQ. + +#### `IpcqCreditMetadata` (PE_IPCQ → peer PE_IPCQ, fast path) + +Carries `consumer_seq` (= my_tail), source PE coords, and source +direction. Travels through the dedicated SimPy Store channel rather +than `vc_comm`. Latency = `credit_size_bytes / bottleneck_bw_on_path`. + +There is **no `IpcqPtrUpdate` event** — head updates flow via D9 +piggyback, tail updates via the D9 fast-path channel. + +### D13. Test strategy + +Following the ADR-0021 D8 pattern. + +#### T1. Unit tests (component-level) + +- **PE_IPCQ** (`tests/test_pe_ipcq.py`): send without backpressure + immediately forwards a token; full peer slot triggers backpressure + (poll / sleep modes); recv waits, wakes on `IpcqMetaArrival`; + round-robin recv weak fairness; bad direction → `IpcqInvalidDirection`. +- **PE_DMA virtual channels** (`tests/test_pe_dma_vc.py`): `vc_compute` + / `vc_comm` independent progress, chunk interleave, BW split. +- **Builtin topology** (`tests/test_ccl_topologies.py`): ring_1d / + mesh_2d / tree_binary correctness, mesh_2d non-square → + `ValueError`, custom resolver returns the module's `neighbors`. + +#### T2. Integration tests (E2E send/recv) + +- **`tests/test_ipcq_e2e.py`**: 2-rank ring, 4-rank ring (bidirectional + no-deadlock), 4×4 mesh. +- **CCL kernel + 2-pass** (`tests/test_ipcq_2pass.py`): greenlet mode + records `ipcq` ops in op_log; DataExecutor produces correct + `out.data`. + +#### T3. Backend init (`tests/test_ccl_backend_ipcq.py`) + +`ccl.yaml` load, builtin topology → `IpcqInitMsg` fan-out, endpoint PA +consistency, per-`buffer_kind` allocation. + +#### T4. Regression + +All existing tests pass; ADR-0020 op_log / DataExecutor unaffected for +non-CCL benches. + +#### T5. Performance / overhead + +Single send/recv pair latency = (DMA latency) + (IPCQ overhead). +Should be close to a regular PE_DMA write of the same nbytes (IPCQ +overhead < 100 ns). + +### D14. Invariants and failure modes + +#### Invariants + +I1. **Slot lifecycle exactly-once**: one send → exactly one recv. +I2. **Pointer monotonicity**: `my_head` / `my_tail` strictly + non-decreasing; `sender_seq` strictly increasing. +I3. **Endpoint consistency**: if rank A's `direction=E` peer is rank + B, then rank B's reverse-direction peer must be rank A. Verified at + init. +I4. **`buffer_kind` consistency**: all PEs in a process group share + the same `buffer_kind` (no mixed mode in the first cut). +I5. **op_log ordering**: send → DMA complete → recv possible. The + t_start order in op_log respects this causality. +I6. **Atomic data + metadata visibility (MUST)**: at the receiver + side, data write (`MemoryStore.write`) and metadata forward + (`peer_head_cache` update) **must execute in the same SimPy step**. + No yield is allowed between the two operations in PE_DMA's vc_comm + handler. Code review must reject any inserted `yield` (or `yield + from`) — it would create a race where head_cache becomes visible + before or after the data. +I7. **MemoryStore slot existence ↔ pointer**: as a consequence of I6, + the step in which `peer_head_cache > my_tail` becomes truthy is the + same step in which the slot data is observable. + +#### Failure modes (runtime errors) + +F1. **Bad direction**: `tl.send(dir="X")` for an uninstalled direction + → `IpcqInvalidDirection`, simulation aborts. +F2. **Type mismatch**: dtype/shape/nbytes disagreement between matched + send and recv. Not validated by default; opt-in strict mode catches + it (`strict_validation: true` on a PE_IPCQ node attrs). +F3. **Deadlock detection (timeout-based)**: the simulator empties its + schedule while a send/recv is still pending → engine raises + `IpcqDeadlock` and embeds a pointer dump. +F4. **Backend init failure**: missing `defaults.algorithm`, missing + `algorithms[name]`, module import failure, topology validation + failure (I3, I4) — all raised at `init_process_group` time. +F5. **Slot full + infinite backpressure**: the peer never recvs. + Surfaces as F3 timeout. + +#### Diagnostics + +- **CCL trace**: `KERNBENCH_CCL_TRACE=1` logs each send/recv as + `(rank, t, dir, nbytes)`. +- **Pointer dump**: `kernbench.ccl.diagnostics.pointer_dump(engine)` + prints every PE_IPCQ ring buffer's `my_head`, `my_tail`, + `peer_head_cache`, `peer_tail_cache`. +- **Deadlock dump**: on hang the engine includes the pointer dump in + the `IpcqDeadlock` exception message. + +### D15. Algorithm-author cheat sheet + +Full step-by-step lives in +[`docs/ccl-author-guide.en.md`](../ccl-author-guide.en.md). The +shortest version: + +| Things you touch | Things you don't | +|------------------|-------------------| +| `src/kernbench/ccl/algorithms/.py` (`kernel`, `kernel_args`, optional `neighbors`) | `benches/ccl_allreduce.py` host code | +| One entry in `ccl.yaml` + optionally `defaults.algorithm` | `src/kernbench/ccl/` framework | +| (Optional) `tests/test_.py` mock test | PE_IPCQ component, AhbmCCLBackend | + +5-step flow: write the kernel → register in `ccl.yaml` → optional +`neighbors` override → optional mock unit test → SimPy validation via +`kernbench run --bench ccl_allreduce --verify-data`. + +Common mistakes: using a direction that wasn't installed, sends +without matching recvs (deadlock), dtype/shape disagreement, assuming +fairness from `tl.recv()` round-robin, confusing +`tl.num_programs(axis)` with the CCL group size. + +--- + +## Non-goals + +- **Host collective**: a model where `dist.all_reduce` itself moves + data on the host side is out of scope. This ADR only covers + communication that happens inside the PE kernel. +- **All-reduce algorithms**: ring / tree / etc. live in algorithm + modules and can be added without amending this ADR. +- **Reliability / error handling**: link faults, send/recv failure + recovery, etc. are out of scope. +- **NoC arbiter precision**: dynamic VC contention is left for a future + ADR (see D8). + +--- + +## Open questions + +- **VC arbitration accuracy** — the first cut uses deterministic + chunk interleave + weighted round-robin; heavy contention may report + optimistic latency. A NoC arbiter component can be added later. +- **Credit return BW model** — the fast path is currently outside the + fabric BW contention model. Can be modeled as a separate link or + switched to piggyback (`credit_return_mode: piggyback`). +- **Ring buffer slot allocation metadata** — whether the host pushes + IPCQ buffer metadata via sideband or via a fabric message similar to + `MmuMapMsg` is open. +- **VC BW split default** — 50/50 vs. weighted (e.g. 80/20). Exposed in + `ccl.yaml`; default value TBD. +- **Direction count** — 4 (N/S/E/W) is fixed in the first cut; 6 + (with Up/Down for 3D) or N (variable) is future work. +- **Multi-tile aggregation primitives** — whether + `tl.recv_all` or similar is needed for fan-in. +- **Round-robin recv fairness** — current weak fairness can starve; + strict fairness counter is future work. +- **Deadlock detection precision** — currently timeout-based; a + realtime wait-for graph would enable deterministic detection. + +--- + +## Consequences + +### Positive + +- PE-to-PE direct communication enables CCL kernels to be written. +- Host stays minimal (just `launch`), synchronization happens inside + the PE → strong compute / comm overlap. +- VCs eliminate HoL blocking → collective latency is not blocked by + compute traffic. +- Buffer placement and backpressure mode are init-time parameters → + easy to benchmark. +- Four-direction logical neighbors → host is free to map + ring/mesh/tree algorithms. + +### Negative + +- One new component (PE_IPCQ) and a redesigned PE_DMA (VCs). +- IPCQ memory cost = 8 rings × `slot_size` × `n_slots` per PE. +- VC arbitration is a first-order approximation; heavy contention + scenarios may report slightly optimistic latency vs real HW (D8). +- Chunk-level interleave makes PE_DMA implementation more complex. + +--- + +## Affected files + +| File | Change | +|------|--------| +| `topology.yaml` | Add `pe_ipcq` to `pe_template`, plus the IPCQ ↔ DMA / CPU / TCM edges. | +| `components.yaml` | Register `pe_ipcq_v1`. | +| `src/kernbench/topology/builder.py` | Wire the IPCQ chain into PE-internal edges. | +| `src/kernbench/components/builtin/pe_ipcq.py` | New. | +| `src/kernbench/components/builtin/pe_dma.py` | Add VCs, handle `IpcqDmaToken`. | +| `src/kernbench/common/pe_commands.py` | `IpcqSendCmd`, `IpcqRecvCmd`, `IpcqDmaToken`. | +| `src/kernbench/triton_emu/tl_context.py` | `tl.send` / `tl.recv` API. | +| `src/kernbench/runtime_api/distributed.py` | Eager IPCQ install in `AhbmCCLBackend.__init__`. | +| `src/kernbench/runtime_api/kernel.py` | `IpcqInitMsg` definition. | +| `src/kernbench/ccl/__init__.py` | New CCL package. | +| `src/kernbench/ccl/topologies.py` | Builtin topology generators + `resolve_topology()`. | +| `src/kernbench/ccl/helpers.py` | Algorithm-author helpers (`chunked`, `ring_step`, `tree_step`). | +| `src/kernbench/ccl/testing.py` | Mock CCL runtime (`run_kernel_in_mock`). | +| `src/kernbench/ccl/algorithms/*.py` | Algorithm modules (kernel + `kernel_args` + optional `neighbors`). | +| `ccl.yaml` | Algorithm metadata + IPCQ defaults. | +| `tests/test_pe_ipcq.py` | PE_IPCQ unit tests. | +| `tests/test_pe_dma_vc.py` | PE_DMA VC tests. | +| `tests/test_ipcq_e2e.py` | end-to-end send/recv tests. | +| `tests/test_ccl_topologies.py` | Builtin topology generator tests. | +| `tests/test_ccl_allreduce_matrix.py` | Unified bench × algorithm matrix. | diff --git a/docs/adr/ADR-0023-ipcq-pe-collective.md b/docs/adr/ADR-0023-ipcq-pe-collective.md new file mode 100644 index 0000000..77369dd --- /dev/null +++ b/docs/adr/ADR-0023-ipcq-pe-collective.md @@ -0,0 +1,1220 @@ +# ADR-0023: PE-level IPCQ — Inter-PE Collective Communication + +## Status + +Proposed + +## Context + +### 목표 + +CCL (Collective Communication Library) 커널을 PE 안에서 실행할 수 있도록 +PE 간 데이터 교환 인프라를 추가한다. 호스트는 그저 각 SIP에 커널을 launch만 하고, +실제 동기화와 데이터 이동은 **PE 커널 안에서 IPCQ(Inter-Process Communication +Queue)를 통해** 일어난다. + +이는 NCCL이 GPU 커널 안에서 NVLink 통신을 수행하는 모델, 또는 Cerebras/Tenstorrent의 +core-local 통신 큐와 유사하다. 호스트 레벨 collective(`dist.all_reduce`)는 +**미래 작업**으로 미루고, 본 ADR은 커널 collective 인프라에만 집중한다. + +### 현재 상태 + +- ADR-0021 PE 파이프라인 리팩토링: PE 내부가 컴포넌트 단위로 분리됨 + (PE_CPU, PE_SCHEDULER, PE_DMA, PE_FETCH_STORE, PE_GEMM, PE_MATH, PE_TCM, PE_MMU) +- PE 간 직접 통신 채널 없음. 모든 데이터 이동은 PE_DMA → cube_noc/UCIe/PCIE → HBM 경로 +- 호스트 CCL skeleton (ADR 없음, ad-hoc 구현): `dist.init_process_group(backend="ahbm")`, + `_run_ccl_bench`가 rank별 greenlet로 동시 실행. collective는 stub 상태. + +### 풀어야 할 문제 + +1. PE 간 직접 데이터 이동 (peer's memory에 write) +2. 동기화 — 송신 측이 수신 측 buffer 공간을 확인해야 함 (backpressure) +3. compute traffic과 communication traffic의 자원 경쟁 (Head-of-Line blocking) +4. 호스트가 알고리즘에 따라 (ring/mesh/tree) 논리적 neighbor 토폴로지를 구성할 수 있어야 함 + +--- + +## Decision + +### D1. PE_IPCQ 컴포넌트 신규 추가 + +PE 안에 새 컴포넌트 `PE_IPCQ`를 추가한다. PE_GEMM/PE_MATH가 PE_CPU의 +sub-block을 별도 컴포넌트로 모델링하는 것과 동일한 패턴이다. + +``` +PE +├── PE_CPU +├── PE_SCHEDULER +├── PE_DMA +├── PE_IPCQ ← 신규 +├── PE_FETCH_STORE +├── PE_GEMM +├── PE_MATH +├── PE_TCM +├── PE_MMU +``` + +**역할 분리** (control plane vs data plane): +- **PE_IPCQ (control plane)**: ring buffer 주소 계산, head/tail pointer 관리, + peer pointer 캐시, backpressure 결정, 4-방향 neighbor 매핑 +- **PE_DMA (data plane)**: 실제 데이터를 cube_noc/UCIe/PCIE 경유로 peer 메모리에 전송 + +PE_IPCQ는 데이터 이동을 직접 수행하지 않고 PE_DMA에 위임한다. + +### D2. Ring Buffer 모델 + +각 PE는 4-방향(N/S/E/W) × {tx, rx} = 총 8개의 ring buffer를 가진다. + +```python +@dataclass +class IpcqQueuePair: + direction: Direction # N/S/E/W + peer: IpcqEndpoint # init 시 호스트가 설정 (D2.5) + tx_buffer_base: int # 내가 보낼 데이터의 base addr (자기 메모리) + rx_buffer_base: int # 내가 받을 데이터의 base addr (자기 메모리) + slot_size: int # tile 단위 + n_slots: int # ring depth + my_head: int # 내 send 위치 (다음에 쓸 tx/peer slot) + my_tail: int # 내 recv 위치 (다음에 읽을 rx slot) + peer_head_cache: int # 캐시: peer가 마지막으로 보낸 head 위치 (D9 piggyback으로 갱신) + peer_tail_cache: int # 캐시: peer가 마지막으로 소비한 tail 위치 (D9 fast path credit으로 갱신) +``` + +**필드명 규약 (canonical)**: 본 ADR 전체에서 다음 4개 이름을 일관되게 사용한다. + +| 필드 | 소유자 | 갱신 시점 | +|------|--------|----------| +| `my_head` | 자기 PE_IPCQ | tl.send 호출 후 즉시 (송신 추적용) | +| `my_tail` | 자기 PE_IPCQ | tl.recv 호출 후 즉시 (수신 추적용) | +| `peer_head_cache` | 자기 PE_IPCQ | IpcqMetaArrival 도착 시 (D9 piggyback) | +| `peer_tail_cache` | 자기 PE_IPCQ | IpcqCreditMetadata 도착 시 (D9 fast path) | + +다른 표현(`peer_head_local`, `peer_head`, `peer_tail` 등)은 사용하지 않는다. + +**Slot 단위**: fixed-size, 한 slot이 한 tile 데이터를 통째로 담는다. +descriptor 모델이 아니라 **full data embedding** 모델 (D5에서 상세). + +### D2.5. PeAddress / IpcqEndpoint 스키마 + +`IpcqQueuePair.peer`가 가져야 할 정보를 명시한다. 송신 측 PE_IPCQ가 +peer rx slot에 직접 DMA write하려면 다음을 모두 알아야 한다. + +```python +@dataclass(frozen=True) +class IpcqEndpoint: + """송신 측이 peer's rx_buffer 주소를 계산하기 위해 필요한 모든 정보.""" + sip: int # 목적지 SIP + cube: int # 목적지 cube + pe: int # 목적지 PE (cube 내 local index) + buffer_kind: str # "tcm" | "hbm" | "sram" — 어느 메모리 공간 + rx_base_pa: int # peer rx_buffer base의 PA (PhysAddr.encode()) + rx_base_va: int # peer rx_buffer base의 VA (선택, MMU 사용 시) + n_slots: int # peer ring depth (경계 wrap-around 계산용) + slot_size: int # peer slot 크기 (offset 계산용) +``` + +`IpcqQueuePair`의 `peer` 필드는 이 `IpcqEndpoint` 객체를 들고 있다. +주소 계산은: + +```python +slot_idx = self.my_head % peer.n_slots +dst_pa = peer.rx_base_pa + slot_idx * peer.slot_size +``` + +PE_IPCQ는 이 dst_pa를 `IpcqDmaToken`의 dst_addr로 PE_DMA에 전달한다. +PE_DMA(vc_comm)는 fabric 라우팅(cube_noc/UCIe/PCIE)을 통해 dst_pa로 데이터를 전송한다. + +**Endpoint 생성 시점**: backend init (D10)에서 모든 PE의 IPCQ buffer를 +allocator로 할당받고, 각 rank의 neighbor table을 만들 때 peer rank의 +endpoint 정보를 install한다. 즉 install 순서는: + +1. **모든 rank의 IPCQ buffer 할당** (각 PE의 buffer_kind 메모리 공간에서) +2. **rank별 endpoint table 구성** (자신의 4-방향 peer가 어느 sip/cube/pe/pa를 갖는지) +3. **PE_IPCQ에 install** (`IpcqInitMsg` via fabric or sideband) + +이 순서는 모든 rank가 서로의 PA를 알아야 하므로, 단계 1을 모든 rank에 대해 +먼저 끝낸 후 단계 2-3을 진행한다. + +### D3. 4-방향 매핑 = 논리적 ProcessGroup + +PE는 4방향(N/S/E/W)을 logical port로 본다. 실제 peer 주소는 호스트 CCL init이 +알고리즘에 따라 설정한다. PE 커널은 토폴로지를 알지 못하고 방향만 사용한다. + +```python +# 호스트 init 예시 — 1D ring +for rank in range(world_size): + ipcq_set_neighbor(rank, "E", peer=ranks[(rank + 1) % world_size]) + ipcq_set_neighbor(rank, "W", peer=ranks[(rank - 1) % world_size]) + +# 호스트 init 예시 — 2D mesh +for r in range(R): + for c in range(C): + ipcq_set_neighbor((r, c), "N", peer=((r - 1) % R, c)) + ipcq_set_neighbor((r, c), "S", peer=((r + 1) % R, c)) + ipcq_set_neighbor((r, c), "E", peer=(r, (c + 1) % C)) + ipcq_set_neighbor((r, c), "W", peer=(r, (c - 1) % C)) +``` + +PE 코드 입장에서 `tl.send(dir="E", ...)`가 어디로 가는지는 알 필요가 없다. + +### D4. PE 커널 API + +```python +# Send (blocking, backpressure 발생 가능) +tl.send(dir: str, src_addr: int, nbytes: int) -> None + +# Recv (blocking) +data = tl.recv(dir: str) # 특정 방향에서 수신 +data = tl.recv() # 4방향 round-robin, 도착한 첫 tile 반환 + +# Recv (non-blocking) +handle = tl.recv_async(dir: str) +data = tl.wait(handle) +``` + +`tl.recv()` (방향 미지정)는 IPCQ가 last_polled_dir 인덱스를 들고 있다가 +다음 호출 시 그 다음 방향부터 검사하면서 데이터 있는 첫 슬롯을 반환한다. +4방향 모두 비어있으면 wait. + +**Fairness는 weak fairness**: polling 시작 방향을 회전시켜 단순 편향을 +완화하지만, 한 방향에 데이터가 항상 먼저 도착하면 다른 방향이 starvation될 +수 있다. strict fairness가 필요한 알고리즘은 `tl.recv(dir=...)`로 방향을 +명시해야 한다. (Open Questions 참조) + +### D5. Single-hop DMA Write + Full-data Slot 모델 + +데이터는 송신 측 메모리에서 수신 측 ring slot으로 **단일 DMA 전송**으로 +이동한다. 핵심 속성: + +- **Single-hop**: 송신 측 IPCQ가 peer rx slot 주소를 직접 알고 있어 한 번의 + fabric DMA로 데이터가 도착한다. +- **No CPU memcpy**: CPU가 데이터를 복사하지 않는다. +- **No intermediate staging**: 송신/수신 어느 쪽에도 별도 staging buffer가 + 없다 (송신은 자기 source 주소에서 직접, 수신은 자기 ring slot으로 직접). + +(엄밀히 말하면 fabric DMA write 자체는 발생하므로 "data movement가 전혀 없다"는 +의미는 아니다. NCCL의 "zero-copy"가 가리키는 것 — CPU memcpy / staging copy +부재 — 과 동일한 속성이다.) + +데이터 이동 모델: + +``` +PE A: tl.send(E, src_addr, nbytes) + 1. IPCQ가 peer rx slot 주소 계산 + dst_addr = peer.rx_base_pa + (my_head % peer.n_slots) * peer.slot_size + 2. backpressure: my_head - peer_tail_cache < peer.n_slots ? + (꽉 찼으면 sleep/poll) + 3. PE_DMA(vc_comm)에 DMA 요청 → src_addr에서 peer의 dst_addr로 nbytes 전송 + 4. my_head += 1 + +PE B: data = tl.recv(W) + 1. 내 rx_buffer[my_tail % n_slots] 위치 확인 + 2. 데이터 도착 대기 (D7 backpressure 모드) + 3. 그 주소를 PE 커널에 반환 (또는 fetch unit으로 register file에 로드) + 4. my_tail += 1 + 5. credit return fast path 발행 (D9) — bottleneck-BW latency 후 + peer A의 peer_tail_cache 갱신 +``` + +**핵심**: Slot에 데이터가 통째로 들어간다. PE B의 recv는 자기 rx_buffer만 +읽으면 되고, A의 메모리를 read하지 않는다. 송신 측 IPCQ가 peer rx slot +주소를 알고 있으므로 직접 그 주소로 DMA write한다 (single-hop). + +본인의 PE_TCM read/write는 DMA를 거치지 않는다 (PE에 직접 붙어있음). +slot이 본인 TCM에 있으면 직접 접근, 아니면 PE_DMA 경유. + +### D6. Buffer 위치 — 3-way benchmark + +호스트 CCL init이 IPCQ ring buffer의 메모리 위치를 결정한다: + +```python +ipcq_init( + backend="ahbm", + buffer_kind="tcm" | "hbm" | "sram", + n_slots=8, + slot_size=4096, +) +``` + +| 위치 | 특징 | trade-off | +|------|------|-----------| +| **PE_TCM** | PE에 직접 붙음, 빠름 | 작음, PE 내부 자원과 경쟁 | +| **PE-local HBM** | 큼, DMA 경유 | latency 큼 | +| **Cube SRAM** | 중간 크기, cube-shared | cube 내 PE 간 contention | + +세 위치 모두 동일 코드로 동작하며 init만 다르다. 벤치마크로 비교 가능. + +**규칙**: peer가 read/write할 때는 DMA 경유. 본인이 자기 PE_TCM 읽기/쓰기는 +DMA 없음. + +### D7. Backpressure — 2-mode benchmark + +송신 측이 peer slot full을 감지했을 때, 또는 수신 측이 데이터 미도착을 +감지했을 때 어떻게 대기하는가: + +| 모드 | 동작 | 모델 | +|------|------|------| +| **poll** | 캐시된 peer pointer를 주기적으로 재확인. cache update event를 폴링 | spin loop | +| **sleep** | SimPy event를 yield하고 sleep, peer가 update event를 trigger하면 wake | interrupt-like | + +```python +ipcq_init(backpressure="poll" | "sleep", ...) +``` + +두 모드 모두 구현하여 latency/throughput trade-off를 벤치마크할 수 있다. + +### D8. PE_DMA Virtual Channel + +PE_DMA를 단일 큐에서 **2-channel virtual channel** 모델로 확장한다. + +``` +PE_DMA +├── vc_compute: GEMM/MATH의 tile load/store/writeback +└── vc_comm: IPCQ의 send 데이터 +``` + +각 VC는 독립적인 state machine을 가진다: +- 한 채널이 stall되어도 다른 채널은 진행 +- 동일 link(cube_noc, UCIe 등)는 공유하지만, link BW는 두 채널이 분할 사용 + +**Chunk 단위 인터리브**: +- 큰 GEMM tile DMA가 한 번에 link를 점유하지 않음 +- chunk_size 단위로 진행 (예: 256B), 매 chunk마다 다른 VC와 link BW 공유 +- chunk_size는 init 파라미터 (작을수록 fair, 클수록 효율) + +이로써: +- HoL blocking 해소 (compute DMA 진행 중에도 IPCQ send 끼어들 수 있음) +- compute/comm overlap 자연스러움 (NVIDIA copy engine + compute SM 패턴) +- HW 모델 정합 (NoC virtual channel은 실제 HW 기법) + +**첫 구현의 정확도 한계 (intentional)**: + +본 ADR의 첫 구현은 **deterministic chunk-level interleave + weighted +round-robin arbitration** (default 50/50, ccl.yaml에 노출)을 채택한다. +이는 first-order approximation이며, 실제 HW의 dynamic contention/credit-based +arbitration보다는 단순화된 모델이다. + +| 모델링 항목 | 첫 구현 | 향후 확장 가능 | +|------------|---------|---------------| +| VC 간 BW 분할 | 정적 weight | dynamic contention 기반 | +| Chunk 단위 인터리브 | deterministic round-robin | priority/QoS 기반 | +| Cross-VC dependency | 없음 (독립) | NoC arbiter component 추가 | + +이 단순화는 functional correctness에는 영향이 없으며, latency 측정에서 +heavy contention 시나리오는 실제보다 약간 optimistic한 결과를 낼 수 있다. +정밀화가 필요하면 별도 ADR로 NoC arbiter를 도입한다. + +#### Token routing + +- compute용 token (TileToken): 기존 PE_FETCH_STORE → PE_DMA 체이닝 그대로 +- comm용 token (IpcqDmaToken, 신규): PE_IPCQ → PE_DMA로 self-routing +- PE_DMA가 token 종류로 채널 결정 + +```python +class PeDmaComponent: + def _process(self, env, token): + if isinstance(token, IpcqDmaToken): + yield from self._vc_comm_process(env, token) + else: + yield from self._vc_compute_process(env, token) +``` + +### D9. Pointer 동기화 — DMA payload piggyback + +실제 하드웨어(NVLink, UCIe 등)는 DMA 메시지의 payload에 메타데이터를 +piggyback하여 송수신과 함께 pointer를 갱신한다. 본 시뮬레이션도 같은 모델을 +채택하여 **별도의 control 채널 없이** 메타데이터가 data와 함께 도착하도록 한다. + +이 모델의 핵심 이점: + +- **자동 ordering**: 메타데이터가 data와 동일 token으로 이동하므로 data가 + 먼저 visible해진 다음에야 head_cache가 갱신된다. 별도 ordering invariant + 없이 race condition이 원천 차단된다. +- **HW 정합**: 실제 NVLink/UCIe의 piggybacked header 모델과 일치 +- **컴포넌트 단순화**: 별도 IpcqPtrUpdate event 종류가 필요 없음 + +#### Send 흐름 (head 측 piggyback) + +``` +PE A: tl.send(E, src_addr, nbytes) + 1. PE_IPCQ가 backpressure 체크 (peer_tail_cache 기준) + 2. PE_IPCQ가 IpcqDmaToken 생성: + - data 본체 (src_addr → peer dst_addr) + - piggyback metadata: (sender_seq, src_sip/cube/pe, src_direction) + 3. PE_DMA(vc_comm)에 token put + 4. PE A는 자기 my_head++ (송신 추적용) + +[fabric DMA: latency 만큼 진행] + +PE B의 PE_DMA가 token 수신 + 5. data를 dst_addr (B의 rx slot)에 MemoryStore.write + 6. token의 metadata를 PE B의 PE_IPCQ로 forward (PE 내부 wire, ~1 cycle) + +PE B의 PE_IPCQ가 metadata 수신 + 7. peer_head_cache 갱신 (= A의 head 위치) + 8. 대기 중인 recv (해당 direction)가 있으면 wake +``` + +여기서 핵심은 **5와 6은 같은 SimPy step**이라는 것이다 — DMA 완료와 동시에 +data와 metadata가 atomic하게 visible해진다. + +#### Recv 흐름 (credit return — fast path with bottleneck-BW latency) + +수신측이 slot을 비우면 송신측은 그 사실을 알아야 한다 (backpressure 해제). +data 경로의 piggyback 모델과 달리, credit return은 일반 vc_comm fabric을 +거치지 않고 **별도 fast path**로 처리한다. 이는 실제 HW의 NVLink/UCIe +credit return fast path를 추상화한 것이다. + +**Latency 계산**: magic constant가 아니라 **라우팅 경로의 bottleneck BW** +기준으로 산출한다. + +``` +credit_size_bytes = 16 (ccl.yaml: ipcq_credit_size_bytes) +path = router.find_path(self_pe, peer_pe) +latency = compute_drain_ns(path, credit_size_bytes) + = credit_size_bytes / bottleneck_bw_on_path +``` + +이로써: +- **토폴로지 비례 approximation**: cube 내 credit return과 cross-SIP credit이 + 자동으로 다른 latency를 가짐 (정확한 값은 아니지만 magic constant보다 의미 있음) +- **Magic constant 없음**: 별도 `ipcq_ctrl_latency_ns` 같은 임의 값 불필요 +- **Deadlock 위험 없음**: piggyback과 달리 B가 A에게 보낼 데이터가 없어도 + credit이 자동 발행됨 +- **기존 utility 재사용**: `ComponentContext.compute_drain_ns` 그대로 사용 + +``` +PE B: tl.recv(W) → 데이터 가져감 → my_tail++ + +PE B의 PE_IPCQ: + 1. router로 PE A까지 path 계산 + 2. compute_drain_ns(path, credit_size_bytes) = latency_ns + 3. env.process(self._delayed_credit_send(latency_ns, peer_credit_store, my_tail)) + +[fast path: latency_ns 만큼 timeout, fabric vc 미사용] + +PE A의 PE_IPCQ가 자기 credit_store에서 IpcqCreditMetadata 수신: + 4. peer_tail_cache 갱신 + 5. 대기 중인 send (해당 direction)가 있으면 wake +``` + +#### Component 결합도 — SimPy Store 채널 + +PE B의 PE_IPCQ가 PE A의 PE_IPCQ를 직접 호출하지 않는다. 대신 **init 시점에 +양쪽 PE_IPCQ 사이에 SimPy Store를 한 번 wire**해두고 (양방향 fast path 채널), +credit metadata는 그 store로 put한다. + +```python +class PeIpcqComponent: + def _delayed_credit_send(self, env, peer_credit_store, my_tail, latency_ns): + yield env.timeout(latency_ns) + yield peer_credit_store.put(IpcqCreditMetadata(seq=my_tail, ...)) +``` + +backend init에서 IpcqInitMsg fan-out 시 양방향 fast path channel을 함께 +설치한다 (D12 IpcqInitMsg에 명시). + +#### Credit return fast path의 한계 + +- `credit_size_bytes`는 estimate. 보통 16-64 bytes로 충분하며, 실제 HW의 + credit return wire 크기를 모방한 값. +- fast path는 일반 vc_comm BW contention 모델에서 **제외**된다 (별도 채널). + 실제 HW의 credit return wire는 매우 lightweight이므로 1차 근사로 합리적. +- 정밀화가 필요하면 후속 ADR에서: + - credit fast path를 별도 link로 모델링 (BW limit + contention) + - 또는 piggyback 모드로 변경 가능 (`credit_return_mode: piggyback`) + +#### PE_DMA의 책임 추가 + +PE_DMA(vc_comm)는 token 수신 시 다음 atomic 시퀀스로 처리한다. +**두 동작 사이에 SimPy yield를 두어서는 안 된다** (I6 MUST 규칙 참조): + +```python +def _on_vc_comm_recv(self, env, token): + # ── ATOMIC: 두 동작 사이에 yield 금지 ── + # 1. data를 dst_addr에 write (dst의 메모리 공간은 token.dst_endpoint.buffer_kind) + data = self._memory_store.read(token.src_space, token.src_addr, + shape=..., dtype=...) + self._memory_store.write(token.dst_endpoint.buffer_kind, token.dst_addr, data) + # 2. token의 metadata를 자기 PE의 IPCQ로 forward + yield self.out_ports[self._ipcq_id].put(IpcqMetaArrival(token=token)) + # ───────────────────────────────────── +``` + +`out_ports[ipcq_id].put`은 SimPy Store의 yield-able 호출이지만, PE 내부 +wire로 capacity가 unbounded인 store를 사용하므로 즉시 완료된다 (실질적으로 +single-step). 이 최종 put이 atomic 구간의 끝이며, 그 이전에 다른 yield가 +삽입되면 안 된다. + +#### Backpressure latency 정확도 + +backpressure 해제까지 걸리는 시간: + +- **데이터 send 측 latency** = full fabric DMA (data + piggyback metadata 함께) +- **Credit return 측 latency** = fast path with bottleneck-BW + (`credit_size_bytes / bottleneck_bw_on_path`) + +| 시나리오 | 모델링된 latency | 실제 HW와의 관계 | +|---------|----------------|----------------| +| Cube 내 (fast link) | 작음 (bottleneck = cube_noc BW) | topology-aware approximation | +| Cross-cube (UCIe) | 중간 (bottleneck = UCIe BW) | topology-aware approximation | +| Cross-SIP (PCIE) | 큼 (bottleneck = PCIE BW) | topology-aware approximation | + +별도 magic latency 파라미터 없이 토폴로지에 비례한 first-order +approximation이 자동으로 반영된다. 실제 HW와 정확히 일치하지는 않지만 +(credit fast path는 contention 모델에서 제외, credit_size_bytes는 estimate), +magic constant 모델보다 훨씬 의미 있는 비교 가능. 정밀화는 후속 ADR로 +넘긴다. + +### D9.5. ADR-0020 (2-Pass) 통합 + +`tl.send/recv`는 ADR-0020의 2-pass 모델과 통합되어야 한다. Phase 1은 +타이밍과 실제 데이터 이동(MemoryStore) 모두 모델링하고, Phase 2는 op_log +기반 정합성 검증을 가능케 한다. + +#### Phase 1 (타이밍 + 데이터 이동) + +D9는 head 갱신과 tail 갱신을 다른 메커니즘으로 모델링한다: + +- **Send-side (head update)** — DMA payload piggyback. data write와 metadata + forward가 동일 SimPy step에 일어나므로 자동으로 atomic visibility 보장. +- **Recv-side (tail credit return)** — fast path SimPy Store 채널. + bottleneck-BW 기반 latency 후 peer_tail_cache 갱신. + +두 메커니즘을 합쳐서 전체 ring buffer pointer 일관성을 유지한다. + +**send 시**: + +1. PE_IPCQ가 backpressure 체크 (peer_tail_cache 기준) +2. PE_IPCQ가 IpcqDmaToken 생성 (data + piggyback metadata) → PE_DMA(vc_comm)에 put +3. PE_DMA가 fabric DMA 시뮬레이션 (latency 진행) +4. **DMA 완료와 동일한 SimPy step에 atomic 시퀀스**: + - **MemoryStore.write(buffer_kind, dst_pa, data)** — single-hop DMA write + - 수신측 PE_IPCQ에 metadata forward → peer_head_cache 갱신 → 대기 recv wake +5. **op_log 기록**: `OpRecord(op_kind="ipcq", op_name="send", params={src_space, src_addr, dst_space, dst_addr, nbytes, dir, dtype, shape, sender_seq})` + - `dst_space`는 `token.dst_endpoint.buffer_kind`에서 derive된 값이다 + (별도 token 필드가 아니다). dst_addr은 `token.dst_addr`. + +**recv 시**: + +1. PE_IPCQ가 (peer_head_cache > my_tail) AND (MemoryStore.has(slot_addr)) 조건 대기 + (D9 piggyback 모델에서는 두 조건이 같은 step에 truthy가 되지만, defensive check) +2. 조건 만족 시: `slot_addr = my_rx_base + slot_idx * slot_size` +3. **두 가지 모드** (`recv_mode`로 op_log에 기록): + - **`return_slot`** (default): slot_addr을 그대로 PE 커널에 반환. + 데이터 복사 없음. 커널이 slot 메모리를 직접 사용한다. + - **`copy_to_dst`**: 호출 시 dst_addr이 지정된 경우. slot 데이터를 읽어서 + dst_addr에 write. `data = memory_store.read(...)`; `memory_store.write(dst_space, dst_addr, data)` +4. PE_IPCQ가 my_tail++, fast path credit return을 발행 (D9 — vc_comm + fabric을 거치지 않고 별도 SimPy Store 채널로 bottleneck-BW latency 후 + peer 측 peer_tail_cache 갱신) +5. **op_log 기록**: `OpRecord(op_kind="ipcq", op_name="recv", params={recv_mode, src_space, src_addr, dst_space, dst_addr, nbytes, dir, dtype, shape, consumer_seq})` + - `recv_mode="return_slot"`: src_space/src_addr가 slot 위치, dst_addr=None + - `recv_mode="copy_to_dst"`: src_space/src_addr가 slot 위치, dst_space/dst_addr가 사용자 지정 위치 + +#### Phase 2 (op_log replay) + +DataExecutor가 `op_kind="ipcq"` 레코드를 만나면: + +- **send**: src → dst (peer rx slot)로 ndarray를 idempotent하게 write +- **recv (`recv_mode="return_slot"`)**: no-op. slot 데이터는 Phase 1에서 + 이미 적절한 위치에 있으며, 커널이 해당 slot 메모리를 직접 사용함. +- **recv (`recv_mode="copy_to_dst"`)**: slot → dst_addr로 ndarray를 idempotent + 하게 copy + +본질적으로 IPCQ는 **데이터 이동**만 하므로 Phase 2가 추가로 계산할 것은 없다. +DataExecutor의 GEMM/Math가 그 데이터를 사용하면 자동으로 정합성이 검증된다. + +```python +class DataExecutor: + def _execute_op(self, op): + if op.op_kind == "ipcq": + self._execute_ipcq(op) + elif op.op_kind == "memory": + ... + elif op.op_kind == "gemm": + ... + + def _execute_ipcq(self, op): + """IPCQ ops are data movement; Phase 1 already wrote to MemoryStore.""" + p = op.params + if op.op_name == "send": + data = self.store.read(p["src_space"], p["src_addr"], + shape=p["shape"], dtype=p["dtype"]) + self.store.write(p["dst_space"], p["dst_addr"], data) + elif op.op_name == "recv": + if p.get("recv_mode") == "copy_to_dst": + data = self.store.read(p["src_space"], p["src_addr"], + shape=p["shape"], dtype=p["dtype"]) + self.store.write(p["dst_space"], p["dst_addr"], data) + # recv_mode == "return_slot": no-op (data already in slot) +``` + +#### `--verify-data` 흐름 (CCL 커널) + +``` +1. kernbench run --bench ccl_allreduce --verify-data +2. backend init → IPCQ buffers 할당, neighbor table install +3. 모든 rank greenlet 동시 실행 +4. 각 PE 커널이 tl.send/recv → MemoryStore에 데이터 누적 +5. 시뮬레이션 완료 후 DataExecutor.run() → ipcq op 멱등 replay (no-op) +6. 벤치마크가 print(out) 또는 out.data 비교 → 정합성 확인 +``` + +벤치 작성자는 `out.data`로 결과를 읽고 expected와 비교하면 된다 (ADR-0020 D7 +Tensor.data 패턴). + +### D10. 호스트 CCL Init은 PyTorch 패턴 그대로 + +호스트 코드는 실제 PyTorch distributed 코드와 동일하게 유지한다. +`init_process_group`은 backend 객체만 만들고, IPCQ 설정 (neighbor topology, +buffer_kind, backpressure 등)은 받지 않는다. + +```python +# benches/ccl_allreduce.py — 실제 PyTorch와 동일한 호스트 코드 +def run_rank(rank, world_size, torch): + dist = torch.distributed + dist.init_process_group(backend="ahbm", world_size=world_size, rank=rank) + + tensor = torch.zeros((M, K), dtype="f16", dp=...) + + from kernbench.ccl.algorithms import ring_allreduce + torch.launch("ring_allreduce", ring_allreduce.kernel, tensor, rank, world_size) +``` + +IPCQ 설정은 backend가 **init_process_group 시점에** `ccl.yaml`을 읽고 즉시 +PE_IPCQ neighbor table을 install한다. 호스트 코드는 IPCQ를 인지할 필요가 없다. + +벤치마크 하나는 하나의 알고리즘을 사용하는 것을 가정하며, 사용할 알고리즘은 +`ccl.yaml`의 `defaults.algorithm` 으로 지정한다 (D11). 호스트 코드 변경 없이 +ccl.yaml만 수정하여 다른 알고리즘으로 교체할 수 있다. + +#### Init 흐름 (eager) + +1. `init_process_group(backend="ahbm")` 호출 +2. backend가 `ccl.yaml` 로드 → `defaults.algorithm` 결정 +3. `algorithms[]`에서 topology + buffer_kind + backpressure + slot/size 결정 +4. **즉시** 모든 PE의 PE_IPCQ에 neighbor table을 install (sideband 또는 fabric `IpcqInitMsg`) +5. 이후 `torch.launch(kernel_name, ...)`는 일반 launch와 동일하게 처리 + (CCL kernel이든 아니든 PE_IPCQ는 이미 준비됨) + +### D11. CCL 설정 파일 (`ccl.yaml`) + +IPCQ 설정과 알고리즘 metadata는 별도 YAML 파일에 둔다. +`components.yaml`/`topology.yaml`과 같은 패턴을 유지하며, 변경 이력이 코드처럼 +추적 가능하다. + +벤치마크 한 번 실행은 한 알고리즘만 사용한다 (`defaults.algorithm`). +다른 알고리즘으로 교체하려면 `ccl.yaml`의 `defaults.algorithm` 만 바꾸면 된다. + +```yaml +# ccl.yaml — CCL backend (ahbm) configuration +# +# 이 파일은 init_process_group(backend="ahbm") 시점에 로드되며, +# defaults.algorithm 으로 지정된 알고리즘에 따라 PE_IPCQ neighbor table을 +# install한다. 호스트 코드는 IPCQ 설정을 인지하지 않는다. + +defaults: + # 이번 벤치 실행에서 사용할 알고리즘. algorithms 섹션에 정의된 것 중 하나. + algorithm: ring_allreduce + + # IPCQ ring buffer가 위치할 메모리. + # tcm — PE-local TCM (작지만 빠름, PE 내부 자원과 경쟁) + # hbm — PE-local HBM (큼, DMA latency 큼) + # sram — Cube-shared SRAM (중간 크기, cube 내 PE 간 contention) + buffer_kind: tcm + + # send/recv가 peer slot full / data 미도착을 만났을 때의 대기 방식. + # poll — peer pointer 캐시를 spin loop로 재확인 + # sleep — SimPy event yield 후 wakeup 대기 (interrupt-like) + backpressure: sleep + + # Ring buffer depth (한 방향당 slot 개수). 클수록 in-flight 가능, 메모리 ↑ + n_slots: 8 + + # Slot 하나의 크기 (bytes). 한 tile을 통째로 담을 수 있는 크기여야 함. + slot_size: 4096 + + # PE_DMA virtual channel chunk 크기 (bytes). 작을수록 fair, 클수록 효율. + # IPCQ traffic과 compute traffic 사이의 인터리브 granularity (D8 참조). + vc_chunk_size: 256 + + # Credit return fast path 메시지 크기 (bytes). 실제 HW의 credit return wire + # 크기를 모방. backend가 라우팅 경로의 bottleneck BW를 보고 latency를 + # 계산한다 (D9 참조). 보통 16-64로 충분. + ipcq_credit_size_bytes: 16 + +algorithms: + # ── 알고리즘 정의 ───────────────────────────────────────────────── + # 각 entry는 알고리즘 모듈과 그 알고리즘이 요구하는 topology를 명시한다. + # 알고리즘별 default override 가능 (buffer_kind, backpressure 등). + + ring_allreduce: + # PE 커널이 정의된 모듈. `kernel(t_ptr, rank, world_size, tl)` 함수를 export. + module: kernbench.ccl.algorithms.ring_allreduce + + # 이 알고리즘이 요구하는 neighbor topology. builtin 이름 또는 "custom". + # ring_1d — 1D 양방향 ring (E/W) + # ring_1d_unidir — 1D 단방향 ring (E only) + # mesh_2d — 2D mesh (N/S/E/W) + # tree_binary — binary tree (parent/children direction) + # custom — 모듈의 neighbors(rank, world_size) 함수 사용 + topology: ring_1d + + tree_allreduce: + module: kernbench.ccl.algorithms.tree_allreduce + topology: tree_binary + # 알고리즘별 override (이 알고리즘만 hbm 사용) + buffer_kind: hbm + + custom_mesh: + module: kernbench.ccl.algorithms.custom_mesh + topology: custom # 모듈이 직접 neighbors() 함수 제공 +``` + +#### 알고리즘 모듈 구조 + +알고리즘 모듈은 두 개의 hook을 export한다 — `kernel`은 필수, `neighbors`는 선택. + +```python +# src/kernbench/ccl/algorithms/ring_allreduce.py + +def kernel(t_ptr, rank, world_size, tl): + """필수 — PE 커널. + + IPCQ 설정은 backend가 ccl.yaml + neighbors() 결과로 install한 상태이다. + 커널은 그저 4-방향 send/recv API만 사용하면 된다. + """ + for step in range(world_size - 1): + ... + tl.send(dir="E", ...) + data = tl.recv(dir="W") + + +def neighbors(rank, world_size, neighbor_map): + """선택 — neighbor table override hook. + + backend는 ccl.yaml의 topology 필드에 따라 builtin neighbor_map을 생성한 뒤, + 이 함수가 정의되어 있으면 호출하여 결과를 override 한다. + + Args: + rank: 이 rank의 인덱스 + world_size: 전체 rank 수 + neighbor_map: ccl.yaml의 topology 필드가 만든 builtin 매핑 + 예: ring_1d → {"E": (rank+1)%ws, "W": (rank-1)%ws} + mutable dict — 직접 수정 가능 + + Returns: + dict | None: + dict — neighbor_map을 override한 결과 + None — override 안 함, neighbor_map 그대로 사용 + """ + return None # 또는 수정 후 반환 +``` + +#### `neighbors` override 패턴 + +대부분의 알고리즘은 builtin topology만으로 충분하므로 `neighbors` 정의가 필요 없다. +정의가 필요한 경우의 패턴: + +**Pattern A — builtin을 base로 일부만 수정**: +```python +def neighbors(rank, world_size, neighbor_map): + # 짝수 rank만 W 사용 + if rank % 2 == 1: + neighbor_map.pop("W", None) + return neighbor_map +``` + +**Pattern B — 완전히 새로 만들기 (skip-connection ring 등)**: +```python +def neighbors(rank, world_size, neighbor_map): + # neighbor_map은 무시하고 새로 작성 + return {"E": (rank + 2) % world_size} +``` + +#### Builtin topology generators + +`ccl.yaml`의 `topology` 필드가 다음 builtin 이름이면 backend가 알아서 처리: + +| topology | 설명 | direction set | +|----------|------|---------------| +| `ring_1d` | 1D 양방향 ring | E, W | +| `ring_1d_unidir` | 1D 단방향 ring | E only | +| `mesh_2d` | 2D mesh | N, S, E, W | +| `tree_binary` | binary tree (root = rank 0) | parent, child_left, child_right | +| `none` | 빈 매핑 — 알고리즘이 `neighbors()`로 처음부터 작성 | (없음) | + +`topology: none`은 builtin이 빈 dict를 반환하므로 알고리즘의 `neighbors()`가 +처음부터 매핑을 만들어야 한다. + +#### 알고리즘 추가 절차 + +1. `src/kernbench/ccl/algorithms/.py`에 `kernel` 함수 작성 +2. `ccl.yaml`의 `algorithms` 섹션에 entry 추가 (`module`, `topology`) +3. (선택) 같은 모듈에 `neighbors()` 함수 추가하여 builtin override +4. `defaults.algorithm`을 새 알고리즘으로 설정하면 적용 + +호스트 코드는 손대지 않는다. + +### D12. 메시지 / 토큰 스키마 + +본 ADR이 추가하는 모든 메시지/토큰의 필드를 명시한다. 구현 시 이 정의를 +`src/kernbench/common/pe_commands.py`와 `src/kernbench/runtime_api/kernel.py`에 +그대로 추가한다. + +#### `IpcqInitMsg` (sideband, init 시 fan-out) + +backend가 모든 PE의 PE_IPCQ에 neighbor table을 install하기 위해 사용한다. +구조는 `MmuMapMsg`와 유사 (target_sips, target_cubes, target_pe + entries). + +```python +@dataclass(frozen=True) +class IpcqInitEntry: + direction: str # "N" | "S" | "E" | "W" + peer: IpcqEndpoint # D2.5 참조 + my_rx_base_pa: int # 자신의 rx_buffer base + my_rx_base_va: int # 선택 + n_slots: int + slot_size: int + # Credit fast path 채널 (D9). + # 계약: 이 필드는 반드시 simpy.Store 인스턴스이며, IpcqCreditMetadata + # 객체만을 받는 receive endpoint이다 (peer's PE_IPCQ가 자기 입력 큐로 + # 사용). 송신 측 PE_IPCQ는 _delayed_credit_send에서 이 store에 직접 + # IpcqCreditMetadata를 put한다. 다른 객체 type을 put해서는 안 된다. + # backend init 시 양방향 SimPy Store가 한 번 wire되며 이후 변경 불가. + peer_credit_store: "simpy.Store[IpcqCreditMetadata]" + +@dataclass(frozen=True) +class IpcqInitMsg: + correlation_id: str + request_id: str + target_sips: tuple[int, ...] + target_cubes: tuple[int, ...] + target_pe: int | tuple[int, ...] | str + entries: tuple[IpcqInitEntry, ...] # 이 PE의 4-방향 entry + backpressure_mode: str # "poll" | "sleep" + buffer_kind: str # "tcm" | "hbm" | "sram" + credit_size_bytes: int # D9 fast path latency 계산용 (default 16) +``` + +**Credit fast path channel wiring**: backend init이 모든 PE의 PE_IPCQ에 +양방향 fast path 채널을 한 번 설치한다. PE A의 IpcqInitEntry(direction=E)에 +PE B의 credit-receive Store reference를 넣어 송신 측이 직접 put할 수 있게 +한다 (별도 fabric routing 없음). + +#### `IpcqSendCmd` (PE_CPU → PE_IPCQ) + +```python +@dataclass(frozen=True) +class IpcqSendCmd: + direction: str # 어느 방향으로 보낼지 + src_addr: int # 보낼 데이터의 원본 주소 (TCM/HBM) + src_space: str # "tcm" | "hbm" | "sram" + nbytes: int + shape: tuple[int, ...] # data shape (op_log/MemoryStore용) + dtype: str + handle_id: str # completion 추적용 + data_op: bool = True # ADR-0020 op_log 기록 대상 +``` + +#### `IpcqRecvCmd` (PE_CPU → PE_IPCQ) + +```python +@dataclass(frozen=True) +class IpcqRecvCmd: + direction: str | None # None이면 round-robin (weak fairness, D4) + # recv_mode: 두 가지 동작 모드 + # "return_slot" — slot 주소를 그대로 PE 커널에 반환 (default, zero-copy) + # "copy_to_dst" — slot 데이터를 dst_addr에 copy 후 반환 + recv_mode: str = "return_slot" + # dst_addr / dst_space는 recv_mode="copy_to_dst"일 때만 사용됨 + dst_addr: int = 0 + dst_space: str = "" + shape: tuple[int, ...] = () # data shape (op_log/MemoryStore용) + dtype: str = "" + handle_id: str = "" + blocking: bool = True # blocking vs non-blocking + data_op: bool = True +``` + +#### `IpcqDmaToken` (PE_IPCQ → PE_DMA, vc_comm 채널) + +D9의 piggyback 모델에 따라 token이 data + head metadata를 함께 담아 +fabric을 따라 이동한다. 수신 측 PE_DMA가 도착 시점에 data를 dst_addr에 +write하고 metadata를 PE_IPCQ로 forward한다 (atomic). + +```python +@dataclass +class IpcqDmaToken: + # ── Data movement (single-hop DMA write) ── + src_addr: int # 자기 메모리 주소 + src_space: str + dst_addr: int # peer rx slot 주소 (이미 계산됨) + dst_endpoint: IpcqEndpoint # 라우팅용 (sip/cube/pe) + nbytes: int # data 크기 + handle_id: str # 완료 시 송신 측 PE_IPCQ로 알림 + + # ── Piggyback metadata (수신측 PE_IPCQ가 자동 갱신할 정보) ── + sender_seq: int # 단조 증가 sequence number + # peer가 자기 head_cache로 사용 + src_sip: int # 송신 측 (수신측이 어느 peer인지 식별) + src_cube: int + src_pe: int + src_direction: str # 송신측 기준 방향 (수신측은 reverse 매핑으로 자기 direction 결정) + + data_op: bool = True # ADR-0020 op_log 기록 대상 +``` + +PE_DMA는 token type으로 채널 결정 (D8): TileToken → vc_compute, IpcqDmaToken → vc_comm. + +**수신 측 PE_DMA의 처리** (vc_comm 도착 시): + +```python +def _vc_comm_arrival(self, env, token: IpcqDmaToken): + # 1. data를 dst_addr에 write (data와 metadata atomic visibility) + if self._memory_store is not None: + data = self._memory_store.read(token.src_space, token.src_addr, + shape=..., dtype=...) + self._memory_store.write(token.dst_endpoint.buffer_kind, token.dst_addr, data) + # 2. metadata를 자기 PE의 IPCQ로 forward (PE 내부 wire, 같은 step) + yield self.out_ports[self._ipcq_id].put(IpcqMetaArrival(token=token)) +``` + +PE_IPCQ는 `IpcqMetaArrival`을 받아 sender_seq를 보고 peer_head_cache를 갱신한다. + +#### `IpcqCreditMetadata` (PE_IPCQ → peer PE_IPCQ, fast path 채널) + +Credit return은 D9의 fast path 모델에 따라 vc_comm fabric을 거치지 않고 +**별도의 SimPy Store 채널**로 전달된다. backend init 시 양방향 channel이 +미리 wire되며, latency는 bottleneck-BW 기반으로 계산된다. + +```python +@dataclass(frozen=True) +class IpcqCreditMetadata: + """Credit return — recv 측 → send 측 fast path.""" + consumer_seq: int # my_tail (recv 측의 새 tail) + src_sip: int # 누가 보냈는지 (수신 측이 어느 peer credit인지 식별) + src_cube: int + src_pe: int + src_direction: str # 송신 측 기준 방향 (수신 측은 reverse 매핑) +``` + +**전송 흐름**: + +```python +class PeIpcqComponent: + def _delayed_credit_send(self, env, peer_credit_store, my_tail, latency_ns): + yield env.timeout(latency_ns) + yield peer_credit_store.put(IpcqCreditMetadata( + consumer_seq=my_tail, src_sip=..., src_cube=..., src_pe=..., + src_direction=..., + )) +``` + +`latency_ns`는 D9에 정의된 대로: + +```python +path = self.ctx.router.find_path(self_pe_prefix, peer_pe_prefix) +latency_ns = self.ctx.compute_drain_ns(path, credit_size_bytes) +``` + +**별도의 IpcqPtrUpdate 이벤트는 없다** — head 갱신은 D9 piggyback 모델로, +tail 갱신은 D9 fast path SimPy Store 채널로 처리된다. + +### D13. 테스트 전략 + +ADR-0021의 D8 패턴을 따라 단위/통합/regression 테스트를 명시한다. + +#### T1. 단위 테스트 (component-level) + +- **PE_IPCQ 단위** (`tests/test_pe_ipcq.py`): + - send: backpressure 미발생 시 즉시 PE_DMA로 token forward + - send: peer slot full → backpressure (poll/sleep 모드별) + - send: peer credit return (IpcqCreditMetadata) 도착 후 backpressure 해제 + - recv: 데이터 도착 시 즉시 반환 + - recv: 데이터 미도착 → wait → IpcqMetaArrival (D9 piggyback) 수신 시 wake + - recv (round-robin): 4-방향 중 도착한 첫 데이터 반환 (weak fairness) + - 잘못된 방향 → IpcqInvalidDirection 예외 + +- **PE_DMA virtual channel** (`tests/test_pe_dma_vc.py`): + - vc_compute / vc_comm 독립 진행 (한 채널 stall 시 다른 채널 진행) + - chunk-level 인터리브 verification + - link BW 분할 (50/50 또는 weighted) + +- **builtin topology** (`tests/test_ccl_topologies.py`): + - ring_1d/mesh_2d/tree_binary 각각 (rank, world_size) → neighbor dict 정합성 + - mesh_2d non-square → ValueError + - resolve_topology(custom, module) → module.neighbors 반환 + +#### T2. 통합 테스트 (E2E send/recv) + +- **`tests/test_ipcq_e2e.py`**: + - 2-rank ring: rank 0 send(E) → rank 1 recv(W) → 데이터 정합성 + - 4-rank ring: 양방향 send/recv 동시 진행, deadlock 없음 + - mesh_2d 4×4: N/S/E/W 4방향 동시 send/recv + +- **CCL kernel + 2-pass** (`tests/test_ipcq_2pass.py`): + - greenlet 모드 + IPCQ → op_log에 ipcq 레코드 생성 검증 + - DataExecutor가 ipcq op 처리 후 결과 정합성 (`out.data` 확인) + +#### T3. Backend init 테스트 (`tests/test_ccl_backend_ipcq.py`) + +- ccl.yaml 로드 → `defaults.algorithm` 추출 +- builtin topology → IpcqInitMsg fan-out +- IpcqEndpoint의 PA가 모든 PE에서 일관 (rank A의 peer E의 rx_base_pa = rank A+1의 자기 rx_base_pa) +- buffer_kind 별 메모리 할당 (tcm/hbm/sram) + +#### T4. Regression + +- 기존 401 tests 전부 PASS +- ADR-0020 통합으로 인한 op_log/DataExecutor 영향 없음 (CCL 미사용 벤치) + +#### T5. 성능 / overhead + +- 단일 send/recv pair latency = (DMA latency) + (IPCQ overhead) +- 비교: 같은 nbytes의 일반 PE_DMA write와 거의 동일해야 함 (IPCQ overhead < 100 ns) + +### D14. Invariants & Failure Modes + +CCL 인프라에서 흔히 발생하는 hang/오류 상황을 명시하고, 대응 방식을 정의한다. + +#### Invariants (시뮬레이션이 보장해야 하는 것) + +I1. **Slot lifecycle exactly-once**: 한 send → 정확히 한 recv. 중복 send나 + 중복 recv는 sequence 오류로 간주. + +I2. **Pointer monotonicity**: my_head, my_tail은 단조 증가 (감소 없음). + sender_seq는 송신 측에서 단조 증가, 수신 측 cache 갱신도 단조 증가. + +I3. **Endpoint consistency**: rank A의 IpcqEndpoint(direction=E)의 peer가 + rank B라면, rank B의 IpcqEndpoint(reverse(E))의 peer는 rank A여야 함. + backend init 시 검증. + +I4. **buffer_kind consistency**: 한 ProcessGroup 내 모든 PE의 buffer_kind는 + 동일 (mixed kind는 supported 안 함, 첫 구현). 검증 실패 시 init 에러. + +I5. **op_log ordering**: send → DMA 완료 → recv 가능. op_log의 t_start + 순서가 이 인과관계를 위배하지 않음. + +I6. **Atomic data + metadata visibility (MUST)**: 본 ADR의 correctness 핵심 + 조건이다. 수신 측에서 data write (MemoryStore.write)와 metadata forward + (peer_head_cache 갱신)는 동일한 SimPy step에 일어나야 한다. control이 + data를 앞지를 수 없다. + + **구현 규칙 (MUST)**: + - PE_DMA의 vc_comm token 도착 처리(`_vc_comm_arrival`)는 다음 두 동작 + 사이에 **어떤 SimPy yield도 두어서는 안 된다**: + 1. `MemoryStore.write(token.dst_endpoint.buffer_kind, token.dst_addr, data)` + 2. PE_IPCQ에 `IpcqMetaArrival` forward + - 두 동작은 동일 SimPy event callback 내에서 연속 실행되어야 한다. + - 코드 리뷰에서 이 사이에 `yield` (또는 `yield from`)을 추가하는 것은 + correctness 위반으로 reject한다. + + 이 규칙을 위반하면 다른 SimPy process가 끼어들어 head_cache가 data + visibility보다 먼저 또는 늦게 보이는 race condition이 발생한다. + +I7. **MemoryStore slot existence ↔ pointer**: I6의 결과로, + `peer_head_cache > my_tail`이 truthy가 되는 step과 `MemoryStore.has(slot_addr)` + 이 truthy가 되는 step이 동일하다. recv는 두 조건을 모두 체크하지만 (defensive), + 단일 조건만 체크해도 정확하다. + +#### Failure Modes (런타임 에러) + +F1. **잘못된 direction**: + - PE 커널이 `tl.send(dir="X")` 호출 → install 안 된 direction + - PE_IPCQ가 즉시 `IpcqInvalidDirection` 예외 raise + - SimPy 시뮬레이션 즉시 abort, 사용자에게 명확한 에러 + +F2. **타입 mismatch**: + - send와 recv의 dtype/shape/nbytes가 일치하지 않음 + - 첫 구현은 검증 안 함 (dtype/shape는 hint), 향후 strict mode로 추가 + +F3. **Deadlock detection (timeout 기반)**: + - send: peer_tail_cache가 갱신 안 되고 영원히 wait + - recv: peer_head_cache 갱신 안 되고 영원히 wait + - 시뮬레이션 timeout (default 10ms simulated time) 초과 시 abort + - 디버그를 위해 각 PE의 last send/recv 위치, blocking 상태 dump + +F4. **Backend init 실패**: + - ccl.yaml에 `defaults.algorithm` 누락 + - `algorithms[name]` 정의 누락 + - 알고리즘 모듈 import 실패 + - topology 검증 실패 (I3, I4) + → 모두 `init_process_group` 시점에 즉시 에러 + +F5. **Slot full + 무한 backpressure**: + - peer가 영원히 안 받음 + - F3과 같이 timeout으로 처리 + - 디버그: 막힌 PE의 my_head, peer_tail_cache 출력 + +#### 진단 도구 (구현 단계에서 추가) + +- **CCL trace**: 각 send/recv를 (rank, t, dir, nbytes) 형태로 로깅 +- **Pointer dump**: 시뮬레이션 종료 시 또는 hang 시 모든 PE의 IPCQ pointer 상태 출력 +- **Deadlock graph**: hang 발생 시 wait-for 그래프 출력 (어느 PE가 어떤 PE를 기다리는지) + +### D15. 알고리즘 작성자 가이드 (요약) + +본 섹션은 알고리즘 작성자가 한 화면으로 시작점을 잡을 수 있도록 한다. +자세한 step-by-step 가이드는 [docs/ccl-author-guide.md](../ccl-author-guide.md) 참조. + +#### 만지는 것 / 만지지 않는 것 + +| 만지는 것 | 만지지 않는 것 | +|----------|---------------| +| `src/kernbench/ccl/algorithms/.py` (kernel + 선택적 neighbors) | `benches/ccl_allreduce.py` 호스트 코드 | +| `ccl.yaml` 의 한 entry 추가 + `defaults.algorithm` | `src/kernbench/ccl/` 프레임워크 | +| (선택) `tests/test_.py` 단위 테스트 | `src/kernbench/components/builtin/pe_ipcq.py` 컴포넌트 | +| | `src/kernbench/runtime_api/distributed.py` backend | + +#### 알고리즘 모듈 인터페이스 contract + +```python +# src/kernbench/ccl/algorithms/.py + +def kernel(*args, tl) -> None: + """필수. PE 커널. + + Args (positional): tensor pointers, rank, world_size, 알고리즘 파라미터 + Args (keyword): tl — TLContext (자동 주입) + + 사용 가능한 IPCQ API: + tl.send(dir, src_addr, nbytes) # blocking, backpressure 시 wait + tl.recv(dir) # 특정 방향에서 blocking recv + tl.recv() # 4방향 round-robin + tl.recv_async(dir) → handle # non-blocking + tl.wait(handle) # non-blocking 완료 대기 + + 기존 API도 그대로 사용: + tl.load / tl.store / tl.composite / tl.program_id 등 + """ + ... + +def neighbors(rank, world_size, neighbor_map) -> dict | None: + """선택. ccl.yaml의 builtin topology가 만든 neighbor_map을 override. + + None 반환 → builtin 그대로 사용 + dict 반환 → 그 dict로 override (builtin을 base로 수정 가능) + """ + return None +``` + +#### 5-step 흐름 + +1. **kernel 함수 작성** — `src/kernbench/ccl/algorithms/.py` 신규 파일 +2. **ccl.yaml 등록** — `algorithms.` entry + `defaults.algorithm` 변경 +3. **(선택) neighbors override** — builtin topology를 base로 수정이 필요할 때 +4. **단위 테스트** — `kernbench.ccl.testing.run_kernel_in_mock` (SimPy 없이 빠름) +5. **시뮬 검증** — `kernbench run --bench ccl_allreduce --verify-data` + +호스트 코드 (`benches/ccl_allreduce.py`)는 손대지 않는다. + +#### 사용 가능한 헬퍼 (`kernbench.ccl.helpers`) + +| Helper | 설명 | +|--------|------| +| `chunked(addr, n_chunks, ...)` | 텐서를 n개 chunk view로 슬라이싱 | +| `ring_step(rank, step, ws)` | ring algorithm의 step별 (send_idx, recv_idx) | +| `tree_step(rank, level)` | binary tree의 level별 parent/child 인덱스 | + +#### 디버깅 도구 + +- `KERNBENCH_CCL_TRACE=1` — send/recv trace 출력 +- 시뮬 종료 시 자동 IPCQ pointer dump +- Deadlock 시 (10ms 시뮬 시간 초과) wait-for graph dump + +#### 흔한 실수 + +1. **install 안 된 direction 사용** — ccl.yaml의 topology가 ring_1d면 N/S 사용 불가 +2. **send/recv 짝 맞지 않음** — peer 측 recv 없으면 hang (slot full backpressure) +3. **dtype/shape 불일치** — 첫 구현은 검증 안 함, 작성자 책임 + +자세한 step-by-step과 hello-world 예제는 `docs/ccl-author-guide.md` 참조. + +--- + +## Non-goals + +- **호스트 collective**: `dist.all_reduce`가 데이터 이동을 직접 수행하는 모델은 + 본 ADR 범위 외. 본 ADR은 PE 커널 안에서 일어나는 통신만 다룬다. +- **All-reduce 알고리즘**: ring/tree 등 알고리즘 자체는 별도 ADR (또는 커널 + 코드)에서 다룬다. 본 ADR은 인프라(IPCQ + VC)만 정의. +- **Reliability/error handling**: send/recv 실패, link 장애 등은 다루지 않음. +- **NoC arbiter 정밀 모델**: VC 간 dynamic contention은 첫 구현 범위 외 (D8). + +--- + +## Open Questions + +- **VC arbitration 정확도**: 첫 구현은 deterministic chunk interleave + + weighted round-robin. heavy contention 시나리오에서 실제보다 optimistic한 + 결과가 나올 수 있음. 정밀화 필요 시 별도 NoC arbiter component 도입을 검토. +- **Credit return fast path BW 모델**: 첫 구현은 fast path가 fabric BW + contention 모델에서 제외 (별도 lightweight wire 가정). 정밀화 필요 시 + credit fast path를 별도 link로 모델링하거나, `credit_return_mode: piggyback` + 옵션 추가. +- **Ring buffer slot의 메모리 할당**: TCM/HBM/SRAM 어디에 두든 IPCQ가 알아야 + 할 metadata (base addr, slot_size, n_slots). init 시 호스트가 사이드밴드로 + 넣을지, fabric MmuMapMsg와 유사한 메시지로 넣을지 결정 필요. +- **VC 간 BW 분할 default**: 균등 분할(50/50)인지, weighted(예: 80% compute, + 20% comm)인지. ccl.yaml에 노출하되 default 값 결정 필요. +- **Direction 개수**: 4방향(N/S/E/W) 고정인지, 6방향(+ Up/Down for 3D), + 또는 가변 N개로 확장할지. 첫 구현은 4방향 고정. +- **다중 channel 데이터 구조 (multi-tile aggregation)**: 한 collective에서 + 여러 tile을 fan-out 받는 경우 기존 round-robin recv로 충분한지, 별도 + primitive(`tl.recv_all`)가 필요한지. +- **Round-robin recv fairness**: 첫 구현은 last_polled_dir 인덱스 기반 weak + fairness. 한 방향에 데이터가 항상 먼저 도착하면 starvation 가능. strict + fairness가 필요하면 별도 fairness counter 추가. +- **Deadlock detection 정밀화**: 첫 구현은 timeout 기반. 향후 wait-for graph + 실시간 추적으로 deterministic deadlock detection 가능. + +--- + +## Consequences + +### 긍정적 + +- PE 간 직접 통신 가능 → CCL 커널 작성 가능 +- 호스트는 launch만, 동기화는 PE 안에서 → 단순한 호스트 코드, 강한 + compute/comm overlap +- VC를 통해 HoL blocking 제거 → collective latency가 compute traffic에 + block되지 않음 +- Buffer 위치/backpressure 모드를 init 파라미터로 선택 가능 → 벤치마크 가능 +- 4-방향 logical neighbor → 호스트가 ring/mesh/tree 등 알고리즘 자유롭게 + 매핑 + +### 부정적 + +- 컴포넌트 1개 신규 추가 (PE_IPCQ), PE_DMA 재설계 (VC 추가) +- IPCQ 메모리 (8 ring × slot_size × n_slots) 만큼 PE-local 메모리 사용 +- VC arbitration 모델이 first-order approximation이므로 heavy contention + 시나리오에서 실제 HW보다 약간 optimistic한 latency 결과 가능 (D8 한계) +- VC chunk-level 인터리브로 PE_DMA 구현이 더 복잡해짐 + +--- + +## 영향받는 파일 + +| 파일 | 변경 | +|------|------| +| `topology.yaml` | pe_template에 pe_ipcq 추가, ipcq↔dma/cpu/tcm edge 추가 | +| `components.yaml` | pe_ipcq_v1 등록 | +| `src/kernbench/topology/builder.py` | PE 내부 edge에 ipcq 체인 추가 | +| `src/kernbench/components/builtin/pe_ipcq.py` | 신규 | +| `src/kernbench/components/builtin/pe_dma.py` | VC 추가, IpcqDmaToken 처리 | +| `src/kernbench/common/pe_commands.py` | IpcqSendCmd, IpcqRecvCmd, IpcqDmaToken 정의 | +| `src/kernbench/triton_emu/tl_context.py` | tl.send / tl.recv API | +| `src/kernbench/runtime_api/distributed.py` | ccl.yaml 로드, init 시 IPCQ install (eager) | +| `src/kernbench/runtime_api/kernel.py` | IpcqInitMsg (sideband) 정의 | +| `src/kernbench/ccl/__init__.py` | 신규 — CCL 패키지 | +| `src/kernbench/ccl/topologies.py` | 신규 — builtin topology generators (ring_1d, mesh_2d, tree_binary 등), `resolve_topology()` | +| `src/kernbench/ccl/helpers.py` | 신규 — 알고리즘 작성 헬퍼 (chunked, ring_step 등) | +| `src/kernbench/ccl/testing.py` | 신규 — mock CCL runtime (`run_kernel_in_mock`) | +| `ccl.yaml` | 신규 — 알고리즘 metadata + IPCQ default 설정 | +| `src/kernbench/ccl/algorithms/ring_allreduce.py` | 신규 — 첫 알고리즘 예제 | +| `tests/test_pe_ipcq.py` | 신규 — PE_IPCQ 단위 테스트 | +| `tests/test_pe_dma_vc.py` | 신규 — PE_DMA virtual channel 테스트 | +| `tests/test_ipcq_e2e.py` | 신규 — send/recv end-to-end 테스트 | +| `tests/test_ccl_topologies.py` | 신규 — builtin topology generator 단위 테스트 | diff --git a/docs/ccl-author-guide.en.md b/docs/ccl-author-guide.en.md new file mode 100644 index 0000000..e2e62f9 --- /dev/null +++ b/docs/ccl-author-guide.en.md @@ -0,0 +1,592 @@ +# CCL Algorithm Author Guide (English) + +This document is a step-by-step guide for engineers writing CCL +(Collective Communication Library) algorithms in kernbench. The +internal system design and component structure live in +[ADR-0023](adr/ADR-0023-ipcq-pe-collective.md). + +The goal here is to clearly separate **what an algorithm author has to +touch** from **what they can leave alone**, and to get a first +algorithm running through the shortest possible path. + +--- + +## 0. Five-minute tour + +| Things you touch | Location | +|------------------|----------| +| Algorithm module (kernel + optional `neighbors()`) | `src/kernbench/ccl/algorithms/.py` | +| Algorithm registration | `ccl.yaml` | +| Host bench (rank count, init, launch, verify) | `benches/.py` | +| (Optional) unit test | `tests/test_.py` | + +| Things you do NOT touch | Location | +|--------------------------|----------| +| TLContext API | `src/kernbench/triton_emu/tl_context.py` (ADR-0022 spec) | +| Framework (topology generators, helpers, mock testing) | `src/kernbench/ccl/` | +| PE_IPCQ / PE_DMA components | `src/kernbench/components/builtin/` | +| Backend implementation (`install_ipcq`) | `src/kernbench/runtime_api/distributed.py` and `kernbench/ccl/install.py` | + +Workflow: +1. Write a `kernel` function in the algorithm module. +2. Register an entry in `ccl.yaml`. +3. Write a host bench using `torch.distributed.init_process_group` / + `torch.distributed.all_reduce` (the unified `benches/ccl_allreduce.py` + handles the common case). +4. (Optional) Run the mock runtime for fast unit tests (a few ms). +5. `kernbench run --bench --verify-data` for full SimPy verification. + +--- + +## 1. Hello World — the simplest send/recv + +Each PE sends its tile to its E neighbor once and receives a tile from +its W neighbor once. The reference code lives in +[`src/kernbench/ccl/algorithms/hello_send.py`](../src/kernbench/ccl/algorithms/hello_send.py). + +### Step 1: write the kernel + +New file `src/kernbench/ccl/algorithms/hello_send.py`: + +```python +"""Hello world: send your tile to the next rank, receive from the previous one.""" + + +def kernel(t_ptr, n_elem, tl): + # Global rank is computed from program_id(0/1) (ADR-0022). + local_pe = tl.program_id(axis=0) + cube_id = tl.program_id(axis=1) + pes_per_cube = tl.num_programs(axis=0) + rank = cube_id * pes_per_cube + local_pe + + nbytes = n_elem * 2 # f16 + pe_addr = t_ptr + rank * nbytes + + # Load our slice and send it east. + src = tl.load(pe_addr, shape=(n_elem,), dtype="f16") + tl.send(dir="E", src=src) + + # Receive from west and store directly back into our slice. + recv = tl.recv(dir="W", shape=(n_elem,), dtype="f16") + tl.store(pe_addr, recv) + + +def kernel_args(world_size: int, n_elem: int) -> tuple: + """Positional kernel args used by the ahbm backend (after t_ptr).""" + return (n_elem,) +``` + +Key points: + +- **Global rank is computed from `program_id(axis=0)` + `program_id(axis=1)`.** + TL has no contractually-supported `tl.rank` / `tl.world_size`. If the + host needs to pass `world_size` or anything else as an algorithm + parameter, it goes through ordinary `torch.launch` arguments. +- **`tl.send` takes a `TensorHandle`.** PE_IPCQ reads + `addr`/`space`/`shape`/`dtype`/`nbytes` from the handle to issue an + `IpcqDmaToken` to PE_DMA. +- **`tl.recv` requires `shape` and `dtype`.** The returned TensorHandle + points at the IPCQ ring slot and can be used directly as a `dst` + handle (e.g. `tl.store(pe_addr, recv)`). Phase 2's `dma_write` replay + handles the (slot → hbm) copy, so user code never has to touch + `recv.data`. + +### Step 2: register in `ccl.yaml` + +```yaml +algorithms: + hello_send: + module: kernbench.ccl.algorithms.hello_send + topology: ring_1d + buffer_kind: tcm + world_size: 8 +``` + +`world_size` here is optional. If absent, `AhbmCCLBackend` derives it +from the topology spec (`sips × cubes_per_sip × pes_per_cube`). + +### Step 3: write a host bench (optional — the unified bench may suffice) + +For most CCL benchmarks the existing `benches/ccl_allreduce.py` is +sufficient: it reads `ccl.yaml`, picks the algorithm, sets up the +process group, and runs the collective. If your algorithm needs custom +host logic, write a new bench file along the same lines. + +The host code looks like a real PyTorch DDP worker: + +```python +"""benches/ccl_hello.py""" +from __future__ import annotations + +import numpy as np + +from kernbench.policy.placement.dp import DPPolicy + + +N_ELEM = 8 + + +def worker(rank: int, world_size: int, torch) -> None: + """Per-rank business logic — mirrors a real PyTorch DDP worker.""" + dp = DPPolicy( + sip="replicate", cube="replicate", pe="column_wise", + num_sips=1, num_cubes=1, num_pes=world_size, + ) + tensor = torch.zeros( + (1, world_size * N_ELEM), dtype="f16", dp=dp, name="hello_in", + ) + + # Per-rank initialization via the real PyTorch idiom. + init = np.zeros((1, world_size * N_ELEM), dtype=np.float16) + for r in range(world_size): + init[0, r * N_ELEM : (r + 1) * N_ELEM] = float(r + 1) + tensor.copy_(torch.from_numpy(init)) + + # The collective itself. + torch.distributed.all_reduce(tensor, op="sum") + + # Verify on rank 0 (real PyTorch DDP idiom). + if rank == 0: + result = tensor.numpy() + for r in range(world_size): + expected = float(((r - 1) % world_size) + 1) + slice_r = result[0, r * N_ELEM : (r + 1) * N_ELEM] + print( + f" rank {r}: got {float(slice_r.mean()):.1f}, " + f"expected {expected:.1f}" + ) + + +def run(torch) -> None: + """CLI entry point. Initializes dist, dispatches to worker.""" + dist = torch.distributed + dist.init_process_group(backend="ahbm") + worker( + rank=dist.get_rank(), + world_size=dist.get_world_size(), + torch=torch, + ) +``` + +### Step 4: unit test (optional but strongly recommended) + +`tests/test_hello_send.py`: + +```python +import numpy as np + +from kernbench.ccl.algorithms.hello_send import kernel +from kernbench.ccl.testing import run_kernel_in_mock + + +def test_hello_send_4_ranks(): + n_elem = 8 + inputs = [ + np.full((n_elem,), float(r + 1), dtype=np.float16) + for r in range(4) + ] + outputs = run_kernel_in_mock( + kernel_fn=kernel, + world_size=4, + topology="ring_1d", + inputs=inputs, + kernel_args=(n_elem,), + ) + # rank r should now hold rank (r-1) % 4's data. + for r in range(4): + assert np.array_equal(outputs[r], inputs[(r - 1) % 4]) +``` + +`run_kernel_in_mock` runs every rank concurrently in pure Python (no +SimPy), so a unit test like this finishes in **milliseconds**. It only +verifies algorithmic correctness — no latency, no DMA, no fabric. + +### Step 5: SimPy validation + +```bash +kernbench run --topology topology.yaml --bench ccl_hello --verify-data +``` + +Phase 1 runs the SimPy simulation + MemoryStore data movement, Phase 2 +replays the op_log for correctness. The bench's `print` lines should +show OK for every rank. + +--- + +## 2. Ring all-reduce — the second algorithm + +Slightly more complex. Each PE runs `world_size - 1` rounds, sending +its current tile east and accumulating the tile received from the west. +After all rounds, every PE holds the global sum. + +The reference implementation lives in +[`src/kernbench/ccl/algorithms/ring_allreduce.py`](../src/kernbench/ccl/algorithms/ring_allreduce.py). +The core flow: + +```python +"""Ring all-reduce.""" + + +def kernel(t_ptr, n_elem, world_size, tl): + local_pe = tl.program_id(axis=0) + cube_id = tl.program_id(axis=1) + pes_per_cube = tl.num_programs(axis=0) + rank = cube_id * pes_per_cube + local_pe + nbytes = n_elem * 2 + pe_addr = t_ptr + rank * nbytes + + # The handle points at HBM[pe_addr]. In greenlet mode .data is + # populated, but the kernel never has to touch .data directly. + acc = tl.load(pe_addr, shape=(n_elem,), dtype="f16") + current = acc # source for the first send + + for _step in range(world_size - 1): + tl.send(dir="E", src=current) + recv = tl.recv(dir="W", shape=(n_elem,), dtype="f16") + # TensorHandle operator overload → MathCmd → PE_MATH dispatch. + # Phase 1 only models timing; Phase 2 DataExecutor replays the + # actual numpy accumulation. + acc = acc + recv + current = recv # forward the received slot to the next round + + # Store the final accumulator back to HBM. Source is acc (a PE-local + # scratch addr); dst is HBM. The op_log dma_write entry records both + # ends so Phase 2 copies the math result into HBM at verify time. + tl.store(pe_addr, acc) + + +def kernel_args(world_size: int, n_elem: int) -> tuple: + return (n_elem, world_size) +``` + +Four key points: + +1. **Accumulation goes through TensorHandle operators.** `acc + recv` + emits a `MathCmd` and dispatches it through PE_MATH — i.e. the + real hardware path, so the latency model stays accurate. Per + ADR-0020 D3, Phase 1 only simulates timing; Phase 2's `DataExecutor` + replays the op_log and runs the actual numpy accumulation. +2. **Use `current = recv` to forward.** Each round must update the send + source to the just-received slot handle so the same data circulates + exactly once around the ring. Setting `current = acc` would resend + the cumulative sum, inflating the result. +3. **`tl.store(pe_addr, acc)` exactly once at the end.** Do not use a + store→reload pattern in the middle. `acc` lives in PE-local scratch; + the op_log records `(src=scratch, dst=hbm)` and Phase 2 first runs + math (filling scratch) then copies via the dma_write snapshot. +4. **`world_size` is passed by the host explicitly.** TL only knows the + topology slot count (e.g. `num_programs(axis=0)` is "PEs per cube"), + not the participating CCL group size. The host bench knows + `world_size` and forwards it as an explicit kernel argument. + +For registration in `ccl.yaml` and wiring through the unified bench, +look at the existing `ring_allreduce_tcm/_hbm/_sram` entries plus +[`benches/ccl_allreduce.py`](../benches/ccl_allreduce.py). Mock unit +tests live in +[`tests/test_ccl_mock_runtime.py`](../tests/test_ccl_mock_runtime.py) +and follow the `kernel_args=(n_elem, world_size)` convention. + +--- + +## 3. `neighbors()` override — custom topology + +Most algorithms are happy with the builtin topologies (`ring_1d`, +`mesh_2d`, `tree_binary`, `ring_1d_unidir`, `none`). If you want to +modify a builtin or define a brand-new connectivity pattern, define a +`neighbors()` function in your algorithm module. + +### Signature + +```python +def neighbors( + rank: int, world_size: int, neighbor_map: dict[str, int], +) -> dict[str, int] | None: + """Override the neighbor map produced by the builtin topology. + + Args: + neighbor_map: the mapping the ccl.yaml ``topology`` field built. + For ring_1d this is {"E": (rank+1)%ws, "W": (rank-1)%ws}. + The dict is mutable — modify in place if you want. + + Returns: + dict: the new neighbor map (or the modified-in-place dict). + None: do not override; use neighbor_map as-is. + """ + return None +``` + +### Pattern A: tweak a builtin + +```python +def neighbors(rank, world_size, neighbor_map): + # Only even ranks use W; remove W from odd ranks. + if rank % 2 == 1: + neighbor_map.pop("W", None) + return neighbor_map +``` + +### Pattern B: replace entirely (skip-connection ring) + +```python +def neighbors(rank, world_size, neighbor_map): + return {"E": (rank + 2) % world_size} +``` + +### Pattern C: keep builtin + +Either omit `neighbors` entirely or return None: + +```python +def neighbors(rank, world_size, neighbor_map): + return None # explicit "use the builtin" +``` + +--- + +## 4. PE kernel API reference (ADR-0023 D4) + +### IPCQ API + +| API | Description | Blocking? | +|-----|-------------|-----------| +| `tl.send(dir, src=TensorHandle)` | Send to a peer in the given direction. | Yes (waits if peer slots are full) | +| `tl.send(dir, src_addr=..., nbytes=..., shape=..., dtype=..., space=...)` | Same, keyword form. | Yes | +| `tl.recv(dir, shape=..., dtype=...)` | Blocking recv from one direction. | Yes | +| `tl.recv(shape=..., dtype=...)` | Round-robin recv across all four directions. | Yes | +| `tl.recv_async(dir, shape=..., dtype=...) → RecvFuture` | Non-blocking recv. | No | +| `tl.wait(future)` | Wait for a non-blocking recv future → returns the resolved TensorHandle. | Yes | + +### Existing TL API (ADR-0020/0022, unchanged) + +| API | Description | +|-----|-------------| +| `tl.load(addr, shape, dtype) → TensorHandle` | DMA read; in greenlet mode `.data` carries the ndarray. | +| `tl.store(addr, handle)` | DMA write — when `handle.data` is set the runner propagates it to MemoryStore. | +| `tl.composite(op, ...)` | Submit a GEMM/Math composite (non-blocking). | +| `tl.program_id(axis=0)` | Local PE id within the cube. | +| `tl.program_id(axis=1)` | Cube id (ADR-0022). | +| `tl.num_programs(axis=0/1)` | Topology slot counts (NOT the participating-rank count). | + +### Two recv modes + +The default is `return_slot` (zero-copy): the IPCQ slot address is +returned in `handle.addr`. To force a copy into a custom destination, +pass `dst_addr` + `dst_space`: + +```python +recv = tl.recv( + dir="W", shape=(8,), dtype="f16", + dst_addr=my_scratch_addr, + dst_space="hbm", +) +# After this call recv.addr == my_scratch_addr (copy_to_dst mode). +``` + +--- + +## 5. Helpers (`kernbench.ccl.helpers`) + +Convenience helpers to keep algorithm code short: + +```python +from kernbench.ccl.helpers import chunked, ring_step, tree_step +``` + +### `chunked(base_addr, n_chunks, n_elem, dtype="f16") → list[Chunk]` + +Split a tile of `n_elem` elements into `n_chunks` equal-size views. +Each `Chunk` has `addr`, `n_elem`, `nbytes` fields. + +```python +chunks = chunked(t_ptr, n_chunks=4, n_elem=64, dtype="f16") +# chunks[0..3] are 16-element views with consecutive addresses. +``` + +### `ring_step(rank, step, world_size) → (send_idx, recv_idx)` + +Per-step chunk indices for a ring algorithm (reduce-scatter / all-gather): + +```python +for step in range(world_size - 1): + send_idx, recv_idx = ring_step(rank, step, world_size) + tl.send( + dir="E", src_addr=chunks[send_idx].addr, + nbytes=chunks[send_idx].nbytes, + shape=(chunks[send_idx].n_elem,), dtype="f16", + ) + recv = tl.recv( + dir="W", shape=(chunks[recv_idx].n_elem,), dtype="f16", + ) + # accumulate ... +``` + +### `tree_step(rank, world_size) → {"parent": int|None, "children": list[int]}` + +Parent / children rank ids for a binary tree: + +```python +info = tree_step(rank, world_size) +if info["parent"] is None: + print(f"rank {rank} is the root") +for child in info["children"]: + ... +``` + +--- + +## 6. Unit testing — Mock runtime + +`kernbench.ccl.testing.run_kernel_in_mock` runs an algorithm without +SimPy for fast feedback. + +### Basic usage + +```python +import numpy as np + +from kernbench.ccl.testing import run_kernel_in_mock +from kernbench.ccl.algorithms.my_algo import kernel + + +def test_my_algo(): + n_elem = 16 + inputs = [np.arange(n_elem, dtype="f16") + r for r in range(4)] + expected = sum(inputs) + outputs = run_kernel_in_mock( + kernel_fn=kernel, + world_size=4, + topology="ring_1d", + inputs=inputs, + kernel_args=(n_elem, 4), # positional args after t_ptr + ) + for r in range(4): + assert np.allclose(outputs[r], expected, rtol=1e-3) +``` + +### Behavior + +- All ranks run their kernels concurrently as cooperative greenlets. +- `tl.send` / `tl.recv` are serviced by in-memory FIFOs (no DMA, no + latency). +- Each rank's last `store` is what the helper returns as a numpy array. + +### Limitations + +- No latency or performance numbers (it is not a simulation). +- No PE_DMA, fabric, or BW model. +- Correctness only. +- One cube assumed: `program_id(axis=1)` is always 0. + +--- + +## 7. Debugging + +### CCL trace + +```bash +KERNBENCH_CCL_TRACE=1 kernbench run --topology topology.yaml \ + --bench ccl_allreduce --verify-data +``` + +Per-rank send/recv events appear on stdout: + +``` +[ccl t=346.4 send] sip0.cube0.pe1 dir=E nbytes=64 seq=0 +[ccl t=360.4 recv] sip0.cube0.pe2 dir=W nbytes=64 +``` + +### Pointer dump + +`kernbench.ccl.diagnostics.pointer_dump(engine)` returns a multi-line +dump of every PE_IPCQ ring buffer's `my_head`, `my_tail`, +`peer_head_cache`, `peer_tail_cache`. When something hangs, this shows +which rank is stuck and on what. + +### Deadlock detection + +When the SimPy schedule empties because of unmatched send/recv pairs, +the engine raises `IpcqDeadlock` and embeds the pointer dump in the +message (ADR-0023 D14 F3). Wait-for-graph visualization is future +work. + +--- + +## 8. Common mistakes + +### 1. Using a direction that wasn't installed + +`topology: ring_1d` only installs E and W. Trying: + +```python +tl.send(dir="N", ...) # → IpcqInvalidDirection +``` + +Fix: switch to `topology: mesh_2d`, or add N/S in a `neighbors()` override. + +### 2. `send` without a matching `recv` + +```python +def kernel(..., tl): + for _ in range(100): + tl.send(dir="E", ...) + # The peer never recvs → ring buffer fills → backpressure → deadlock. +``` + +Fix: every `send` needs a matching `recv` on the receiver side. +Otherwise `IpcqDeadlock` is raised. + +### 3. dtype/shape mismatch + +By default mismatches are not validated. The author is responsible for +consistency. Set `strict_validation: true` on a PE_IPCQ node's attrs to +enable D14 F2 strict mode and catch them immediately. + +### 4. Assuming round-robin recv fairness + +`tl.recv()` (no direction) returns the first slot to arrive in +round-robin order, but **arrival order is not predictable**. If your +algorithm depends on a particular direction, name it explicitly: +`tl.recv(dir="N", ...)`. + +### 5. Confusing `num_programs` with the CCL group size + +`tl.num_programs(axis=0/1)` reports topology slot counts, not the +number of ranks participating in the collective. The host bench knows +`world_size` and must pass it through as a kernel argument. + +### 6. Overwriting the send source before it's actually sent + +PE_DMA snapshots the source data into the IpcqDmaToken at send time, +preserving in-flight semantics. Even so, the safest pattern is to call +`tl.send` first and only mutate the source addr afterwards. If you +mutate the addr before `tl.send` makes it into the PE_DMA queue, the +snapshot will pick up the wrong data. + +--- + +## 9. Next steps + +- Try other topologies (`mesh_2d`, `tree_binary`). +- Faster algorithms (recursive halving / doubling). +- Compare `buffer_kind` (tcm/hbm/sram) and `backpressure` (poll/sleep) + modes for latency. +- Larger-scale validation through the unified `ccl_allreduce` bench + with different `ccl.yaml` overlays. + +If you add a new algorithm or pattern, please send a PR. + +--- + +## References + +- [ADR-0023](adr/ADR-0023-ipcq-pe-collective.md): IPCQ + PE-level collective design. +- [ADR-0022](adr/ADR-0022-program-id-2d-grid.md): 2D grid program_id (axis=0/1). +- [ADR-0020](adr/ADR-0020-data-execution-two-pass.md): 2-pass data execution. +- [ADR-0021](adr/ADR-0021-pe-pipeline-refactor.md): PE pipeline refactor. + +Existing algorithm examples: + +- [`src/kernbench/ccl/algorithms/hello_send.py`](../src/kernbench/ccl/algorithms/hello_send.py) — simplest send/recv +- [`src/kernbench/ccl/algorithms/ring_allreduce.py`](../src/kernbench/ccl/algorithms/ring_allreduce.py) — ring all-reduce +- [`src/kernbench/ccl/algorithms/mesh_allreduce.py`](../src/kernbench/ccl/algorithms/mesh_allreduce.py) — 2D mesh all-reduce +- [`src/kernbench/ccl/algorithms/tree_allreduce.py`](../src/kernbench/ccl/algorithms/tree_allreduce.py) — binary tree all-reduce diff --git a/docs/ccl-author-guide.md b/docs/ccl-author-guide.md new file mode 100644 index 0000000..4fa7cb4 --- /dev/null +++ b/docs/ccl-author-guide.md @@ -0,0 +1,537 @@ +# CCL Algorithm Author Guide + +이 문서는 kernbench에서 CCL (Collective Communication Library) 알고리즘을 +직접 작성하는 사람을 위한 step-by-step 가이드이다. 시스템 내부 설계와 +컴포넌트 구조는 [ADR-0023](adr/ADR-0023-ipcq-pe-collective.md)에 있다. + +본 가이드는 알고리즘 작성자가 **자신이 만져야 할 곳**과 **만지지 않아도 될 곳**을 +명확히 분리하고, 가장 짧은 경로로 첫 알고리즘을 동작시키는 것을 목표로 한다. + +--- + +## 0. 5분 요약 + +| 만지는 것 | 위치 | +|----------|------| +| 알고리즘 모듈 (kernel + 선택적 neighbors) | `src/kernbench/ccl/algorithms/.py` | +| 알고리즘 등록 | `ccl.yaml` | +| 호스트 bench (PE 수, 메모리 init, launch, 검증) | `benches/.py` | +| (선택) 단위 테스트 | `tests/test_.py` | + +| 만지지 않는 것 | 위치 | +|---------------|------| +| TLContext API | `src/kernbench/triton_emu/tl_context.py` (ADR-0022 spec) | +| 프레임워크 (topology generators, helpers, mock testing) | `src/kernbench/ccl/` | +| PE_IPCQ / PE_DMA 컴포넌트 | `src/kernbench/components/builtin/` | +| backend 구현 (install_ipcq) | `src/kernbench/runtime_api/distributed.py` 및 `kernbench/ccl/install.py` | + +흐름: +1. 알고리즘 모듈에 `kernel` 작성 +2. `ccl.yaml`에 entry 등록 +3. 호스트 bench에서 `install_ipcq` + `launch` +4. (선택) mock runtime으로 단위 테스트 (수 ms) +5. `kernbench run --bench --verify-data`로 SimPy 검증 + +--- + +## 1. Hello World — 가장 단순한 send/recv + +각 PE가 자기 데이터를 E 방향 이웃에 한 번 보내고, W 방향에서 한 번 받는 +가장 단순한 알고리즘이다. 실제 동작 코드는 +[`src/kernbench/ccl/algorithms/hello_send.py`](../src/kernbench/ccl/algorithms/hello_send.py) +에 있다. + +### Step 1: kernel 작성 + +새 파일 `src/kernbench/ccl/algorithms/hello_send.py`: + +```python +"""Hello world: 자기 데이터를 다음 rank에 보내고 이전 rank에서 받기.""" +def kernel(t_ptr, n_elem, tl): + # 글로벌 rank는 program_id(0/1)에서 계산 (ADR-0022) + local_pe = tl.program_id(axis=0) + cube_id = tl.program_id(axis=1) + pes_per_cube = tl.num_programs(axis=0) + rank = cube_id * pes_per_cube + local_pe + + nbytes = n_elem * 2 # f16 + pe_addr = t_ptr + rank * nbytes + + # 자기 슬라이스를 로드해서 E로 보낸다. + src = tl.load(pe_addr, shape=(n_elem,), dtype="f16") + tl.send(dir="E", src=src) + + # W 방향에서 받아서 그대로 자기 슬라이스에 store한다. + recv = tl.recv(dir="W", shape=(n_elem,), dtype="f16") + tl.store(pe_addr, recv) +``` + +핵심 포인트: + +- **글로벌 rank는 `program_id(axis=0)` + `program_id(axis=1)`에서 계산.** TL에는 + `tl.rank` / `tl.world_size` 같은 약속되지 않은 확장이 없다. 호스트가 + `world_size` 같은 알고리즘 파라미터가 필요하면 `torch.launch`의 일반 인자로 + 전달한다. +- **`tl.send`는 `TensorHandle`을 받는다.** 핸들의 `addr`/`space`/`shape`/`dtype`/`nbytes`를 + PE_IPCQ가 읽어 PE_DMA에 IpcqDmaToken을 발행한다. +- **`tl.recv`는 `shape`와 `dtype`이 필수.** 반환된 TensorHandle은 IPCQ ring slot을 + 가리키며, `tl.store(pe_addr, recv)`처럼 dst 핸들로 그대로 사용할 수 있다. + Phase 2 dma_write replay가 (slot, hbm) 복사를 수행하므로 numpy `.data`를 + 직접 만질 필요가 없다. + +### Step 2: ccl.yaml 등록 + +`ccl.yaml`의 `algorithms` 섹션에 entry를 추가한다. (defaults.algorithm은 호스트 +bench가 `install_ipcq(algorithm=...)`로 명시 전달해도 되므로 꼭 바꿀 필요는 없다.) + +```yaml +algorithms: + hello_send: + module: kernbench.ccl.algorithms.hello_send + topology: ring_1d + buffer_kind: tcm +``` + +### Step 3: 호스트 bench 작성 + +새 파일 `benches/ccl_hello.py`: + +```python +"""Hello-world ring rotation bench (각 PE가 W 이웃의 데이터를 1번 받음).""" +import numpy as np + +from kernbench.ccl.algorithms import hello_send +from kernbench.policy.placement.dp import DPPolicy + +ALGORITHM = "hello_send" +N_ELEM = 8 +WORLD_SIZE = 8 + + +def run(torch): + plan = torch.install_ipcq(algorithm=ALGORITHM) + + a = torch.zeros( + (1, WORLD_SIZE * N_ELEM), dtype="f16", + dp=DPPolicy( + sip="replicate", cube="replicate", pe="column_wise", + num_sips=1, num_cubes=1, + ), + name="hello_in", + ) + + store = torch.engine.memory_store + base = a._handle.va_base or a._handle.shards[0].pa + nbytes = N_ELEM * 2 + for r in range(WORLD_SIZE): + store.write("hbm", base + r * nbytes, + np.full((N_ELEM,), float(r + 1), dtype=np.float16)) + + torch.launch(ALGORITHM, hello_send.kernel, a, N_ELEM) + + # rank r은 rank (r-1)%ws의 데이터를 가져야 한다. + for r, (sip, cube, pe) in enumerate(plan["rank_to_pe"]): + result = store.read("hbm", base + r * nbytes, shape=(N_ELEM,), dtype="f16") + prev = float(((r - 1) % WORLD_SIZE) + 1) + ok = np.allclose(result, prev) + print(f" [{'OK ' if ok else 'FAIL'}] rank {r} got {float(result.mean()):.1f}, " + f"expected {prev:.1f}") +``` + +### Step 4: 단위 테스트 (선택, 강력 추천) + +`tests/test_hello_send.py`: + +```python +import numpy as np +from kernbench.ccl.algorithms.hello_send import kernel +from kernbench.ccl.testing import run_kernel_in_mock + + +def test_hello_send_4_ranks(): + n_elem = 8 + inputs = [np.full((n_elem,), float(r + 1), dtype=np.float16) for r in range(4)] + + outputs = run_kernel_in_mock( + kernel_fn=kernel, + world_size=4, + topology="ring_1d", + inputs=inputs, + kernel_args=(n_elem,), + ) + + # rank r은 rank (r-1) % 4의 데이터를 받아야 함 + for r in range(4): + assert np.array_equal(outputs[r], inputs[(r - 1) % 4]) +``` + +`run_kernel_in_mock`는 SimPy 없이 순수 Python으로 모든 rank를 동시 실행하므로 +**ms 단위로 끝난다**. 알고리즘 logic 정합성만 검증. + +### Step 5: 시뮬 검증 + +```bash +kernbench run --topology topology.yaml --bench ccl_hello --verify-data +``` + +Phase 1에서 SimPy 시뮬레이션 + MemoryStore 데이터 이동, Phase 2에서 op_log +정합성 replay. 호스트 bench의 `print` 검증이 모든 rank에 대해 OK여야 한다. + +--- + +## 2. Ring All-Reduce — 두 번째 알고리즘 + +조금 더 복잡한 예제. Ring all-reduce는 N-1 라운드 동안 각 PE가 자기 데이터를 +E로 보내고 W에서 받아 누적한다. 최종적으로 모든 PE가 글로벌 sum을 갖는다. + +실제 동작 코드는 [`src/kernbench/ccl/algorithms/ring_allreduce.py`](../src/kernbench/ccl/algorithms/ring_allreduce.py) +참조. 핵심 흐름: + +```python +"""Ring all-reduce.""" + + +def kernel(t_ptr, n_elem, world_size, tl): + # rank + local_pe = tl.program_id(axis=0) + cube_id = tl.program_id(axis=1) + pes_per_cube = tl.num_programs(axis=0) + rank = cube_id * pes_per_cube + local_pe + nbytes = n_elem * 2 + pe_addr = t_ptr + rank * nbytes + + # HBM의 자기 슬라이스를 가리키는 TensorHandle. greenlet 모드에선 .data가 + # 채워지지만 커널은 .data를 직접 만질 필요가 없다. + acc = tl.load(pe_addr, shape=(n_elem,), dtype="f16") + current = acc # 첫 라운드 send 출처 + + for _step in range(world_size - 1): + tl.send(dir="E", src=current) + recv = tl.recv(dir="W", shape=(n_elem,), dtype="f16") + # TensorHandle 연산자 오버로드 → MathCmd → PE_MATH 디스패치. + # Phase 1은 타이밍만, Phase 2 DataExecutor가 실제 numpy 누적을 수행한다. + acc = acc + recv + current = recv # 다음 라운드는 직전에 받은 슬롯을 다시 forward + + # 최종 누적값을 자기 슬라이스에 store. 출처는 acc(=PE-local scratch addr) + # 이고 dst는 HBM. op_log dma_write가 (scratch, hbm) 복사 정보를 기록하므로 + # Phase 2가 검증 시점에 HBM[pe_addr]에 정답을 채워준다. + tl.store(pe_addr, acc) +``` + +네 가지 포인트: + +1. **누적은 TensorHandle 연산자**: `acc + recv`는 `MathCmd`를 emit하고 + PE_MATH로 디스패치된다 — 실제 하드웨어 경로를 거치므로 latency 모델이 + 정확하다. ADR-0020 D3대로 Phase 1은 타이밍만 시뮬레이션하고, Phase 2 + `DataExecutor`가 op_log를 재실행하면서 numpy 누적을 수행한다. +2. **`current = recv`로 forward**: 매 라운드의 send 출처를 직전에 받은 슬롯 + 핸들로 갱신해야 같은 데이터가 ring을 순회하면서 누적이 한 번씩 일어난다. + `current = acc`로 두면 누적값이 다시 송출되어 결과가 부풀려진다. +3. **`tl.store(pe_addr, acc)` 한 번이면 끝**: 중간에 store→reload 패턴은 + 금지다. acc는 PE-local scratch에 살고, op_log가 (src=scratch, dst=hbm) + 메타데이터를 기록한다. Phase 2가 math를 먼저 실행해 scratch를 채운 뒤 + dma_write 스냅샷으로 HBM에 복사한다. +4. **`world_size`는 호스트가 명시 전달**: TL은 topology slot 수만 안다 (예: + `num_programs(axis=0)`은 cube당 PE 수). 실제 참여하는 CCL group 크기는 bench가 + 알고 호스트→kernel 인자로 넘긴다. + +`ccl.yaml` 등록 + 호스트 bench는 [`benches/ccl_allreduce_tcm.py`](../benches/ccl_allreduce_tcm.py) +참조. mock 단위 테스트는 [`tests/test_ccl_mock_runtime.py`](../tests/test_ccl_mock_runtime.py) +를 그대로 따라하면 된다 (`kernel_args=(n_elem, world_size)` 인자 형태). + +--- + +## 3. neighbors() override — Custom topology + +대부분의 알고리즘은 builtin topology(`ring_1d`, `mesh_2d`, `tree_binary`, +`ring_1d_unidir`, `none`)로 충분하다. builtin을 변형하거나 새로 만들고 싶으면 +알고리즘 모듈에 `neighbors()`를 정의한다. + +### 시그니처 + +```python +def neighbors(rank: int, world_size: int, neighbor_map: dict[str, int]) -> dict[str, int] | None: + """builtin topology가 만든 neighbor_map을 override. + + Args: + neighbor_map: ccl.yaml의 topology 필드가 만든 builtin 매핑. + 예: ring_1d → {"E": (rank+1)%ws, "W": (rank-1)%ws} + mutable dict — 직접 수정 가능. + + Returns: + dict: neighbor_map을 override한 결과 (또는 수정한 그 dict) + None: override 안 함, neighbor_map 그대로 사용 + """ + return None +``` + +### Pattern A: builtin을 base로 일부만 수정 + +```python +def neighbors(rank, world_size, neighbor_map): + # 짝수 rank만 W 방향 사용 (홀수 rank는 W 제거) + if rank % 2 == 1: + neighbor_map.pop("W", None) + return neighbor_map +``` + +### Pattern B: 완전히 새로 작성 (skip-connection ring) + +```python +def neighbors(rank, world_size, neighbor_map): + # neighbor_map은 무시하고 새로 작성 + return {"E": (rank + 2) % world_size} +``` + +### Pattern C: builtin 사용, override 없음 + +`neighbors()` 함수를 정의하지 않거나 None을 반환: + +```python +def neighbors(rank, world_size, neighbor_map): + return None # 명시적으로 builtin 사용 +``` + +--- + +## 4. PE 커널 API 레퍼런스 (ADR-0023 D4) + +### IPCQ API + +| API | 설명 | Blocking? | +|-----|------|-----------| +| `tl.send(dir, src=TensorHandle)` | direction으로 데이터 send | Yes (peer slot full 시 wait) | +| `tl.send(dir, src_addr=..., nbytes=..., shape=..., dtype=..., space=...)` | 동일, keyword 형태 | Yes | +| `tl.recv(dir, shape=..., dtype=...)` | 특정 방향에서 blocking recv | Yes | +| `tl.recv(shape=..., dtype=...)` | 4방향 round-robin recv (방향 미지정) | Yes | +| `tl.recv_async(dir, shape=..., dtype=...) → RecvFuture` | non-blocking recv | No | +| `tl.wait(future)` | non-blocking future 완료 대기 → TensorHandle | Yes | + +### 기존 TL API (ADR-0020/0022, 그대로 사용 가능) + +| API | 설명 | +|-----|------| +| `tl.load(addr, shape, dtype) → TensorHandle` | DMA read; greenlet 모드에서 `.data`에 ndarray | +| `tl.store(addr, handle)` | DMA write — handle.data가 있으면 MemoryStore에 propagate | +| `tl.composite(op, ...)` | GEMM/Math compute 비동기 submit | +| `tl.program_id(axis=0)` | cube 내 local PE id | +| `tl.program_id(axis=1)` | cube id (ADR-0022) | +| `tl.num_programs(axis=0/1)` | topology 슬롯 수 (참여 ranks 수가 아님) | + +### `recv` 두 가지 모드 + +기본은 `return_slot` (zero-copy): IPCQ slot 주소가 그대로 handle.addr에 들어온다. +slot 데이터를 별도 위치로 복사하고 싶으면 `dst_addr` + `dst_space`를 명시: + +```python +recv = tl.recv( + dir="W", shape=(8,), dtype="f16", + dst_addr=my_scratch_addr, + dst_space="hbm", +) +# 이제 recv.addr == my_scratch_addr (copy_to_dst 모드) +``` + +--- + +## 5. Helpers (`kernbench.ccl.helpers`) + +알고리즘 코드를 짧게 유지하기 위한 헬퍼들: + +```python +from kernbench.ccl.helpers import chunked, ring_step, tree_step +``` + +### `chunked(base_addr, n_chunks, n_elem, dtype="f16") → list[Chunk]` + +총 `n_elem` 개의 element를 `n_chunks` 등분한 view 리스트를 반환. 각 `Chunk`는 +`addr`, `n_elem`, `nbytes` 필드를 가진다. + +```python +chunks = chunked(t_ptr, n_chunks=4, n_elem=64, dtype="f16") +# chunks[0..3] 각각 16 element view, addr이 연속 +``` + +### `ring_step(rank, step, world_size) → (send_idx, recv_idx)` + +Ring algorithm의 step별 chunk 인덱스 (reduce-scatter / all-gather): + +```python +for step in range(world_size - 1): + send_idx, recv_idx = ring_step(rank, step, world_size) + tl.send(dir="E", src_addr=chunks[send_idx].addr, + nbytes=chunks[send_idx].nbytes, + shape=(chunks[send_idx].n_elem,), dtype="f16") + recv = tl.recv(dir="W", shape=(chunks[recv_idx].n_elem,), dtype="f16") + # accumulate ... +``` + +### `tree_step(rank, world_size) → {"parent": int|None, "children": list[int]}` + +Binary tree의 parent/children rank: + +```python +info = tree_step(rank, world_size) +if info["parent"] is None: + print(f"rank {rank} is the root") +for child in info["children"]: + ... +``` + +--- + +## 6. 단위 테스트 — Mock Runtime + +`kernbench.ccl.testing.run_kernel_in_mock`은 SimPy를 거치지 않고 알고리즘을 +빠르게 검증할 수 있다. + +### 기본 사용법 + +```python +from kernbench.ccl.testing import run_kernel_in_mock +from kernbench.ccl.algorithms.my_algo import kernel +import numpy as np + + +def test_my_algo(): + n_elem = 16 + inputs = [np.arange(n_elem, dtype="f16") + r for r in range(4)] + expected = sum(inputs) + + outputs = run_kernel_in_mock( + kernel_fn=kernel, + world_size=4, + topology="ring_1d", + inputs=inputs, + kernel_args=(n_elem, 4), # kernel의 (t_ptr 이후) 추가 positional 인자 + ) + + for r in range(4): + assert np.allclose(outputs[r], expected, rtol=1e-3) +``` + +### 동작 + +- 4개 rank의 kernel을 greenlet으로 동시 실행 +- `tl.send/recv`를 in-memory FIFO로 즉시 처리 (DMA, latency 무시) +- 각 rank가 마지막에 store한 데이터를 ndarray로 반환 + +### 한계 + +- latency / 성능 측정 불가 (시뮬레이션이 아님) +- PE_DMA, fabric, BW 모델 안 함 +- 정합성 검증만 가능 +- 한 cube 안에서 동작하는 가정 — `program_id(axis=1)`은 항상 0 + +--- + +## 7. 디버깅 + +### CCL trace + +```bash +KERNBENCH_CCL_TRACE=1 kernbench run --topology topology.yaml \ + --bench ccl_allreduce_tcm --verify-data +``` + +각 rank의 send/recv 시점이 stdout에 출력된다: + +``` +[ccl t=346.4 send] sip0.cube0.pe1 dir=E nbytes=64 seq=0 +[ccl t=360.4 recv] sip0.cube0.pe2 dir=W nbytes=64 +... +``` + +### Pointer dump + +`kernbench.ccl.diagnostics.pointer_dump(engine)`는 모든 PE_IPCQ의 ring buffer +상태(`my_head`, `my_tail`, `peer_head_cache`, `peer_tail_cache`)를 multi-line +문자열로 반환한다. hang이 발생하면 어느 rank가 어떤 상태에서 막혔는지 한눈에 +보인다. + +### Deadlock detection + +매칭되지 않는 send/recv 등으로 SimPy 스케줄이 비면 engine이 `IpcqDeadlock`을 +던지며 pointer dump를 메시지에 포함시킨다 (ADR-0023 D14 F3). 별도 wait-for graph +시각화는 미래 작업. + +--- + +## 8. 흔한 실수 + +### 1. install 안 된 direction 사용 + +ccl.yaml의 `topology: ring_1d`는 E/W만 install한다. N/S 사용 시: + +```python +tl.send(dir="N", ...) # → IpcqInvalidDirection 예외 +``` + +해결: `topology: mesh_2d`로 바꾸거나, `neighbors()` override로 N/S 추가. + +### 2. send만 호출하고 recv 없음 + +```python +def kernel(..., tl): + for _ in range(100): + tl.send(dir="E", ...) + # peer 측 recv 없음 → ring buffer 가득 차면 backpressure → deadlock +``` + +해결: 모든 send에 짝이 되는 recv가 있어야 한다. 안 그러면 `IpcqDeadlock`이 +발생한다. + +### 3. dtype/shape 불일치 + +기본 모드에서는 dtype/shape mismatch를 검증하지 않는다. 작성자가 직접 보장하거나, +PE_IPCQ 노드 attrs에 `strict_validation: true`를 설정해 D14 F2 strict 모드로 +mismatch를 즉시 잡을 수 있다. + +### 4. round-robin recv의 fairness 가정 + +`tl.recv()` (방향 미지정)는 round-robin으로 가져오지만, 도착한 첫 슬롯을 반환한다. +**도착 순서를 알 수 없으므로** 알고리즘이 도착 방향에 의존하면 안 된다. +필요하면 `tl.recv(dir="N", ...)`처럼 명시. + +### 5. CCL 그룹 크기 가정 + +`tl.num_programs(axis=0/1)`은 토폴로지 슬롯 개수이지 CCL group 크기가 아니다. +참여하는 rank 수(`world_size`)는 호스트 bench가 알고 있고, kernel 인자로 명시 +전달해야 한다. + +### 6. 호스트가 send-source 메모리를 도착 전에 덮어씀 + +PE_DMA가 송신 시점에 src 데이터를 토큰에 스냅샷해서 in-flight 데이터의 의미가 +보존된다. 그래도 하나의 PE 안에서 같은 주소를 여러 step에 걸쳐 갱신할 때는 +direct send 후 다른 step에서 같은 주소를 store해도 안전하다 (token snapshot 덕분). +하지만 `tl.send`가 PE_DMA 큐에 enqueue되기 전에 주소를 덮어쓰면 잘못된 데이터가 +스냅샷된다 — `tl.send`를 먼저, 메모리 변경을 나중에 하는 게 권장. + +--- + +## 9. 다음 단계 + +- `mesh_2d` / `tree_binary` 같은 다른 topology 활용 +- recursive halving/doubling 등 더 빠른 알고리즘 +- `buffer_kind` (tcm/hbm/sram) / `backpressure` (poll/sleep) 모드별 latency 비교 +- `ccl_ring_allreduce_multicube.py`, `ccl_ring_allreduce_multisip.py`처럼 큰 + scale의 ring 검증 + +새 알고리즘이나 패턴을 추가했다면 PR로 기여해주세요. + +--- + +## 참고 + +- [ADR-0023](adr/ADR-0023-ipcq-pe-collective.md): IPCQ + PE-level collective 설계 +- [ADR-0022](adr/ADR-0022-program-id-2d-grid.md): 2D grid program_id (axis=0/1) +- [ADR-0020](adr/ADR-0020-data-execution-two-pass.md): 2-pass data execution +- [ADR-0021](adr/ADR-0021-pe-pipeline-refactor.md): PE pipeline refactor + +기존 알고리즘 예제: + +- [`src/kernbench/ccl/algorithms/hello_send.py`](../src/kernbench/ccl/algorithms/hello_send.py) — 가장 단순한 send/recv +- [`src/kernbench/ccl/algorithms/ring_allreduce.py`](../src/kernbench/ccl/algorithms/ring_allreduce.py) — ring all-reduce +- [`src/kernbench/ccl/algorithms/mesh_allreduce.py`](../src/kernbench/ccl/algorithms/mesh_allreduce.py) — 2D mesh all-reduce +- [`src/kernbench/ccl/algorithms/tree_allreduce.py`](../src/kernbench/ccl/algorithms/tree_allreduce.py) — binary tree all-reduce diff --git a/src/kernbench/ccl/__init__.py b/src/kernbench/ccl/__init__.py new file mode 100644 index 0000000..aa60e46 --- /dev/null +++ b/src/kernbench/ccl/__init__.py @@ -0,0 +1,9 @@ +"""CCL (Collective Communication Library) framework for kernbench (ADR-0023). + +This package provides: + - topologies: builtin neighbor topology generators (ring/mesh/tree) + - helpers: utilities for algorithm authors (chunked, ring_step, ...) + - testing: mock CCL runtime for fast unit tests of algorithm kernels + +See docs/adr/ADR-0023-ipcq-pe-collective.md and docs/ccl-author-guide.md. +""" diff --git a/src/kernbench/ccl/algorithms/__init__.py b/src/kernbench/ccl/algorithms/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/kernbench/ccl/algorithms/hello_send.py b/src/kernbench/ccl/algorithms/hello_send.py new file mode 100644 index 0000000..9275be1 --- /dev/null +++ b/src/kernbench/ccl/algorithms/hello_send.py @@ -0,0 +1,29 @@ +"""Hello-world CCL kernel for the docs/ccl-author-guide.md walkthrough. + +Each PE sends its tile to the E neighbor and receives one tile from W, +then stores the received tile back into its own HBM slice. The simplest +possible demonstration of ``tl.send`` / ``tl.recv``. +""" +from __future__ import annotations + + +def kernel_args(world_size: int, n_elem: int) -> tuple: + """Return the positional kernel arguments for the ahbm backend.""" + return (n_elem,) + + +def kernel(t_ptr, n_elem, tl): + local_pe = tl.program_id(axis=0) + cube_id = tl.program_id(axis=1) + pes_per_cube = tl.num_programs(axis=0) + rank = cube_id * pes_per_cube + local_pe + nbytes = n_elem * 2 + pe_addr = t_ptr + rank * nbytes + + # Send our local HBM tile to the E neighbor. + src = tl.load(pe_addr, shape=(n_elem,), dtype="f16") + tl.send(dir="E", src=src) + + # Receive a tile from W and store it into our slice (overwrite). + recv = tl.recv(dir="W", shape=(n_elem,), dtype="f16") + tl.store(pe_addr, recv) diff --git a/src/kernbench/ccl/algorithms/mesh_allreduce.py b/src/kernbench/ccl/algorithms/mesh_allreduce.py new file mode 100644 index 0000000..e668c5c --- /dev/null +++ b/src/kernbench/ccl/algorithms/mesh_allreduce.py @@ -0,0 +1,73 @@ +"""2D-mesh all-reduce kernel (ADR-0023). + +Two-phase reduce on a square mesh of side ``S`` (world_size = S*S): + 1. Row reduce: ring all-reduce along E/W within each row. + 2. Column reduce: ring all-reduce along N/S within each column. + +After both phases, every rank holds the global sum. + +Uses TensorHandle math (PE_MATH) for accumulation. Op_log captures the +data flow so Phase 2 produces correct final HBM contents. Math/recv +handles are passed directly to the next send, avoiding store→reload +which doesn't propagate correctly with timing-only Phase 1 math. +""" +from __future__ import annotations + +import math + + +def kernel_args(world_size: int, n_elem: int) -> tuple: + """Return the positional kernel arguments for the ahbm backend. + + Mesh all-reduce requires ``world_size`` to be a perfect square — + the mesh side length is ``sqrt(world_size)``. + """ + side = int(round(math.sqrt(world_size))) + if side * side != world_size: + raise ValueError( + f"mesh_allreduce requires a square world_size; got {world_size}" + ) + return (n_elem, side) + + +def kernel(t_ptr, n_elem, side, tl): + """All-reduce on a square mesh. + + Args: + t_ptr: HBM base address (column-sharded VA shared across ranks) + n_elem: number of f16 elements per tile + side: mesh side length (sqrt(world_size)) + tl: TLContext (ADR-0022). + """ + local_pe = tl.program_id(axis=0) + cube_id = tl.program_id(axis=1) + pes_per_cube = tl.num_programs(axis=0) + rank = cube_id * pes_per_cube + local_pe + nbytes = n_elem * 2 + + pe_addr = t_ptr + rank * nbytes + acc = tl.load(pe_addr, shape=(n_elem,), dtype="f16") + current = acc + + # ── Phase 1: row ring (E direction) ── + # Ring forwards each received tile (not the cumulative acc) so every + # tile passes through every rank exactly once. + for _ in range(side - 1): + tl.send(dir="E", src=current) + recv = tl.recv(dir="W", shape=(n_elem,), dtype="f16") + acc = acc + recv + current = recv + + # Phase 2 column ring starts from the row-phase accumulator. We do NOT + # store/reload here — the math handle's scratch addr is the source for + # the first column send and Phase 2 ipcq_copy replays from there. + current = acc + + # ── Phase 2: column ring (S direction) ── + for _ in range(side - 1): + tl.send(dir="S", src=current) + recv = tl.recv(dir="N", shape=(n_elem,), dtype="f16") + acc = acc + recv + current = recv + + tl.store(pe_addr, acc) diff --git a/src/kernbench/ccl/algorithms/ring_allreduce.py b/src/kernbench/ccl/algorithms/ring_allreduce.py new file mode 100644 index 0000000..ac8c9dd --- /dev/null +++ b/src/kernbench/ccl/algorithms/ring_allreduce.py @@ -0,0 +1,80 @@ +"""Ring all-reduce kernel for IPCQ-based PE collective (ADR-0023). + +Algorithm: 1D ring of N PEs, each PE starts with one tile of data. +After ``world_size - 1`` rounds, every PE's accumulator holds the sum +of all PE tiles. + +Strategy +-------- +Each PE starts with its own tile in HBM. The kernel: +1. Loads the local tile into a TensorHandle (the accumulator). +2. In each of ``world_size - 1`` rounds: + - Sends the current accumulator/recv slot to the E neighbor. + - Receives a tile from the W neighbor — the recv handle points + into the per-direction TCM slot. + - Adds the received tile to the accumulator using the TensorHandle + operator overload, which dispatches to ``MathCmd`` (PE_MATH). +3. Stores the final accumulator back to HBM via tl.store. The store is + recorded in op_log with both src and dst, so Phase 2 will copy the + replayed math result from PE-local scratch into HBM. + +ADR-0020 D3 split: Phase 1 simulates timing only — math results are +not yet computed, so the accumulator data flowing through Phase 1 may +be stale. Phase 2's DataExecutor replays math + IPCQ copies + dma_write +in stable t_start order, producing correct final HBM contents. +""" +from __future__ import annotations + + +def kernel_args(world_size: int, n_elem: int) -> tuple: + """Return the positional kernel arguments for the ahbm backend. + + Ring all-reduce takes (n_elem, world_size) after the tensor pointer. + """ + return (n_elem, world_size) + + +def kernel(t_ptr, n_elem, world_size, tl): + """Ring all-reduce. + + Args: + t_ptr: HBM base address of the column-sharded tensor — all PEs + share this base. The per-PE slice lives at + ``t_ptr + global_rank * n_elem * 2``. + n_elem: number of f16 elements per tile. + world_size: total number of participating ranks (passed by host). + tl: TLContext (auto-injected, ADR-0022). The kernel derives the + global rank from ``program_id(axis=0)`` (local PE) and + ``program_id(axis=1)`` (cube id): + + rank = cube_id * pes_per_cube + local_pe + """ + local_pe = tl.program_id(axis=0) + cube_id = tl.program_id(axis=1) + pes_per_cube = tl.num_programs(axis=0) + rank = cube_id * pes_per_cube + local_pe + nbytes = n_elem * 2 # f16 + + # Each PE reads from its own slice of the shared base address + pe_addr = t_ptr + rank * nbytes + + # Load the local tile — handle points at HBM[pe_addr]. + acc = tl.load(pe_addr, shape=(n_elem,), dtype="f16") + # The ring forwards each received tile to the next neighbor (NOT the + # cumulative accumulator), so every rank's tile passes through every + # rank exactly once. The accumulator sums the new arrival each round. + current = acc + + for _step in range(world_size - 1): + tl.send(dir="E", src=current) + recv = tl.recv(dir="W", shape=(n_elem,), dtype="f16") + # TensorHandle add → MathCmd → PE_MATH (timing in Phase 1, real + # numpy in Phase 2 via DataExecutor). The result handle lives at + # an auto-allocated PE-local scratch addr. + acc = acc + recv + current = recv # forward W's tile to E next round + + # Final result back to this PE's HBM slice. Op_log captures the + # source (scratch addr) and dst (HBM slice) so Phase 2 copies the + # accumulated value into HBM for verification. + tl.store(pe_addr, acc) diff --git a/src/kernbench/ccl/algorithms/tree_allreduce.py b/src/kernbench/ccl/algorithms/tree_allreduce.py new file mode 100644 index 0000000..9462846 --- /dev/null +++ b/src/kernbench/ccl/algorithms/tree_allreduce.py @@ -0,0 +1,80 @@ +"""Tree all-reduce kernel for IPCQ-based PE collective (ADR-0023). + +Two-phase binary tree all-reduce: + + Phase 1 (reduce up): + - leaf nodes send their value to ``parent`` + - internal nodes recv from each child, sum, then send to ``parent`` + - root accumulates child contributions; final acc holds global sum + + Phase 2 (broadcast down): + - root sends acc to ``child_left`` and ``child_right`` (if present) + - internal nodes recv from ``parent``, then forward to children + - all ranks store the final acc to HBM + +Uses TensorHandle math (PE_MATH) for accumulation. Op_log captures the +data flow so Phase 2 produces correct final HBM contents. The kernel +deliberately avoids the store→reload→send pattern: math/recv handles +are passed directly to the next send so PE_DMA snapshots a deterministic +source addr that Phase 2 can replay. +""" +from __future__ import annotations + + +def kernel_args(world_size: int, n_elem: int) -> tuple: + """Return the positional kernel arguments for the ahbm backend.""" + return (n_elem, world_size) + + +def kernel(t_ptr, n_elem, world_size, tl): + """Tree all-reduce. + + Args: + t_ptr: HBM base address. + n_elem: number of f16 elements per tile. + world_size: total number of participating ranks (passed by host). + tl: TLContext (ADR-0022). Global rank from program_id(0/1). + """ + local_pe = tl.program_id(axis=0) + cube_id = tl.program_id(axis=1) + pes_per_cube = tl.num_programs(axis=0) + rank = cube_id * pes_per_cube + local_pe + nbytes = n_elem * 2 + + pe_addr = t_ptr + rank * nbytes + acc = tl.load(pe_addr, shape=(n_elem,), dtype="f16") + + # Compute children/parent existence (matches tree_binary topology generator) + has_parent = rank > 0 + left = 2 * rank + 1 + right = 2 * rank + 2 + has_left = left < world_size + has_right = right < world_size + + # ── Phase 1: reduce up ── + if has_left: + recv = tl.recv(dir="child_left", shape=(n_elem,), dtype="f16") + acc = acc + recv + if has_right: + recv = tl.recv(dir="child_right", shape=(n_elem,), dtype="f16") + acc = acc + recv + + if has_parent: + # Send the math/load handle directly — its addr is either the + # original HBM tile (leaf) or the PE-local scratch where the + # accumulator lives. Phase 2 ipcq_copy replays from the same addr. + tl.send(dir="parent", src=acc) + + # ── Phase 2: broadcast down ── + if has_parent: + # Replace acc with the value broadcast from the parent (the global + # sum). The recv handle points at the parent-direction TCM slot. + acc = tl.recv(dir="parent", shape=(n_elem,), dtype="f16") + + if has_left: + tl.send(dir="child_left", src=acc) + if has_right: + tl.send(dir="child_right", src=acc) + + # Final store to HBM for the bench's verification path. + tl.store(pe_addr, acc) diff --git a/src/kernbench/ccl/diagnostics.py b/src/kernbench/ccl/diagnostics.py new file mode 100644 index 0000000..6dec58e --- /dev/null +++ b/src/kernbench/ccl/diagnostics.py @@ -0,0 +1,127 @@ +"""CCL diagnostics: trace + pointer dump + deadlock (ADR-0023 D14). + +Trace +----- +Set ``KERNBENCH_CCL_TRACE=1`` (or any truthy value) to enable per-event +logging of CCL send/recv to stdout. Off by default. + +Pointer dump +------------ +``pointer_dump(engine)`` returns a multi-line string showing every PE_IPCQ's +ring buffer state (my_head, my_tail, peer_head_cache, peer_tail_cache). +Useful for diagnosing hangs. + +Deadlock +-------- +``IpcqDeadlock`` is raised by the engine when SimPy's schedule empties +while a request is still pending — typical of unmatched send/recv pairs. +The exception message includes the pointer dump. +""" +from __future__ import annotations + +import os +from typing import Any + + +class IpcqDeadlock(RuntimeError): + """Raised when the simulation cannot make further progress while a + CCL request is still pending (D14 F3).""" + + +# ── Trace toggle ───────────────────────────────────────────────────── + + +_TRACE_ENABLED: bool = False + + +def reload_trace_setting() -> None: + """Re-read the ``KERNBENCH_CCL_TRACE`` env var.""" + global _TRACE_ENABLED + val = os.environ.get("KERNBENCH_CCL_TRACE", "") + _TRACE_ENABLED = val.strip().lower() in {"1", "true", "yes", "on"} + + +def trace_enabled() -> bool: + return _TRACE_ENABLED + + +# Initialise once at import time +reload_trace_setting() + + +# ── Trace event functions ──────────────────────────────────────────── + + +def log_send( + t_ns: float, + sender: str, + direction: str, + nbytes: int, + sender_seq: int, +) -> None: + if not _TRACE_ENABLED: + return + print( + f"[ccl t={t_ns:.1f} send] {sender} dir={direction} nbytes={nbytes} seq={sender_seq}", + flush=True, + ) + + +def log_recv( + t_ns: float, + receiver: str, + direction: str, + nbytes: int, +) -> None: + if not _TRACE_ENABLED: + return + print( + f"[ccl t={t_ns:.1f} recv] {receiver} dir={direction} nbytes={nbytes}", + flush=True, + ) + + +def log_credit_return( + t_ns: float, + sender: str, + direction: str, + consumer_seq: int, +) -> None: + if not _TRACE_ENABLED: + return + print( + f"[ccl t={t_ns:.1f} credit] {sender} dir={direction} seq={consumer_seq}", + flush=True, + ) + + +# ── Pointer dump ───────────────────────────────────────────────────── + + +def pointer_dump(engine: Any) -> str: + """Return a multi-line string of every PE_IPCQ's pointer state.""" + lines: list[str] = [] + components = getattr(engine, "_components", {}) + for node_id in sorted(components): + if not node_id.endswith(".pe_ipcq"): + continue + comp = components[node_id] + qps = getattr(comp, "queue_pairs", {}) + if not qps: + continue + lines.append(node_id) + for d in sorted(qps): + qp = qps[d] + peer = qp["peer"] + lines.append( + f" {d}: peer=sip{peer.sip}.cube{peer.cube}.pe{peer.pe} " + f"my_head={qp['my_head']} my_tail={qp['my_tail']} " + f"peer_head_cache={qp['peer_head_cache']} " + f"peer_tail_cache={qp['peer_tail_cache']}" + ) + return "\n".join(lines) + + +def print_pointer_dump(engine: Any) -> None: + """Convenience: print pointer_dump(engine) to stdout.""" + print(pointer_dump(engine), flush=True) diff --git a/src/kernbench/ccl/helpers.py b/src/kernbench/ccl/helpers.py new file mode 100644 index 0000000..cc3f240 --- /dev/null +++ b/src/kernbench/ccl/helpers.py @@ -0,0 +1,118 @@ +"""Helpers for CCL algorithm authors (ADR-0023 D15). + +These are pure utility functions usable from any kernel module: + + from kernbench.ccl.helpers import chunked, ring_step, tree_step + +They keep algorithm code short and free of off-by-one bugs. +""" +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any + + +_DTYPE_BYTES = { + "f16": 2, "fp16": 2, "float16": 2, "bf16": 2, + "f32": 4, "fp32": 4, "float32": 4, + "i8": 1, "int8": 1, + "i16": 2, "int16": 2, + "i32": 4, "int32": 4, +} + + +def _itemsize(dtype: str) -> int: + if dtype not in _DTYPE_BYTES: + raise ValueError(f"Unsupported dtype: {dtype}") + return _DTYPE_BYTES[dtype] + + +# ── chunked ────────────────────────────────────────────────────────── + + +@dataclass(frozen=True) +class Chunk: + """One chunk of a tensor used by collective algorithms.""" + + addr: int + n_elem: int + nbytes: int + + +def chunked( + base_addr: int, + n_chunks: int, + n_elem: int, + dtype: str = "f16", +) -> list[Chunk]: + """Slice a 1D buffer into ``n_chunks`` equal Chunks. + + Args: + base_addr: starting address of the buffer. + n_chunks: number of equal chunks to produce. + n_elem: total number of elements (must be divisible by n_chunks). + dtype: element type for byte-size calculation. + + Returns: + List of ``Chunk`` objects whose addresses are consecutive. + + Raises: + ValueError: if n_elem is not divisible by n_chunks. + """ + if n_elem % n_chunks != 0: + raise ValueError( + f"chunked: n_elem ({n_elem}) not divisible by n_chunks ({n_chunks})" + ) + per_chunk_elem = n_elem // n_chunks + isize = _itemsize(dtype) + per_chunk_bytes = per_chunk_elem * isize + return [ + Chunk( + addr=base_addr + i * per_chunk_bytes, + n_elem=per_chunk_elem, + nbytes=per_chunk_bytes, + ) + for i in range(n_chunks) + ] + + +# ── ring_step ──────────────────────────────────────────────────────── + + +def ring_step(rank: int, step: int, world_size: int) -> tuple[int, int]: + """Return ``(send_chunk_idx, recv_chunk_idx)`` for a ring algorithm step. + + Standard reduce-scatter / all-gather ring schedule: + at step s, rank r sends chunk (r - s) and receives chunk (r - s - 1) + modulo world_size. + + Used by ring all-reduce kernels: + + for step in range(world_size - 1): + send_idx, recv_idx = ring_step(rank, step, world_size) + tl.send(dir="E", src=chunks[send_idx]) + chunks[recv_idx] += tl.recv(dir="W").data + """ + send_idx = (rank - step) % world_size + recv_idx = (rank - step - 1) % world_size + return send_idx, recv_idx + + +# ── tree_step ──────────────────────────────────────────────────────── + + +def tree_step(rank: int, world_size: int) -> dict[str, Any]: + """Return parent/children for binary tree rooted at rank 0. + + Returns: + ``{"parent": int|None, "children": list[int]}`` + """ + parent = (rank - 1) // 2 if rank > 0 else None + children: list[int] = [] + left = 2 * rank + 1 + right = 2 * rank + 2 + if left < world_size: + children.append(left) + if right < world_size: + children.append(right) + return {"parent": parent, "children": children} diff --git a/src/kernbench/ccl/install.py b/src/kernbench/ccl/install.py new file mode 100644 index 0000000..4218763 --- /dev/null +++ b/src/kernbench/ccl/install.py @@ -0,0 +1,266 @@ +"""IPCQ install plan for AhbmCCLBackend (ADR-0023 D10/D11/D12). + +Given a ccl.yaml config, the topology, and the engine, this module: + +1. Loads ccl.yaml and resolves the chosen algorithm. +2. Maps each rank to a (sip, cube, pe) PE address using a linear scheme. +3. Allocates per-rank IPCQ ring buffer base addresses (synthetic but + unique-per-PE; see notes below). +4. Builds neighbor tables via the algorithm's ``topology`` field plus the + optional ``neighbors()`` override hook from the algorithm module. +5. Wires bidirectional credit-return SimPy Stores between every (PE, peer) + pair. +6. Installs each PE_IPCQ component's neighbor table directly via its + ``_install_neighbors`` sideband call (equivalent to fan-out IpcqInitMsg + without going through fabric). + +Address scheme +-------------- +For the first implementation we use a synthetic address scheme that +guarantees uniqueness per (sip, cube, pe, direction) without going +through ``PEMemAllocator``. The address is encoded as: + + base = IPCQ_BASE | (sip << 40) | (cube << 32) | (pe << 24) + rx_base[direction_idx] = base + direction_idx * (n_slots * slot_size) + +The ``buffer_kind`` (tcm/hbm/sram) selects the *MemoryStore space* into +which data is written. Within a space, addresses are unique per PE so +the existing MemoryStore (``{space: {addr: ndarray}}``) handles them +naturally. + +This bypasses the topology's address resolver / PhysAddr encoding and +treats IPCQ buffers as a separate, parallel address namespace. Real PA +encoding can be plugged in later without changing the rest of the design. +""" +from __future__ import annotations + +from pathlib import Path +from typing import Any + +import simpy +import yaml + +from kernbench.ccl.topologies import resolve_topology +from kernbench.common.ipcq_types import ( + IpcqEndpoint, + IpcqInitEntry, +) +from kernbench.runtime_api.kernel import IpcqInitMsg + + +# IPCQ synthetic address space top bit +_IPCQ_BASE = 1 << 60 + + +def _ipcq_base_for_pe(sip: int, cube: int, pe: int) -> int: + return _IPCQ_BASE | (sip << 40) | (cube << 32) | (pe << 24) + + +# ── ccl.yaml loading ───────────────────────────────────────────────── + + +def load_ccl_config(path: str | Path | None = None) -> dict: + """Load and validate ccl.yaml. Searches cwd and project root.""" + if path is None: + candidates = [ + Path.cwd() / "ccl.yaml", + Path(__file__).resolve().parents[3] / "ccl.yaml", + ] + for p in candidates: + if p.exists(): + path = p + break + if path is None: + raise FileNotFoundError( + "ccl.yaml not found. Place it at project root or cwd." + ) + with open(path) as f: + cfg = yaml.safe_load(f) + if "defaults" not in cfg: + raise ValueError("ccl.yaml missing 'defaults' section") + if "algorithms" not in cfg: + raise ValueError("ccl.yaml missing 'algorithms' section") + return cfg + + +def resolve_algorithm_config(cfg: dict, name: str | None = None) -> dict: + """Merge defaults with the chosen algorithm's overrides. + + Returns a flat dict with at minimum: module, topology, buffer_kind, + backpressure, n_slots, slot_size, ipcq_credit_size_bytes, world_size. + """ + defaults = dict(cfg.get("defaults", {})) + algo_name = name or defaults.get("algorithm") + if algo_name is None: + raise ValueError("ccl.yaml: defaults.algorithm not set") + algos = cfg.get("algorithms", {}) + if algo_name not in algos: + raise ValueError( + f"ccl.yaml: algorithm '{algo_name}' not in algorithms section" + ) + merged = defaults.copy() + merged.update(algos[algo_name]) + merged["algorithm"] = algo_name + return merged + + +# ── rank → PE mapping ──────────────────────────────────────────────── + + +def linear_rank_to_pe(rank: int, spec: dict) -> tuple[int, int, int]: + """Map a rank to (sip, cube, pe) using linear topology order.""" + sips = spec["system"]["sips"]["count"] + cubes_per_sip = spec["sip"]["cube_mesh"]["w"] * spec["sip"]["cube_mesh"]["h"] + pe_layout = spec["cube"]["pe_layout"] + pes_per_cube = pe_layout["pe_per_corner"] * len(pe_layout["corners"]) + + pes_per_sip = cubes_per_sip * pes_per_cube + if rank >= sips * pes_per_sip: + raise ValueError( + f"rank {rank} exceeds total PE count {sips * pes_per_sip}" + ) + sip = rank // pes_per_sip + rem = rank % pes_per_sip + cube = rem // pes_per_cube + pe = rem % pes_per_cube + return sip, cube, pe + + +# ── Install plan ───────────────────────────────────────────────────── + + +def install_ipcq( + engine: Any, + spec: dict, + cfg: dict, + algo_module: Any | None = None, + rank_to_pe: list[tuple[int, int, int]] | None = None, +) -> dict[str, Any]: + """Build neighbor tables and install them in every participating PE_IPCQ. + + Args: + engine: GraphEngine with ``_components`` dict + spec: topology spec dict + cfg: merged algorithm config (from ``resolve_algorithm_config``) + algo_module: optional algorithm Python module (for neighbors override) + rank_to_pe: optional explicit rank → (sip, cube, pe) mapping. If + None, the default linear mapping is used. + + Returns: + A diagnostics dict with the install plan (rank → PE map, neighbor table). + """ + if "world_size" in cfg: + world_size = int(cfg["world_size"]) + else: + # Topology-derived fallback (mirrors AhbmCCLBackend / RuntimeContext). + sips = int(spec.get("system", {}).get("sips", {}).get("count", 1)) + cm = spec.get("sip", {}).get("cube_mesh", {}) + cubes_per_sip = int(cm.get("w", 1)) * int(cm.get("h", 1)) + pl = spec.get("cube", {}).get("pe_layout", {}) + corners = pl.get("corners", []) + pe_per_corner = int(pl.get("pe_per_corner", 1)) + pes_per_cube = pe_per_corner * max(len(corners), 1) + world_size = sips * cubes_per_sip * pes_per_cube + buffer_kind = cfg["buffer_kind"] + n_slots = int(cfg["n_slots"]) + slot_size = int(cfg["slot_size"]) + backpressure = cfg["backpressure"] + credit_size_bytes = int(cfg.get("ipcq_credit_size_bytes", 16)) + + # Step 1: rank → (sip, cube, pe) + if rank_to_pe is not None: + if len(rank_to_pe) != world_size: + raise ValueError( + f"rank_to_pe has {len(rank_to_pe)} entries but world_size={world_size}" + ) + rank_pe = list(rank_to_pe) + else: + rank_pe: list[tuple[int, int, int]] = [ + linear_rank_to_pe(r, spec) for r in range(world_size) + ] + pe_to_rank = {(s, c, p): r for r, (s, c, p) in enumerate(rank_pe)} + + # Step 2: resolve topology fn (with optional override) + topo_fn = resolve_topology(cfg["topology"], algo_module=algo_module) + + # Build per-rank neighbor map + neighbor_table: dict[int, dict[str, int]] = {} + for r in range(world_size): + neighbor_table[r] = topo_fn(r, world_size) + + # Step 3: pull the live engine reference for each PE_IPCQ + components = engine._components + pe_ipcq_id = lambda s, c, p: f"sip{s}.cube{c}.pe{p}.pe_ipcq" + + # Step 4: per-PE rx_base address and per-PE credit_inbox + direction_keys = sorted({d for nt in neighbor_table.values() for d in nt}) + direction_idx = {d: i for i, d in enumerate(direction_keys)} + bytes_per_direction = n_slots * slot_size + + def rx_base(s: int, c: int, p: int, d: str) -> int: + return _ipcq_base_for_pe(s, c, p) + direction_idx[d] * bytes_per_direction + + # Wire bidirectional credit stores: backend creates the SimPy Stores + # by reading each rank's PE_IPCQ.credit_inbox property. + rank_to_credit_inbox: dict[int, simpy.Store] = {} + for r, (s, c, p) in enumerate(rank_pe): + comp = components[pe_ipcq_id(s, c, p)] + # Trigger lazy creation of credit_inbox if not yet started. + # PE_IPCQ.start() creates it; we ensure it exists. + if comp._credit_inbox is None: + comp._credit_inbox = simpy.Store(engine._env) + rank_to_credit_inbox[r] = comp.credit_inbox + + # Step 5: build IpcqInitMsg per rank and call _install_neighbors directly + plan: dict[str, Any] = { + "world_size": world_size, + "rank_to_pe": rank_pe, + "buffer_kind": buffer_kind, + "neighbor_table": neighbor_table, + } + + def reverse_direction(my_rank: int, peer_rank: int) -> str | None: + """Find which direction in peer's neighbor table points back to my_rank.""" + for d, target in neighbor_table[peer_rank].items(): + if target == my_rank: + return d + return None + + for r, (s, c, p) in enumerate(rank_pe): + my_pe_ipcq = components[pe_ipcq_id(s, c, p)] + nbrs = neighbor_table[r] + entries: list[IpcqInitEntry] = [] + for d, peer_rank in nbrs.items(): + if peer_rank is None: + continue + peer_s, peer_c, peer_p = rank_pe[peer_rank] + peer_dir = reverse_direction(r, peer_rank) + if peer_dir is None: + # Peer doesn't have a reverse entry — skip (asymmetric topology) + continue + peer_endpoint = IpcqEndpoint( + sip=peer_s, cube=peer_c, pe=peer_p, + buffer_kind=buffer_kind, + rx_base_pa=rx_base(peer_s, peer_c, peer_p, peer_dir), + rx_base_va=0, + n_slots=n_slots, slot_size=slot_size, + ) + entries.append(IpcqInitEntry( + direction=d, + peer=peer_endpoint, + my_rx_base_pa=rx_base(s, c, p, d), + my_rx_base_va=0, + n_slots=n_slots, slot_size=slot_size, + peer_credit_store=rank_to_credit_inbox[peer_rank], + )) + msg = IpcqInitMsg( + correlation_id="ccl_init", request_id=f"init_r{r}", + target_sips=(s,), target_cubes=(c,), target_pe=p, + entries=tuple(entries), + backpressure_mode=backpressure, + buffer_kind=buffer_kind, + credit_size_bytes=credit_size_bytes, + ) + my_pe_ipcq._install_neighbors(msg) + + return plan diff --git a/src/kernbench/ccl/testing.py b/src/kernbench/ccl/testing.py new file mode 100644 index 0000000..2d099ef --- /dev/null +++ b/src/kernbench/ccl/testing.py @@ -0,0 +1,465 @@ +"""Mock CCL runtime for fast unit tests of algorithm kernels (ADR-0023 D15). + +Runs a kernel function once per rank with a minimal ``tl`` shim — no SimPy, +no PE_DMA, no fabric simulation. Just enough to verify *functional* +correctness of an IPCQ-based collective algorithm. + +Cross-rank send/recv is implemented with greenlet cooperative scheduling +plus per-(rank, direction) FIFO queues. Backpressure is not modeled — +queues are unbounded. + +Typical usage in a test:: + + from kernbench.ccl.testing import run_kernel_in_mock + from kernbench.ccl.algorithms.ring_allreduce import kernel + + inputs = [np.full(16, r + 1, dtype="f16") for r in range(4)] + outputs = run_kernel_in_mock( + kernel_fn=kernel, world_size=4, topology="ring_1d", + inputs=inputs, kernel_args=(16,), + ) + for r in range(4): + assert np.allclose(outputs[r], sum(inputs)) +""" +from __future__ import annotations + +from collections import deque +from typing import Any, Callable + +import numpy as np +from greenlet import greenlet + +from kernbench.ccl.topologies import resolve_topology +from kernbench.common.ipcq_types import IpcqInvalidDirection +from kernbench.common.pe_commands import TensorHandle + + +# ── Per-rank fake state ────────────────────────────────────────────── + + +class _MockRankState: + """Per-rank scratch holding HBM/recv slots and tl shim hooks.""" + + def __init__( + self, + rank: int, + world_size: int, + neighbors: dict[str, int], + input_arr: np.ndarray, + ) -> None: + self.rank = rank + self.world_size = world_size + self.neighbors = neighbors # direction → peer rank + # HBM "memory": addr → ndarray. Per-rank, no cross-rank sharing. + self._hbm: dict[int, np.ndarray] = {} + self._tcm: dict[int, np.ndarray] = {} + # ``t_ptr`` is the address the kernel sees. Real benches use a + # column-sharded VA so each rank reads from ``t_ptr + rank*nbytes``. + # Mirror that here: each rank's slice lives at the rank-specific addr. + nbytes = int(input_arr.nbytes) + self.t_ptr = 0 # base; per-rank offset is rank * nbytes + self._slice_addr = rank * nbytes + self._hbm[self._slice_addr] = input_arr.copy() + # Inbound recv FIFOs: direction → deque[ndarray] + self.recv_q: dict[str, deque[np.ndarray]] = {d: deque() for d in neighbors} + # Output (set when kernel calls tl.store at slice address) + self.output: np.ndarray | None = None + # Greenlet for this rank — set later + self.g: greenlet | None = None + + +# ── Mock TLContext ─────────────────────────────────────────────────── + + +class _MockTL: + """Drop-in tl shim for mock runtime. + + Supports the subset of TLContext API that algorithm authors use: + program_id, num_programs, load, store, send, recv, recv_async, wait, + plus arithmetic operations on TensorHandle (eager numpy execution, + no SimPy involved). + """ + + def __init__(self, state: _MockRankState, scheduler: "_MockScheduler") -> None: + self._state = state + self._scheduler = scheduler + self._handle_counter = 0 + + def _next_id(self) -> str: + self._handle_counter += 1 + return f"mt{self._handle_counter}" + + @property + def rank(self) -> int: + return self._state.rank + + @property + def world_size(self) -> int: + return self._state.world_size + + # axis-aware + def program_id(self, axis: int = 0) -> int: + return self._state.rank if axis == 0 else 0 + + def num_programs(self, axis: int = 0) -> int: + return self._state.world_size if axis == 0 else 1 + + # ── arithmetic ops (called by TensorHandle.__add__ etc.) ── + + def _binary_math(self, op: str, a: TensorHandle, b: TensorHandle) -> TensorHandle: + a_data = np.asarray(a.data) if a.data is not None else None + b_data = np.asarray(b.data) if b.data is not None else None + if a_data is None or b_data is None: + result = None + elif op == "add": + result = a_data + b_data + elif op == "sub": + result = a_data - b_data + elif op == "mul": + result = a_data * b_data + elif op == "div": + result = a_data / b_data + elif op == "maximum": + result = np.maximum(a_data, b_data) + elif op == "minimum": + result = np.minimum(a_data, b_data) + else: + raise NotImplementedError(f"mock _binary_math: op {op!r} not implemented") + return TensorHandle( + id=self._next_id(), + addr=0, shape=a.shape, dtype=a.dtype, + nbytes=int(np.prod(a.shape)) * 2 if a.shape else 0, + data=result, space="tcm", + ) + + def maximum(self, a: TensorHandle, b: TensorHandle) -> TensorHandle: + return self._binary_math("maximum", a, b) + + def minimum(self, a: TensorHandle, b: TensorHandle) -> TensorHandle: + return self._binary_math("minimum", a, b) + + def fma( + self, a: TensorHandle, b: TensorHandle, c: TensorHandle, + ) -> TensorHandle: + a_data = np.asarray(a.data) if a.data is not None else None + b_data = np.asarray(b.data) if b.data is not None else None + c_data = np.asarray(c.data) if c.data is not None else None + result = ( + a_data * b_data + c_data + if (a_data is not None and b_data is not None and c_data is not None) + else None + ) + return TensorHandle( + id=self._next_id(), + addr=0, shape=a.shape, dtype=a.dtype, + nbytes=int(np.prod(a.shape)) * 2 if a.shape else 0, + data=result, space="tcm", + ) + + def clamp( + self, + x: TensorHandle, + min: TensorHandle, + max: TensorHandle, + ) -> TensorHandle: + x_data = np.asarray(x.data) if x.data is not None else None + lo = np.asarray(min.data) if min.data is not None else None + hi = np.asarray(max.data) if max.data is not None else None + result = ( + np.minimum(np.maximum(x_data, lo), hi) + if (x_data is not None and lo is not None and hi is not None) + else None + ) + return TensorHandle( + id=self._next_id(), + addr=0, shape=x.shape, dtype=x.dtype, + nbytes=int(np.prod(x.shape)) * 2 if x.shape else 0, + data=result, space="tcm", + ) + + def softmax(self, x: TensorHandle, axis: int = -1) -> TensorHandle: + x_data = np.asarray(x.data) if x.data is not None else None + if x_data is None: + result = None + else: + x_max = np.max(x_data, axis=axis, keepdims=True) + e = np.exp(x_data - x_max) + s = np.sum(e, axis=axis, keepdims=True) + result = e / s + return TensorHandle( + id=self._next_id(), + addr=0, shape=x.shape, dtype=x.dtype, + nbytes=int(np.prod(x.shape)) * 2 if x.shape else 0, + data=result, space="tcm", + ) + + @staticmethod + def cdiv(a: int, b: int) -> int: + return -(-int(a) // int(b)) + + def _unary_math(self, op: str, x: TensorHandle) -> TensorHandle: + x_data = np.asarray(x.data) if x.data is not None else None + if x_data is None: + result = None + elif op == "exp": + result = np.exp(x_data) + elif op == "log": + result = np.log(x_data) + elif op == "sqrt": + result = np.sqrt(x_data) + elif op == "abs": + result = np.abs(x_data) + elif op == "sigmoid": + result = 1.0 / (1.0 + np.exp(-x_data)) + elif op == "cos": + result = np.cos(x_data) + elif op == "sin": + result = np.sin(x_data) + else: + raise NotImplementedError(f"mock _unary_math: op {op!r} not implemented") + return TensorHandle( + id=self._next_id(), + addr=0, shape=x.shape, dtype=x.dtype, + nbytes=int(np.prod(x.shape)) * 2 if x.shape else 0, + data=result, space="tcm", + ) + + def load(self, ptr: int, shape: tuple[int, ...], dtype: str = "f16") -> TensorHandle: + data = self._state._hbm.get(ptr) + if data is None: + data = np.zeros(shape, dtype=np.float16) + return TensorHandle( + id=f"load_{ptr}", addr=ptr, shape=shape, dtype=dtype, + nbytes=int(np.prod(shape)) * 2, data=data, space="hbm", + ) + + def store(self, ptr: int, handle: TensorHandle) -> None: + if handle.data is not None: + self._state._hbm[ptr] = np.asarray(handle.data) + if ptr == self._state._slice_addr: + self._state.output = self._state._hbm[ptr] + + # IPCQ + def send( + self, + dir: str, + src: TensorHandle | None = None, + *, + src_addr: int | None = None, + nbytes: int | None = None, + shape: tuple[int, ...] | None = None, + dtype: str = "f16", + space: str = "tcm", + ) -> None: + if dir not in self._state.neighbors: + raise IpcqInvalidDirection( + f"mock tl.send: direction {dir!r} not in neighbors {list(self._state.neighbors)}" + ) + if src is not None: + if src.data is not None: + data = np.asarray(src.data) + else: + # Resolve from this rank's local memory at src.addr + space_dict = self._state._hbm if src.space == "hbm" else self._state._tcm + stored = space_dict.get(src.addr) + if stored is None: + raise RuntimeError( + f"mock tl.send: no data at {src.space}:0x{src.addr:x}" + ) + data = np.asarray(stored) + else: + data = None + if data is None: + raise RuntimeError("mock tl.send: src is None") + peer_rank = self._state.neighbors[dir] + # Find the reverse direction in peer's neighbors that points back to me + peer_state = self._scheduler.states[peer_rank] + reverse_dir = None + for d, target in peer_state.neighbors.items(): + if target == self._state.rank: + reverse_dir = d + break + if reverse_dir is None: + raise RuntimeError( + f"mock tl.send: peer rank {peer_rank} has no reverse direction" + ) + peer_state.recv_q[reverse_dir].append(data.copy()) + # After delivering, hand control back to scheduler so the receiver + # can wake up. + self._scheduler.yield_() + + def recv_async( + self, + dir: str, + shape: tuple[int, ...] = (), + dtype: str = "f16", + ) -> dict: + """Non-blocking recv. Returns a future dict to pass to tl.wait.""" + if dir not in self._state.neighbors: + raise IpcqInvalidDirection( + f"mock tl.recv_async: direction {dir!r} not in neighbors" + ) + return {"_kind": "recv_future", "dir": dir, "shape": shape, "dtype": dtype} + + def wait(self, future: Any) -> TensorHandle: + """Block until the recv future has data.""" + if not isinstance(future, dict) or future.get("_kind") != "recv_future": + raise TypeError("tl.wait: expected recv future from tl.recv_async") + d = future["dir"] + while not self._state.recv_q[d]: + self._scheduler.yield_() + data = self._state.recv_q[d].popleft() + return self._make_handle(data, d, future["dtype"]) + + def recv( + self, + dir: str | None = None, + shape: tuple[int, ...] = (), + dtype: str = "f16", + ) -> TensorHandle: + if dir is not None and dir not in self._state.neighbors: + raise IpcqInvalidDirection( + f"mock tl.recv: direction {dir!r} not in neighbors {list(self._state.neighbors)}" + ) + # Wait for data + while True: + if dir is None: + # round-robin over directions + for d in self._state.neighbors: + if self._state.recv_q[d]: + data = self._state.recv_q[d].popleft() + return self._make_handle(data, d, dtype) + else: + if self._state.recv_q[dir]: + data = self._state.recv_q[dir].popleft() + return self._make_handle(data, dir, dtype) + # Yield to other ranks + self._scheduler.yield_() + + def _make_handle(self, data: np.ndarray, direction: str, dtype: str) -> TensorHandle: + return TensorHandle( + id=f"recv_{direction}", + addr=0, shape=data.shape, dtype=dtype, + nbytes=int(data.nbytes), data=data, space="tcm", + ) + + +# ── Cooperative scheduler ──────────────────────────────────────────── + + +class _MockScheduler: + """Round-robin cooperative scheduler over rank greenlets.""" + + def __init__(self, states: list[_MockRankState]) -> None: + self.states = states + self._parent: greenlet | None = None + self._cur_idx = 0 + + def yield_(self) -> None: + """Called from inside a rank greenlet to give other ranks a turn.""" + assert self._parent is not None + self._parent.switch() + + def run(self, kernel_fn: Callable, kernel_args: tuple) -> list[np.ndarray]: + from kernbench.triton_emu.tl_context import TLContext + + self._parent = greenlet.getcurrent() + n = len(self.states) + + # Per-rank tl shim + tls: dict[int, _MockTL] = {} + + def _spawn(rank_idx: int) -> greenlet: + state = self.states[rank_idx] + tl = _MockTL(state, self) + tls[rank_idx] = tl + + def _entry(): + # Activate this rank's tl for TensorHandle operator overloads + TLContext._set_active(tl) # type: ignore[attr-defined] + try: + kernel_fn(state.t_ptr, *kernel_args, tl=tl) + finally: + TLContext._set_active(None) # type: ignore[attr-defined] + + return greenlet(_entry) + + for state in self.states: + state.g = _spawn(state.rank) + + # Drive each rank round-robin until all dead. Detect global deadlock. + max_rounds = 10_000 + round_no = 0 + while True: + alive = [s for s in self.states if s.g is not None and not s.g.dead] + if not alive: + break + progressed = False + for s in self.states: + if s.g is None or s.g.dead: + continue + # Multi-rank greenlets share TLContext active state via the + # module-level thread-local; restore this rank's tl before + # resuming so TensorHandle operator overloads dispatch to + # the right _MockTL. + TLContext._set_active(tls[s.rank]) # type: ignore[attr-defined] + s.g.switch() + if s.g.dead: + progressed = True + TLContext._set_active(None) # type: ignore[attr-defined] + # Loose progress check: if no greenlet died and queues didn't grow, + # advance round counter; abort after too many idle rounds. + round_no += 1 + if round_no > max_rounds and not progressed: + raise RuntimeError( + "mock CCL runtime: deadlock detected (no progress for " + f"{max_rounds} rounds)" + ) + + return [ + s.output if s.output is not None else s._hbm.get(s._slice_addr) + for s in self.states + ] + + +# ── Public entry ──────────────────────────────────────────────────── + + +def run_kernel_in_mock( + kernel_fn: Callable, + world_size: int, + topology: str, + inputs: list[np.ndarray], + kernel_args: tuple = (), + algo_module: Any | None = None, +) -> list[np.ndarray]: + """Run a CCL kernel under the mock runtime with no SimPy/fabric. + + Args: + kernel_fn: ``kernel(t_ptr, *kernel_args, tl=...)`` + world_size: number of ranks + topology: builtin topology name (e.g. "ring_1d") + inputs: per-rank input ndarrays. ``inputs[r]`` becomes rank r's + local tile at HBM address 0. + kernel_args: extra positional args after t_ptr + algo_module: optional module providing ``neighbors()`` override + + Returns: + Per-rank output ndarrays — whatever the kernel wrote via tl.store + (or the original input if the kernel didn't store). + """ + if len(inputs) != world_size: + raise ValueError(f"len(inputs)={len(inputs)} != world_size={world_size}") + + topo_fn = resolve_topology(topology, algo_module=algo_module) + states = [ + _MockRankState( + rank=r, world_size=world_size, + neighbors=topo_fn(r, world_size), + input_arr=inputs[r], + ) + for r in range(world_size) + ] + + sched = _MockScheduler(states) + return sched.run(kernel_fn, kernel_args) diff --git a/src/kernbench/ccl/topologies.py b/src/kernbench/ccl/topologies.py new file mode 100644 index 0000000..c2ed1f4 --- /dev/null +++ b/src/kernbench/ccl/topologies.py @@ -0,0 +1,128 @@ +"""Builtin neighbor topology generators for CCL backend (ADR-0023 D11). + +Each generator takes ``(rank, world_size)`` and returns a +``dict[direction, peer_rank]`` for that rank. ``direction`` is one of +``"N" | "S" | "E" | "W"`` for ring/mesh, or +``"parent" | "child_left" | "child_right"`` for tree topologies. + +Algorithm modules may override the generated map by defining a +``neighbors(rank, world_size, neighbor_map) -> dict | None`` function in +the same module (see D11 / D15). ``resolve_topology`` wires these together. +""" +from __future__ import annotations + +from typing import Any, Callable + +NeighborMap = dict[str, int] +TopologyFn = Callable[[int, int], NeighborMap] + + +# ── Builtin generators ─────────────────────────────────────────────── + + +def ring_1d(rank: int, world_size: int) -> NeighborMap: + """1D bidirectional ring (E/W).""" + return { + "E": (rank + 1) % world_size, + "W": (rank - 1) % world_size, + } + + +def ring_1d_unidir(rank: int, world_size: int) -> NeighborMap: + """1D unidirectional ring (E only).""" + return {"E": (rank + 1) % world_size} + + +def mesh_2d(rank: int, world_size: int) -> NeighborMap: + """Square 2D mesh (N/S/E/W). + + Layout: rank = row * side + col, with side = sqrt(world_size). + Wrap-around (torus) on all four edges. + """ + side = int(round(world_size ** 0.5)) + if side * side != world_size: + raise ValueError( + f"mesh_2d requires square world_size, got {world_size}" + ) + r, c = divmod(rank, side) + return { + "N": ((r - 1) % side) * side + c, + "S": ((r + 1) % side) * side + c, + "W": r * side + (c - 1) % side, + "E": r * side + (c + 1) % side, + } + + +def tree_binary(rank: int, world_size: int) -> NeighborMap: + """Binary tree rooted at rank 0. + + Children of rank r are 2r+1 and 2r+2 (if within world_size). + Parent of rank r > 0 is (r-1)//2. + Returned keys (only those that exist): + "parent", "child_left", "child_right" + """ + n: NeighborMap = {} + if rank > 0: + n["parent"] = (rank - 1) // 2 + left = 2 * rank + 1 + right = 2 * rank + 2 + if left < world_size: + n["child_left"] = left + if right < world_size: + n["child_right"] = right + return n + + +def none(rank: int, world_size: int) -> NeighborMap: + """Empty map — algorithm's neighbors() must build from scratch.""" + return {} + + +_BUILTIN: dict[str, TopologyFn] = { + "ring_1d": ring_1d, + "ring_1d_unidir": ring_1d_unidir, + "mesh_2d": mesh_2d, + "tree_binary": tree_binary, + "none": none, +} + + +# ── Resolution ─────────────────────────────────────────────────────── + + +def resolve_topology( + name: str, algo_module: Any | None = None, +) -> TopologyFn: + """Return a callable ``(rank, world_size) -> NeighborMap``. + + Args: + name: builtin topology name from ccl.yaml. Must be one of + ``ring_1d``, ``ring_1d_unidir``, ``mesh_2d``, ``tree_binary``, + or ``none``. + algo_module: optional algorithm module. If it defines + ``neighbors(rank, world_size, neighbor_map)``, that hook is + invoked after the builtin to override the result. + Returning None from neighbors() leaves the builtin map + unchanged; returning a dict replaces it. + + Raises: + ValueError: if ``name`` is not a known builtin. + """ + if name not in _BUILTIN: + raise ValueError( + f"Unknown topology '{name}'. " + f"Available builtins: {list(_BUILTIN)}" + ) + builtin_fn = _BUILTIN[name] + override_fn = getattr(algo_module, "neighbors", None) if algo_module else None + if override_fn is None or not callable(override_fn): + return builtin_fn + + def _wrapped(rank: int, world_size: int) -> NeighborMap: + base = builtin_fn(rank, world_size) + result = override_fn(rank, world_size, base) + if result is None: + return base + return result + + return _wrapped diff --git a/src/kernbench/common/ipcq_types.py b/src/kernbench/common/ipcq_types.py new file mode 100644 index 0000000..0deb789 --- /dev/null +++ b/src/kernbench/common/ipcq_types.py @@ -0,0 +1,234 @@ +"""IPCQ schemas and exceptions (ADR-0023 D2.5, D12, D14 F1). + +This module contains the data structures and exceptions used by the +PE-level IPCQ collective communication infrastructure. The host-facing +sideband fan-out message ``IpcqInitMsg`` lives in +``kernbench.runtime_api.kernel`` (alongside other fabric messages), +while all internal token / metadata / command schemas are kept here. + +Layering: + PE_CPU --IpcqRequest(IpcqSendCmd|IpcqRecvCmd)--> PE_IPCQ + PE_IPCQ --IpcqDmaToken--> PE_DMA (vc_comm) + PE_DMA --IpcqMetaArrival--> PE_IPCQ (atomic, D9) + PE_IPCQ --IpcqCreditMetadata--> peer PE_IPCQ (fast path, D9) + +See ADR-0023 for the full design. +""" +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import TYPE_CHECKING, Any, Union + +if TYPE_CHECKING: + import simpy + + +# ── D14 F1: invalid direction exception ────────────────────────────── + + +class IpcqInvalidDirection(ValueError): + """Raised when a kernel calls tl.send/recv with a direction that + has no neighbor installed for this PE.""" + + +# ── D2.5: IpcqEndpoint ─────────────────────────────────────────────── + + +@dataclass(frozen=True) +class IpcqEndpoint: + """송신 측이 peer's rx_buffer 주소를 계산하기 위해 필요한 모든 정보 (D2.5). + + Sender PE_IPCQ uses this to compute the destination PA for its DMA + write into the peer's rx ring buffer slot: + + slot_idx = sender.my_head % peer.n_slots + dst_pa = peer.rx_base_pa + slot_idx * peer.slot_size + """ + + sip: int # destination SIP + cube: int # destination cube + pe: int # destination PE (cube-local index) + buffer_kind: str # "tcm" | "hbm" | "sram" + rx_base_pa: int # peer rx_buffer base PA (PhysAddr.encode()) + rx_base_va: int # peer rx_buffer base VA (optional, MMU) + n_slots: int # peer ring depth (wrap-around modulo) + slot_size: int # peer slot size (offset multiplier) + + +# ── D12: IpcqInitEntry (used by IpcqInitMsg in kernel.py) ──────────── + + +@dataclass(frozen=True) +class IpcqInitEntry: + """One direction's neighbor entry that backend installs into a PE_IPCQ + via IpcqInitMsg (kernbench.runtime_api.kernel.IpcqInitMsg, D12). + """ + + direction: str # "N" | "S" | "E" | "W" + peer: IpcqEndpoint # see D2.5 + my_rx_base_pa: int # this PE's own rx_buffer base + my_rx_base_va: int # this PE's own rx_buffer base VA (optional) + n_slots: int # this PE's ring depth + slot_size: int # this PE's slot size + # Credit fast path channel (D9). + # Contract: must be a simpy.Store instance dedicated to receiving + # IpcqCreditMetadata objects only. Backend wires it once at init time + # and the receiving PE_IPCQ owns its consumer side; the sender (peer's + # PE_IPCQ) puts IpcqCreditMetadata directly into this store via + # _delayed_credit_send. Do not put any other object type. + peer_credit_store: "simpy.Store" + + +# ── D12: IpcqSendCmd (PE_CPU → PE_IPCQ) ────────────────────────────── + + +@dataclass(frozen=True) +class IpcqSendCmd: + """tl.send command issued by the kernel to PE_IPCQ.""" + + direction: str # "N" | "S" | "E" | "W" + src_addr: int # source data address (TCM/HBM/SRAM) + src_space: str # "tcm" | "hbm" | "sram" + nbytes: int + shape: tuple[int, ...] # data shape (op_log + MemoryStore use) + dtype: str + handle_id: str # completion tracking + data_op: bool = True # ADR-0020 op_log recording flag + + +# ── D12: IpcqRecvCmd (PE_CPU → PE_IPCQ) ────────────────────────────── + + +@dataclass(frozen=True) +class IpcqRecvCmd: + """tl.recv command issued by the kernel to PE_IPCQ. + + Two modes (recv_mode): + "return_slot" — return slot address as-is (default, zero-copy). + Kernel uses the slot memory directly. + "copy_to_dst" — copy slot data to dst_addr, then return. + """ + + direction: str | None # None → round-robin (weak fairness, D4) + shape: tuple[int, ...] + dtype: str + handle_id: str + recv_mode: str = "return_slot" + dst_addr: int = 0 # used only when recv_mode == "copy_to_dst" + dst_space: str = "" # used only when recv_mode == "copy_to_dst" + blocking: bool = True + data_op: bool = True + + +# ── D12: IpcqDmaToken (PE_IPCQ → PE_DMA, vc_comm) ─────────────────── + + +@dataclass +class IpcqDmaToken: + """Token sent from PE_IPCQ to PE_DMA (vc_comm channel) carrying both + the data move request and the piggyback metadata (ADR-0023 D9). + + Receiving PE_DMA processes this atomically (I6 MUST): + 1. MemoryStore.write(dst_endpoint.buffer_kind, dst_addr, data) + 2. Forward IpcqMetaArrival(token=self) to peer PE_IPCQ + No yield is allowed between the two steps. + + The ``data`` field is a snapshot taken by the sender's PE_DMA at the + moment the send is issued. This preserves "in-flight data" semantics: + if the sender mutates its source memory after issuing the send but + before arrival, the receiver still gets the snapshot. The snapshot is + None for control-only tokens (e.g. credit-only updates). + """ + + # ── Data movement (single-hop DMA write) ── + src_addr: int + src_space: str + dst_addr: int # already-computed peer rx slot PA + dst_endpoint: IpcqEndpoint # routing target (sip/cube/pe) + buffer_kind + nbytes: int + handle_id: str # completion notify back to sender PE_IPCQ + # Optional shape/dtype carried for op_log + MemoryStore convenience. + shape: tuple[int, ...] = () + dtype: str = "f16" + # In-flight data snapshot (sender PE_DMA captures this at send time). + data: Any = None + + # ── Piggyback metadata (D9) ── + sender_seq: int = 0 # monotonic; receiver updates peer_head_cache + src_sip: int = 0 + src_cube: int = 0 + src_pe: int = 0 + src_direction: str = "E" # sender-side direction; receiver maps to its own + + data_op: bool = True + + +# ── D12: IpcqMetaArrival (PE_DMA → PE_IPCQ, intra-PE wire) ────────── + + +@dataclass +class IpcqMetaArrival: + """Posted by receiving PE_DMA into the destination PE's PE_IPCQ inbox + in the same SimPy step as the MemoryStore.write (D9, I6 MUST). + + The receiver PE_IPCQ uses ``token.sender_seq`` to update its + peer_head_cache for the corresponding direction. + """ + + token: IpcqDmaToken + + +# ── D12: IpcqCreditMetadata (PE_IPCQ → peer PE_IPCQ, fast path) ───── + + +@dataclass(frozen=True) +class IpcqCreditMetadata: + """Credit return — recv-side → send-side fast path (D9). + + Sent by ``PeIpcqComponent._delayed_credit_send`` after a + bottleneck-BW based latency, putting the metadata directly into + the peer's pre-wired credit store (no fabric routing). + """ + + consumer_seq: int # my_tail at recv side (new tail value) + src_sip: int # which peer is sending the credit + src_cube: int + src_pe: int + src_direction: str # sender-side direction (peer maps to its own) + + +# ── Request wrapper (PE_CPU → PE_IPCQ) ─────────────────────────────── + + +@dataclass +class IpcqRequest: + """Wrapper carrying an IpcqSendCmd or IpcqRecvCmd plus a SimPy completion + event. Posted by PE_CPU into PE_IPCQ's inbox; PE_IPCQ calls + ``done.succeed()`` when the request is fully processed. + + For recv requests, the result (slot address, direction, dtype, shape) + is written into ``result_data`` so the caller can read it after wait. + """ + + command: "IpcqSendCmd | IpcqRecvCmd" + done: "simpy.Event" + result_data: dict[str, Any] = field(default_factory=dict) + + +# ── RecvFuture (kernel ↔ runner handshake for tl.recv_async / tl.wait) ─ + + +@dataclass +class RecvFuture: + """Opaque future returned by ``tl.recv_async``. + + The KernelRunner attaches a SimPy event and the IpcqRequest in the + background; ``tl.wait(future)`` switches back to the runner which + yields on the event and resolves the result into a TensorHandle. + """ + + cmd: "IpcqRecvCmd" + request: Any = None # IpcqRequest (set by runner) + event: Any = None # simpy.Event (set by runner) + resolved: bool = False + result: Any = None # cached TensorHandle after wait() diff --git a/src/kernbench/common/pe_commands.py b/src/kernbench/common/pe_commands.py index ed0dc8b..e70c367 100644 --- a/src/kernbench/common/pe_commands.py +++ b/src/kernbench/common/pe_commands.py @@ -33,6 +33,7 @@ class TensorHandle: dtype: str nbytes: int # total byte size data: object = None # reserved for validate mode + space: str = "tcm" # MemoryStore space ("tcm" | "hbm" | "sram") @dataclass(frozen=True) diff --git a/src/kernbench/components/builtin/pe_cpu.py b/src/kernbench/components/builtin/pe_cpu.py index 455b7de..d5f615a 100644 --- a/src/kernbench/components/builtin/pe_cpu.py +++ b/src/kernbench/components/builtin/pe_cpu.py @@ -42,9 +42,30 @@ class PeCpuComponent(ComponentBase): self._cube_idx = int(parts[1].replace("cube", "")) except (IndexError, ValueError): self._cube_idx = 0 - # num_cubes from spec (for tl.program_id(axis=1)) + # num_cubes from spec (for tl.program_id(axis=1) — ADR-0022) spec = ctx.spec if ctx else {} - self._num_cubes = spec.get("system", {}).get("sips", {}).get("cubes_per_sip", 1) + cube_mesh = spec.get("sip", {}).get("cube_mesh", {}) + if cube_mesh: + self._num_cubes = int(cube_mesh.get("w", 1)) * int(cube_mesh.get("h", 1)) + else: + self._num_cubes = ( + spec.get("system", {}).get("sips", {}).get("cubes_per_sip", 1) + ) + # PE-local scratch for kernel math output handles (ADR-0020 D3 + # extension; reserved portion of TCM addressed via a synthetic + # MemoryStore key, not the real PA encoder). + pe_template = spec.get("cube", {}).get("pe_template", {}) + tcm_attrs = pe_template.get("components", {}).get("pe_tcm", {}).get("attrs", {}) + scratch_mb = float(tcm_attrs.get("kernel_scratch_mb", 1)) + self._tl_scratch_size = int(scratch_mb * (1 << 20)) + # PE-unique base address — high bit pattern to avoid collision with + # IPCQ ring buffers (which use bit 60). + self._tl_scratch_base = ( + (1 << 61) + | (self._sip_idx << 40) + | (self._cube_idx << 32) + | (self._pe_idx << 24) + ) def _find_shard(self, shards: tuple) -> Any: """Find shard matching this PE's (sip, cube, pe). Fallback to positional index.""" @@ -146,6 +167,8 @@ class PeCpuComponent(ComponentBase): scheduler_id=scheduler_id, out_ports=self.out_ports, store=store, + scratch_base=self._tl_scratch_base, + scratch_size=self._tl_scratch_size, ) yield from runner.run(env, kernel_fn, kernel_args, num_programs) return getattr(runner, "_composite_results", []) diff --git a/src/kernbench/components/builtin/pe_dma.py b/src/kernbench/components/builtin/pe_dma.py index 80edc96..eae866f 100644 --- a/src/kernbench/components/builtin/pe_dma.py +++ b/src/kernbench/components/builtin/pe_dma.py @@ -106,18 +106,131 @@ class PeDmaComponent(PeEngineBase): pe_txn.done.succeed() def _worker(self, env: simpy.Environment) -> Generator: - """Handle TileToken (pipeline), PeInternalTxn (legacy), and Transaction (fabric).""" + """Handle TileToken (pipeline), PeInternalTxn (legacy), IpcqDmaToken, + and Transaction (fabric).""" + from kernbench.common.ipcq_types import IpcqDmaToken from kernbench.common.pe_commands import PeInternalTxn from kernbench.components.builtin.pe_types import TileToken while True: msg: Any = yield self._inbox.get() - if isinstance(msg, TileToken): + if isinstance(msg, IpcqDmaToken): + # Outbound: IPCQ token from local PE_IPCQ → forward via fabric + env.process(self._handle_ipcq_outbound(env, msg)) + elif isinstance(msg, TileToken): env.process(self._pipeline_process(env, msg)) elif isinstance(msg, PeInternalTxn): env.process(self._handle_with_hooks(env, msg)) else: - env.process(self._forward_txn(env, msg)) + # Transaction (or unknown). May carry IpcqDmaToken inbound. + req = getattr(msg, "request", None) + if isinstance(req, IpcqDmaToken): + env.process(self._handle_ipcq_inbound(env, msg)) + else: + env.process(self._forward_txn(env, msg)) + + # ── IPCQ outbound (PE_IPCQ → PE_DMA → fabric) ─────────────────── + + def _handle_ipcq_outbound(self, env: simpy.Environment, token: Any) -> Generator: + """Forward IpcqDmaToken from local PE_IPCQ through the fabric to peer + PE_DMA. ADR-0023 D8 (vc_comm channel).""" + if self.ctx is None: + return # nothing to do + peer = token.dst_endpoint + peer_pe_dma = f"sip{peer.sip}.cube{peer.cube}.pe{peer.pe}.pe_dma" + + # Snapshot the source data at send time (D9 in-flight semantics). + # Without this, the receiver could read stale or future data if the + # sender mutates src_addr between send issue and DMA arrival. + store = getattr(self.ctx, "memory_store", None) + if store is not None and token.data is None: + try: + snap = store.read( + token.src_space, token.src_addr, + shape=token.shape, dtype=token.dtype, + ) + # Copy so later mutations to src_addr don't affect the snapshot. + token.data = snap.copy() if hasattr(snap, "copy") else snap + except Exception: + token.data = None + + # Record the IPCQ copy in op_log at OUTBOUND time. ADR-0020 D6: + # Phase 2 replays the copy in t_start order; using outbound time + # (rather than inbound) ensures the copy executes before any later + # local op at the sender that might overwrite token.src_addr (e.g. + # a tl.store after a recv). + if self._op_logger is not None: + try: + self._op_logger.record_copy( + t_start=float(env.now), t_end=float(env.now), + component_id=self.node.id, + src_space=token.src_space, src_addr=token.src_addr, + dst_space=peer.buffer_kind, + dst_addr=token.dst_addr, + shape=token.shape, dtype=token.dtype, nbytes=token.nbytes, + ) + except Exception: + pass + + try: + path = self.ctx.router.find_path(self._pe_prefix, peer_pe_dma) + except Exception: + return + drain_ns = self.ctx.compute_drain_ns(path, token.nbytes) + + sub_done = env.event() + sub_txn = Transaction( + request=token, path=path, step=0, + nbytes=token.nbytes, done=sub_done, drain_ns=drain_ns, + ) + if len(path) > 1: + next_hop = path[1] + if next_hop in self.out_ports: + yield self.out_ports[next_hop].put(sub_txn.advance()) + else: + return + # Note: don't wait on sub_done here — fire-and-forget for vc_comm. + # IPCQ slot bookkeeping (peer_head) was already updated by PE_IPCQ; + # backpressure is via credit return, not via this DMA's completion. + + # ── IPCQ inbound (fabric → PE_DMA → MemoryStore + PE_IPCQ) ────── + + def _handle_ipcq_inbound(self, env: simpy.Environment, txn: Any) -> Generator: + """At destination PE_DMA: atomically write data and forward metadata. + + I6 (MUST): no SimPy yield between MemoryStore.write and the + IpcqMetaArrival put into PE_IPCQ. + """ + from kernbench.common.ipcq_types import IpcqMetaArrival + + token = txn.request + + # ── ATOMIC: do not introduce yield between these two operations ── + # 1. Move data via MemoryStore (single-hop DMA write). + # Prefer the in-flight snapshot stashed by the sender PE_DMA; + # fall back to a fresh read of src_addr if no snapshot is present + # (e.g. control-only token). + store = getattr(self.ctx, "memory_store", None) if self.ctx else None + if store is not None: + try: + data = token.data + if data is None: + data = store.read( + token.src_space, token.src_addr, + shape=token.shape, dtype=token.dtype, + ) + store.write(token.dst_endpoint.buffer_kind, token.dst_addr, data) + except Exception: + pass + + # 2. Forward IpcqMetaArrival to local PE_IPCQ + ipcq_id = f"{self._pe_prefix}.pe_ipcq" + if ipcq_id in self.out_ports: + yield self.out_ports[ipcq_id].put(IpcqMetaArrival(token=token)) + # ───────────────────────────────────────────────────────────────── + + if not txn.done.triggered: + txn.done.succeed() def _pipeline_process(self, env: simpy.Environment, token: Any) -> Generator: """Pipeline mode: DMA read/write via fabric, then self-route.""" diff --git a/src/kernbench/components/builtin/pe_ipcq.py b/src/kernbench/components/builtin/pe_ipcq.py new file mode 100644 index 0000000..710786a --- /dev/null +++ b/src/kernbench/components/builtin/pe_ipcq.py @@ -0,0 +1,455 @@ +"""PE_IPCQ component (ADR-0023): per-PE IPCQ control plane. + +Responsibilities: + - Hold per-direction queue pair state (my_head, my_tail, + peer_head_cache, peer_tail_cache, ring buffer addresses) + - Process IpcqInitMsg from backend to install neighbor table + - Handle IpcqRequest(IpcqSendCmd) from PE_CPU: + compute peer slot address, check backpressure, forward + IpcqDmaToken to PE_DMA (vc_comm) + - Handle IpcqRequest(IpcqRecvCmd) from PE_CPU: + wait for data arrival, return slot address (or copy to dst), + send fast-path credit return + - Handle IpcqMetaArrival from PE_DMA: update peer_head_cache, wake recv + - Handle IpcqCreditMetadata via own credit_inbox: update peer_tail_cache, + wake send + +PE_IPCQ does NOT move data — it forwards IpcqDmaToken to PE_DMA which +performs the actual fabric DMA. + +Credit return uses a fast path: PE_IPCQ creates a SimPy process with a +bottleneck-BW based latency, then puts IpcqCreditMetadata directly into +the peer's pre-wired credit_store. +""" +from __future__ import annotations + +from collections.abc import Generator +from typing import TYPE_CHECKING, Any + +import simpy + +from kernbench.common.ipcq_types import ( + IpcqCreditMetadata, + IpcqDmaToken, + IpcqInvalidDirection, + IpcqMetaArrival, + IpcqRecvCmd, + IpcqRequest, + IpcqSendCmd, +) +from kernbench.components.base import ComponentBase + +if TYPE_CHECKING: + from kernbench.components.context import ComponentContext + from kernbench.runtime_api.kernel import IpcqInitMsg + from kernbench.topology.types import Node + + +_DIR_ORDER: tuple[str, ...] = ("N", "S", "E", "W", "parent", "child_left", "child_right") + + +class PeIpcqComponent(ComponentBase): + """PE_IPCQ: ring buffer pointer + neighbor management for CCL. + + Owned by one PE; talks to PE_DMA via out_ports[] and + receives credit return metadata via the public ``credit_inbox`` + SimPy Store (wired by backend at IpcqInitMsg installation time). + """ + + def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None: + super().__init__(node, ctx) + # Strict shape/dtype validation (D14 F2). Off by default. + self._strict: bool = bool(node.attrs.get("strict_validation", False)) + # direction → list of received tokens (for strict-mode peek of next slot) + self._arrived_tokens: dict[str, list] = {} + # Parse self (sip, cube, pe) from node id, e.g. "sip0.cube0.pe0.pe_ipcq" + self._pe_prefix: str = node.id.rsplit(".", 1)[0] + parts = self._pe_prefix.split(".") + try: + self._self_sip = int(parts[0].replace("sip", "")) + except (IndexError, ValueError): + self._self_sip = 0 + try: + self._self_cube = int(parts[1].replace("cube", "")) + except (IndexError, ValueError): + self._self_cube = 0 + try: + self._self_pe = int(parts[2].replace("pe", "")) + except (IndexError, ValueError): + self._self_pe = 0 + + self._dma_node_id = f"{self._pe_prefix}.pe_dma" + # direction → state dict (see _install_neighbors for shape) + self._queue_pairs: dict[str, dict[str, Any]] = {} + self._installed = False + self._buffer_kind: str = "tcm" + self._backpressure_mode: str = "sleep" + self._credit_size_bytes: int = 16 + # waiters for recv (per direction) and any-direction (for round-robin) + self._recv_waiters: dict[str, list[simpy.Event]] = {} + self._any_recv_waiters: list[simpy.Event] = [] + # waiters for send backpressure (per direction) + self._send_waiters: dict[str, list[simpy.Event]] = {} + # round-robin cursor over installed directions + self._rr_dirs: list[str] = [] + self._rr_cursor: int = 0 + # credit_inbox is created in start() once env is available + self._credit_inbox: simpy.Store | None = None + + # ── Public ── + + @property + def credit_inbox(self) -> simpy.Store: + """SimPy Store that backend wires as ``peer_credit_store`` on + every remote sender targeting this PE. Used by D9 fast path.""" + assert self._credit_inbox is not None, "PE_IPCQ not started yet" + return self._credit_inbox + + @property + def queue_pairs(self) -> dict[str, dict[str, Any]]: + """Test/debug accessor.""" + return self._queue_pairs + + # ── Lifecycle ── + + def run(self, env: simpy.Environment, nbytes: int) -> Generator: + yield env.timeout(0) + + def start(self, env: simpy.Environment) -> None: + # Create credit_inbox even if there are no in_ports yet + if self._credit_inbox is None: + self._credit_inbox = simpy.Store(env) + # If no in_ports were wired (e.g. unit test), still spin up workers + if not self.in_ports: + self._inbox = simpy.Store(env) + super().start(env) + env.process(self._credit_worker(env)) + + # ── Worker (override of ComponentBase._worker) ── + + def _worker(self, env: simpy.Environment) -> Generator: + from kernbench.runtime_api.kernel import IpcqInitMsg + + while True: + msg: Any = yield self._inbox.get() + + # IpcqInitMsg may arrive wrapped in a transaction (with .request) + # or directly. + request_obj = getattr(msg, "request", None) + if isinstance(request_obj, IpcqInitMsg): + self._install_neighbors(request_obj) + done = getattr(msg, "done", None) + if done is not None and not done.triggered: + done.succeed() + continue + if isinstance(msg, IpcqInitMsg): + self._install_neighbors(msg) + continue + + if isinstance(msg, IpcqMetaArrival): + self._handle_meta_arrival(msg) + continue + + if isinstance(msg, IpcqRequest): + env.process(self._handle_request(env, msg)) + continue + + # Unknown message — drop or forward via base class fallback + env.process(self._forward_txn(env, msg)) + + # ── Init ── + + def _install_neighbors(self, msg: IpcqInitMsg) -> None: + self._installed = True + self._buffer_kind = msg.buffer_kind + self._backpressure_mode = msg.backpressure_mode + self._credit_size_bytes = msg.credit_size_bytes + for entry in msg.entries: + self._queue_pairs[entry.direction] = { + "peer": entry.peer, + "my_rx_base_pa": entry.my_rx_base_pa, + "my_rx_base_va": entry.my_rx_base_va, + "n_slots": entry.n_slots, + "slot_size": entry.slot_size, + "peer_credit_store": entry.peer_credit_store, + "my_head": 0, + "my_tail": 0, + "peer_head_cache": 0, + "peer_tail_cache": 0, + } + self._recv_waiters.setdefault(entry.direction, []) + self._send_waiters.setdefault(entry.direction, []) + # Reset round-robin order to a stable canonical sequence + self._rr_dirs = [d for d in _DIR_ORDER if d in self._queue_pairs] + self._rr_cursor = 0 + + # ── Send ── + + def _handle_request(self, env: simpy.Environment, req: IpcqRequest) -> Generator: + cmd = req.command + if isinstance(cmd, IpcqSendCmd): + yield from self._handle_send(env, req, cmd) + elif isinstance(cmd, IpcqRecvCmd): + yield from self._handle_recv(env, req, cmd) + + def _handle_send( + self, env: simpy.Environment, req: IpcqRequest, cmd: IpcqSendCmd, + ) -> Generator: + if cmd.direction not in self._queue_pairs: + raise IpcqInvalidDirection( + f"PE {self._pe_prefix}: direction {cmd.direction!r} not installed" + ) + qp = self._queue_pairs[cmd.direction] + peer = qp["peer"] + + # Backpressure: wait while ring full + while (qp["my_head"] - qp["peer_tail_cache"]) >= peer.n_slots: + wait_event = env.event() + self._send_waiters[cmd.direction].append(wait_event) + yield wait_event + + # Compute peer slot address + slot_idx = qp["my_head"] % peer.n_slots + dst_pa = peer.rx_base_pa + slot_idx * peer.slot_size + + token = IpcqDmaToken( + src_addr=cmd.src_addr, + src_space=cmd.src_space, + dst_addr=dst_pa, + dst_endpoint=peer, + nbytes=cmd.nbytes, + handle_id=cmd.handle_id, + shape=cmd.shape, + dtype=cmd.dtype, + sender_seq=qp["my_head"], + src_sip=self._self_sip, + src_cube=self._self_cube, + src_pe=self._self_pe, + src_direction=cmd.direction, + ) + + # Forward to PE_DMA (vc_comm) + yield self.out_ports[self._dma_node_id].put(token) + qp["my_head"] += 1 + # Diagnostics trace (D14) + from kernbench.ccl import diagnostics + if diagnostics.trace_enabled(): + diagnostics.log_send( + t_ns=float(env.now), sender=self._pe_prefix, + direction=cmd.direction, nbytes=cmd.nbytes, + sender_seq=qp["my_head"] - 1, + ) + if not req.done.triggered: + req.done.succeed() + + # ── Recv ── + + def _handle_recv( + self, env: simpy.Environment, req: IpcqRequest, cmd: IpcqRecvCmd, + ) -> Generator: + if cmd.direction is None: + direction = yield from self._wait_any_direction(env) + else: + if cmd.direction not in self._queue_pairs: + raise IpcqInvalidDirection( + f"PE {self._pe_prefix}: direction {cmd.direction!r} not installed" + ) + direction = cmd.direction + qp = self._queue_pairs[direction] + while qp["peer_head_cache"] <= qp["my_tail"]: + wait_event = env.event() + self._recv_waiters[direction].append(wait_event) + yield wait_event + + qp = self._queue_pairs[direction] + slot_idx = qp["my_tail"] % qp["n_slots"] + slot_addr = qp["my_rx_base_pa"] + slot_idx * qp["slot_size"] + + # Strict validation (D14 F2): peek the next-arrived token's metadata + # against the recv command's expected shape/dtype/nbytes. + arrived = self._arrived_tokens.get(direction, []) + if arrived: + front = arrived.pop(0) + if self._strict: + expected_nbytes = self._nbytes_for(cmd.shape, cmd.dtype) + if front.dtype != cmd.dtype: + raise ValueError( + f"PE_IPCQ {self._pe_prefix} recv strict: dtype mismatch — " + f"sender={front.dtype} recv={cmd.dtype}" + ) + if front.shape != cmd.shape: + raise ValueError( + f"PE_IPCQ {self._pe_prefix} recv strict: shape mismatch — " + f"sender={front.shape} recv={cmd.shape}" + ) + if front.nbytes != expected_nbytes: + raise ValueError( + f"PE_IPCQ {self._pe_prefix} recv strict: nbytes mismatch — " + f"sender={front.nbytes} recv={expected_nbytes}" + ) + + req.result_data["src_space"] = self._buffer_kind + req.result_data["src_addr"] = slot_addr + req.result_data["direction"] = direction + req.result_data["dtype"] = cmd.dtype + req.result_data["shape"] = cmd.shape + req.result_data["nbytes"] = self._nbytes_for(cmd.shape, cmd.dtype) + + # copy_to_dst mode: rebind the result handle to (dst_space, dst_addr). + # When op_log is disabled, we also do the actual data move now; + # when op_log is enabled, Phase 2 replays the slot→dst copy from + # the op_log entry below so we don't pollute the slot in Phase 1. + if cmd.recv_mode == "copy_to_dst" and self.ctx is not None: + req.result_data["src_space"] = cmd.dst_space + req.result_data["src_addr"] = cmd.dst_addr + store = getattr(self.ctx, "memory_store", None) + if store is not None and self._op_logger is None: + try: + data = store.read(self._buffer_kind, slot_addr, shape=cmd.shape, dtype=cmd.dtype) + store.write(cmd.dst_space, cmd.dst_addr, data) + except Exception: + pass + if self._op_logger is not None: + # Record slot → dst copy for Phase 2 replay (ADR-0023 D9.5). + try: + self._op_logger.record_copy( + t_start=float(env.now), t_end=float(env.now), + component_id=self.node.id, + src_space=self._buffer_kind, src_addr=slot_addr, + dst_space=cmd.dst_space, dst_addr=cmd.dst_addr, + shape=cmd.shape, dtype=cmd.dtype, + nbytes=self._nbytes_for(cmd.shape, cmd.dtype), + ) + except Exception: + pass + + qp["my_tail"] += 1 + + # Diagnostics trace (D14) + from kernbench.ccl import diagnostics + if diagnostics.trace_enabled(): + diagnostics.log_recv( + t_ns=float(env.now), receiver=self._pe_prefix, + direction=direction, + nbytes=req.result_data.get("nbytes", 0), + ) + + # Fast path credit return — bottleneck BW based latency + env.process( + self._delayed_credit_send(env, direction, qp["peer_credit_store"], qp["my_tail"]) + ) + + if not req.done.triggered: + req.done.succeed() + + def _wait_any_direction(self, env: simpy.Environment) -> Generator: + """Round-robin scan over installed directions; wait until at least one + has data. Returns the chosen direction (str).""" + if not self._rr_dirs: + raise IpcqInvalidDirection( + f"PE {self._pe_prefix}: no neighbors installed" + ) + while True: + n = len(self._rr_dirs) + for i in range(n): + idx = (self._rr_cursor + i) % n + d = self._rr_dirs[idx] + qp = self._queue_pairs[d] + if qp["peer_head_cache"] > qp["my_tail"]: + self._rr_cursor = (idx + 1) % n + return d + # Nothing available — wait until any arrival + wait_event = env.event() + self._any_recv_waiters.append(wait_event) + yield wait_event + + # ── Metadata arrival from PE_DMA (D9) ── + + def _handle_meta_arrival(self, msg: IpcqMetaArrival) -> None: + token = msg.token + sender_key = (token.src_sip, token.src_cube, token.src_pe) + for d, qp in self._queue_pairs.items(): + p = qp["peer"] + if (p.sip, p.cube, p.pe) == sender_key: + qp["peer_head_cache"] = max(qp["peer_head_cache"], token.sender_seq + 1) + # Track arrived token for strict-mode peek + self._arrived_tokens.setdefault(d, []).append(token) + # Wake any blocked recv on this direction + waiters = self._recv_waiters.get(d, []) + self._recv_waiters[d] = [] + for ev in waiters: + if not ev.triggered: + ev.succeed() + # Wake any-direction waiters + any_waiters = self._any_recv_waiters + self._any_recv_waiters = [] + for ev in any_waiters: + if not ev.triggered: + ev.succeed() + return + # Unknown sender — silently drop (could log) + + # ── Credit return (fast path) ── + + def _credit_worker(self, env: simpy.Environment) -> Generator: + """Process IpcqCreditMetadata from credit_inbox.""" + assert self._credit_inbox is not None + while True: + credit: IpcqCreditMetadata = yield self._credit_inbox.get() + sender_key = (credit.src_sip, credit.src_cube, credit.src_pe) + for d, qp in self._queue_pairs.items(): + p = qp["peer"] + if (p.sip, p.cube, p.pe) == sender_key: + qp["peer_tail_cache"] = max(qp["peer_tail_cache"], credit.consumer_seq) + # Wake any blocked send on this direction + waiters = self._send_waiters.get(d, []) + self._send_waiters[d] = [] + for ev in waiters: + if not ev.triggered: + ev.succeed() + break + + def _delayed_credit_send( + self, + env: simpy.Environment, + direction: str, + peer_credit_store: simpy.Store, + new_tail: int, + ) -> Generator: + """Wait bottleneck-BW latency, then put IpcqCreditMetadata into peer + credit store (D9 fast path).""" + latency_ns = self._credit_latency_ns(direction) + if latency_ns > 0: + yield env.timeout(latency_ns) + meta = IpcqCreditMetadata( + consumer_seq=new_tail, + src_sip=self._self_sip, + src_cube=self._self_cube, + src_pe=self._self_pe, + src_direction=direction, + ) + yield peer_credit_store.put(meta) + + def _credit_latency_ns(self, direction: str) -> float: + """Compute credit fast path latency = credit_size / bottleneck_bw. + + Falls back to 0 when ctx/router is unavailable (unit-test mode). + """ + if self.ctx is None: + return 0.0 + qp = self._queue_pairs[direction] + peer = qp["peer"] + peer_pe_prefix = f"sip{peer.sip}.cube{peer.cube}.pe{peer.pe}" + try: + path = self.ctx.router.find_path(self._pe_prefix, peer_pe_prefix) + return self.ctx.compute_drain_ns(path, self._credit_size_bytes) + except Exception: + return 0.0 + + # ── Helpers ── + + @staticmethod + def _nbytes_for(shape: tuple[int, ...], dtype: str) -> int: + from math import prod + bits = {"f16": 16, "bf16": 16, "f32": 32, "i8": 8, "i16": 16, "i32": 32}.get(dtype, 16) + return prod(shape) * (bits // 8) if shape else 0 diff --git a/src/kernbench/runtime_api/bench_runner.py b/src/kernbench/runtime_api/bench_runner.py index 4f478f7..bc80cc0 100644 --- a/src/kernbench/runtime_api/bench_runner.py +++ b/src/kernbench/runtime_api/bench_runner.py @@ -29,11 +29,10 @@ def run_bench( correlation_id: str = "bench0", completion_policy: CompletionPolicy = CompletionPolicy.LAST_SUBMITTED, ) -> BenchResult: - """ - Minimal bench runner. + """Minimal bench runner. - topology: compiled topology object (opaque to runtime here) - - bench_fn: callable that receives RuntimeContext and submits requests + - bench_fn: callable ``run(torch)`` receiving a RuntimeContext - device: DeviceSelector ("all" or "sip:") - engine_factory: builds sim_engine for given topology & device - completion_policy: how to determine overall completion/result @@ -48,7 +47,6 @@ def run_bench( ) bench_fn(ctx) - ctx.wait_all() collected_traces = ctx._traces or None diff --git a/src/kernbench/runtime_api/context.py b/src/kernbench/runtime_api/context.py index b522ed5..3b2afc6 100644 --- a/src/kernbench/runtime_api/context.py +++ b/src/kernbench/runtime_api/context.py @@ -9,6 +9,39 @@ from kernbench.common.types import Completion, RequestHandle, SimEngine from .types import DeviceSelector +def _world_size_from_spec(spec: dict | None) -> int: + """Derive world_size from topology spec: sips × cubes × pes_per_cube.""" + spec = spec or {} + sips = int(spec.get("system", {}).get("sips", {}).get("count", 1)) + cm = spec.get("sip", {}).get("cube_mesh", {}) + cubes_per_sip = int(cm.get("w", 1)) * int(cm.get("h", 1)) + pl = spec.get("cube", {}).get("pe_layout", {}) + corners = pl.get("corners", []) + pe_per_corner = int(pl.get("pe_per_corner", 1)) + pes_per_cube = pe_per_corner * max(len(corners), 1) + return sips * cubes_per_sip * pes_per_cube + + +def _numpy_to_dtype_str(np_dtype) -> str: + """Map numpy dtype → kernbench dtype string used by Tensor.""" + import numpy as np + + kind_map = { + np.float16: "f16", + np.float32: "f32", + np.int8: "i8", + np.int16: "i16", + np.int32: "i32", + np.uint8: "u8", + np.uint16: "u16", + np.uint32: "u32", + } + for np_type, s in kind_map.items(): + if np.dtype(np_dtype) == np.dtype(np_type): + return s + raise ValueError(f"unsupported numpy dtype: {np_dtype!r}") + + @dataclass class RuntimeContext: engine: SimEngine @@ -23,6 +56,66 @@ class RuntimeContext: _tensor_counter: int = field(default=0, init=False) _traces: list[dict] = field(default_factory=list, init=False) _tensors: list[Any] = field(default_factory=list, init=False) + distributed: Any = field(default=None, init=False) # DistributedContext for CCL benches + _ipcq_plan: dict = field(default_factory=dict, init=False) # ADR-0023 install plan + + def __post_init__(self) -> None: + # Eagerly attach a DistributedContext so bench code can do + # ``dist = torch.distributed`` + ``dist.init_process_group(...)`` + # without needing a separate launcher to install it. + from kernbench.runtime_api.distributed import DistributedContext + dc = DistributedContext() + dc._ctx_ref = self # back-reference for AhbmCCLBackend to reach ctx.launch etc. + self.distributed = dc + + def install_ipcq( + self, + algorithm: str | None = None, + ccl_yaml: str | None = None, + world_size_override: int | None = None, + rank_to_pe: list[tuple[int, int, int]] | None = None, + ) -> dict: + """Install IPCQ neighbor tables on all participating PEs (ADR-0023 D10). + + Loads ``ccl.yaml`` (or the path provided), resolves the chosen + algorithm (or ``defaults.algorithm`` if None), and pushes per-PE + IpcqInitMsg into every PE_IPCQ component via the engine. + + Args: + algorithm: name of the algorithm in ccl.yaml (or use defaults). + ccl_yaml: optional path to ccl.yaml. + world_size_override: if set, replace the algorithm's world_size. + + Returns the install plan dict (rank → (sip,cube,pe), neighbor table). + """ + import importlib + from kernbench.ccl.install import ( + install_ipcq as _install, + load_ccl_config, + resolve_algorithm_config, + ) + + cfg = load_ccl_config(ccl_yaml) + merged = resolve_algorithm_config(cfg, algorithm) + if world_size_override is not None: + merged["world_size"] = world_size_override + elif "world_size" not in merged: + # Derive from topology.yaml when neither the algorithm entry + # nor ``defaults`` carries ``world_size`` (matches pytorch DDP + # where env vars determine ranks, not the ccl config file). + merged["world_size"] = _world_size_from_spec(self.spec) + algo_module = None + try: + algo_module = importlib.import_module(merged["module"]) + except ModuleNotFoundError: + pass + plan = _install( + self.engine, self.spec, merged, + algo_module=algo_module, rank_to_pe=rank_to_pe, + ) + self._ipcq_plan = plan + self._ipcq_config = merged + return plan def __enter__(self): return self @@ -258,6 +351,24 @@ class RuntimeContext: """Allocate a tensor in HBM without initialization (like torch.empty).""" return self._create_tensor(shape, dtype, name, pattern=None, dp=dp) + def from_numpy(self, arr: Any): + """Create a host-side tensor wrapping a numpy array. + + Mirrors ``torch.from_numpy``. The returned tensor is NOT deployed + to any PE — it lives in an in-memory host staging buffer. Use + ``target.copy_(host_tensor)`` to scatter its contents into a + sharded, deployed tensor. + """ + import numpy as np + from kernbench.runtime_api.tensor import Tensor + + arr_c = np.ascontiguousarray(arr) + dtype_str = _numpy_to_dtype_str(arr_c.dtype) + t = Tensor(shape=tuple(arr_c.shape), dtype=dtype_str, name="host") + t._host_buffer = arr_c + t._memory_store = getattr(self.engine, "_memory_store", None) + return t + def _create_tensor( self, shape: tuple[int, ...], @@ -418,13 +529,12 @@ class RuntimeContext: TensorArgShard, ) from kernbench.runtime_api.tensor import Tensor - from kernbench.triton_emu.registry import register_kernel + from kernbench.triton_emu.registry import _kernels, register_kernel - # Register kernel (idempotent) - try: - register_kernel(kernel_name, kernel_fn) - except ValueError: - pass + # Register kernel (idempotent overwrite — last call wins). + # Tests can re-register the same kernel_name with a different + # function; the user's most recent launch must use the latest fn. + _kernels[kernel_name] = kernel_fn # Collect tensors and scalars tensor_args: list[Tensor] = [] @@ -506,6 +616,7 @@ class RuntimeContext: # Per-SIP kernel launch: each SIP gets TensorArgs with local va_base last_handle = None + _pending_handles: list[tuple[Any, int]] = [] for sip_id in sorted(sip_set): sip_kernel_args: list = [] sip_cube_set: set[int] = set() @@ -566,10 +677,17 @@ class RuntimeContext: target_cubes=target_cubes, target_pe=target_pe, )) + # Defer wait until all SIPs are submitted (multi-SIP CCL needs + # all participating PEs to be live concurrently — waiting + # per-SIP would deadlock when ranks span SIP boundaries). + _pending_handles.append((h, sip_id)) + last_handle = h + + # Drain pending handles now that every SIP has a launch posted. + for h, sip_id in _pending_handles: self.wait(h, _meta={ "phase": "kernel", "name": kernel_name, "sip": sip_id, "target_pe": target_pe, }) - last_handle = h return last_handle diff --git a/src/kernbench/runtime_api/distributed.py b/src/kernbench/runtime_api/distributed.py new file mode 100644 index 0000000..e2a3231 --- /dev/null +++ b/src/kernbench/runtime_api/distributed.py @@ -0,0 +1,179 @@ +"""PyTorch-compatible distributed communication shim (ADR-0023 D11). + +Provides a ``torch.distributed``-like API whose public surface matches +real PyTorch so that bench code looks identical to a DDP training script. + +Only the ``ahbm`` backend is implemented. It: + +1. Reads ``ccl.yaml`` to decide which collective algorithm to run. +2. Derives world_size from the algorithm entry, the defaults section, or + from the topology spec (``system.sips.count × sip.cube_mesh × pe_layout``). +3. At ``init_process_group`` time, eagerly installs the IPCQ neighbor + table once (one-time comm setup — mirrors NCCL communicator creation). +4. On each ``all_reduce(tensor)`` call, reads per-shard metadata from the + tensor handle and dispatches ``torch.launch`` with the registered + kernel. The kernel performs intra-PE ring/tree/mesh CCL via IPCQ, + and Phase 2 DataExecutor replays math + copies from op_log so + MemoryStore is correct when ``all_reduce`` returns. + +Host bench code uses only real-PyTorch names: + dist.init_process_group, dist.is_initialized, dist.get_world_size, + dist.get_rank, dist.get_backend, dist.all_reduce, dist.barrier +""" +from __future__ import annotations + +import importlib +from typing import Any + + +class AhbmCCLBackend: + """Ahbm CCL backend — drives kernel-level collectives via IPCQ.""" + + def __init__(self, torch_ctx: Any) -> None: + from kernbench.ccl.install import ( + load_ccl_config, + resolve_algorithm_config, + ) + + self.ctx = torch_ctx + self._cfg_all = load_ccl_config() + self._merged = resolve_algorithm_config(self._cfg_all) + self._algo_module = importlib.import_module(self._merged["module"]) + self._world_size = self._resolve_world_size() + + # Eager IPCQ install — ``init_process_group`` time. Mirrors NCCL + # communicator creation: done once, reused across every subsequent + # collective call on the same process group. + self.ctx.install_ipcq( + algorithm=self._merged["algorithm"], + world_size_override=self._world_size, + ) + + def _resolve_world_size(self) -> int: + """Derive world_size (priority: algorithm override > defaults > topology). + + Topology derivation: + sips × cubes_per_sip × pes_per_cube + """ + if "world_size" in self._merged: + return int(self._merged["world_size"]) + defaults = self._cfg_all.get("defaults", {}) + if "world_size" in defaults: + return int(defaults["world_size"]) + spec = self.ctx.spec or {} + sips = int(spec.get("system", {}).get("sips", {}).get("count", 1)) + cm = spec.get("sip", {}).get("cube_mesh", {}) + cubes_per_sip = int(cm.get("w", 1)) * int(cm.get("h", 1)) + pl = spec.get("cube", {}).get("pe_layout", {}) + corners = pl.get("corners", []) + pe_per_corner = int(pl.get("pe_per_corner", 1)) + pes_per_cube = pe_per_corner * max(len(corners), 1) + return sips * cubes_per_sip * pes_per_cube + + @property + def world_size(self) -> int: + return self._world_size + + def all_reduce(self, tensor: Any, op: str = "sum") -> None: + """Dispatch the configured CCL algorithm as a single kernel launch. + + Raises if ``op != "sum"`` (current kernels only implement add + reduction) or if the tensor's shard count disagrees with the + world_size that was installed into PE_IPCQ. + """ + if op != "sum": + raise NotImplementedError(f"all_reduce op={op!r} not supported") + if tensor._handle is None: + raise RuntimeError( + f"Tensor '{tensor.name}' is not deployed (call torch.zeros " + "with a DPPolicy first)" + ) + shards = tensor._handle.shards + if len(shards) != self._world_size: + raise RuntimeError( + f"all_reduce tensor has {len(shards)} shards but the " + f"ahbm backend was installed with world_size=" + f"{self._world_size}; adjust the tensor's DPPolicy or " + "restart the process group" + ) + n_elem = shards[0].nbytes // tensor.itemsize + kernel_fn = self._algo_module.kernel + kernel_args = self._algo_module.kernel_args(self._world_size, n_elem) + self.ctx.launch( + self._merged["algorithm"], kernel_fn, tensor, *kernel_args, + ) + + def barrier(self) -> None: + # Single-driver model → no cross-process sync needed. Keeping the + # method so ``dist.barrier()`` is callable (pytorch-compat surface). + return None + + +class DistributedContext: + """torch.distributed-compat facade. + + Public surface matches real PyTorch so bench code reads identically + to a DDP training script. Single-driver semantics: ``get_rank()`` + always returns 0 because kernbench runs as one Python process; + ``get_world_size()`` returns the CCL group size (number of PEs + participating in the collective). + """ + + def __init__(self) -> None: + self._backend: AhbmCCLBackend | None = None + + def init_process_group( + self, + backend: str = "ahbm", + world_size: int | None = None, + rank: int | None = None, + **kwargs: Any, + ) -> None: + """Create the default process group. + + ``world_size`` and ``rank`` are accepted for API parity with + ``torch.distributed.init_process_group`` but ignored — the ahbm + backend derives both from ``ccl.yaml`` + topology automatically + (like reading ``RANK``/``WORLD_SIZE`` env vars in real DDP). + """ + if backend != "ahbm": + raise ValueError( + f"Unsupported backend '{backend}'. Only 'ahbm' is supported." + ) + ctx = getattr(self, "_ctx_ref", None) + if ctx is None: + raise RuntimeError( + "DistributedContext not bound to a RuntimeContext" + ) + self._backend = AhbmCCLBackend(torch_ctx=ctx) + + def is_initialized(self) -> bool: + return self._backend is not None + + def get_world_size(self) -> int: + self._ensure_initialized() + return self._backend.world_size + + def get_rank(self) -> int: + # Single-driver kernbench: there is only one host rank. + self._ensure_initialized() + return 0 + + def get_backend(self) -> str: + self._ensure_initialized() + return "ahbm" + + def all_reduce(self, tensor: Any, op: str = "sum") -> None: + self._ensure_initialized() + self._backend.all_reduce(tensor, op=op) + + def barrier(self) -> None: + self._ensure_initialized() + self._backend.barrier() + + def _ensure_initialized(self) -> None: + if self._backend is None: + raise RuntimeError( + "Default process group has not been initialized. " + "Call init_process_group(backend='ahbm') first." + ) diff --git a/src/kernbench/runtime_api/kernel.py b/src/kernbench/runtime_api/kernel.py index acda736..27fe732 100644 --- a/src/kernbench/runtime_api/kernel.py +++ b/src/kernbench/runtime_api/kernel.py @@ -152,3 +152,30 @@ class MmuUnmapMsg: target_cubes: tuple[int, ...] | Literal["all"] = "all" target_pe: int | Literal["all"] = "all" msg_type: Literal["mmu_unmap"] = "mmu_unmap" + + +@dataclass(frozen=True) +class IpcqInitMsg: + """IPCQ neighbor table install (sideband fan-out, ADR-0023 D10/D12). + + Backend issues this at ``init_process_group`` time to install per-PE + IPCQ neighbor tables. Each entry covers one direction (N/S/E/W) and + carries the peer's IpcqEndpoint plus this PE's own rx_buffer base + and a pre-wired SimPy Store for credit return fast path (D9). + + Routing is similar to MmuMapMsg. + """ + + correlation_id: str + request_id: str + target_sips: tuple[int, ...] | Literal["all"] = "all" + target_cubes: tuple[int, ...] | Literal["all"] = "all" + target_pe: int | tuple[int, ...] | Literal["all"] = "all" + # entries: tuple[IpcqInitEntry, ...] — kept as tuple of plain objects to + # avoid a runtime import cycle (IpcqInitEntry lives in + # kernbench.common.ipcq_types). + entries: tuple = () + backpressure_mode: str = "sleep" # "poll" | "sleep" + buffer_kind: str = "tcm" # "tcm" | "hbm" | "sram" + credit_size_bytes: int = 16 + msg_type: Literal["ipcq_init"] = "ipcq_init" diff --git a/src/kernbench/runtime_api/tensor.py b/src/kernbench/runtime_api/tensor.py index 7fa40c6..05f86d2 100644 --- a/src/kernbench/runtime_api/tensor.py +++ b/src/kernbench/runtime_api/tensor.py @@ -146,6 +146,11 @@ class Tensor: self._handle: TensorHandle | None = None self._ctx_ref: weakref.ref | None = None # set by RuntimeContext self._memory_store = None # set by RuntimeContext when enable_data=True + # Host-side staging buffer for torch.from_numpy() results. A tensor + # with a non-None _host_buffer is NOT deployed to any PE — it lives + # only on the host. Use `target.copy_(host_tensor)` to scatter the + # data into a deployed, sharded target tensor. + self._host_buffer: np.ndarray | None = None def __del__(self) -> None: if self._ctx_ref is None or self._handle is None: @@ -166,15 +171,85 @@ class Tensor: @property def data(self) -> np.ndarray: - """Tensor data as numpy array. Returns actual values when enable_data=True, - zeros placeholder otherwise (like an uninitialized tensor).""" - if self._memory_store is not None and self._handle is not None: - shard = self._handle.shards[0] + """Tensor data as numpy array. + + Gathers all shards into a single full-shape array. Returns actual + values when enable_data=True, zeros placeholder otherwise (like an + uninitialized tensor). Alias of ``numpy()``. + """ + return self.numpy() + + def _shard_store_addr(self, shard: TensorShard) -> int: + """MemoryStore key for a shard. + + Kernels read tensors via VA (translated to PA by PE_DMA's MMU when + a mapping exists, otherwise the addr is treated as a PA-equivalent + key). Tensor I/O therefore writes/reads at ``va_base + offset_bytes`` + when ``va_base`` is set, falling back to ``shard.pa`` for the + VA-less mode used by some legacy paths. + """ + if self._handle and self._handle.va_base: + return self._handle.va_base + shard.offset_bytes + return shard.pa + + def numpy(self) -> np.ndarray: + """Return a single numpy array gathered from all shards. + + Mirrors ``torch.Tensor.numpy()``. In kernbench, sharded tensors are + gathered into a single full-shape ndarray according to each shard's + ``offset_bytes`` / ``nbytes`` range. + """ + np_dtype = _numpy_dtype(self.dtype) + # Host-side tensor (created via torch.from_numpy) has no shards. + if self._host_buffer is not None: + return self._host_buffer.copy() + if self._handle is None or self._memory_store is None: + return np.zeros(self.shape, dtype=np_dtype) + flat = np.zeros(math.prod(self.shape), dtype=np_dtype) + for shard in self._handle.shards: + start = shard.offset_bytes // self.itemsize + count = shard.nbytes // self.itemsize try: - return self._memory_store.read("hbm", shard.pa, shape=self.shape, dtype=self.dtype) + piece = self._memory_store.read( + "hbm", self._shard_store_addr(shard), + ) except KeyError: - pass - return np.zeros(self.shape, dtype=_numpy_dtype(self.dtype)) + continue + flat[start : start + count] = ( + np.asarray(piece, dtype=np_dtype).reshape(-1)[:count] + ) + return flat.reshape(self.shape) + + def copy_(self, source: "Tensor") -> "Tensor": + """In-place copy from another tensor into self. + + Mirrors ``torch.Tensor.copy_()``. If ``source`` is a host tensor + (from ``torch.from_numpy``), its ndarray is split across self's + shards using each shard's byte range. If ``source`` is a deployed + (sharded) tensor, its contents are gathered first and then + re-scattered into self's shard layout. + + Shapes must match. Returns self. + """ + if self._handle is None or self._memory_store is None: + raise RuntimeError( + f"Tensor '{self.name}' must be deployed before copy_()" + ) + if source.shape != self.shape: + raise ValueError( + f"copy_ shape mismatch: self={self.shape} source={source.shape}" + ) + np_dtype = _numpy_dtype(self.dtype) + arr = source.numpy().astype(np_dtype, copy=False) + flat = np.ascontiguousarray(arr).reshape(-1) + for shard in self._handle.shards: + start = shard.offset_bytes // self.itemsize + count = shard.nbytes // self.itemsize + piece = flat[start : start + count].copy() + self._memory_store.write( + "hbm", self._shard_store_addr(shard), piece, + ) + return self @property def itemsize(self) -> int: diff --git a/src/kernbench/sim_engine/data_executor.py b/src/kernbench/sim_engine/data_executor.py index fe0be02..d52bebb 100644 --- a/src/kernbench/sim_engine/data_executor.py +++ b/src/kernbench/sim_engine/data_executor.py @@ -51,7 +51,42 @@ class DataExecutor: self._execute_math(op) def _execute_memory(self, op: OpRecord) -> None: - """Memory ops are already handled by Phase 1 MemoryStore. Skip.""" + """Replay memory copy ops in Phase 2 (ADR-0020 + ADR-0023). + + - dma_read: no-op (handle already references HBM source). + - dma_write: copy (src_space, src_addr) → (dst_space, dst_addr). + Required because Phase 2 may have just produced new data at the + source addr (e.g. PE_MATH scratch output). + - ipcq_copy: copy across PEs — sender's source → receiver's slot. + Required because the source may be a Phase 2 math output, and + a downstream math op on the receiver reads from the slot. + + Legacy entries without src/dst metadata are silently skipped. + """ + p = op.params + if op.op_name == "dma_write" or op.op_name == "ipcq_copy": + src_space = p.get("src_space") + src_addr = p.get("src_addr") + dst_space = p.get("dst_space") + dst_addr = p.get("dst_addr") + if (src_space is None or src_addr is None + or dst_space is None or dst_addr is None): + return + # Prefer the Phase-1-time snapshot (captured at record_end / + # outbound) so we don't read from a source that has since been + # mutated by another op. Fall back to MemoryStore for sources + # that had no Phase 1 data (e.g. math scratch outputs that + # only get populated by Phase 2's math replay). + data = p.get("snapshot") + if data is None: + try: + data = self.store.read( + src_space, src_addr, + shape=p.get("shape"), dtype=p.get("dtype"), + ) + except KeyError: + return + self.store.write(dst_space, dst_addr, data) def _execute_gemm(self, op: OpRecord) -> None: """Execute GEMM: out = a @ b.""" @@ -77,18 +112,35 @@ class DataExecutor: """Execute math op: unary, binary, or reduction.""" p = op.params math_op = p.get("op", op.op_name) - space = p.get("addr_space", "tcm") dtype = p.get("dtype", "f32") input_addrs = p.get("input_addrs", []) input_shapes = p.get("input_shapes", []) + # Per-input space/dtype (ADR-0023 CCL accumulation): math ops can + # mix inputs from different MemoryStore spaces (e.g. acc in "hbm", + # recv slot in "tcm"). Fall back to legacy single-space mode when + # the per-input lists are absent. + input_spaces = p.get("input_spaces") or [p.get("addr_space", "tcm")] * len(input_addrs) + input_dtypes = p.get("input_dtypes") or [dtype] * len(input_addrs) + # Per-input data snapshots (ADR-0020 D6): captured at op_log + # record time. Phase 1 has correct values for slot/HBM addrs at + # that moment, which lets Phase 2 sidestep the slot-wraparound + # races where a later round overwrites a slot before this op + # runs in t_start order. + snapshots = p.get("input_snapshots") or [None] * len(input_addrs) + dst_space = p.get("dst_space", p.get("addr_space", "tcm")) inputs = [] - for addr, shape in zip(input_addrs, input_shapes): - inputs.append(self.store.read(space, addr, shape=shape, dtype=dtype)) + for addr, shape, space, idtype, snap in zip( + input_addrs, input_shapes, input_spaces, input_dtypes, snapshots + ): + if snap is not None: + inputs.append(snap) + else: + inputs.append(self.store.read(space, addr, shape=shape, dtype=idtype)) result = _compute_math(math_op, inputs, p.get("axis")) if result is not None: - self.store.write(space, p["dst_addr"], result) + self.store.write(dst_space, p["dst_addr"], result) def verify(self, expected: dict[tuple[str, int], np.ndarray], rtol: float = 1e-3, atol: float = 1e-3) -> dict[str, bool]: @@ -146,6 +198,14 @@ def _compute_math(op: str, inputs: list[np.ndarray], axis: int | None) -> np.nda if op == "min": return np.min(x, axis=axis, keepdims=True) + # Softmax (numerically stable) + if op == "softmax": + ax = axis if axis is not None else -1 + x_max = np.max(x, axis=ax, keepdims=True) + e = np.exp(x - x_max) + s = np.sum(e, axis=ax, keepdims=True) + return e / s + # Binary if len(inputs) >= 2: y = inputs[1] @@ -157,9 +217,18 @@ def _compute_math(op: str, inputs: list[np.ndarray], axis: int | None) -> np.nda return x * y if op == "div": return x / y + if op == "maximum": + return np.maximum(x, y) + if op == "minimum": + return np.minimum(x, y) # Ternary - if op == "where" and len(inputs) >= 3: - return np.where(inputs[0], inputs[1], inputs[2]) + if len(inputs) >= 3: + if op == "where": + return np.where(inputs[0], inputs[1], inputs[2]) + if op == "fma": + return inputs[0] * inputs[1] + inputs[2] + if op == "clamp": + return np.minimum(np.maximum(inputs[0], inputs[1]), inputs[2]) return None diff --git a/src/kernbench/sim_engine/engine.py b/src/kernbench/sim_engine/engine.py index eef7c46..0995397 100644 --- a/src/kernbench/sim_engine/engine.py +++ b/src/kernbench/sim_engine/engine.py @@ -51,8 +51,12 @@ class GraphEngine: if enable_data: from kernbench.sim_engine.memory_store import MemoryStore from kernbench.sim_engine.op_log import OpLogger - self._op_logger = OpLogger() self._memory_store = MemoryStore() + self._op_logger = OpLogger(memory_store=self._memory_store) + # Cursor for incremental Phase 2 replay (ADR-0020 D6). + # SimPy env.now is monotonic so newly logged records always sort + # to the tail; the cursor remains valid across waits. + self._data_cursor = 0 ctx = ComponentContext( router=self._router, @@ -147,11 +151,60 @@ class GraphEngine: self._env.process(self._process(str(handle), request, event)) return handle + def _flush_data_phase(self) -> None: + """Replay newly recorded op_log entries through DataExecutor. + + ADR-0020 D6 Phase 2: when data tracking is enabled, run DataExecutor + on records added since the last flush so that callers reading + MemoryStore between launches observe correct (compute-replayed) + tensor data. + + Cursor-based incremental replay is necessary because Phase 2 is + NOT idempotent across full re-runs: a math op writes a TCM scratch + addr, a later dma_write copies that scratch into HBM[X], and an + even-later math op may then read HBM[X]. Re-running everything + from scratch would let the second pass's first math op read the + already-overwritten HBM[X] instead of the original input. + """ + if self._op_logger is None or self._memory_store is None: + return + records = self._op_logger.records # sorted by t_start (stable) + if self._data_cursor >= len(records): + return + new_records = records[self._data_cursor:] + from kernbench.sim_engine.data_executor import DataExecutor + DataExecutor(new_records, self._memory_store).run() + self._data_cursor = len(records) + def wait(self, handle: RequestHandle) -> None: key = str(handle) event = self._events[key] if not event.triggered: - self._env.run(until=event) + try: + self._env.run(until=event) + except (simpy.core.EmptySchedule, RuntimeError) as exc: + # SimPy raises EmptySchedule directly OR (in newer simpy) + # wraps it as a RuntimeError("No scheduled events left ..."). + # Either case while our event is still pending → IPCQ deadlock. + msg = str(exc) + is_deadlock = ( + isinstance(exc, simpy.core.EmptySchedule) + or "No scheduled events left" in msg + ) + if not is_deadlock: + raise + from kernbench.ccl.diagnostics import IpcqDeadlock, pointer_dump + dump = pointer_dump(self) + if dump.strip(): + raise IpcqDeadlock( + "IPCQ deadlock: simulation schedule empty while " + f"request {handle!r} is still pending.\n" + f"Pointer state:\n{dump}" + ) from None + raise + # ADR-0020: replay newly logged ops so the caller observes + # post-Phase-2 tensor state from MemoryStore. + self._flush_data_phase() def get_completion(self, handle: RequestHandle) -> tuple[Completion, Trace | None]: return self._results[str(handle)] diff --git a/src/kernbench/sim_engine/op_log.py b/src/kernbench/sim_engine/op_log.py index bf0f5de..f0accd8 100644 --- a/src/kernbench/sim_engine/op_log.py +++ b/src/kernbench/sim_engine/op_log.py @@ -29,9 +29,13 @@ class OpLogger: Records are maintained in t_start stable ordering (insertion order). """ - def __init__(self) -> None: + def __init__(self, memory_store: Any | None = None) -> None: self._records: list[OpRecord] = [] self._pending: dict[int, dict[str, Any]] = {} # msg id → partial record + # Optional MemoryStore reference. When set, math op records capture + # input data snapshots at record_end time so Phase 2 replay does + # not depend on slot/scratch addrs surviving until math runs. + self._memory_store = memory_store @property def records(self) -> list[OpRecord]: @@ -53,6 +57,38 @@ class OpLogger: if pending is None: return op_kind, op_name, params = _extract_op_info(msg) + # Snapshot data at record time so Phase 2 replay sidesteps + # downstream mutations of source addrs (e.g. a tl.store that + # overwrites HBM after a load handle was sent, or a slot that + # gets reused on the next ring round). + if self._memory_store is not None: + if op_kind == "math": + snaps: list[Any] = [] + for addr, shape, space, idtype in zip( + params.get("input_addrs", []), + params.get("input_shapes", []), + params.get("input_spaces", []), + params.get("input_dtypes", []), + ): + try: + arr = self._memory_store.read( + space, addr, shape=shape, dtype=idtype, + ) + snaps.append(arr.copy() if hasattr(arr, "copy") else arr) + except Exception: + snaps.append(None) + params["input_snapshots"] = snaps + elif op_name == "dma_write": + try: + arr = self._memory_store.read( + params["src_space"], params["src_addr"], + shape=params.get("shape"), dtype=params.get("dtype"), + ) + params["snapshot"] = ( + arr.copy() if hasattr(arr, "copy") else arr + ) + except Exception: + params["snapshot"] = None self._records.append(OpRecord( t_start=pending["t_start"], t_end=t, @@ -62,6 +98,45 @@ class OpLogger: params=params, )) + def record_copy( + self, t_start: float, t_end: float, component_id: str, + src_space: str, src_addr: int, + dst_space: str, dst_addr: int, + shape: tuple[int, ...], dtype: str, nbytes: int, + ) -> None: + """Record a memory copy op for Phase 2 replay (ADR-0023 + ADR-0020). + + Used by PE_DMA at outbound (sender) time: the snapshot captures + the source data at the moment the send was issued, so Phase 2 + replay does not see later mutations of the source addr (e.g. a + tl.store that runs after the recv at the sender). + + For sources whose data is not yet materialized in Phase 1 (math + scratch outputs), the snapshot is None and Phase 2 falls back to + reading from MemoryStore — by which point the corresponding math + op has been replayed and the scratch addr is populated. + """ + snap = None + if self._memory_store is not None: + try: + arr = self._memory_store.read( + src_space, src_addr, shape=shape, dtype=dtype, + ) + snap = arr.copy() if hasattr(arr, "copy") else arr + except Exception: + snap = None + self._records.append(OpRecord( + t_start=t_start, t_end=t_end, + component_id=component_id, + op_kind="memory", op_name="ipcq_copy", + params={ + "src_space": src_space, "src_addr": src_addr, + "dst_space": dst_space, "dst_addr": dst_addr, + "shape": shape, "dtype": dtype, "nbytes": nbytes, + "snapshot": snap, + }, + )) + def _extract_op_info(msg: Any) -> tuple[str, str, dict[str, Any]]: """Extract op_kind, op_name, params from a data_op message.""" @@ -76,6 +151,11 @@ def _extract_op_info(msg: Any) -> tuple[str, str, dict[str, Any]]: } if isinstance(msg, DmaWriteCmd): return "memory", "dma_write", { + "src_space": getattr(msg.handle, "space", "tcm"), + "src_addr": msg.handle.addr, + "shape": msg.handle.shape, + "dtype": msg.handle.dtype, + "dst_space": "hbm", "dst_addr": msg.dst_addr, "nbytes": msg.nbytes, "handle_id": msg.handle.id, @@ -96,7 +176,10 @@ def _extract_op_info(msg: Any) -> tuple[str, str, dict[str, Any]]: return "math", msg.op, { "input_addrs": [h.addr for h in msg.inputs], "input_shapes": [h.shape for h in msg.inputs], + "input_spaces": [getattr(h, "space", "tcm") for h in msg.inputs], + "input_dtypes": [h.dtype for h in msg.inputs], "dst_addr": msg.out.addr, + "dst_space": getattr(msg.out, "space", "tcm"), "shape_out": msg.out.shape, "dtype": msg.out.dtype, "axis": msg.axis, diff --git a/src/kernbench/topology/builder.py b/src/kernbench/topology/builder.py index ae18f3c..2337a1b 100644 --- a/src/kernbench/topology/builder.py +++ b/src/kernbench/topology/builder.py @@ -25,6 +25,7 @@ _PE_COMP_OFFSETS = { "pe_math": (0.0, 0.15), "pe_mmu": (0.15, -0.15), "pe_tcm": (0.3, 0.0), + "pe_ipcq": (-0.15, 0.15), } @@ -698,6 +699,20 @@ def _add_pe_internal_edges(edges: list[Edge], pp: str, pe_links: dict) -> None: kind="pe_internal", )) + # PE_IPCQ edges (ADR-0023 D1, D9 D10) + ipcq_edges = [ + ("pe_cpu", "pe_ipcq", "cpu_to_ipcq_mm"), # IpcqRequest + ("pe_ipcq", "pe_dma", "ipcq_to_dma_mm"), # IpcqDmaToken outbound + ("pe_dma", "pe_ipcq", "dma_to_ipcq_mm"), # IpcqMetaArrival inbound + ] + for src_c, dst_c, mm_key in ipcq_edges: + if mm_key in pe_links: + edges.append(Edge( + src=f"{pp}.{src_c}", dst=f"{pp}.{dst_c}", + distance_mm=pe_links[mm_key], + kind="pe_internal", + )) + # ── Inter-cube / IO / system edges ────────────────────────────────── @@ -765,7 +780,13 @@ def _add_io_to_cube_edges( def _add_system_to_io_edges( edges: list[Edge], sp: str, sip_spec: dict, system: dict, ) -> None: - """Add fabric switch → IO chiplet PCIe edges.""" + """Add bidirectional fabric switch ↔ IO chiplet PCIe edges. + + Both directions are needed: + switch → pcie_ep for host→device traffic (memory writes, kernel launch) + pcie_ep → switch for device-side outbound traffic (cross-SIP IPCQ + send between PE_DMAs through the system switch). + """ sw_id = "fabric.switch0" sys_link = system["links"]["io_ep_to_switch"] for inst in sip_spec["iochiplet"]["instances"]: @@ -776,6 +797,12 @@ def _add_system_to_io_edges( bw_gbs=sys_link["bw_gbs_per_ep"], kind="pcie", )) + edges.append(Edge( + src=pcie_ep_id, dst=sw_id, + distance_mm=sys_link["distance_mm"], + bw_gbs=sys_link["bw_gbs_per_ep"], + kind="pcie", + )) # ── View builders ──────────────────────────────────────────────────── @@ -1113,13 +1140,14 @@ def _build_pe_view(spec: dict) -> ViewGraph: "pe_math": (7.0, 6.5), "pe_mmu": (4.0, 1.5), "pe_tcm": (10.0, 4.0), + "pe_ipcq": (4.0, 6.5), } nodes: dict[str, Node] = {} view_edges: list[Edge] = [] for comp_name, comp_spec in pe_tmpl["components"].items(): - px, py = positions[comp_name] + px, py = positions.get(comp_name, (1.0, 1.0)) nodes[comp_name] = Node( id=comp_name, kind=comp_spec["kind"], impl=comp_spec["impl"], attrs=comp_spec["attrs"], pos_mm=(px, py), diff --git a/src/kernbench/triton_emu/kernel_runner.py b/src/kernbench/triton_emu/kernel_runner.py index 593733f..ca25ea1 100644 --- a/src/kernbench/triton_emu/kernel_runner.py +++ b/src/kernbench/triton_emu/kernel_runner.py @@ -15,6 +15,7 @@ from typing import TYPE_CHECKING, Any import simpy from greenlet import greenlet +from kernbench.common.ipcq_types import IpcqRecvCmd, IpcqRequest, IpcqSendCmd, RecvFuture from kernbench.common.pe_commands import ( CompletionHandle, CompositeCmd, @@ -51,6 +52,9 @@ class KernelRunner: out_ports: dict[str, simpy.Store], store: MemoryStore | None = None, num_cubes: int = 1, + ipcq_id: str | None = None, + scratch_base: int = 0, + scratch_size: int = 1 << 20, ) -> None: self._pe_prefix = pe_prefix self._pe_idx = pe_idx @@ -61,6 +65,13 @@ class KernelRunner: self._out_ports = out_ports self._store = store self._parent: greenlet | None = None + # Optional IPCQ port (ADR-0023). If None, IPCQ commands raise. + self._ipcq_id = ipcq_id or f"{pe_prefix}.pe_ipcq" + # PE-local scratch for compute output TensorHandles (ADR-0020 D3 + # extension). The TLContext allocates from this pool when math/dot + # ops produce a result that may later be used as a send/store source. + self._scratch_base = scratch_base + self._scratch_size = scratch_size def run( self, @@ -89,7 +100,10 @@ class KernelRunner: num_cubes=self._num_cubes, dispatch_cycles=0, runner=self, + scratch_base=self._scratch_base, + scratch_size=self._scratch_size, ) + self._tl = tl # exposed so switch_to_simpy can re-set on restore def _kernel_entry(): TLContext._set_active(tl) # type: ignore[attr-defined] @@ -103,13 +117,20 @@ class KernelRunner: pending: dict[str, simpy.Event] = {} composite_results: list[dict] = [] + # Helper: set our tl as active just before resuming the kernel. + # Multiple PE kernel runners share the same thread-local; without + # this, another runner's kernel may have left a different context. + def _switch_kernel(*args): + TLContext._set_active(tl) # type: ignore[attr-defined] + return g.switch(*args) + # Start kernel — first switch returns first command (or None if kernel is done) - cmd = g.switch() + cmd = _switch_kernel() while cmd is not None: if isinstance(cmd, PeCpuOverheadCmd): yield env.timeout(cmd.cycles) - cmd = g.switch() + cmd = _switch_kernel() elif isinstance(cmd, WaitCmd): if cmd.handle is not None: @@ -120,7 +141,7 @@ class KernelRunner: for evt in pending.values(): yield evt pending.clear() - cmd = g.switch() + cmd = _switch_kernel() elif isinstance(cmd, DmaReadCmd): # Dispatch DMA through SimPy components @@ -141,10 +162,12 @@ class KernelRunner: ) except KeyError: pass - cmd = g.switch(data) + cmd = _switch_kernel(data) elif isinstance(cmd, DmaWriteCmd): - # Write to MemoryStore first (visibility = issue, ADR-0020 D3) + # Write to MemoryStore first (visibility = issue, ADR-0020 D3). + # When data is None (e.g. timing-only TensorHandle math result), + # this is a no-op; Phase 2 dma_write replay handles those. if self._store is not None and cmd.handle.data is not None: self._store.write("hbm", cmd.dst_addr, cmd.handle.data) @@ -154,7 +177,7 @@ class KernelRunner: ) yield self._out_ports[self._scheduler_id].put(pe_txn) yield done_evt - cmd = g.switch() + cmd = _switch_kernel() elif isinstance(cmd, CompositeCmd): # Non-blocking composite @@ -165,7 +188,7 @@ class KernelRunner: composite_results.append(pe_txn.result_data) yield self._out_ports[self._scheduler_id].put(pe_txn) pending[cmd.completion.id] = done_evt - cmd = g.switch() + cmd = _switch_kernel() elif isinstance(cmd, (GemmCmd, MathCmd)): # Blocking compute command @@ -175,7 +198,90 @@ class KernelRunner: ) yield self._out_ports[self._scheduler_id].put(pe_txn) yield done_evt - cmd = g.switch() + cmd = _switch_kernel() + + elif isinstance(cmd, IpcqSendCmd): + # Forward IpcqRequest to PE_IPCQ, wait for done + if self._ipcq_id not in self._out_ports: + raise RuntimeError( + f"PE_IPCQ port {self._ipcq_id!r} not wired to runner" + ) + done_evt = env.event() + req = IpcqRequest(command=cmd, done=done_evt) + yield self._out_ports[self._ipcq_id].put(req) + yield done_evt + cmd = _switch_kernel() + + elif isinstance(cmd, IpcqRecvCmd): + if self._ipcq_id not in self._out_ports: + raise RuntimeError( + f"PE_IPCQ port {self._ipcq_id!r} not wired to runner" + ) + done_evt = env.event() + req = IpcqRequest(command=cmd, done=done_evt) + yield self._out_ports[self._ipcq_id].put(req) + yield done_evt + # Read actual data from MemoryStore at the slot address + data = None + src_space = req.result_data.get("src_space", "tcm") + src_addr = req.result_data.get("src_addr", 0) + if self._store is not None: + try: + data = self._store.read( + src_space, src_addr, + shape=cmd.shape, dtype=cmd.dtype, + ) + except KeyError: + pass + # Build result dict for tl.recv to wrap in TensorHandle + result = { + "data": data, + "src_space": src_space, + "src_addr": src_addr, + "direction": req.result_data.get("direction", cmd.direction), + "dtype": cmd.dtype, + "shape": cmd.shape, + "nbytes": req.result_data.get("nbytes", 0), + } + cmd = _switch_kernel(result) + + elif isinstance(cmd, tuple) and len(cmd) == 2 and cmd[0] == "recv_async": + # Non-blocking recv: post the IpcqRequest now, store the + # event in the future, return None to kernel. + future: RecvFuture = cmd[1] + done_evt = env.event() + req = IpcqRequest(command=future.cmd, done=done_evt) + future.request = req + future.event = done_evt + yield self._out_ports[self._ipcq_id].put(req) + cmd = _switch_kernel(None) + + elif isinstance(cmd, tuple) and len(cmd) == 2 and cmd[0] == "recv_wait": + future = cmd[1] + if not future.event.triggered: + yield future.event + req = future.request + src_space = req.result_data.get("src_space", "tcm") + src_addr = req.result_data.get("src_addr", 0) + data = None + if self._store is not None: + try: + data = self._store.read( + src_space, src_addr, + shape=future.cmd.shape, dtype=future.cmd.dtype, + ) + except KeyError: + pass + result = { + "data": data, + "src_space": src_space, + "src_addr": src_addr, + "direction": req.result_data.get("direction", future.cmd.direction), + "dtype": future.cmd.dtype, + "shape": future.cmd.shape, + "nbytes": req.result_data.get("nbytes", 0), + } + cmd = _switch_kernel(result) else: # Unknown command — pass through as blocking @@ -185,7 +291,7 @@ class KernelRunner: ) yield self._out_ports[self._scheduler_id].put(pe_txn) yield done_evt - cmd = g.switch() + cmd = _switch_kernel() # Wait remaining pending composites for evt in pending.values(): diff --git a/src/kernbench/triton_emu/tl_context.py b/src/kernbench/triton_emu/tl_context.py index 3498a84..22e43d6 100644 --- a/src/kernbench/triton_emu/tl_context.py +++ b/src/kernbench/triton_emu/tl_context.py @@ -17,6 +17,7 @@ from __future__ import annotations import math from typing import Literal +from kernbench.common.ipcq_types import IpcqRecvCmd, IpcqSendCmd, RecvFuture from kernbench.common.pe_commands import ( CompletionHandle, CompositeCmd, @@ -55,6 +56,8 @@ class TLContext: runner: Any = None, cube_id: int = 0, num_cubes: int = 1, + scratch_base: int = 0, + scratch_size: int = 1 << 20, # 1 MiB per kernel invocation ) -> None: self._pe_id = pe_id self._num_programs = num_programs @@ -65,6 +68,33 @@ class TLContext: self._handle_counter = 0 self._completion_counter = 0 self._runner = runner # KernelRunner for greenlet mode (ADR-0020 D3) + # PE-local scratch allocator for math/compute output handles. + # Each binary/unary/reduction op auto-allocates a unique addr from + # this pool so the resulting TensorHandle can be the source of a + # later tl.send / tl.store. Cursor resets on every kernel invocation. + self._scratch_base = scratch_base + self._scratch_size = scratch_size + self._scratch_cursor = 0 + + def _scratch_alloc(self, nbytes: int) -> int: + """Allocate a unique scratch address for an output TensorHandle. + + Returns 0 if no scratch base was configured (e.g. command-list mode); + in that case the resulting handle has addr=0 and cannot be used as a + send/store source. Greenlet/runner mode always supplies a base. + """ + if self._scratch_base == 0: + return 0 + # 16-byte alignment + aligned = (nbytes + 15) & ~15 + addr = self._scratch_base + self._scratch_cursor + self._scratch_cursor += aligned + if self._scratch_cursor > self._scratch_size: + raise RuntimeError( + f"TLContext scratch overflow: requested {nbytes}B, " + f"used {self._scratch_cursor}/{self._scratch_size}B" + ) + return addr @property def commands(self) -> list[PeCommand]: @@ -93,11 +123,30 @@ class TLContext: def _make_handle( self, addr: int, shape: tuple[int, ...], dtype: str, + space: str = "tcm", ) -> TensorHandle: return TensorHandle( id=self._next_handle_id(), addr=addr, shape=shape, dtype=dtype, nbytes=self._nbytes(shape, dtype), + space=space, + ) + + def _make_compute_out( + self, shape: tuple[int, ...], dtype: str, + ) -> TensorHandle: + """Allocate an output TensorHandle in PE-local scratch (TCM space). + + Used by math/compute ops so the result has a real address that can + be the source of a later send/store. The data field stays None in + Phase 1 — Phase 2 DataExecutor fills the actual ndarray. + """ + nbytes = self._nbytes(shape, dtype) + addr = self._scratch_alloc(nbytes) + return TensorHandle( + id=self._next_handle_id(), + addr=addr, shape=shape, dtype=dtype, + nbytes=nbytes, space="tcm", ) # ── Reference (no DMA, metadata only) ──────────────────────── @@ -124,20 +173,26 @@ class TLContext: def load( self, ptr: int, shape: tuple[int, ...], dtype: str = "f16", ) -> TensorHandle: - """Load tensor from HBM to TCM. Returns TensorHandle. + """Load tensor from HBM. Returns TensorHandle pointing at HBM[ptr]. In greenlet mode: returns TensorHandle with actual numpy data. In command-list mode: returns TensorHandle with data=None. + + The returned handle's ``space`` is "hbm" so subsequent ops (math, + send, store) using this handle as a source resolve via MemoryStore + at ``(hbm, ptr)`` — which is where the load's underlying data + actually lives in Phase 2 storage. """ self._emit_dispatch_overhead() - handle = self._make_handle(addr=ptr, shape=shape, dtype=dtype) + handle = self._make_handle(addr=ptr, shape=shape, dtype=dtype, space="hbm") cmd = DmaReadCmd(handle=handle, src_addr=ptr, nbytes=handle.nbytes) data = self._emit(cmd) if data is not None: - # Greenlet mode: attach real data to handle + # Greenlet mode: attach real data to handle (preserve space) return TensorHandle( id=handle.id, addr=handle.addr, shape=handle.shape, dtype=handle.dtype, nbytes=handle.nbytes, data=data, + space=handle.space, ) return handle @@ -162,7 +217,7 @@ class TLContext: raise ValueError(f"dot shape mismatch: a.K={k} != b.K={k2}") out_shape = (*a.shape[:-2], m, n) out_dtype = a.dtype - out = self._make_handle(addr=0, shape=out_shape, dtype=out_dtype) + out = self._make_compute_out(shape=out_shape, dtype=out_dtype) self._emit_dispatch_overhead() self._emit(GemmCmd(a=a, b=b, out=out, m=m, k=k, n=n)) return out @@ -170,7 +225,7 @@ class TLContext: # ── MATH Engine: unary (blocking) ───────────────────────────── def _unary_math(self, op: str, x: TensorHandle) -> TensorHandle: - out = self._make_handle(addr=0, shape=x.shape, dtype=x.dtype) + out = self._make_compute_out(shape=x.shape, dtype=x.dtype) self._emit_dispatch_overhead() self._emit(MathCmd(op=op, inputs=(x,), out=out)) return out @@ -203,7 +258,7 @@ class TLContext: ) -> TensorHandle: out_shape = list(x.shape) out_shape[axis] = 1 - out = self._make_handle(addr=0, shape=tuple(out_shape), dtype=x.dtype) + out = self._make_compute_out(shape=tuple(out_shape), dtype=x.dtype) self._emit_dispatch_overhead() self._emit(MathCmd(op=op, inputs=(x,), out=out, axis=axis)) return out @@ -222,7 +277,7 @@ class TLContext: def _binary_math( self, op: str, a: TensorHandle, b: TensorHandle, ) -> TensorHandle: - out = self._make_handle(addr=0, shape=a.shape, dtype=a.dtype) + out = self._make_compute_out(shape=a.shape, dtype=a.dtype) self._emit_dispatch_overhead() self._emit(MathCmd(op=op, inputs=(a, b), out=out)) return out @@ -230,15 +285,67 @@ class TLContext: def where( self, cond: TensorHandle, a: TensorHandle, b: TensorHandle, ) -> TensorHandle: - out = self._make_handle(addr=0, shape=a.shape, dtype=a.dtype) + out = self._make_compute_out(shape=a.shape, dtype=a.dtype) self._emit_dispatch_overhead() self._emit(MathCmd(op="where", inputs=(cond, a, b), out=out)) return out + def maximum(self, a: TensorHandle, b: TensorHandle) -> TensorHandle: + """Element-wise max of two tensors (real Triton: tl.maximum).""" + return self._binary_math("maximum", a, b) + + def minimum(self, a: TensorHandle, b: TensorHandle) -> TensorHandle: + """Element-wise min of two tensors (real Triton: tl.minimum).""" + return self._binary_math("minimum", a, b) + + def fma( + self, a: TensorHandle, b: TensorHandle, c: TensorHandle, + ) -> TensorHandle: + """Fused multiply-add: a * b + c (real Triton: tl.fma).""" + out = self._make_compute_out(shape=a.shape, dtype=a.dtype) + self._emit_dispatch_overhead() + self._emit(MathCmd(op="fma", inputs=(a, b, c), out=out)) + return out + + def clamp( + self, + x: TensorHandle, + min: TensorHandle, + max: TensorHandle, + ) -> TensorHandle: + """Clamp x to [min, max] (real Triton: tl.clamp).""" + out = self._make_compute_out(shape=x.shape, dtype=x.dtype) + self._emit_dispatch_overhead() + self._emit(MathCmd(op="clamp", inputs=(x, min, max), out=out)) + return out + + def softmax(self, x: TensorHandle, axis: int = -1) -> TensorHandle: + """Numerically-stable softmax along ``axis`` (real Triton: tl.softmax). + + Implemented as a single MathCmd (op="softmax") so timing accounts + for one MATH dispatch; Phase 2 DataExecutor expands it to the + canonical (x - max) → exp → sum → div sequence. + """ + out = self._make_compute_out(shape=x.shape, dtype=x.dtype) + self._emit_dispatch_overhead() + self._emit(MathCmd(op="softmax", inputs=(x,), out=out, axis=axis)) + return out + + # ── Scalar helpers (real Triton: tl.cdiv etc.) ──────────────── + + @staticmethod + def cdiv(a: int, b: int) -> int: + """Ceiling division: (a + b - 1) // b (real Triton: tl.cdiv). + + Used by host/kernel grid math; not a tensor op, so no MathCmd + is emitted. Mirrors triton.cdiv. + """ + return -(-int(a) // int(b)) + # ── Index / Scalar (PE_CPU, no engine) ──────────────────────── def program_id(self, axis: int = 0) -> int: - """Return program instance index. + """Return program instance index (ADR-0022). axis=0: local PE id within cube. axis=1: cube id. @@ -248,7 +355,7 @@ class TLContext: return self._pe_id def num_programs(self, axis: int = 0) -> int: - """Return total number of program instances. + """Return total number of program instances (ADR-0022). axis=0: num PEs per cube. axis=1: num cubes. @@ -284,6 +391,119 @@ class TLContext: dtype=x.dtype, nbytes=x.nbytes, data=x.data, ) + # ── IPCQ (CCL) collective primitives (ADR-0023 D4) ──────────── + + def send( + self, + dir: str, + src: TensorHandle | None = None, + *, + src_addr: int | None = None, + nbytes: int | None = None, + shape: tuple[int, ...] | None = None, + dtype: str = "f16", + space: str = "tcm", + ) -> None: + """Send tensor data to the peer in the given direction. + + Two calling forms: + tl.send(dir, handle) # use handle's metadata + tl.send(dir, src_addr=..., nbytes=..., shape=..., dtype=..., space=...) + + Blocking: returns when PE_IPCQ has accepted the request and + forwarded the IpcqDmaToken to PE_DMA. Backpressure may apply. + """ + if src is not None: + src_addr = src.addr + nbytes = src.nbytes + shape = src.shape + dtype = src.dtype + space = getattr(src, "space", space) + if src_addr is None or nbytes is None or shape is None: + raise ValueError("tl.send: provide either a TensorHandle or src_addr/nbytes/shape") + self._emit_dispatch_overhead() + cmd = IpcqSendCmd( + direction=dir, + src_addr=src_addr, src_space=space, + nbytes=nbytes, shape=shape, dtype=dtype, + handle_id=self._next_handle_id(), + ) + self._emit(cmd) + + def recv( + self, + dir: str | None = None, + shape: tuple[int, ...] = (), + dtype: str = "f16", + space: str = "tcm", + dst_addr: int | None = None, + dst_space: str | None = None, + ) -> TensorHandle: + """Receive tensor data from a peer. + + Args: + dir: specific direction (e.g. "W"), or None for round-robin. + shape, dtype: expected tensor metadata. + dst_addr / dst_space: if both are provided, the slot data is + copied to (dst_space, dst_addr) before the handle is + returned ("copy_to_dst" mode). Otherwise the slot address + is returned directly ("return_slot" mode). + + Returns: + TensorHandle pointing to the slot (or dst) where the data has + arrived. In greenlet/runner mode, ``handle.data`` carries the + actual ndarray; in command-list mode the handle is a placeholder. + """ + self._emit_dispatch_overhead() + if dst_addr is not None and dst_space is not None: + cmd = IpcqRecvCmd( + direction=dir, + shape=shape, dtype=dtype, + handle_id=self._next_handle_id(), + recv_mode="copy_to_dst", + dst_addr=dst_addr, dst_space=dst_space, + ) + else: + cmd = IpcqRecvCmd( + direction=dir, + shape=shape, dtype=dtype, + handle_id=self._next_handle_id(), + ) + result = self._emit(cmd) + if isinstance(result, dict): + slot_addr = int(result.get("src_addr", 0)) + slot_space = str(result.get("src_space", "tcm")) + data = result.get("data") + return TensorHandle( + id=self._next_handle_id(), + addr=slot_addr, + shape=shape, + dtype=dtype, + nbytes=self._nbytes(shape, dtype), + data=data, + space=slot_space, + ) + return self._make_handle(addr=0, shape=shape, dtype=dtype) + + def recv_async( + self, + dir: str, + shape: tuple[int, ...] = (), + dtype: str = "f16", + ) -> "RecvFuture": + """Non-blocking recv. Returns a future to pass into ``tl.wait``.""" + self._emit_dispatch_overhead() + cmd = IpcqRecvCmd( + direction=dir, + shape=shape, dtype=dtype, + handle_id=self._next_handle_id(), + blocking=False, + ) + future = RecvFuture(cmd=cmd) + if self._runner is not None: + self._runner.switch_to_simpy(("recv_async", future)) + return future + # ── Composite + Control ─────────────────────────────────────── def composite( @@ -316,9 +536,40 @@ class TLContext: )) return completion - def wait(self, handle: CompletionHandle | None = None) -> None: - """Wait for a specific composite or all pending composites.""" + def wait(self, handle: "CompletionHandle | RecvFuture | None" = None) -> Any: + """Wait for a composite, a recv future, or all pending composites. + + - ``CompletionHandle`` (or None): wait for composite completion. + - ``RecvFuture``: wait for a non-blocking ``recv_async`` to finish. + Returns the resolved ``TensorHandle``. + """ + if isinstance(handle, RecvFuture): + if handle.resolved: + return handle.result + if self._runner is None: + raise RuntimeError( + "tl.wait(RecvFuture) requires runner mode (greenlet)" + ) + result_dict = self._runner.switch_to_simpy(("recv_wait", handle)) + slot_addr = int(result_dict.get("src_addr", 0)) + slot_space = str(result_dict.get("src_space", "tcm")) + data = result_dict.get("data") + th = TensorHandle( + id=self._next_handle_id(), + addr=slot_addr, + shape=handle.cmd.shape, + dtype=handle.cmd.dtype, + nbytes=self._nbytes(handle.cmd.shape, handle.cmd.dtype), + data=data, + space=slot_space, + ) + handle.resolved = True + handle.result = th + return th + + # Composite path (existing behaviour) self._emit(WaitCmd(handle=handle)) + return None def cycles(self, n: int) -> None: """Declare PE_CPU scalar execution overhead (cycles).""" diff --git a/tests/test_ccl_allreduce_matrix.py b/tests/test_ccl_allreduce_matrix.py new file mode 100644 index 0000000..e19c4ba --- /dev/null +++ b/tests/test_ccl_allreduce_matrix.py @@ -0,0 +1,142 @@ +"""End-to-end matrix tests for the unified ``ccl_allreduce`` bench. + +Each parametrized case writes a tmp ``ccl.yaml`` overlay that selects a +specific (algorithm, world_size, buffer_kind, n_elem) combination, then +runs the bench via the CLI and asserts the printed line reports all +ranks OK. + +This single test file replaces the per-variant bench tests +(test_ccl_allreduce_e2e, test_ccl_mesh_allreduce, test_ccl_tree_allreduce, +test_ccl_multicube, test_ccl_multisip). +""" +from __future__ import annotations + +import os +import textwrap + +import pytest + +import kernbench.cli.main as cli_main + + +CCL_YAML_TEMPLATE = textwrap.dedent("""\ + defaults: + algorithm: {algorithm} + buffer_kind: {buffer_kind} + backpressure: sleep + n_slots: 4 + slot_size: 4096 + vc_chunk_size: 256 + ipcq_credit_size_bytes: 16 + + algorithms: + {algorithm}: + module: {module} + topology: {topology} + buffer_kind: {buffer_kind} +{world_size_line}{n_elem_line} +""") + + +def _write_ccl_yaml( + tmp_path, + *, + algorithm: str, + module: str, + topology: str, + buffer_kind: str = "tcm", + world_size: int | None = None, + n_elem: int | None = None, +) -> str: + """Write a tmp ccl.yaml in tmp_path and return its directory.""" + ws_line = f" world_size: {world_size}\n" if world_size is not None else "" + nel_line = f" n_elem: {n_elem}\n" if n_elem is not None else "" + body = CCL_YAML_TEMPLATE.format( + algorithm=algorithm, + module=module, + topology=topology, + buffer_kind=buffer_kind, + world_size_line=ws_line, + n_elem_line=nel_line, + ) + yaml_path = tmp_path / "ccl.yaml" + yaml_path.write_text(body) + return str(tmp_path) + + +CASES = [ + # algorithm, module, topology, buffer_kind, world_size, n_elem, expected_ws + pytest.param( + "ring_allreduce_tcm", "kernbench.ccl.algorithms.ring_allreduce", + "ring_1d", "tcm", None, 8, 256, + id="ring_full_system_tcm", + ), + pytest.param( + "ring_allreduce_hbm", "kernbench.ccl.algorithms.ring_allreduce", + "ring_1d", "hbm", None, 8, 256, + id="ring_full_system_hbm", + ), + pytest.param( + "ring_allreduce_sram", "kernbench.ccl.algorithms.ring_allreduce", + "ring_1d", "sram", None, 8, 256, + id="ring_full_system_sram", + ), + pytest.param( + "ring_allreduce_8", "kernbench.ccl.algorithms.ring_allreduce", + "ring_1d", "tcm", 8, 32, 8, + id="ring_single_cube", + ), + pytest.param( + "ring_allreduce_16", "kernbench.ccl.algorithms.ring_allreduce", + "ring_1d", "tcm", 16, 16, 16, + id="ring_multi_cube", + ), + pytest.param( + "mesh_allreduce_4", "kernbench.ccl.algorithms.mesh_allreduce", + "mesh_2d", "tcm", 4, 16, 4, + id="mesh_2x2", + ), + pytest.param( + "tree_allreduce_7", "kernbench.ccl.algorithms.tree_allreduce", + "tree_binary", "tcm", 7, 16, 7, + id="tree_binary_7", + ), +] + + +@pytest.mark.parametrize( + "algorithm,module,topology,buffer_kind,world_size,n_elem,expected_ws", + CASES, +) +def test_ccl_allreduce_matrix( + tmp_path, capsys, monkeypatch, + algorithm, module, topology, buffer_kind, world_size, n_elem, expected_ws, +): + """Each (algorithm × buffer × world_size) combo passes through the + unified bench and yields all ranks OK.""" + project_root = os.path.abspath( + os.path.join(os.path.dirname(__file__), "..") + ) + yaml_dir = _write_ccl_yaml( + tmp_path, + algorithm=algorithm, + module=module, + topology=topology, + buffer_kind=buffer_kind, + world_size=world_size, + n_elem=n_elem, + ) + monkeypatch.chdir(yaml_dir) + rc = cli_main.main([ + "run", + "--topology", os.path.join(project_root, "topology.yaml"), + "--bench", "ccl_allreduce", + "--verify-data", + ]) + assert rc == 0 + out = capsys.readouterr().out + assert "FAIL" not in out, f"unexpected FAIL in output:\n{out}" + assert f"{algorithm} (ws={expected_ws}): {expected_ws} OK" in out, ( + f"expected '{algorithm} (ws={expected_ws}): {expected_ws} OK' " + f"in output:\n{out}" + ) diff --git a/tests/test_ccl_deadlock_detection.py b/tests/test_ccl_deadlock_detection.py new file mode 100644 index 0000000..9dbb133 --- /dev/null +++ b/tests/test_ccl_deadlock_detection.py @@ -0,0 +1,125 @@ +"""Tests for IPCQ deadlock detection (ADR-0023 D14 F3).""" +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Any + +import pytest +import simpy + +from kernbench.ccl import diagnostics +from kernbench.common.ipcq_types import ( + IpcqEndpoint, + IpcqInitEntry, + IpcqRecvCmd, + IpcqRequest, +) +from kernbench.components.builtin.pe_ipcq import PeIpcqComponent +from kernbench.runtime_api.kernel import IpcqInitMsg +from kernbench.topology.types import Node + + +@dataclass +class _FakeTxn: + request: Any + done: simpy.Event + result_data: dict[str, Any] = field(default_factory=dict) + + +def _make_isolated_pe_ipcq(env): + node = Node( + id="sip0.cube0.pe0.pe_ipcq", kind="pe_ipcq", + impl="builtin.pe_ipcq", attrs={}, pos_mm=None, + ) + comp = PeIpcqComponent(node, ctx=None) + comp.in_ports["host"] = simpy.Store(env) + comp.out_ports["sip0.cube0.pe0.pe_dma"] = simpy.Store(env) + comp.start(env) + + peer_credit = simpy.Store(env) + ep = IpcqEndpoint( + sip=0, cube=0, pe=1, buffer_kind="tcm", + rx_base_pa=0x10_000, rx_base_va=0, + n_slots=4, slot_size=4096, + ) + init_msg = IpcqInitMsg( + correlation_id="t", request_id="t", + target_sips=(0,), target_cubes=(0,), target_pe=0, + entries=(IpcqInitEntry( + direction="W", peer=ep, + my_rx_base_pa=0x40_000, my_rx_base_va=0, + n_slots=4, slot_size=4096, + peer_credit_store=peer_credit, + ),), + backpressure_mode="sleep", + buffer_kind="tcm", + credit_size_bytes=16, + ) + done = env.event() + comp.in_ports["host"].put(_FakeTxn(request=init_msg, done=done)) + env.run(until=done) + return comp + + +def test_pointer_dump_includes_blocked_state(): + """A blocked recv should still be visible in the pointer dump.""" + env = simpy.Environment() + comp = _make_isolated_pe_ipcq(env) + + # Issue a recv that will block (no data has arrived) + recv_cmd = IpcqRecvCmd(direction="W", shape=(8,), dtype="f16", handle_id="r1") + req = IpcqRequest(command=recv_cmd, done=env.event()) + comp.in_ports["host"].put(req) + env.run(until=10) + assert not req.done.triggered + + # Pointer dump should show my_tail=0 and peer_head_cache=0 + # We need to use the engine API but for an isolated component, just call directly + class FakeEngine: + _components = {"sip0.cube0.pe0.pe_ipcq": comp} + + dump = diagnostics.pointer_dump(FakeEngine()) + assert "my_tail=0" in dump + assert "peer_head_cache=0" in dump + + +def test_deadlock_detection_recv_without_send(): + """A recv with no matching sender → SimPy schedule empties → engine + raises ``IpcqDeadlock`` with a pointer dump. + """ + from kernbench.ccl.diagnostics import IpcqDeadlock + from kernbench.policy.placement.dp import DPPolicy + from kernbench.runtime_api.bench_runner import run_bench + from kernbench.runtime_api.types import resolve_device + from kernbench.sim_engine.engine import GraphEngine + from kernbench.topology.builder import resolve_topology + + def deadlock_kernel(t_ptr, n_elem, tl): + # Every PE just receives, no sends → no one delivers → deadlock + tl.recv(dir="W", shape=(n_elem,), dtype="f16") + + topo = resolve_topology("topology.yaml") + + def run(torch): + torch.install_ipcq( + algorithm="ring_allreduce_tcm", world_size_override=8, + ) + a = torch.zeros( + (1, 8 * 8), + dtype="f16", + dp=DPPolicy( + sip="replicate", cube="replicate", pe="column_wise", + num_sips=1, num_cubes=1, + ), + name="dl_in", + ) + torch.launch("dl", deadlock_kernel, a, 8) + + with pytest.raises(IpcqDeadlock): + run_bench( + topology=topo, bench_fn=run, + device=resolve_device("all"), + engine_factory=lambda t, d: GraphEngine( + getattr(t, "topology_obj", t), enable_data=True + ), + ) diff --git a/tests/test_ccl_diagnostics.py b/tests/test_ccl_diagnostics.py new file mode 100644 index 0000000..5d949fb --- /dev/null +++ b/tests/test_ccl_diagnostics.py @@ -0,0 +1,70 @@ +"""Tests for CCL diagnostics: trace + pointer dump (ADR-0023 D14).""" +from __future__ import annotations + +import os + +from kernbench.ccl import diagnostics + + +# ── trace toggle ───────────────────────────────────────────────────── + + +def test_trace_disabled_by_default(monkeypatch): + monkeypatch.delenv("KERNBENCH_CCL_TRACE", raising=False) + diagnostics.reload_trace_setting() + assert diagnostics.trace_enabled() is False + + +def test_trace_enabled_via_env(monkeypatch): + monkeypatch.setenv("KERNBENCH_CCL_TRACE", "1") + diagnostics.reload_trace_setting() + assert diagnostics.trace_enabled() is True + + +def test_trace_record_send(monkeypatch, capsys): + monkeypatch.setenv("KERNBENCH_CCL_TRACE", "1") + diagnostics.reload_trace_setting() + diagnostics.log_send(t_ns=100.0, sender="sip0.cube0.pe0", + direction="E", nbytes=64, sender_seq=0) + out = capsys.readouterr().out + assert "send" in out + assert "sip0.cube0.pe0" in out + assert "dir=E" in out + monkeypatch.delenv("KERNBENCH_CCL_TRACE") + diagnostics.reload_trace_setting() + + +def test_trace_record_recv(monkeypatch, capsys): + monkeypatch.setenv("KERNBENCH_CCL_TRACE", "1") + diagnostics.reload_trace_setting() + diagnostics.log_recv(t_ns=200.0, receiver="sip0.cube0.pe1", + direction="W", nbytes=64) + out = capsys.readouterr().out + assert "recv" in out + assert "sip0.cube0.pe1" in out + monkeypatch.delenv("KERNBENCH_CCL_TRACE") + diagnostics.reload_trace_setting() + + +# ── pointer dump ──────────────────────────────────────────────────── + + +def test_pointer_dump_format(): + from kernbench.sim_engine.engine import GraphEngine + from kernbench.topology.builder import resolve_topology + from kernbench.ccl.install import ( + install_ipcq, load_ccl_config, resolve_algorithm_config, + ) + + topo = resolve_topology("topology.yaml").topology_obj + engine = GraphEngine(topo, enable_data=True) + cfg = resolve_algorithm_config(load_ccl_config(), name="ring_allreduce_tcm") + install_ipcq(engine, topo.spec, cfg) + + dump = diagnostics.pointer_dump(engine) + # 8 ranks × 2 directions = 16 lines (plus 8 PE headers) + assert "sip0.cube0.pe0" in dump + assert "E:" in dump + assert "W:" in dump + assert "my_head=" in dump + assert "peer_tail_cache=" in dump diff --git a/tests/test_ccl_framework.py b/tests/test_ccl_framework.py new file mode 100644 index 0000000..93dfa9c --- /dev/null +++ b/tests/test_ccl_framework.py @@ -0,0 +1,62 @@ +"""Tests for the torch.distributed-compat facade (ADR-0023 D11). + +These tests verify the public API surface of ``DistributedContext`` + +``AhbmCCLBackend``. End-to-end correctness of the allreduce itself is +covered by tests/test_ccl_allreduce_matrix.py. +""" +from __future__ import annotations + +from kernbench.runtime_api.distributed import AhbmCCLBackend, DistributedContext + + +def test_init_process_group_requires_ctx_ref(): + """Using DistributedContext without RuntimeContext binding should fail.""" + dist = DistributedContext() + # Not bound to a RuntimeContext → init should raise. + try: + dist.init_process_group(backend="ahbm") + assert False, "expected RuntimeError" + except RuntimeError: + pass + + +def test_init_process_group_rejects_unknown_backend(): + """Unknown backend raises ValueError (matches pytorch behavior).""" + dist = DistributedContext() + dist._ctx_ref = object() # dummy; won't be reached before the check + try: + dist.init_process_group(backend="nccl") + assert False, "expected ValueError" + except ValueError: + pass + + +def test_distributed_pytorch_compat_surface(): + """DistributedContext only exposes real torch.distributed API names.""" + # Every public attribute should either be a real pytorch name or private. + allowed = { + "init_process_group", + "is_initialized", + "get_world_size", + "get_rank", + "get_backend", + "all_reduce", + "barrier", + } + dc = DistributedContext() + for attr in dir(dc): + if attr.startswith("_"): + continue + assert attr in allowed, ( + f"DistributedContext exposes non-pytorch API: {attr!r}" + ) + + +def test_backend_class_surface(): + """AhbmCCLBackend exposes only all_reduce + barrier + world_size.""" + # Ensure we don't accidentally leak internal method names. + public = {m for m in dir(AhbmCCLBackend) if not m.startswith("_")} + # Class must at minimum expose these. + assert "all_reduce" in public + assert "barrier" in public + assert "world_size" in public diff --git a/tests/test_ccl_hello_world_guide.py b/tests/test_ccl_hello_world_guide.py new file mode 100644 index 0000000..68bc017 --- /dev/null +++ b/tests/test_ccl_hello_world_guide.py @@ -0,0 +1,81 @@ +"""Validate the hello-world example from docs/ccl-author-guide.md. + +This is the simplest possible CCL kernel — each PE sends its tile E +and receives a tile from W. After running, each rank's slice should +contain the data of the previous rank. +""" +from __future__ import annotations + +import numpy as np + +from kernbench.ccl.algorithms import hello_send +from kernbench.ccl.testing import run_kernel_in_mock + + +def test_hello_send_4_ranks_mock(): + n_elem = 8 + inputs = [np.full((n_elem,), float(r + 1), dtype=np.float16) for r in range(4)] + + outputs = run_kernel_in_mock( + kernel_fn=hello_send.kernel, + world_size=4, + topology="ring_1d", + inputs=inputs, + kernel_args=(n_elem,), + ) + + # rank r should have rank (r-1) % 4's data + for r in range(4): + prev = inputs[(r - 1) % 4] + assert np.array_equal(outputs[r], prev), f"rank {r}: got {outputs[r]}" + + +def test_hello_send_via_simpy_runner(): + """Same but through real SimPy + IPCQ.""" + from kernbench.policy.placement.dp import DPPolicy + from kernbench.runtime_api.bench_runner import run_bench + from kernbench.runtime_api.types import resolve_device + from kernbench.sim_engine.engine import GraphEngine + from kernbench.topology.builder import resolve_topology + + topo = resolve_topology("topology.yaml") + n_elem = 8 + world_size = 8 + + def run(torch): + # World size for this hello test is 8 (one cube). ccl.yaml no + # longer carries a default world_size — pass it explicitly. + plan = torch.install_ipcq( + algorithm="ring_allreduce_tcm", world_size_override=world_size, + ) + a = torch.zeros( + (1, world_size * n_elem), dtype="f16", + dp=DPPolicy( + sip="replicate", cube="replicate", pe="column_wise", + num_sips=1, num_cubes=1, + ), + name="hello_in", + ) + store = torch.engine.memory_store + base = a._handle.va_base or a._handle.shards[0].pa + nbytes = n_elem * 2 + for r in range(world_size): + store.write("hbm", base + r * nbytes, + np.full((n_elem,), float(r + 1), dtype=np.float16)) + + torch.launch("hello_send", hello_send.kernel, a, n_elem) + + # Each rank should hold the previous rank's data after the round + for r in range(world_size): + arr = store.read("hbm", base + r * nbytes, shape=(n_elem,), dtype="f16") + prev_value = float(((r - 1) % world_size) + 1) + assert np.allclose(arr, prev_value), f"rank {r}: got {arr}, expected {prev_value}" + + result = run_bench( + topology=topo, bench_fn=run, + device=resolve_device("all"), + engine_factory=lambda t, d: GraphEngine( + getattr(t, "topology_obj", t), enable_data=True + ), + ) + assert result.completion.ok diff --git a/tests/test_ccl_helpers.py b/tests/test_ccl_helpers.py new file mode 100644 index 0000000..a15cb9c --- /dev/null +++ b/tests/test_ccl_helpers.py @@ -0,0 +1,68 @@ +"""Tests for CCL algorithm-author helpers (ADR-0023 D15).""" +from __future__ import annotations + +import pytest + +from kernbench.ccl.helpers import ( + Chunk, + chunked, + ring_step, + tree_step, +) + + +# ── chunked ────────────────────────────────────────────────────────── + + +def test_chunked_basic(): + chunks = chunked(base_addr=0x1000, n_chunks=4, n_elem=64, dtype="f16") + assert len(chunks) == 4 + # Each chunk has 16 elements (64 / 4) + assert chunks[0] == Chunk(addr=0x1000, n_elem=16, nbytes=32) + assert chunks[1] == Chunk(addr=0x1020, n_elem=16, nbytes=32) + assert chunks[2] == Chunk(addr=0x1040, n_elem=16, nbytes=32) + assert chunks[3] == Chunk(addr=0x1060, n_elem=16, nbytes=32) + + +def test_chunked_f32(): + chunks = chunked(base_addr=0x100, n_chunks=2, n_elem=8, dtype="f32") + assert chunks[0].nbytes == 16 # 4 elem × 4 bytes + assert chunks[1].addr == 0x100 + 16 + + +def test_chunked_uneven_raises(): + with pytest.raises(ValueError): + chunked(base_addr=0x100, n_chunks=3, n_elem=10, dtype="f16") + + +# ── ring_step ──────────────────────────────────────────────────────── + + +def test_ring_step_4_ranks(): + # Standard reduce-scatter ring step: + # at step s, rank r sends chunk (r-s) and receives chunk (r-s-1) (mod ws) + assert ring_step(rank=0, step=0, world_size=4) == (0, 3) + assert ring_step(rank=0, step=1, world_size=4) == (3, 2) + assert ring_step(rank=1, step=0, world_size=4) == (1, 0) + assert ring_step(rank=2, step=0, world_size=4) == (2, 1) + + +# ── tree_step ──────────────────────────────────────────────────────── + + +def test_tree_step_root(): + info = tree_step(rank=0, world_size=7) + assert info["parent"] is None + assert info["children"] == [1, 2] + + +def test_tree_step_internal(): + info = tree_step(rank=1, world_size=7) + assert info["parent"] == 0 + assert info["children"] == [3, 4] + + +def test_tree_step_leaf(): + info = tree_step(rank=4, world_size=7) + assert info["parent"] == 1 + assert info["children"] == [] diff --git a/tests/test_ccl_install.py b/tests/test_ccl_install.py new file mode 100644 index 0000000..738611e --- /dev/null +++ b/tests/test_ccl_install.py @@ -0,0 +1,100 @@ +"""Tests for CCL backend install (ADR-0023 D10/D11).""" +from __future__ import annotations + +from kernbench.ccl.install import ( + install_ipcq, + linear_rank_to_pe, + load_ccl_config, + resolve_algorithm_config, +) +from kernbench.sim_engine.engine import GraphEngine +from kernbench.topology.builder import resolve_topology + + +def _engine(): + topo = resolve_topology("topology.yaml").topology_obj + return GraphEngine(topo, enable_data=True), topo + + +def test_load_ccl_config(): + cfg = load_ccl_config() + assert "defaults" in cfg + assert "algorithms" in cfg + + +def test_resolve_algorithm_config_default(): + cfg = load_ccl_config() + merged = resolve_algorithm_config(cfg) + assert merged["algorithm"] == cfg["defaults"]["algorithm"] + # ccl.yaml no longer carries defaults.world_size — backend derives + # it from topology.yaml at install time. Just check the field is + # absent here (verified per-test where install_ipcq is called). + assert "world_size" not in merged or merged["world_size"] >= 1 + + +def test_resolve_algorithm_config_override(): + cfg = load_ccl_config() + merged = resolve_algorithm_config(cfg, name="ring_allreduce_hbm") + assert merged["algorithm"] == "ring_allreduce_hbm" + assert merged["buffer_kind"] == "hbm" # algo override + # defaults still apply + assert merged["n_slots"] == cfg["defaults"]["n_slots"] + + +def test_linear_rank_to_pe(): + engine, topo = _engine() + spec = topo.spec + # Cube 0 of SIP 0 + assert linear_rank_to_pe(0, spec) == (0, 0, 0) + assert linear_rank_to_pe(7, spec) == (0, 0, 7) + # Should not exceed total PE count + pes_per_sip = ( + spec["sip"]["cube_mesh"]["w"] * spec["sip"]["cube_mesh"]["h"] + * spec["cube"]["pe_layout"]["pe_per_corner"] + * len(spec["cube"]["pe_layout"]["corners"]) + ) + sips = spec["system"]["sips"]["count"] + total = sips * pes_per_sip + assert total >= 8 + + +def test_install_ipcq_neighbors_correct(): + engine, topo = _engine() + cfg = load_ccl_config() + merged = resolve_algorithm_config(cfg, name="ring_allreduce_tcm") + # Force a single-cube 8-rank install for the assertions below. + merged["world_size"] = 8 + plan = install_ipcq(engine, topo.spec, merged) + + assert plan["world_size"] == 8 + assert plan["buffer_kind"] == "tcm" + + # Each rank should have E and W entries + for r, nbrs in plan["neighbor_table"].items(): + assert "E" in nbrs + assert "W" in nbrs + + # Inspect installed PE_IPCQ for rank 0 + ipcq = engine._components["sip0.cube0.pe0.pe_ipcq"] + qp_e = ipcq.queue_pairs["E"] + qp_w = ipcq.queue_pairs["W"] + assert qp_e["peer"].pe == 1 # rank 0's E neighbor is rank 1 + assert qp_w["peer"].pe == 7 # rank 0's W neighbor is rank 7 + # rx_base addresses should be unique + assert qp_e["my_rx_base_pa"] != qp_w["my_rx_base_pa"] + + +def test_install_ipcq_credit_stores_wired(): + engine, topo = _engine() + cfg = load_ccl_config() + merged = resolve_algorithm_config(cfg, name="ring_allreduce_tcm") + merged["world_size"] = 8 + install_ipcq(engine, topo.spec, merged) + + # rank 0 (pe0) sending E goes to rank 1 (pe1) + # rank 0's peer_credit_store on E direction should equal rank 1's credit_inbox + pe0 = engine._components["sip0.cube0.pe0.pe_ipcq"] + pe1 = engine._components["sip0.cube0.pe1.pe_ipcq"] + + qp_e = pe0.queue_pairs["E"] + assert qp_e["peer_credit_store"] is pe1.credit_inbox diff --git a/tests/test_ccl_mock_runtime.py b/tests/test_ccl_mock_runtime.py new file mode 100644 index 0000000..b39f7a2 --- /dev/null +++ b/tests/test_ccl_mock_runtime.py @@ -0,0 +1,83 @@ +"""Tests for the mock CCL runtime (ADR-0023 D15).""" +from __future__ import annotations + +import numpy as np + +from kernbench.ccl.algorithms import ring_allreduce +from kernbench.ccl.testing import run_kernel_in_mock + + +def test_ring_allreduce_4_ranks(): + """Run the ring all-reduce kernel under the mock runtime, no SimPy.""" + n_elem = 8 + inputs = [ + np.full((n_elem,), float(r + 1), dtype=np.float16) + for r in range(4) + ] + expected = sum(inputs) # [10, 10, ..., 10] + + outputs = run_kernel_in_mock( + kernel_fn=ring_allreduce.kernel, + world_size=4, + topology="ring_1d", + inputs=inputs, + kernel_args=(n_elem, 4), + ) + + assert len(outputs) == 4 + for r in range(4): + assert np.allclose(outputs[r], expected) + + +def test_ring_allreduce_8_ranks(): + n_elem = 16 + inputs = [ + np.full((n_elem,), float(r + 1), dtype=np.float16) + for r in range(8) + ] + expected = sum(inputs) # [36, 36, ...] + + outputs = run_kernel_in_mock( + kernel_fn=ring_allreduce.kernel, + world_size=8, + topology="ring_1d", + inputs=inputs, + kernel_args=(n_elem, 8), + ) + for r in range(8): + assert np.allclose(outputs[r], expected) + + +def test_ring_allreduce_random_data(): + n_elem = 32 + rng = np.random.default_rng(42) + inputs = [rng.standard_normal(n_elem).astype(np.float16) for _ in range(4)] + expected = sum(inputs) + + outputs = run_kernel_in_mock( + kernel_fn=ring_allreduce.kernel, + world_size=4, + topology="ring_1d", + inputs=inputs, + kernel_args=(n_elem, 4), + ) + for r in range(4): + assert np.allclose(outputs[r], expected, rtol=1e-2, atol=1e-2) + + +def test_mock_runtime_invalid_direction_raises(): + """A kernel that uses an unsupported direction should raise.""" + import pytest + + def bad_kernel(t_ptr, n_elem, tl): + tl.send(dir="N", src_addr=0, nbytes=2, shape=(1,), dtype="f16", space="hbm") + + inputs = [np.array([1.0], dtype=np.float16) for _ in range(2)] + with pytest.raises(Exception): + run_kernel_in_mock( + kernel_fn=bad_kernel, + world_size=2, + topology="ring_1d", + inputs=inputs, + kernel_args=(1,), + ) diff --git a/tests/test_ccl_performance.py b/tests/test_ccl_performance.py new file mode 100644 index 0000000..05ba8a4 --- /dev/null +++ b/tests/test_ccl_performance.py @@ -0,0 +1,134 @@ +"""CCL performance validation tests (ADR-0023 D13 T5). + +Sanity-checks the simulated latency of the unified ``ccl_allreduce`` bench +under different ``ccl.yaml`` algorithm choices: + + - All buffer kinds finish in non-zero simulated time. + - Latency is bounded well under 1 ms for small tiles. + +These are sanity checks on the model itself, not on absolute numbers. +""" +from __future__ import annotations + +import importlib +import os +from contextlib import contextmanager + +import pytest + +from kernbench.runtime_api.bench_runner import run_bench +from kernbench.runtime_api.types import resolve_device +from kernbench.sim_engine.engine import GraphEngine +from kernbench.topology.builder import resolve_topology + + +def _engine_factory(topology, device): + return GraphEngine(getattr(topology, "topology_obj", topology), enable_data=True) + + +@contextmanager +def _ccl_yaml_override(algorithm: str, world_size: int | None = None): + """Write a tmp ccl.yaml that forces a specific algorithm + world_size.""" + import tempfile + entry_extra = f"\n world_size: {world_size}" if world_size is not None else "" + body = f""" +defaults: + algorithm: {algorithm} + buffer_kind: tcm + backpressure: sleep + n_slots: 4 + slot_size: 4096 + vc_chunk_size: 256 + ipcq_credit_size_bytes: 16 + +algorithms: + ring_allreduce_tcm: + module: kernbench.ccl.algorithms.ring_allreduce + topology: ring_1d + buffer_kind: tcm + ring_allreduce_hbm: + module: kernbench.ccl.algorithms.ring_allreduce + topology: ring_1d + buffer_kind: hbm + ring_allreduce_sram: + module: kernbench.ccl.algorithms.ring_allreduce + topology: ring_1d + buffer_kind: sram{entry_extra if algorithm.startswith("ring") else ""} + {algorithm}: + module: kernbench.ccl.algorithms.ring_allreduce + topology: ring_1d + buffer_kind: tcm{entry_extra} +""" if world_size is not None else f""" +defaults: + algorithm: {algorithm} + buffer_kind: tcm + backpressure: sleep + n_slots: 4 + slot_size: 4096 + vc_chunk_size: 256 + ipcq_credit_size_bytes: 16 + +algorithms: + ring_allreduce_tcm: + module: kernbench.ccl.algorithms.ring_allreduce + topology: ring_1d + buffer_kind: tcm + ring_allreduce_hbm: + module: kernbench.ccl.algorithms.ring_allreduce + topology: ring_1d + buffer_kind: hbm + ring_allreduce_sram: + module: kernbench.ccl.algorithms.ring_allreduce + topology: ring_1d + buffer_kind: sram +""" + with tempfile.TemporaryDirectory() as tmp: + path = os.path.join(tmp, "ccl.yaml") + with open(path, "w") as f: + f.write(body) + old_cwd = os.getcwd() + os.chdir(tmp) + try: + yield path + finally: + os.chdir(old_cwd) + + +def _run_unified(algorithm: str, world_size: int | None = None) -> float: + """Run the unified ccl_allreduce bench under a ccl.yaml override, + return simulated kernel total_ns.""" + with _ccl_yaml_override(algorithm, world_size): + topo = resolve_topology( + os.path.join(os.path.dirname(__file__), "..", "topology.yaml") + ) + bench_mod = importlib.import_module("benches.ccl_allreduce") + result = run_bench( + topology=topo, bench_fn=bench_mod.run, + device=resolve_device("all"), + engine_factory=_engine_factory, + ) + assert result.completion.ok, f"{algorithm} did not complete" + last_kernel = None + for tr in (result.traces or []): + if tr.get("phase") == "kernel": + last_kernel = tr + assert last_kernel is not None, f"{algorithm} produced no kernel trace" + return float(last_kernel.get("total_ns", 0.0)) + + +@pytest.mark.parametrize("algorithm", [ + "ring_allreduce_tcm", + "ring_allreduce_hbm", + "ring_allreduce_sram", +]) +def test_ccl_latency_positive(algorithm): + """Every buffer kind must produce a positive simulated latency.""" + ns = _run_unified(algorithm) + assert ns > 0 + + +def test_ccl_latency_under_reasonable_bound(): + """Sanity bound: ring all-reduce (tile=32 f16) should finish in well + under 1 ms simulated. Way overhead-dominated for small tiles.""" + ns = _run_unified("ring_allreduce_tcm") + assert ns < 100_000_000 # < 100 ms simulated — very loose bound diff --git a/tests/test_ccl_round_robin_recv.py b/tests/test_ccl_round_robin_recv.py new file mode 100644 index 0000000..8b98dfa --- /dev/null +++ b/tests/test_ccl_round_robin_recv.py @@ -0,0 +1,48 @@ +"""Test that tl.recv() (no direction) works under the mock runtime +and the SimPy PE_IPCQ component (ADR-0023 D4 weak fairness).""" +from __future__ import annotations + +import numpy as np + +from kernbench.ccl.testing import run_kernel_in_mock + + +def kernel_round_robin(t_ptr, n_elem, tl): + """Each PE sends one tile E then receives N-1 tiles via round-robin. + Uses TensorHandle math (PE_MATH) so Phase 2 produces correct HBM + contents under SimPy + op_log replay.""" + rank = tl.program_id(axis=0) + world_size = tl.num_programs(axis=0) + nbytes = n_elem * 2 + + pe_addr = t_ptr + rank * nbytes + acc = tl.load(pe_addr, shape=(n_elem,), dtype="f16") + current = acc + + for _step in range(world_size - 1): + tl.send(dir="E", src=current) + # No direction → round-robin + recv = tl.recv(shape=(n_elem,), dtype="f16") + acc = acc + recv + current = recv # forward W's tile to E next round + + tl.store(pe_addr, acc) + + +def test_round_robin_recv_mock_runtime(): + n_elem = 8 + inputs = [ + np.full((n_elem,), float(r + 1), dtype=np.float16) + for r in range(4) + ] + expected = sum(inputs) # [10,...] + + outputs = run_kernel_in_mock( + kernel_fn=kernel_round_robin, + world_size=4, + topology="ring_1d", + inputs=inputs, + kernel_args=(n_elem,), + ) + for r in range(4): + assert np.allclose(outputs[r], expected) diff --git a/tests/test_ccl_strict_mode.py b/tests/test_ccl_strict_mode.py new file mode 100644 index 0000000..d145b02 --- /dev/null +++ b/tests/test_ccl_strict_mode.py @@ -0,0 +1,140 @@ +"""Tests for IPCQ strict shape/dtype validation (ADR-0023 D14 F2).""" +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Any + +import pytest +import simpy + +from kernbench.common.ipcq_types import ( + IpcqDmaToken, + IpcqEndpoint, + IpcqInitEntry, + IpcqInvalidDirection, + IpcqMetaArrival, + IpcqRecvCmd, + IpcqRequest, + IpcqSendCmd, +) +from kernbench.components.builtin.pe_ipcq import PeIpcqComponent +from kernbench.runtime_api.kernel import IpcqInitMsg +from kernbench.topology.types import Node + + +# ── helpers (smaller copy of test_pe_ipcq fixtures) ──────────────── + + +@dataclass +class _FakeTxn: + request: Any + done: simpy.Event + result_data: dict[str, Any] = field(default_factory=dict) + + +def _make(env, strict: bool = True): + node = Node( + id="sip0.cube0.pe0.pe_ipcq", kind="pe_ipcq", + impl="builtin.pe_ipcq", + attrs={"strict_validation": strict}, + pos_mm=None, + ) + comp = PeIpcqComponent(node, ctx=None) + comp.in_ports["host"] = simpy.Store(env) + comp.out_ports["sip0.cube0.pe0.pe_dma"] = simpy.Store(env) + comp.start(env) + + peer_credit = simpy.Store(env) + ep = IpcqEndpoint( + sip=0, cube=0, pe=1, buffer_kind="tcm", + rx_base_pa=0x10_000, rx_base_va=0, + n_slots=4, slot_size=4096, + ) + init_msg = IpcqInitMsg( + correlation_id="t", request_id="t", + target_sips=(0,), target_cubes=(0,), target_pe=0, + entries=(IpcqInitEntry( + direction="W", peer=ep, + my_rx_base_pa=0x40_000, my_rx_base_va=0, + n_slots=4, slot_size=4096, + peer_credit_store=peer_credit, + ),), + backpressure_mode="sleep", + buffer_kind="tcm", + credit_size_bytes=16, + ) + done = env.event() + comp.in_ports["host"].put(_FakeTxn(request=init_msg, done=done)) + env.run(until=done) + return comp + + +# ── F2 tests ───────────────────────────────────────────────────────── + + +def test_strict_mode_dtype_mismatch_raises(): + env = simpy.Environment() + comp = _make(env, strict=True) + + # Pre-arrive metadata with f32 dtype + fake_token = IpcqDmaToken( + src_addr=0, src_space="tcm", + dst_addr=0x40_000, dst_endpoint=comp._queue_pairs["W"]["peer"], + nbytes=64, handle_id="x", + shape=(8,), dtype="f32", # mismatched + sender_seq=0, + src_sip=0, src_cube=0, src_pe=1, src_direction="E", + ) + comp.in_ports["host"].put(IpcqMetaArrival(token=fake_token)) + env.run(until=5) + + # recv expecting f16 → should raise on strict + recv_cmd = IpcqRecvCmd(direction="W", shape=(8,), dtype="f16", handle_id="r") + req = IpcqRequest(command=recv_cmd, done=env.event()) + comp.in_ports["host"].put(req) + with pytest.raises(ValueError, match="dtype"): + env.run(until=req.done) + + +def test_strict_mode_shape_mismatch_raises(): + env = simpy.Environment() + comp = _make(env, strict=True) + + fake_token = IpcqDmaToken( + src_addr=0, src_space="tcm", + dst_addr=0x40_000, dst_endpoint=comp._queue_pairs["W"]["peer"], + nbytes=64, handle_id="x", + shape=(16,), dtype="f16", # wrong shape + sender_seq=0, + src_sip=0, src_cube=0, src_pe=1, src_direction="E", + ) + comp.in_ports["host"].put(IpcqMetaArrival(token=fake_token)) + env.run(until=5) + + recv_cmd = IpcqRecvCmd(direction="W", shape=(8,), dtype="f16", handle_id="r") + req = IpcqRequest(command=recv_cmd, done=env.event()) + comp.in_ports["host"].put(req) + with pytest.raises(ValueError, match="shape"): + env.run(until=req.done) + + +def test_non_strict_mode_silently_accepts(): + env = simpy.Environment() + comp = _make(env, strict=False) + + fake_token = IpcqDmaToken( + src_addr=0, src_space="tcm", + dst_addr=0x40_000, dst_endpoint=comp._queue_pairs["W"]["peer"], + nbytes=64, handle_id="x", + shape=(16,), dtype="f32", # both wrong + sender_seq=0, + src_sip=0, src_cube=0, src_pe=1, src_direction="E", + ) + comp.in_ports["host"].put(IpcqMetaArrival(token=fake_token)) + env.run(until=5) + + recv_cmd = IpcqRecvCmd(direction="W", shape=(8,), dtype="f16", handle_id="r") + req = IpcqRequest(command=recv_cmd, done=env.event()) + comp.in_ports["host"].put(req) + env.run(until=req.done) + assert req.done.triggered # no exception diff --git a/tests/test_ccl_topologies.py b/tests/test_ccl_topologies.py new file mode 100644 index 0000000..42dcd43 --- /dev/null +++ b/tests/test_ccl_topologies.py @@ -0,0 +1,164 @@ +"""Tests for CCL builtin topology generators (ADR-0023 D11).""" +import pytest + +from kernbench.ccl.topologies import ( + mesh_2d, + none, + resolve_topology, + ring_1d, + ring_1d_unidir, + tree_binary, +) + + +# ── ring_1d ────────────────────────────────────────────────────────── + + +def test_ring_1d_4_ranks(): + assert ring_1d(0, 4) == {"E": 1, "W": 3} + assert ring_1d(1, 4) == {"E": 2, "W": 0} + assert ring_1d(2, 4) == {"E": 3, "W": 1} + assert ring_1d(3, 4) == {"E": 0, "W": 2} + + +def test_ring_1d_2_ranks(): + assert ring_1d(0, 2) == {"E": 1, "W": 1} + assert ring_1d(1, 2) == {"E": 0, "W": 0} + + +# ── ring_1d_unidir ─────────────────────────────────────────────────── + + +def test_ring_1d_unidir(): + assert ring_1d_unidir(0, 4) == {"E": 1} + assert ring_1d_unidir(3, 4) == {"E": 0} + + +# ── mesh_2d ────────────────────────────────────────────────────────── + + +def test_mesh_2d_2x2(): + # 2x2 mesh: + # 0 1 + # 2 3 + assert mesh_2d(0, 4) == {"N": 2, "S": 2, "E": 1, "W": 1} + assert mesh_2d(1, 4) == {"N": 3, "S": 3, "E": 0, "W": 0} + assert mesh_2d(2, 4) == {"N": 0, "S": 0, "E": 3, "W": 3} + assert mesh_2d(3, 4) == {"N": 1, "S": 1, "E": 2, "W": 2} + + +def test_mesh_2d_4x4(): + # 4x4 mesh: rank = r*4 + c + n = mesh_2d(5, 16) # r=1, c=1 + assert n["N"] == 1 # ((1-1)%4)*4 + 1 + assert n["S"] == 9 # ((1+1)%4)*4 + 1 + assert n["W"] == 4 # 1*4 + (1-1)%4 + assert n["E"] == 6 # 1*4 + (1+1)%4 + + +def test_mesh_2d_non_square_raises(): + with pytest.raises(ValueError): + mesh_2d(0, 5) + + +# ── tree_binary ────────────────────────────────────────────────────── + + +def test_tree_binary_root(): + n = tree_binary(0, 7) + assert "parent" not in n + assert n["child_left"] == 1 + assert n["child_right"] == 2 + + +def test_tree_binary_internal(): + n = tree_binary(1, 7) + assert n["parent"] == 0 + assert n["child_left"] == 3 + assert n["child_right"] == 4 + + +def test_tree_binary_leaf(): + n = tree_binary(6, 7) + assert n["parent"] == 2 + assert "child_left" not in n + assert "child_right" not in n + + +# ── none ───────────────────────────────────────────────────────────── + + +def test_none_returns_empty(): + assert none(0, 4) == {} + assert none(3, 7) == {} + + +# ── resolve_topology ───────────────────────────────────────────────── + + +def test_resolve_topology_builtin(): + fn = resolve_topology("ring_1d") + assert fn(0, 4) == {"E": 1, "W": 3} + + +def test_resolve_topology_unknown_raises(): + with pytest.raises(ValueError): + resolve_topology("nonsense") + + +def test_resolve_topology_with_neighbors_override_pattern_a(): + """Algorithm module with neighbors() that mutates builtin map.""" + class FakeModule: + @staticmethod + def neighbors(rank, world_size, neighbor_map): + if rank % 2 == 1: + neighbor_map.pop("W", None) + return neighbor_map + + fn = resolve_topology("ring_1d", algo_module=FakeModule) + assert fn(0, 4) == {"E": 1, "W": 3} + assert fn(1, 4) == {"E": 2} # W removed + + +def test_resolve_topology_with_neighbors_override_pattern_b(): + """Algorithm module with neighbors() that returns brand-new dict.""" + class FakeModule: + @staticmethod + def neighbors(rank, world_size, neighbor_map): + return {"E": (rank + 2) % world_size} + + fn = resolve_topology("ring_1d", algo_module=FakeModule) + assert fn(0, 4) == {"E": 2} + assert fn(3, 4) == {"E": 1} + + +def test_resolve_topology_with_neighbors_override_pattern_c_none(): + """Algorithm module's neighbors() returns None → builtin used as-is.""" + class FakeModule: + @staticmethod + def neighbors(rank, world_size, neighbor_map): + return None + + fn = resolve_topology("ring_1d", algo_module=FakeModule) + assert fn(0, 4) == {"E": 1, "W": 3} + + +def test_resolve_topology_none_with_neighbors_override(): + """topology=none + custom neighbors() builds from scratch.""" + class FakeModule: + @staticmethod + def neighbors(rank, world_size, neighbor_map): + assert neighbor_map == {} # builtin returned empty + return {"E": (rank + 1) % world_size} + + fn = resolve_topology("none", algo_module=FakeModule) + assert fn(0, 4) == {"E": 1} + + +def test_resolve_topology_module_without_neighbors(): + """Algorithm module without neighbors() function works normally.""" + class FakeModule: + pass # no neighbors attribute + + fn = resolve_topology("ring_1d", algo_module=FakeModule) + assert fn(0, 4) == {"E": 1, "W": 3} diff --git a/tests/test_cross_sip_routing.py b/tests/test_cross_sip_routing.py new file mode 100644 index 0000000..4d0ba86 --- /dev/null +++ b/tests/test_cross_sip_routing.py @@ -0,0 +1,73 @@ +"""Cross-SIP PE_DMA routing tests (ADR-0023, topology v2). + +Verifies that PE_DMA in one SIP can route to PE_DMA in another SIP via +the bidirectional pcie_ep ↔ fabric.switch0 path. Required for IPCQ +multi-SIP collectives. +""" +from __future__ import annotations + +import pytest + +from kernbench.policy.routing.router import PathRouter, RoutingError +from kernbench.topology.builder import resolve_topology + + +def _topo(): + return resolve_topology("topology.yaml").topology_obj + + +# ── New edge ──────────────────────────────────────────────────────── + + +def test_pcie_ep_to_switch_edge_exists(): + """The reverse pcie_ep → switch edge must exist for outbound traffic.""" + topo = _topo() + pairs = {(e.src, e.dst) for e in topo.edges} + assert ("sip0.io0.pcie_ep", "fabric.switch0") in pairs + assert ("sip1.io0.pcie_ep", "fabric.switch0") in pairs + + +def test_existing_switch_to_pcie_ep_still_present(): + """Host→device path must remain intact (regression).""" + topo = _topo() + pairs = {(e.src, e.dst) for e in topo.edges} + assert ("fabric.switch0", "sip0.io0.pcie_ep") in pairs + assert ("fabric.switch0", "sip1.io0.pcie_ep") in pairs + + +# ── Cross-SIP path ────────────────────────────────────────────────── + + +def test_router_finds_cross_sip_pe_dma_path(): + topo = _topo() + r = PathRouter(topo) + path = r.find_path("sip0.cube0.pe0", "sip1.cube0.pe0.pe_dma") + assert len(path) > 0 + assert path[0] == "sip0.cube0.pe0.pe_dma" + assert path[-1] == "sip1.cube0.pe0.pe_dma" + assert "fabric.switch0" in path + + +def test_router_finds_cross_sip_far_pe_path(): + """Last cube of sip0 → first cube of sip1.""" + topo = _topo() + r = PathRouter(topo) + path = r.find_path("sip0.cube15.pe7", "sip1.cube0.pe0.pe_dma") + assert "fabric.switch0" in path + + +# ── Regression: intra-SIP routing unchanged ───────────────────────── + + +def test_router_intra_sip_path_unchanged(): + topo = _topo() + r = PathRouter(topo) + path = r.find_path("sip0.cube0.pe0", "sip0.cube0.pe1.pe_dma") + assert "fabric.switch0" not in path # should not detour through switch + + +def test_router_intra_cube_path_unchanged(): + topo = _topo() + r = PathRouter(topo) + path = r.find_path("sip0.cube0.pe0", "sip0.cube0.hbm_ctrl") + assert "fabric.switch0" not in path diff --git a/tests/test_data_executor.py b/tests/test_data_executor.py index 0204ba8..02bb022 100644 --- a/tests/test_data_executor.py +++ b/tests/test_data_executor.py @@ -58,6 +58,69 @@ def test_math_exp(): assert np.allclose(result, np.exp(x)) +def test_math_extra_ops(): + """Phase 2 replay of tl.maximum/minimum/fma/clamp/softmax.""" + store = MemoryStore() + a = np.array([1.0, 5.0, 3.0], dtype=np.float32) + b = np.array([4.0, 2.0, 6.0], dtype=np.float32) + c = np.array([0.5, 0.5, 0.5], dtype=np.float32) + store.write("tcm", 0x0, a) + store.write("tcm", 0x100, b) + store.write("tcm", 0x200, c) + + def _math(name, op, dst, inputs, axis=None): + return OpRecord( + t_start=float(dst), t_end=float(dst) + 1.0, + component_id="pe_math", op_kind="math", op_name=name, + params={ + "op": op, + "input_addrs": [a for a, _ in inputs], + "input_shapes": [s for _, s in inputs], + "input_spaces": ["tcm"] * len(inputs), + "input_dtypes": ["f32"] * len(inputs), + "dst_addr": dst, "dst_space": "tcm", + "shape_out": (3,), "dtype": "f32", "axis": axis, + }, + ) + + ops = [ + _math("maximum", "maximum", 0x300, [(0x0, (3,)), (0x100, (3,))]), + _math("minimum", "minimum", 0x400, [(0x0, (3,)), (0x100, (3,))]), + _math("fma", "fma", 0x500, [(0x0, (3,)), (0x100, (3,)), (0x200, (3,))]), + _math("clamp", "clamp", 0x600, [(0x0, (3,)), (0x200, (3,)), (0x100, (3,))]), + ] + DataExecutor(ops, store).run() + + assert np.array_equal(store.read("tcm", 0x300), np.maximum(a, b)) + assert np.array_equal(store.read("tcm", 0x400), np.minimum(a, b)) + assert np.array_equal(store.read("tcm", 0x500), a * b + c) + assert np.array_equal( + store.read("tcm", 0x600), np.minimum(np.maximum(a, c), b) + ) + + +def test_math_softmax(): + store = MemoryStore() + x = np.array([[1.0, 2.0, 3.0], [10.0, 20.0, 30.0]], dtype=np.float32) + store.write("tcm", 0x0, x) + + op = OpRecord( + t_start=0.0, t_end=1.0, + component_id="pe_math", op_kind="math", op_name="softmax", + params={ + "op": "softmax", + "input_addrs": [0x0], "input_shapes": [(2, 3)], + "input_spaces": ["tcm"], "input_dtypes": ["f32"], + "dst_addr": 0x100, "dst_space": "tcm", + "shape_out": (2, 3), "dtype": "f32", "axis": -1, + }, + ) + DataExecutor([op], store).run() + expected = np.exp(x - x.max(axis=-1, keepdims=True)) + expected /= expected.sum(axis=-1, keepdims=True) + assert np.allclose(store.read("tcm", 0x100), expected) + + def test_math_add(): store = MemoryStore() a = np.array([1.0, 2.0], dtype=np.float32) diff --git a/tests/test_ipcq_types.py b/tests/test_ipcq_types.py new file mode 100644 index 0000000..647837b --- /dev/null +++ b/tests/test_ipcq_types.py @@ -0,0 +1,169 @@ +"""Tests for IPCQ type schemas (ADR-0023 D2.5, D12, D14 F1).""" +import pytest + +from kernbench.common.ipcq_types import ( + IpcqCreditMetadata, + IpcqDmaToken, + IpcqEndpoint, + IpcqInitEntry, + IpcqInvalidDirection, + IpcqMetaArrival, + IpcqRecvCmd, + IpcqSendCmd, +) +from kernbench.runtime_api.kernel import IpcqInitMsg + + +# ── IpcqEndpoint ───────────────────────────────────────────────────── + + +def test_ipcq_endpoint_basic(): + ep = IpcqEndpoint( + sip=0, cube=0, pe=1, + buffer_kind="tcm", + rx_base_pa=0x1000, rx_base_va=0, + n_slots=8, slot_size=4096, + ) + assert ep.sip == 0 + assert ep.buffer_kind == "tcm" + assert ep.n_slots == 8 + + +def test_ipcq_endpoint_frozen(): + ep = IpcqEndpoint( + sip=0, cube=0, pe=1, buffer_kind="tcm", + rx_base_pa=0x1000, rx_base_va=0, n_slots=8, slot_size=4096, + ) + with pytest.raises(Exception): # FrozenInstanceError + ep.sip = 99 # type: ignore + + +# ── IpcqDmaToken ───────────────────────────────────────────────────── + + +def test_ipcq_dma_token(): + ep = IpcqEndpoint( + sip=0, cube=0, pe=1, buffer_kind="tcm", + rx_base_pa=0x1000, rx_base_va=0, n_slots=8, slot_size=4096, + ) + tok = IpcqDmaToken( + src_addr=0x500, src_space="tcm", + dst_addr=0x1000, dst_endpoint=ep, + nbytes=128, handle_id="h1", + sender_seq=0, + src_sip=0, src_cube=0, src_pe=0, src_direction="E", + ) + assert tok.nbytes == 128 + assert tok.dst_endpoint.buffer_kind == "tcm" + assert tok.data_op is True + + +# ── IpcqCreditMetadata ─────────────────────────────────────────────── + + +def test_ipcq_credit_metadata(): + cm = IpcqCreditMetadata( + consumer_seq=3, src_sip=0, src_cube=0, src_pe=1, src_direction="W", + ) + assert cm.consumer_seq == 3 + assert cm.src_direction == "W" + + +def test_ipcq_credit_metadata_frozen(): + cm = IpcqCreditMetadata( + consumer_seq=3, src_sip=0, src_cube=0, src_pe=1, src_direction="W", + ) + with pytest.raises(Exception): + cm.consumer_seq = 99 # type: ignore + + +# ── IpcqMetaArrival ────────────────────────────────────────────────── + + +def test_ipcq_meta_arrival(): + ep = IpcqEndpoint( + sip=0, cube=0, pe=1, buffer_kind="tcm", + rx_base_pa=0x1000, rx_base_va=0, n_slots=8, slot_size=4096, + ) + tok = IpcqDmaToken( + src_addr=0x500, src_space="tcm", + dst_addr=0x1000, dst_endpoint=ep, + nbytes=128, handle_id="h1", + sender_seq=0, + src_sip=0, src_cube=0, src_pe=0, src_direction="E", + ) + ma = IpcqMetaArrival(token=tok) + assert ma.token.sender_seq == 0 + assert ma.token.src_direction == "E" + + +# ── IpcqSendCmd / IpcqRecvCmd ──────────────────────────────────────── + + +def test_ipcq_send_cmd(): + cmd = IpcqSendCmd( + direction="E", src_addr=0x100, src_space="tcm", + nbytes=64, shape=(8, 8), dtype="f16", handle_id="s1", + ) + assert cmd.direction == "E" + assert cmd.data_op is True + + +def test_ipcq_recv_cmd_default_return_slot(): + cmd = IpcqRecvCmd(direction="W", shape=(8, 8), dtype="f16", handle_id="r1") + assert cmd.recv_mode == "return_slot" + assert cmd.dst_addr == 0 + + +def test_ipcq_recv_cmd_round_robin(): + cmd = IpcqRecvCmd(direction=None, shape=(8, 8), dtype="f16", handle_id="r2") + assert cmd.direction is None + + +def test_ipcq_recv_cmd_copy_to_dst(): + cmd = IpcqRecvCmd( + direction="W", recv_mode="copy_to_dst", + dst_addr=0x2000, dst_space="hbm", + shape=(8, 8), dtype="f16", handle_id="r3", + ) + assert cmd.recv_mode == "copy_to_dst" + assert cmd.dst_addr == 0x2000 + + +# ── IpcqInvalidDirection ───────────────────────────────────────────── + + +def test_ipcq_invalid_direction(): + with pytest.raises(IpcqInvalidDirection): + raise IpcqInvalidDirection("direction 'X' not installed") + + +# ── IpcqInitEntry / IpcqInitMsg ────────────────────────────────────── + + +def test_ipcq_init_entry_and_msg(): + import simpy + env = simpy.Environment() + credit_store = simpy.Store(env) + + ep = IpcqEndpoint( + sip=0, cube=0, pe=1, buffer_kind="tcm", + rx_base_pa=0x1000, rx_base_va=0, n_slots=8, slot_size=4096, + ) + entry = IpcqInitEntry( + direction="E", peer=ep, + my_rx_base_pa=0x2000, my_rx_base_va=0, + n_slots=8, slot_size=4096, + peer_credit_store=credit_store, + ) + msg = IpcqInitMsg( + correlation_id="c1", request_id="r1", + target_sips=(0,), target_cubes=(0,), target_pe=0, + entries=(entry,), + backpressure_mode="sleep", + buffer_kind="tcm", + credit_size_bytes=16, + ) + assert msg.entries[0].direction == "E" + assert msg.entries[0].peer.sip == 0 + assert msg.credit_size_bytes == 16 diff --git a/tests/test_pe_dma_ipcq.py b/tests/test_pe_dma_ipcq.py new file mode 100644 index 0000000..7ce30da --- /dev/null +++ b/tests/test_pe_dma_ipcq.py @@ -0,0 +1,206 @@ +"""Tests for PE_DMA IPCQ handling (ADR-0023 D8 + D9 atomic). + +PE_DMA gains two new behaviors: + 1. Outbound: when it receives an IpcqDmaToken from local PE_IPCQ, it + forwards it through the fabric (next-hop port) toward the peer + PE_DMA. + 2. Inbound: when it receives a Transaction wrapping an IpcqDmaToken, + it performs MemoryStore.write at dst_endpoint.buffer_kind/dst_addr + and forwards IpcqMetaArrival(token) to local PE_IPCQ — both in the + SAME SimPy step (I6 MUST). +""" +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Any + +import numpy as np +import simpy + +from kernbench.common.ipcq_types import ( + IpcqDmaToken, + IpcqEndpoint, + IpcqMetaArrival, +) +from kernbench.components.builtin.pe_dma import PeDmaComponent +from kernbench.sim_engine.memory_store import MemoryStore +from kernbench.sim_engine.transaction import Transaction +from kernbench.topology.types import Node + + +# ── Mock context ───────────────────────────────────────────────────── + + +@dataclass +class _MockResolver: + pass + + +@dataclass +class _MockRouter: + """Returns a fixed two-hop path for any (src, dst).""" + + def find_path(self, src: str, dst: str) -> list[str]: + return [src, "fake_router", dst] + + +@dataclass +class _MockCtx: + router: Any = field(default_factory=_MockRouter) + resolver: Any = field(default_factory=_MockResolver) + memory_store: Any = None + edge_map: dict = field(default_factory=dict) + spec: dict = field(default_factory=dict) + op_logger: Any = None + + def compute_drain_ns(self, path: list[str], nbytes: int) -> float: + return 0.0 + + def get_shared_resource(self, env, key, capacity=1): + return simpy.Resource(env, capacity=capacity) + + +def _make_pe_dma( + env: simpy.Environment, pe_prefix: str, store: MemoryStore | None = None, +) -> PeDmaComponent: + node = Node( + id=f"{pe_prefix}.pe_dma", + kind="pe_dma", + impl="builtin.pe_dma", + attrs={}, + pos_mm=None, + ) + ctx = _MockCtx(memory_store=store) + comp = PeDmaComponent(node, ctx=ctx) + comp.in_ports["host"] = simpy.Store(env) + comp.out_ports["fake_router"] = simpy.Store(env) + comp.out_ports[f"{pe_prefix}.pe_ipcq"] = simpy.Store(env) + comp.start(env) + return comp + + +def _make_endpoint(sip=0, cube=0, pe=1, buffer_kind="tcm") -> IpcqEndpoint: + return IpcqEndpoint( + sip=sip, cube=cube, pe=pe, + buffer_kind=buffer_kind, + rx_base_pa=0x10_000, rx_base_va=0, + n_slots=4, slot_size=4096, + ) + + +# ── Outbound: PE_IPCQ → PE_DMA → fabric ────────────────────────────── + + +def test_outbound_forwards_token_through_fabric(): + env = simpy.Environment() + store = MemoryStore() + src_arr = np.arange(16, dtype=np.float16) + store.write("tcm", 0x500, src_arr) + + src = _make_pe_dma(env, "sip0.cube0.pe0", store=store) + + peer = _make_endpoint(pe=1) + token = IpcqDmaToken( + src_addr=0x500, src_space="tcm", + dst_addr=0x10_000, dst_endpoint=peer, + nbytes=32, handle_id="t1", + shape=(16,), dtype="f16", + sender_seq=0, + src_sip=0, src_cube=0, src_pe=0, src_direction="E", + ) + src.in_ports["host"].put(token) + env.run(until=10) + + # The token should be wrapped in a Transaction and forwarded to "fake_router" + fab = src.out_ports["fake_router"] + assert len(fab.items) == 1 + txn = fab.items[0] + assert isinstance(txn, Transaction) + assert isinstance(txn.request, IpcqDmaToken) + assert txn.request.dst_addr == 0x10_000 + + +# ── Inbound: PE_DMA → MemoryStore.write + IpcqMetaArrival forward ─── + + +def test_inbound_writes_memory_and_forwards_metadata_atomically(): + env = simpy.Environment() + store = MemoryStore() + + # Sender wrote source data to MemoryStore + src_arr = np.arange(16, dtype=np.float16) + 100 + store.write("tcm", 0x500, src_arr) + + dst = _make_pe_dma(env, "sip0.cube0.pe1", store=store) + + peer = _make_endpoint(sip=0, cube=0, pe=1, buffer_kind="tcm") + token = IpcqDmaToken( + src_addr=0x500, src_space="tcm", + dst_addr=0x10_000, dst_endpoint=peer, + nbytes=32, handle_id="t1", + shape=(16,), dtype="f16", + sender_seq=0, + src_sip=0, src_cube=0, src_pe=0, src_direction="E", + ) + + # Wrap in a Transaction with this PE_DMA as the terminal + done = env.event() + txn = Transaction( + request=token, path=["fake_router", "sip0.cube0.pe1.pe_dma"], + step=1, nbytes=32, done=done, + ) + dst.in_ports["host"].put(txn) + env.run(until=done) + + # 1. MemoryStore should have the data at dst_addr + arrived = store.read("tcm", 0x10_000, shape=(16,), dtype="f16") + assert np.array_equal(arrived, src_arr) + + # 2. IpcqMetaArrival should be in PE_IPCQ port + ipcq_port = dst.out_ports["sip0.cube0.pe1.pe_ipcq"] + assert len(ipcq_port.items) == 1 + arrival = ipcq_port.items[0] + assert isinstance(arrival, IpcqMetaArrival) + assert arrival.token.sender_seq == 0 + assert arrival.token.src_pe == 0 + + +def test_inbound_no_yield_between_write_and_metadata_forward(): + """Soft check: when multiple inbound IPCQ tokens arrive, the order of + MemoryStore writes and IpcqMetaArrival forwards is preserved (no + interleaving from extraneous yields). + """ + env = simpy.Environment() + store = MemoryStore() + + for i in range(3): + store.write("tcm", 0x500 + i * 0x100, np.arange(8, dtype=np.float16) + i * 10) + + dst = _make_pe_dma(env, "sip0.cube0.pe1", store=store) + peer = _make_endpoint(sip=0, cube=0, pe=1) + + for i in range(3): + token = IpcqDmaToken( + src_addr=0x500 + i * 0x100, src_space="tcm", + dst_addr=0x10_000 + i * 0x100, dst_endpoint=peer, + nbytes=16, handle_id=f"t{i}", + shape=(8,), dtype="f16", + sender_seq=i, + src_sip=0, src_cube=0, src_pe=0, src_direction="E", + ) + done = env.event() + txn = Transaction( + request=token, path=["fake_router", "sip0.cube0.pe1.pe_dma"], + step=1, nbytes=16, done=done, + ) + dst.in_ports["host"].put(txn) + env.run(until=done) + + # Check ordering of arrivals + ipcq_port = dst.out_ports["sip0.cube0.pe1.pe_ipcq"] + arrivals = list(ipcq_port.items) + assert [a.token.sender_seq for a in arrivals] == [0, 1, 2] + # Memory must be in order + for i in range(3): + arr = store.read("tcm", 0x10_000 + i * 0x100, shape=(8,), dtype="f16") + assert arr[0] == i * 10 diff --git a/tests/test_pe_ipcq.py b/tests/test_pe_ipcq.py new file mode 100644 index 0000000..b339315 --- /dev/null +++ b/tests/test_pe_ipcq.py @@ -0,0 +1,317 @@ +"""Tests for PE_IPCQ component (ADR-0023 D1, D2, D9, D14). + +These tests use a mock setup: PeIpcqComponent is instantiated directly, +its in_ports/out_ports are wired to plain SimPy Stores, and IpcqInitMsg +is delivered via a simple dummy transaction wrapper. PE_DMA is mocked +as a Store that we drain manually. +""" +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Any + +import pytest +import simpy + +from kernbench.common.ipcq_types import ( + IpcqCreditMetadata, + IpcqDmaToken, + IpcqEndpoint, + IpcqInitEntry, + IpcqInvalidDirection, + IpcqMetaArrival, + IpcqRecvCmd, + IpcqRequest, + IpcqSendCmd, +) +from kernbench.components.builtin.pe_ipcq import PeIpcqComponent +from kernbench.runtime_api.kernel import IpcqInitMsg +from kernbench.topology.types import Node + + +# ── Fakes / fixtures ───────────────────────────────────────────────── + + +@dataclass +class _FakeTxn: + request: Any + done: simpy.Event + result_data: dict[str, Any] = field(default_factory=dict) + + +def _make_pe_ipcq(env: simpy.Environment, pe_prefix: str = "sip0.cube0.pe0") -> PeIpcqComponent: + """Create a PeIpcqComponent with mocked ports. + + Returns the component with: + - in_ports["host"] for posting IpcqInitMsg / IpcqRequest + - out_ports["__pe_dma__"] for outgoing IpcqDmaToken (drain manually) + - The component is started. + """ + node = Node( + id=f"{pe_prefix}.pe_ipcq", + kind="pe_ipcq", + impl="builtin.pe_ipcq", + attrs={}, + pos_mm=None, + ) + comp = PeIpcqComponent(node, ctx=None) + comp.in_ports["host"] = simpy.Store(env) + comp.out_ports[f"{pe_prefix}.pe_dma"] = simpy.Store(env) + comp.start(env) + return comp + + +def _install_two_neighbors(env: simpy.Environment, comp: PeIpcqComponent) -> tuple[simpy.Store, simpy.Store]: + """Install E and W neighbor entries with peer_credit_stores. + + Returns (peer_e_credit_store, peer_w_credit_store) — i.e. the stores + that the component will put credits into when it receives data. + """ + peer_e_credit = simpy.Store(env) + peer_w_credit = simpy.Store(env) + + ep_e = IpcqEndpoint( + sip=0, cube=0, pe=1, + buffer_kind="tcm", + rx_base_pa=0x10_000, rx_base_va=0, + n_slots=4, slot_size=4096, + ) + ep_w = IpcqEndpoint( + sip=0, cube=0, pe=2, + buffer_kind="tcm", + rx_base_pa=0x20_000, rx_base_va=0, + n_slots=4, slot_size=4096, + ) + init_msg = IpcqInitMsg( + correlation_id="t", request_id="t", + target_sips=(0,), target_cubes=(0,), target_pe=0, + entries=( + IpcqInitEntry( + direction="E", peer=ep_e, + my_rx_base_pa=0x30_000, my_rx_base_va=0, + n_slots=4, slot_size=4096, + peer_credit_store=peer_e_credit, + ), + IpcqInitEntry( + direction="W", peer=ep_w, + my_rx_base_pa=0x40_000, my_rx_base_va=0, + n_slots=4, slot_size=4096, + peer_credit_store=peer_w_credit, + ), + ), + backpressure_mode="sleep", + buffer_kind="tcm", + credit_size_bytes=16, + ) + done = env.event() + comp.in_ports["host"].put(_FakeTxn(request=init_msg, done=done)) + env.run(until=done) + return peer_e_credit, peer_w_credit + + +# ── send: forward token to PE_DMA ──────────────────────────────────── + + +def test_send_forwards_token_to_pe_dma(): + env = simpy.Environment() + comp = _make_pe_ipcq(env) + _install_two_neighbors(env, comp) + pe_dma = comp.out_ports["sip0.cube0.pe0.pe_dma"] + + cmd = IpcqSendCmd( + direction="E", src_addr=0x500, src_space="tcm", + nbytes=128, shape=(8, 8), dtype="f16", handle_id="s1", + ) + done = env.event() + comp.in_ports["host"].put(IpcqRequest(command=cmd, done=done)) + env.run(until=done) + + # Token should be in PE_DMA's mock store + assert len(pe_dma.items) == 1 + token = pe_dma.items[0] + assert isinstance(token, IpcqDmaToken) + assert token.dst_addr == 0x10_000 # peer.rx_base_pa + 0 + assert token.nbytes == 128 + assert token.sender_seq == 0 + assert token.src_direction == "E" + + +def test_send_advances_my_head_and_slot_addresses(): + env = simpy.Environment() + comp = _make_pe_ipcq(env) + _install_two_neighbors(env, comp) + pe_dma = comp.out_ports["sip0.cube0.pe0.pe_dma"] + + for i in range(3): + cmd = IpcqSendCmd( + direction="E", src_addr=0x500 + i, + src_space="tcm", nbytes=64, + shape=(8,), dtype="f16", handle_id=f"s{i}", + ) + done = env.event() + comp.in_ports["host"].put(IpcqRequest(command=cmd, done=done)) + env.run(until=done) + + tokens = pe_dma.items + assert [t.sender_seq for t in tokens] == [0, 1, 2] + # slot addresses: peer.rx_base_pa (0x10_000) + i * slot_size (4096) + assert [t.dst_addr for t in tokens] == [0x10_000, 0x11_000, 0x12_000] + + +def test_send_invalid_direction_raises(): + env = simpy.Environment() + comp = _make_pe_ipcq(env) + _install_two_neighbors(env, comp) + + cmd = IpcqSendCmd( + direction="N", src_addr=0x100, src_space="tcm", + nbytes=64, shape=(8,), dtype="f16", handle_id="s_bad", + ) + done = env.event() + comp.in_ports["host"].put(IpcqRequest(command=cmd, done=done)) + + with pytest.raises(IpcqInvalidDirection): + env.run(until=done) + + +# ── recv: wait for data and return slot address ───────────────────── + + +def test_recv_waits_until_metadata_arrives(): + env = simpy.Environment() + comp = _make_pe_ipcq(env) + _install_two_neighbors(env, comp) + + recv_cmd = IpcqRecvCmd( + direction="W", shape=(8,), dtype="f16", handle_id="r1", + ) + recv_req = IpcqRequest(command=recv_cmd, done=env.event()) + comp.in_ports["host"].put(recv_req) + + # Run a bit — recv should not complete yet (no data) + env.run(until=10) + assert not recv_req.done.triggered + + # Simulate metadata arrival from peer (W direction = sender pe=2) + fake_token = IpcqDmaToken( + src_addr=0, src_space="tcm", + dst_addr=0x40_000, dst_endpoint=comp._queue_pairs["W"]["peer"], + nbytes=64, handle_id="x", + shape=(8,), dtype="f16", + sender_seq=0, + src_sip=0, src_cube=0, src_pe=2, src_direction="E", + ) + comp.in_ports["host"].put(IpcqMetaArrival(token=fake_token)) + env.run(until=recv_req.done) + + assert recv_req.result_data["src_addr"] == 0x40_000 # my_rx_base_pa for W + assert recv_req.result_data["direction"] == "W" + + +def test_recv_returns_immediately_if_data_already_present(): + env = simpy.Environment() + comp = _make_pe_ipcq(env) + _install_two_neighbors(env, comp) + + # Pre-arrive metadata + fake_token = IpcqDmaToken( + src_addr=0, src_space="tcm", + dst_addr=0x40_000, dst_endpoint=comp._queue_pairs["W"]["peer"], + nbytes=64, handle_id="x", + shape=(8,), dtype="f16", + sender_seq=0, + src_sip=0, src_cube=0, src_pe=2, src_direction="E", + ) + comp.in_ports["host"].put(IpcqMetaArrival(token=fake_token)) + env.run(until=5) + + recv_cmd = IpcqRecvCmd( + direction="W", shape=(8,), dtype="f16", handle_id="r1", + ) + recv_req = IpcqRequest(command=recv_cmd, done=env.event()) + comp.in_ports["host"].put(recv_req) + env.run(until=recv_req.done) + + assert recv_req.result_data["src_addr"] == 0x40_000 + + +def test_recv_round_robin_picks_arrived_direction(): + env = simpy.Environment() + comp = _make_pe_ipcq(env) + _install_two_neighbors(env, comp) + + # Pre-arrive metadata only on W direction + fake_token = IpcqDmaToken( + src_addr=0, src_space="tcm", + dst_addr=0x40_000, dst_endpoint=comp._queue_pairs["W"]["peer"], + nbytes=64, handle_id="x", + shape=(8,), dtype="f16", + sender_seq=0, + src_sip=0, src_cube=0, src_pe=2, src_direction="E", + ) + comp.in_ports["host"].put(IpcqMetaArrival(token=fake_token)) + env.run(until=5) + + # recv() with no direction → round-robin + recv_cmd = IpcqRecvCmd( + direction=None, shape=(8,), dtype="f16", handle_id="r_rr", + ) + recv_req = IpcqRequest(command=recv_cmd, done=env.event()) + comp.in_ports["host"].put(recv_req) + env.run(until=recv_req.done) + + assert recv_req.result_data["direction"] == "W" + + +# ── backpressure: send blocks when full ────────────────────────────── + + +def test_send_blocks_when_peer_slot_full(): + env = simpy.Environment() + comp = _make_pe_ipcq(env) + _install_two_neighbors(env, comp) + + # n_slots = 4, so 4 sends should succeed; 5th blocks + for i in range(4): + cmd = IpcqSendCmd( + direction="E", src_addr=0x500, src_space="tcm", + nbytes=64, shape=(8,), dtype="f16", handle_id=f"s{i}", + ) + done = env.event() + comp.in_ports["host"].put(IpcqRequest(command=cmd, done=done)) + env.run(until=done) + + # 5th send: should not complete + cmd5 = IpcqSendCmd( + direction="E", src_addr=0x500, src_space="tcm", + nbytes=64, shape=(8,), dtype="f16", handle_id="s5", + ) + req5 = IpcqRequest(command=cmd5, done=env.event()) + comp.in_ports["host"].put(req5) + env.run(until=20) + assert not req5.done.triggered + + # Send a credit return: peer (E direction, pe=1) consumed slot 0 + credit = IpcqCreditMetadata( + consumer_seq=1, # peer consumed up to my_tail=1 + src_sip=0, src_cube=0, src_pe=1, src_direction="W", # peer's view + ) + comp.credit_inbox.put(credit) + env.run(until=req5.done) + assert req5.done.triggered + + +# ── Init test ──────────────────────────────────────────────────────── + + +def test_init_installs_neighbors(): + env = simpy.Environment() + comp = _make_pe_ipcq(env) + _install_two_neighbors(env, comp) + + assert "E" in comp._queue_pairs + assert "W" in comp._queue_pairs + assert comp._queue_pairs["E"]["peer"].pe == 1 + assert comp._queue_pairs["W"]["peer"].pe == 2 + assert comp._queue_pairs["E"]["my_head"] == 0 + assert comp._queue_pairs["E"]["peer_tail_cache"] == 0 diff --git a/tests/test_recv_copy_to_dst.py b/tests/test_recv_copy_to_dst.py new file mode 100644 index 0000000..c4388dc --- /dev/null +++ b/tests/test_recv_copy_to_dst.py @@ -0,0 +1,80 @@ +"""Tests for recv_mode='copy_to_dst' (ADR-0023 D9.5).""" +from __future__ import annotations + +import numpy as np + + +def test_recv_copy_to_dst_via_simpy_runner(): + """Run a kernel that uses tl.recv(..., dst_addr=..., dst_space=...). + Verify the data is moved to the dst location after recv. + """ + import importlib + + from kernbench.policy.placement.dp import DPPolicy + from kernbench.runtime_api.bench_runner import run_bench + from kernbench.runtime_api.types import resolve_device + from kernbench.sim_engine.engine import GraphEngine + from kernbench.topology.builder import resolve_topology + from kernbench.common.pe_commands import TensorHandle + + def kernel(t_ptr, n_elem, dst_buf_addr, tl): + rank = tl.program_id(axis=0) + ws = tl.num_programs(axis=0) + nbytes = n_elem * 2 + # Each PE sends own data, then recv into a custom dst slot + current = TensorHandle( + id="loc", addr=t_ptr + rank * nbytes, + shape=(n_elem,), dtype="f16", + nbytes=nbytes, data=None, space="hbm", + ) + tl.send(dir="E", src=current) + # copy_to_dst: move into a per-rank scratch HBM addr + recv = tl.recv( + dir="W", shape=(n_elem,), dtype="f16", + dst_addr=dst_buf_addr + rank * nbytes, + dst_space="hbm", + ) + # Sanity: recv handle should now point to our dst addr + assert recv.addr == dst_buf_addr + rank * nbytes + assert recv.space == "hbm" + + topo = resolve_topology("topology.yaml") + + def run(torch): + plan = torch.install_ipcq( + algorithm="ring_allreduce_tcm", world_size_override=8, + ) + a = torch.zeros( + (1, 8 * 8), + dtype="f16", + dp=DPPolicy( + sip="replicate", cube="replicate", pe="column_wise", + num_sips=1, num_cubes=1, + ), + name="copy_in", + ) + store = torch.engine.memory_store + base = a._handle.va_base or a._handle.shards[0].pa + nbytes = 8 * 2 + for r in range(8): + store.write("hbm", base + r * nbytes, + np.full((8,), float(r + 1), dtype=np.float16)) + + # Use a separate dst region (synthetic addresses) + dst_buf = 0xC0FFEE_0000 + torch.launch("ring_allreduce_tcm", kernel, a, 8, dst_buf) + + # After the kernel, dst_buf + r*16 should contain rank (r-1)%8's data + for r in range(8): + arr = store.read("hbm", dst_buf + r * nbytes, shape=(8,), dtype="f16") + expected = float(((r - 1) % 8) + 1) + assert np.allclose(arr, expected), f"rank {r}: got {arr}, expected {expected}" + + result = run_bench( + topology=topo, bench_fn=run, + device=resolve_device("all"), + engine_factory=lambda t, d: GraphEngine( + getattr(t, "topology_obj", t), enable_data=True + ), + ) + assert result.completion.ok diff --git a/tests/test_runtime_api_tensor.py b/tests/test_runtime_api_tensor.py new file mode 100644 index 0000000..b06eac0 --- /dev/null +++ b/tests/test_runtime_api_tensor.py @@ -0,0 +1,136 @@ +"""Tests for the pytorch-compat Tensor API extensions. + +Covers the new ``torch.from_numpy`` factory and ``Tensor.numpy``, +``Tensor.copy_`` methods used by the unified ``ccl_allreduce`` bench. +""" +from __future__ import annotations + +import numpy as np +import pytest + +from kernbench.policy.placement.dp import DPPolicy +from kernbench.runtime_api.bench_runner import run_bench +from kernbench.runtime_api.types import resolve_device +from kernbench.sim_engine.engine import GraphEngine +from kernbench.topology.builder import resolve_topology + + +def _engine_factory(topology, device): + return GraphEngine(getattr(topology, "topology_obj", topology), enable_data=True) + + +def _run_with(bench_body): + topo = resolve_topology("topology.yaml") + return run_bench( + topology=topo, + bench_fn=bench_body, + device=resolve_device("all"), + engine_factory=_engine_factory, + ) + + +# ── from_numpy ────────────────────────────────────────────────────── + + +def test_from_numpy_creates_host_tensor(): + """torch.from_numpy returns a kernbench Tensor with the array stored + in its host buffer (not deployed to any PE).""" + + def body(torch): + arr = np.arange(8, dtype=np.float16).reshape(1, 8) + h = torch.from_numpy(arr) + # Host tensor has shape/dtype matching the array. + assert h.shape == (1, 8) + assert h.dtype == "f16" + # numpy() round-trips the host buffer. + assert np.array_equal(h.numpy(), arr) + # No deploy → no real shards. + assert h._handle is None + # Submit a no-op so run_bench has at least one handle. + torch.zeros((1, 8), dtype="f16", + dp=DPPolicy(sip="replicate", cube="replicate", pe="replicate", + num_sips=1, num_cubes=1, num_pes=1), + name="dummy") + + _run_with(body) + + +# ── single-PE replicated tensor ───────────────────────────────────── + + +def test_copy_and_numpy_single_pe(): + """copy_ from a numpy array, then numpy() round-trips correctly on + a single-PE (no real sharding) tensor.""" + + def body(torch): + dp = DPPolicy(sip="replicate", cube="replicate", pe="replicate", + num_sips=1, num_cubes=1, num_pes=1) + t = torch.zeros((1, 16), dtype="f16", dp=dp, name="t") + src = np.arange(16, dtype=np.float16).reshape(1, 16) + t.copy_(torch.from_numpy(src)) + gathered = t.numpy() + assert gathered.shape == (1, 16) + assert np.array_equal(gathered, src) + + _run_with(body) + + +# ── multi-PE column-wise sharding (1 cube) ────────────────────────── + + +def test_copy_and_numpy_multi_pe_column_wise(): + """copy_ splits across 8 PEs in one cube, numpy() reassembles.""" + + def body(torch): + n_pe = 8 + dp = DPPolicy(sip="replicate", cube="replicate", pe="column_wise", + num_sips=1, num_cubes=1, num_pes=n_pe) + t = torch.zeros((1, n_pe * 4), dtype="f16", dp=dp, name="t") + src = np.arange(n_pe * 4, dtype=np.float16).reshape(1, n_pe * 4) + t.copy_(torch.from_numpy(src)) + gathered = t.numpy() + assert gathered.shape == (1, n_pe * 4) + assert np.array_equal(gathered, src) + # Sanity: there really were 8 shards. + assert len(t._handle.shards) == n_pe + + _run_with(body) + + +# ── multi-cube sharding ───────────────────────────────────────────── + + +def test_copy_and_numpy_multi_cube(): + """copy_ across 2 cubes (16 PEs total), numpy() reassembles.""" + + def body(torch): + n_pe_per_cube = 8 + n_cubes = 2 + total = n_cubes * n_pe_per_cube # 16 + dp = DPPolicy(sip="replicate", cube="column_wise", pe="column_wise", + num_sips=1, num_cubes=n_cubes) + t = torch.zeros((1, total * 4), dtype="f16", dp=dp, name="t") + src = np.arange(total * 4, dtype=np.float16).reshape(1, total * 4) + t.copy_(torch.from_numpy(src)) + gathered = t.numpy() + assert np.array_equal(gathered, src) + assert len(t._handle.shards) == total + + _run_with(body) + + +# ── shape mismatch raises ─────────────────────────────────────────── + + +def test_copy_shape_mismatch_raises(): + """copy_ with mismatched shapes raises ValueError.""" + + def body(torch): + dp = DPPolicy(sip="replicate", cube="replicate", pe="replicate", + num_sips=1, num_cubes=1, num_pes=1) + t = torch.zeros((1, 8), dtype="f16", dp=dp, name="t") + src = np.zeros((1, 16), dtype=np.float16) + with pytest.raises(ValueError, match="copy_ shape mismatch"): + t.copy_(torch.from_numpy(src)) + + _run_with(body) diff --git a/tests/test_tl_ipcq_api.py b/tests/test_tl_ipcq_api.py new file mode 100644 index 0000000..f1e708f --- /dev/null +++ b/tests/test_tl_ipcq_api.py @@ -0,0 +1,95 @@ +"""Tests for tl.send / tl.recv API (ADR-0023 D4 + D9.5).""" +from __future__ import annotations + +from typing import Any + +import simpy +from greenlet import greenlet + +from kernbench.common.ipcq_types import ( + IpcqRecvCmd, + IpcqRequest, + IpcqSendCmd, +) +from kernbench.triton_emu.tl_context import TLContext + + +# ── Command-list mode (no runner) ──────────────────────────────────── + + +def test_tl_send_command_list_mode(): + tl = TLContext(pe_id=0, num_programs=4, dispatch_cycles=0) + tl.send(dir="E", src_addr=0x500, nbytes=64, shape=(8,), dtype="f16") + cmds = tl.commands + sends = [c for c in cmds if isinstance(c, IpcqSendCmd)] + assert len(sends) == 1 + assert sends[0].direction == "E" + assert sends[0].src_addr == 0x500 + assert sends[0].nbytes == 64 + + +def test_tl_recv_command_list_mode(): + tl = TLContext(pe_id=0, num_programs=4, dispatch_cycles=0) + handle = tl.recv(dir="W", shape=(8,), dtype="f16") + cmds = tl.commands + recvs = [c for c in cmds if isinstance(c, IpcqRecvCmd)] + assert len(recvs) == 1 + assert recvs[0].direction == "W" + # In command-list mode (no runner), tl.recv returns a placeholder + # TensorHandle (no actual data movement happens until SimPy) + assert handle.shape == (8,) + assert handle.dtype == "f16" + + +def test_tl_recv_round_robin_no_dir(): + tl = TLContext(pe_id=0, num_programs=4, dispatch_cycles=0) + tl.recv(shape=(8,), dtype="f16") + cmds = tl.commands + recvs = [c for c in cmds if isinstance(c, IpcqRecvCmd)] + assert recvs[0].direction is None + + +# ── Runner mode (greenlet) ────────────────────────────────────────── + + +class _StubRunner: + """Minimal runner that auto-responds to IpcqSendCmd / IpcqRecvCmd.""" + + def __init__(self) -> None: + self.received: list[Any] = [] + + def switch_to_simpy(self, cmd: Any) -> Any: + self.received.append(cmd) + if isinstance(cmd, IpcqSendCmd): + return None + if isinstance(cmd, IpcqRecvCmd): + # Return a fake slot dict + return { + "data": None, + "src_space": "tcm", + "src_addr": 0xABCD, + "direction": cmd.direction or "E", + "dtype": cmd.dtype, + "shape": cmd.shape, + "nbytes": 16, + } + return None + + +def test_tl_send_runner_mode(): + runner = _StubRunner() + tl = TLContext(pe_id=0, num_programs=4, dispatch_cycles=0, runner=runner) + tl.send(dir="E", src_addr=0x500, nbytes=64, shape=(8,), dtype="f16") + assert len(runner.received) == 1 + assert isinstance(runner.received[0], IpcqSendCmd) + + +def test_tl_recv_runner_mode_returns_handle_with_slot_addr(): + runner = _StubRunner() + tl = TLContext(pe_id=0, num_programs=4, dispatch_cycles=0, runner=runner) + h = tl.recv(dir="W", shape=(8,), dtype="f16") + assert isinstance(runner.received[0], IpcqRecvCmd) + # The returned TensorHandle's addr should reflect the slot + assert h.addr == 0xABCD + assert h.shape == (8,) + assert h.dtype == "f16" diff --git a/tests/test_tl_recv_async.py b/tests/test_tl_recv_async.py new file mode 100644 index 0000000..37aae56 --- /dev/null +++ b/tests/test_tl_recv_async.py @@ -0,0 +1,106 @@ +"""Tests for tl.recv_async + tl.wait (ADR-0023 D4).""" +from __future__ import annotations + +import numpy as np + +from kernbench.ccl.testing import run_kernel_in_mock + + +def kernel_async_recv(t_ptr, n_elem, tl): + """Each PE issues recv_async first, then send, then wait — this exercises + the non-blocking path. Uses TensorHandle math (PE_MATH) for accumulation + so Phase 2 produces correct final HBM contents.""" + rank = tl.program_id(axis=0) + world_size = tl.num_programs(axis=0) + nbytes = n_elem * 2 + + pe_addr = t_ptr + rank * nbytes + acc = tl.load(pe_addr, shape=(n_elem,), dtype="f16") + current = acc + + for _step in range(world_size - 1): + future = tl.recv_async(dir="W", shape=(n_elem,), dtype="f16") + tl.send(dir="E", src=current) + recv = tl.wait(future) + acc = acc + recv + current = recv # forward W's tile to E next round + + tl.store(pe_addr, acc) + + +def test_recv_async_mock_runtime(): + n_elem = 8 + inputs = [ + np.full((n_elem,), float(r + 1), dtype=np.float16) + for r in range(4) + ] + expected = sum(inputs) + + outputs = run_kernel_in_mock( + kernel_fn=kernel_async_recv, + world_size=4, + topology="ring_1d", + inputs=inputs, + kernel_args=(n_elem,), + ) + for r in range(4): + assert np.allclose(outputs[r], expected) + + +def test_recv_async_simpy_runner(): + """Run the async kernel through the real SimPy stack via the + install_ipcq + launch path. + """ + import importlib + + from kernbench.runtime_api.bench_runner import run_bench + from kernbench.runtime_api.types import resolve_device + from kernbench.sim_engine.engine import GraphEngine + from kernbench.topology.builder import resolve_topology + + # Re-use the standard 8-PE bench skeleton but swap in the async kernel. + topo = resolve_topology("topology.yaml") + + # Build a tiny inline bench module + import types + mod = types.ModuleType("inline_bench_async") + + from kernbench.policy.placement.dp import DPPolicy + + def run(torch): + plan = torch.install_ipcq( + algorithm="ring_allreduce_tcm", world_size_override=8, + ) + a = torch.zeros( + (1, 8 * 8), + dtype="f16", + dp=DPPolicy( + sip="replicate", cube="replicate", pe="column_wise", + num_sips=1, num_cubes=1, + ), + name="async_in", + ) + store = torch.engine.memory_store + base = a._handle.va_base or a._handle.shards[0].pa + nbytes = 8 * 2 + for r in range(8): + store.write("hbm", base + r * nbytes, + np.full((8,), float(r + 1), dtype=np.float16)) + + torch.launch("ring_allreduce_tcm", kernel_async_recv, a, 8) + + for r in range(8): + result = store.read("hbm", base + r * nbytes, shape=(8,), dtype="f16") + expected = float(sum(range(1, 9))) # 36 + assert np.allclose(result, expected, rtol=1e-2, atol=1e-2), \ + f"rank {r}: got {result}, expected {expected}" + + mod.run = run + result = run_bench( + topology=topo, bench_fn=mod.run, + device=resolve_device("all"), + engine_factory=lambda t, d: GraphEngine( + getattr(t, "topology_obj", t), enable_data=True + ), + ) + assert result.completion.ok diff --git a/tests/test_topology_compile.py b/tests/test_topology_compile.py index ae849aa..0ef2348 100644 --- a/tests/test_topology_compile.py +++ b/tests/test_topology_compile.py @@ -19,16 +19,19 @@ def test_full_graph_node_count(): # + 2 SIPs x (1 IO x 23 io_nodes # + 16 cubes x (32 routers + 1 hbm_ctrl + 1 m_cpu + 1 sram # + 20 ucie (4 ports x (1 port + 4 conn)) - # + 8 PEs x 8 pe_comps)) (ADR-0021: +pe_fetch_store) + # + 8 PEs x 9 pe_comps)) (ADR-0023: +pe_ipcq) # IO: pcie_ep + io_cpu + noc + 4 io_ucie_ports + 4*4 io_ucie_conn = 23 - # cube: 32 + 3 + 20 + 64 = 119 - # = 1 + 2*(23 + 16*119) = 1 + 2*(23+1904) = 1 + 3854 = 3855 - assert len(g.nodes) == 3855 + # cube: 32 + 3 + 20 + 72 = 127 + # = 1 + 2*(23 + 16*127) = 1 + 2*(23+2032) = 1 + 4110 = 4111 + assert len(g.nodes) == 4111 def test_full_graph_edge_count(): g = _graph() - assert len(g.edges) == 12922 # ADR-0021: +pe_fetch_store + chaining edges + # ADR-0023: +3 IPCQ edges per PE (cpu→ipcq, ipcq→dma, dma→ipcq) + # 2 SIPs × 16 cubes × 8 PEs × 3 = 768 new edges + # Cross-SIP routing: +1 reverse pcie_ep→switch edge per SIP = +2 + assert len(g.edges) == 13692 # -- Full graph: specific nodes exist ----------------------------------------- @@ -287,7 +290,7 @@ def test_pe_view_has_all_components(): v = _graph().pe_view assert set(v.nodes.keys()) == { "pe_cpu", "pe_scheduler", "pe_dma", "pe_fetch_store", - "pe_gemm", "pe_math", "pe_mmu", "pe_tcm", + "pe_gemm", "pe_math", "pe_mmu", "pe_tcm", "pe_ipcq", } diff --git a/tests/test_topology_load.py b/tests/test_topology_load.py index 82f2859..93c2c6d 100644 --- a/tests/test_topology_load.py +++ b/tests/test_topology_load.py @@ -24,7 +24,7 @@ def test_pe_template_components(): comps = spec["cube"]["pe_template"]["components"] assert set(comps.keys()) == { "pe_cpu", "pe_scheduler", "pe_dma", "pe_fetch_store", - "pe_gemm", "pe_math", "pe_mmu", "pe_tcm", + "pe_gemm", "pe_math", "pe_mmu", "pe_tcm", "pe_ipcq", } diff --git a/tests/test_triton_emu.py b/tests/test_triton_emu.py index 77b4568..160ec4b 100644 --- a/tests/test_triton_emu.py +++ b/tests/test_triton_emu.py @@ -87,6 +87,37 @@ def test_tl_math_unary_ops(): assert ops == ["exp", "log", "sqrt", "abs", "sigmoid", "cos", "sin"] +def test_tl_math_extra_ops(): + """tl.maximum/minimum/fma/clamp/softmax + tl.cdiv (real-Triton parity).""" + tl = _ctx() + a = tl.load(0x1000, shape=(8, 8), dtype="f16") + b = tl.load(0x2000, shape=(8, 8), dtype="f16") + c = tl.load(0x3000, shape=(8, 8), dtype="f16") + + tl.maximum(a, b) + tl.minimum(a, b) + tl.fma(a, b, c) + tl.clamp(a, b, c) + tl.softmax(a, axis=1) + + math_cmds = [cm for cm in tl.commands if isinstance(cm, MathCmd)] + ops = [cm.op for cm in math_cmds] + assert ops == ["maximum", "minimum", "fma", "clamp", "softmax"] + # ternary fma/clamp must record three inputs + fma_cmd = math_cmds[2] + assert len(fma_cmd.inputs) == 3 + clamp_cmd = math_cmds[3] + assert len(clamp_cmd.inputs) == 3 + # softmax records the axis + assert math_cmds[4].axis == 1 + + # cdiv is a scalar helper, not a tensor op + from kernbench.triton_emu.tl_context import TLContext + assert TLContext.cdiv(10, 3) == 4 + assert TLContext.cdiv(9, 3) == 3 + assert TLContext.cdiv(0, 4) == 0 + + # ── 5. a + b, a * b → MathCmd ──────────────────────────────────── diff --git a/topology.yaml b/topology.yaml index ad936cc..c56b385 100644 --- a/topology.yaml +++ b/topology.yaml @@ -67,7 +67,8 @@ cube: pe_math: { kind: pe_math, impl: builtin.pe_math, attrs: { overhead_ns: 0.0, shared_resource: accel_slot } } pe_fetch_store: { kind: pe_fetch_store, impl: builtin.pe_fetch_store, attrs: { overhead_ns: 0.0 } } pe_mmu: { kind: pe_mmu, impl: builtin.pe_mmu, attrs: { tlb_overhead_ns: 0.5, page_size: 4096 } } - pe_tcm: { kind: pe_tcm, impl: builtin.pe_tcm, attrs: { size_mb: 16, read_bw_gbs: 512.0, write_bw_gbs: 512.0 } } + pe_tcm: { kind: pe_tcm, impl: builtin.pe_tcm, attrs: { size_mb: 16, read_bw_gbs: 512.0, write_bw_gbs: 512.0, kernel_scratch_mb: 1 } } + pe_ipcq: { kind: pe_ipcq, impl: builtin.pe_ipcq, attrs: { overhead_ns: 0.0 } } links: pe_cpu_to_scheduler_mm: 0.5 scheduler_to_dma_mm: 0.5 @@ -88,6 +89,9 @@ cube: gemm_to_tcm_mm: 0.5 math_to_tcm_bw_gbs: 512.0 math_to_tcm_mm: 0.5 + cpu_to_ipcq_mm: 0.5 # PE_CPU → PE_IPCQ (ADR-0023) + ipcq_to_dma_mm: 0.0 # PE_IPCQ → PE_DMA token forwarding (ADR-0023) + dma_to_ipcq_mm: 0.0 # PE_DMA → PE_IPCQ metadata arrival (ADR-0023) memory_map: hbm_total_gb_per_cube: 48