diff --git a/benches/ccl_allreduce.py b/benches/ccl_allreduce.py
new file mode 100644
index 0000000..c12a168
--- /dev/null
+++ b/benches/ccl_allreduce.py
@@ -0,0 +1,129 @@
+"""CCL all-reduce bench — single unified entry point.
+
+Driven entirely by ``ccl.yaml`` + ``topology.yaml``:
+
+- ``defaults.algorithm`` in ``ccl.yaml`` picks which kernel to run
+  (``ring_allreduce_{tcm,hbm,sram}`` / ``mesh_allreduce_4`` /
+  ``tree_allreduce_7``).
+- ``world_size`` is derived from the algorithm entry's override or from
+  the topology spec (``sips × cubes_per_sip × pes_per_cube``).
+- The host code uses only real PyTorch ``torch.distributed`` names:
+  ``init_process_group``, ``get_world_size``, ``get_rank``, ``all_reduce``.
+
+The bench is split into ``worker(rank, world_size, torch)`` — the
+per-rank business logic, designed to look like a real PyTorch DDP
+training worker so future model benches can reuse the same skeleton —
+and ``run(torch)`` — the kernbench-specific launcher that initializes
+the process group and invokes the worker.
+"""
+from __future__ import annotations
+
+import numpy as np
+
+from kernbench.ccl.install import load_ccl_config, resolve_algorithm_config
+from kernbench.policy.placement.dp import DPPolicy
+
+# Default per-rank tile size if ccl.yaml doesn't override it. Real
+# pytorch benches hardcode batch/feature dims similarly.
+DEFAULT_N_ELEM = 32
+
+
+def _derive_dp(spec: dict, world_size: int) -> DPPolicy:
+    """Pick a DPPolicy that fans the tensor across exactly ``world_size`` PEs.
+
+    Mirrors what a real PyTorch DDP user does manually with
+    ``tensor.to(f"cuda:{rank}")``: the host code chooses the placement so
+    that the collective sees the right number of participating ranks.
+    """
+    sips = int(spec["system"]["sips"]["count"])
+    cm = spec["sip"]["cube_mesh"]
+    pl = spec["cube"]["pe_layout"]
+    pes_per_cube = int(pl["pe_per_corner"]) * len(pl["corners"])
+    cubes_per_sip = int(cm["w"]) * int(cm["h"])
+    total = sips * cubes_per_sip * pes_per_cube
+    if world_size == total:
+        return DPPolicy(sip="column_wise", cube="column_wise", pe="column_wise")
+    if world_size <= pes_per_cube:
+        return DPPolicy(
+            sip="replicate", cube="replicate", pe="column_wise",
+            num_sips=1, num_cubes=1, num_pes=world_size,
+        )
+    if world_size <= cubes_per_sip * pes_per_cube:
+        return DPPolicy(
+            sip="replicate", cube="column_wise", pe="column_wise",
+            num_sips=1, num_cubes=world_size // pes_per_cube,
+        )
+    return DPPolicy(sip="column_wise", cube="column_wise", pe="column_wise")
+
+
+def worker(rank: int, world_size: int, torch) -> None:
+    """Per-rank business logic. Mirrors a real PyTorch DDP worker.
+
+    In real PyTorch DDP, this function runs in N separate processes,
+    each with its own ``rank``. In kernbench (single-process multi-device)
+    it is invoked once with ``rank=0`` on the single host driver; the
+    actual per-PE parallelism is handled by ``torch.launch`` fanning out
+    the kernel across all participating PEs via the tensor's DPPolicy.
+    The ``rank`` parameter is therefore always 0 today, and is kept as
+    an explicit argument for parity with real DDP workers (``if rank ==
+    0`` logging guards, future multi-host extensions).
+    """
+    cfg = resolve_algorithm_config(load_ccl_config())
+    algo_name = cfg["algorithm"]
+    n_elem = int(cfg.get("n_elem", DEFAULT_N_ELEM))
+
+    # Pick a DP that produces exactly ``world_size`` shards on this topology.
+    dp = _derive_dp(torch.spec, world_size)
+    tensor = torch.zeros(
+        (1, world_size * n_elem), dtype="f16", dp=dp, name="ccl_in",
+    )
+
+    # Initialize: CCL rank r's slice gets value (r + 1). Real PyTorch idiom:
+    #     target.copy_(torch.from_numpy(source))
+    init = np.zeros((1, world_size * n_elem), dtype=np.float16)
+    for r in range(world_size):
+        init[0, r * n_elem : (r + 1) * n_elem] = float(r + 1)
+    tensor.copy_(torch.from_numpy(init))
+
+    # The main act: one all_reduce call — the backend installs IPCQ at
+    # init_process_group time and here only dispatches the kernel.
+    torch.distributed.all_reduce(tensor, op="sum")
+
+    # Verify: each shard should hold sum(1..world_size) after all-reduce.
+    result = tensor.numpy()
+    expected = float(sum(range(1, world_size + 1)))
+    all_ok = bool(np.allclose(result, expected, rtol=1e-1, atol=1e-1))
+
+    # Print only on rank 0 — real PyTorch DDP idiom for single-source logs.
+    if rank == 0:
+        if all_ok:
+            print(f"  {algo_name} (ws={world_size}): {world_size} OK")
+        else:
+            flat = result.reshape(-1)
+            n_fail = 0
+            for r in range(world_size):
+                slice_r = flat[r * n_elem : (r + 1) * n_elem]
+                if not np.allclose(slice_r, expected, rtol=1e-1, atol=1e-1):
+                    n_fail += 1
+                    if n_fail <= 5:
+                        print(
+                            f"  [FAIL] rank {r} "
+                            f"(ws={world_size}, algo={algo_name}): "
+                            f"got mean={float(slice_r.mean()):.3f}, "
+                            f"expected={expected:.3f}"
+                        )
+            print(
+                f"  {algo_name} (ws={world_size}): "
+                f"{world_size - n_fail} OK / {n_fail} FAIL"
+            )
+
+
+def run(torch) -> None:
+    """CLI entry point: initialize the process group, invoke worker."""
+    dist = torch.distributed
+    dist.init_process_group(backend="ahbm")
+    worker(
+        rank=dist.get_rank(),
+        world_size=dist.get_world_size(),
+        torch=torch,
+    )
diff --git a/benches/loader.py b/benches/loader.py
index abc5ac7..fd6bb24 100644
--- a/benches/loader.py
+++ b/benches/loader.py
@@ -9,29 +9,32 @@ from kernbench.runtime_api.context import RuntimeContext
 BenchFn = Callable[[RuntimeContext], Any]
 
 
+def _load_module(bench_id: str):
+    bench_id = bench_id.strip()
+    if not bench_id:
+        raise ValueError("Bench id is empty.")
+    module_path = f"benches.{bench_id}"
+    try:
+        return importlib.import_module(module_path)
+    except ModuleNotFoundError as e:
+        raise ValueError(
+            f"Unknown bench '{bench_id}'. Expected module {module_path}.py"
+        ) from e
+
+
 def resolve_bench(bench_id: str) -> BenchFn:
-    """
-    Resolve a bench id into a callable bench function.
+    """Resolve a bench id into its ``run(torch)`` callable.
 
     Expected layout (repo root):
         benches/<bench_id>.py
             def run(torch: RuntimeContext) -> Any
     """
-    bench_id = bench_id.strip()
-    if not bench_id:
-        raise ValueError("Bench id is empty.")
-
-    module_path = f"benches.{bench_id}"
-
-    try:
-        mod = importlib.import_module(module_path)
-    except ModuleNotFoundError as e:
-        raise ValueError(f"Unknown bench '{bench_id}'. Expected module {module_path}.py") from e
-
+    mod = _load_module(bench_id)
     run_fn = getattr(mod, "run", None)
     if run_fn is None:
-        raise ValueError(f"Bench module {module_path} must define a 'run(torch)' function.")
+        raise ValueError(
+            f"Bench module benches.{bench_id} must define 'run(torch)'."
+        )
     if not callable(run_fn):
-        raise ValueError(f"'run' in {module_path} is not callable.")
-
+        raise ValueError(f"'run' in benches.{bench_id} is not callable.")
     return run_fn
diff --git a/ccl.yaml b/ccl.yaml
new file mode 100644
index 0000000..4bac308
--- /dev/null
+++ b/ccl.yaml
@@ -0,0 +1,80 @@
+# ccl.yaml — CCL backend (ahbm) configuration (ADR-0023 D11)
+#
+# Loaded by AhbmCCLBackend at init_process_group time.
+# defaults.algorithm chooses which kernel + topology is installed
+# into PE_IPCQ neighbor tables. Host code is unaware of these settings.
+
+defaults:
+  # Algorithm to run for this benchmark execution.
+  algorithm: ring_allreduce_tcm
+
+  # NOTE: world_size is not set here by default. AhbmCCLBackend derives it
+  # from the chosen algorithm's entry (if it sets ``world_size``) or from
+  # topology.yaml (``sips × cubes_per_sip × pes_per_cube``). This mirrors
+  # real PyTorch DDP where ranks/world_size come from env vars, not code.
+
+  # IPCQ ring buffer location.
+  #   tcm  — PE-local TCM (fast, small, conflicts with compute TCM access)
+  #   hbm  — PE-local HBM (large, slower DMA latency)
+  #   sram — Cube-shared SRAM (medium, cube-internal contention)
+  buffer_kind: tcm
+
+  # Backpressure mode.
+  #   poll  — spin-loop polling of cached peer pointers
+  #   sleep — yield SimPy event, wake on credit return
+  backpressure: sleep
+
+  # Ring depth: number of slots per (direction, tx|rx) buffer.
+  n_slots: 4
+
+  # Slot size in bytes (must hold one tile worth of data).
+  slot_size: 4096
+
+  # PE_DMA virtual channel chunk size (D8). First implementation does not
+  # use chunk-level interleave; this is reserved for future precision.
+  vc_chunk_size: 256
+
+  # Credit return fast path message size (D9). Used by bottleneck-BW
+  # latency calculation. 16-64 bytes typical.
+  ipcq_credit_size_bytes: 16
+
+algorithms:
+  # ── ring all-reduce, buffer in PE_TCM ──
+  # Defaults to topology-derived world_size (full system, 256 ranks).
+  # Use a smaller tile size at high rank counts so f16 sums stay within
+  # the verification tolerance and op_log replay scales.
+  ring_allreduce_tcm:
+    module: kernbench.ccl.algorithms.ring_allreduce
+    topology: ring_1d
+    buffer_kind: tcm
+    n_elem: 8
+
+  # ── ring all-reduce, buffer in PE-local HBM ──
+  ring_allreduce_hbm:
+    module: kernbench.ccl.algorithms.ring_allreduce
+    topology: ring_1d
+    buffer_kind: hbm
+    n_elem: 8
+
+  # ── ring all-reduce, buffer in cube SRAM ──
+  ring_allreduce_sram:
+    module: kernbench.ccl.algorithms.ring_allreduce
+    topology: ring_1d
+    buffer_kind: sram
+    n_elem: 8
+
+  # ── 2D mesh all-reduce: perfect square only (2×2 = 4 PEs) ──
+  mesh_allreduce_4:
+    module: kernbench.ccl.algorithms.mesh_allreduce
+    topology: mesh_2d
+    buffer_kind: tcm
+    world_size: 4
+    n_elem: 16
+
+  # ── tree all-reduce (binary, 7 PEs) ──
+  tree_allreduce_7:
+    module: kernbench.ccl.algorithms.tree_allreduce
+    topology: tree_binary
+    buffer_kind: tcm
+    world_size: 7
+    n_elem: 16
diff --git a/components.yaml b/components.yaml
index f3ab639..75db756 100644
--- a/components.yaml
+++ b/components.yaml
@@ -51,5 +51,6 @@ components:
   builtin.pe_fetch_store: kernbench.components.builtin.pe_fetch_store:PeFetchStoreComponent
   builtin.pe_mmu:         kernbench.components.builtin.pe_mmu:PeMmuComponent
   builtin.pe_tcm:         kernbench.components.builtin.pe_tcm:PeTcmComponent
+  builtin.pe_ipcq:        kernbench.components.builtin.pe_ipcq:PeIpcqComponent
 
   # Custom — add your implementations here
diff --git a/docs/adr/ADR-0023-ipcq-pe-collective.en.md b/docs/adr/ADR-0023-ipcq-pe-collective.en.md
new file mode 100644
index 0000000..c97f20e
--- /dev/null
+++ b/docs/adr/ADR-0023-ipcq-pe-collective.en.md
@@ -0,0 +1,866 @@
+# ADR-0023: PE-level IPCQ — Inter-PE Collective Communication
+
+## Status
+
+Proposed
+
+## Context
+
+### Goal
+
+Add the infrastructure that lets CCL (Collective Communication Library)
+kernels run **inside** a PE. The host just launches a kernel on each
+SIP; the actual synchronization and data movement happen **inside the
+PE kernel via an IPCQ (Inter-Process Communication Queue)**.
+
+This mirrors how NCCL performs NVLink communication inside a GPU
+kernel, or how Cerebras / Tenstorrent expose core-local communication
+queues. Host-level collectives (`dist.all_reduce`) are deferred to
+**future work**; this ADR focuses solely on the kernel-side collective
+infrastructure.
+
+### Current state
+
+- ADR-0021 PE pipeline refactor: each PE is decomposed into components
+  (PE_CPU, PE_SCHEDULER, PE_DMA, PE_FETCH_STORE, PE_GEMM, PE_MATH,
+  PE_TCM, PE_MMU).
+- No direct PE-to-PE channel exists today. All data movement goes
+  through PE_DMA → cube_noc / UCIe / PCIE → HBM.
+- A pre-ADR host CCL skeleton exists (`dist.init_process_group(backend="ahbm")`,
+  `_run_ccl_bench` running per-rank greenlets concurrently). The
+  collective itself is a stub.
+
+### Problems to solve
+
+1. PE-to-PE direct data movement (writing into a peer's memory).
+2. Synchronization — the sender must check that the receiver has space
+   in its buffer (backpressure).
+3. Resource contention between compute traffic and communication
+   traffic (Head-of-Line blocking).
+4. The host must be able to construct logical neighbor topologies
+   (ring / mesh / tree) per algorithm.
+
+---
+
+## Decision
+
+### D1. Add a new `PE_IPCQ` component
+
+A new component `PE_IPCQ` is added inside each PE. It follows the same
+pattern as PE_GEMM / PE_MATH — modeling a sub-block of the PE as a
+distinct component.
+
+```
+PE
+├── PE_CPU
+├── PE_SCHEDULER
+├── PE_DMA
+├── PE_IPCQ          ← new
+├── PE_FETCH_STORE
+├── PE_GEMM
+├── PE_MATH
+├── PE_TCM
+├── PE_MMU
+```
+
+**Role separation** (control plane vs. data plane):
+
+- **PE_IPCQ (control plane)**: ring-buffer address arithmetic, head /
+  tail pointer management, peer pointer caches, backpressure, 4-direction
+  neighbor mapping.
+- **PE_DMA (data plane)**: actually moves data through cube_noc / UCIe
+  / PCIE into the peer's memory.
+
+PE_IPCQ does **not** move data itself — it delegates to PE_DMA.
+
+### D2. Ring buffer model
+
+Each PE owns 4 directions (N/S/E/W) × {tx, rx} = 8 ring buffers.
+
+```python
+@dataclass
+class IpcqQueuePair:
+    direction: Direction          # N/S/E/W
+    peer: IpcqEndpoint            # set by host at init time (D2.5)
+    tx_buffer_base: int           # outgoing data base addr (in our memory)
+    rx_buffer_base: int           # incoming data base addr (in our memory)
+    slot_size: int                # 1 tile per slot
+    n_slots: int                  # ring depth
+    my_head: int                  # next slot we will write/send into
+    my_tail: int                  # next slot we will read/recv from
+    peer_head_cache: int          # peer's last-seen head (updated via D9 piggyback)
+    peer_tail_cache: int          # peer's last-seen tail (updated via D9 fast-path credit)
+```
+
+**Canonical field names**: throughout this ADR the four names above
+(`my_head`, `my_tail`, `peer_head_cache`, `peer_tail_cache`) are used
+consistently. Synonyms (`peer_head_local`, `peer_head`, `peer_tail`,
+etc.) are not used.
+
+| Field | Owner | Updated when |
+|-------|-------|--------------|
+| `my_head` | local PE_IPCQ | immediately after `tl.send` (send tracking) |
+| `my_tail` | local PE_IPCQ | immediately after `tl.recv` (recv tracking) |
+| `peer_head_cache` | local PE_IPCQ | on `IpcqMetaArrival` (D9 piggyback) |
+| `peer_tail_cache` | local PE_IPCQ | on `IpcqCreditMetadata` (D9 fast path) |
+
+**Slot unit**: fixed-size, one slot holds one full tile (no descriptor
+indirection). Full data embedded in the slot. See D5.
+
+### D2.5. `IpcqEndpoint` schema
+
+`IpcqQueuePair.peer` carries everything the sender needs to compute the
+peer's rx slot address:
+
+```python
+@dataclass(frozen=True)
+class IpcqEndpoint:
+    sip: int
+    cube: int
+    pe: int
+    buffer_kind: str             # "tcm" | "hbm" | "sram"
+    rx_base_pa: int              # peer rx_buffer base PA (PhysAddr.encode())
+    rx_base_va: int              # peer rx_buffer base VA (optional, MMU mode)
+    n_slots: int                 # peer ring depth (for wrap-around)
+    slot_size: int               # peer slot size (for offset)
+```
+
+Address computation:
+
+```python
+slot_idx = self.my_head % peer.n_slots
+dst_pa = peer.rx_base_pa + slot_idx * peer.slot_size
+```
+
+PE_IPCQ passes `dst_pa` to PE_DMA inside an `IpcqDmaToken`. PE_DMA
+(vc_comm) routes the data to `dst_pa` through the fabric.
+
+**Endpoint construction order**: at backend init (D10), the IPCQ
+buffers for **every PE** are allocated first (so each rank knows the
+others' PA), then the per-rank neighbor tables are built and pushed to
+PE_IPCQ via `IpcqInitMsg`.
+
+### D3. Four-direction mapping ≡ logical ProcessGroup
+
+The PE views four directions (N/S/E/W) as logical ports. Real peer
+addresses are configured by the host CCL init, per the chosen
+algorithm. The PE kernel never knows the topology, only directions.
+
+```python
+# 1D ring
+for rank in range(world_size):
+    ipcq_set_neighbor(rank, "E", peer=ranks[(rank + 1) % world_size])
+    ipcq_set_neighbor(rank, "W", peer=ranks[(rank - 1) % world_size])
+
+# 2D mesh
+for r in range(R):
+    for c in range(C):
+        ipcq_set_neighbor((r, c), "N", peer=((r - 1) % R, c))
+        ipcq_set_neighbor((r, c), "S", peer=((r + 1) % R, c))
+        ipcq_set_neighbor((r, c), "E", peer=(r, (c + 1) % C))
+        ipcq_set_neighbor((r, c), "W", peer=(r, (c - 1) % C))
+```
+
+The PE code does not need to know where `tl.send(dir="E", ...)` actually
+ends up.
+
+### D4. PE kernel API
+
+```python
+# Send (blocking; may stall on backpressure)
+tl.send(dir: str, src=TensorHandle)
+tl.send(dir: str, src_addr=..., nbytes=..., shape=..., dtype=..., space=...)
+
+# Recv (blocking)
+recv = tl.recv(dir: str, shape=..., dtype=...)
+recv = tl.recv(shape=..., dtype=...)        # round-robin across 4 directions
+
+# Recv (non-blocking)
+fut  = tl.recv_async(dir: str, shape=..., dtype=...)
+recv = tl.wait(fut)
+```
+
+`tl.recv()` (no direction) keeps a `last_polled_dir` cursor and on each
+call rotates through directions, returning the first available slot.
+Empty in all 4 directions → wait.
+
+**Fairness is weak**: the rotating start mitigates simple bias, but if
+one direction always wins the race the others can starve. Algorithms
+that need strict fairness must call `tl.recv(dir=...)` explicitly.
+
+### D5. Single-hop DMA write + full-data slot model
+
+Data moves from sender memory into the receiver's ring slot in **one
+DMA transfer**. Key properties:
+
+- **Single-hop**: the sender already knows the peer rx slot address and
+  fires one fabric DMA into it.
+- **No CPU memcpy**: the CPU never copies data.
+- **No intermediate staging**: neither side keeps a separate staging
+  buffer (sender uses the source addr directly; receiver gets the data
+  in its ring slot directly).
+
+(Strictly speaking the fabric DMA write does happen, so this is not
+literally "no data movement" — it's the same property NCCL labels
+"zero-copy", meaning no CPU memcpy and no staging copy.)
+
+```
+PE A: tl.send(E, src_addr, nbytes)
+  1. IPCQ computes the peer rx slot address:
+       dst_addr = peer.rx_base_pa + (my_head % peer.n_slots) * peer.slot_size
+  2. Backpressure: my_head - peer_tail_cache < peer.n_slots ?
+                   (full → sleep / poll)
+  3. Submit DMA on PE_DMA(vc_comm): src_addr → peer dst_addr, nbytes
+  4. my_head += 1
+
+PE B: data = tl.recv(W)
+  1. Look at rx_buffer[my_tail % n_slots]
+  2. Wait for the data to arrive (D7 backpressure mode)
+  3. Return the slot address to the kernel (or fetch into register file)
+  4. my_tail += 1
+  5. Issue a credit-return fast path (D9): after the bottleneck-BW
+     latency the peer A's peer_tail_cache is updated.
+```
+
+The slot holds the full tile. The receiver only reads its own
+rx_buffer; it never reads back into A's memory. The sender knows the
+peer rx slot address and DMAs directly into it (single-hop).
+
+The PE's own PE_TCM read/write does not go through DMA (PE_TCM is local
+to the PE).
+
+### D6. Buffer placement — three-way benchmark
+
+The host CCL init picks the IPCQ ring-buffer location:
+
+```python
+ipcq_init(
+    backend="ahbm",
+    buffer_kind="tcm" | "hbm" | "sram",
+    n_slots=8,
+    slot_size=4096,
+)
+```
+
+| Location | Trait | Trade-off |
+|----------|-------|-----------|
+| **PE_TCM** | Attached to the PE; fast | Small; competes with PE-internal resources |
+| **PE-local HBM** | Large; via DMA | Higher latency |
+| **Cube SRAM** | Mid-size; cube-shared | Cube-internal contention |
+
+All three locations run the same kernel code; only the init differs.
+
+### D7. Backpressure — two-mode benchmark
+
+How the sender or receiver waits when peer slots are full / data not
+yet arrived:
+
+| Mode | Behavior | Model |
+|------|----------|-------|
+| **poll** | Periodically re-check the cached peer pointer | Spin loop |
+| **sleep** | Yield a SimPy event; wake on a peer-trigger | Interrupt-like |
+
+```python
+ipcq_init(backpressure="poll" | "sleep", ...)
+```
+
+Both modes are implemented so latency / throughput trade-offs can be
+benchmarked.
+
+### D8. PE_DMA virtual channels
+
+Extend PE_DMA from a single queue into a **two-channel virtual-channel**
+model.
+
+```
+PE_DMA
+├── vc_compute: tile load / store / writeback for GEMM and Math
+└── vc_comm:    IPCQ send data
+```
+
+Each VC has an independent state machine:
+
+- One channel stalling does not block the other.
+- The same physical link (cube_noc, UCIe, …) is shared, but link BW is
+  split between channels.
+
+**Chunk-level interleave**:
+
+- Large GEMM tile DMAs do not lock the link end-to-end.
+- Progress happens in chunks (e.g. 256 B); each chunk shares link BW
+  with the other VC's pending chunks.
+- Chunk size is an init parameter (smaller = fairer, larger = more
+  efficient).
+
+Net effect:
+
+- HoL blocking is eliminated (an IPCQ send can interleave with a long
+  compute DMA).
+- Compute / comm overlap is natural (NVIDIA copy-engine + compute-SM
+  pattern).
+- Matches the NoC-virtual-channel pattern used in real HW.
+
+**First-implementation accuracy limit (intentional)**: this ADR's
+first cut uses **deterministic chunk-level interleave + weighted
+round-robin arbitration** (default 50 / 50, exposed in `ccl.yaml`).
+This is a first-order approximation and is simpler than real HW
+dynamic-contention / credit-based arbiters. Functional correctness is
+unaffected, but heavy-contention scenarios may report slightly
+optimistic latency vs. real HW. A separate ADR can add a NoC arbiter
+component later if more precision is needed.
+
+#### Token routing
+
+- Compute tokens (`TileToken`) — go through the existing
+  PE_FETCH_STORE → PE_DMA chain.
+- Communication tokens (`IpcqDmaToken`, new) — PE_IPCQ → PE_DMA
+  self-routing.
+- PE_DMA picks the channel by token type.
+
+```python
+class PeDmaComponent:
+    def _process(self, env, token):
+        if isinstance(token, IpcqDmaToken):
+            yield from self._vc_comm_process(env, token)
+        else:
+            yield from self._vc_compute_process(env, token)
+```
+
+### D9. Pointer synchronization — DMA payload piggyback
+
+Real HW (NVLink, UCIe, etc.) piggybacks metadata onto DMA payloads so
+pointers update along with the data. This simulation adopts the same
+model: **no separate control channel** — metadata travels with the
+data.
+
+The big benefits:
+
+- **Automatic ordering**: data and metadata move on the same token, so
+  data is visible **before** the head_cache update. No race.
+- **HW fidelity**: matches NVLink / UCIe piggybacked headers.
+- **Component simplification**: no separate `IpcqPtrUpdate` event type.
+
+#### Send flow (head update via piggyback)
+
+```
+PE A: tl.send(E, src_addr, nbytes)
+  1. PE_IPCQ checks backpressure (using peer_tail_cache)
+  2. PE_IPCQ creates an IpcqDmaToken:
+       - data body (src_addr → peer dst_addr)
+       - piggyback metadata: (sender_seq, src_sip/cube/pe, src_direction)
+  3. Hand the token to PE_DMA(vc_comm)
+  4. PE A increments my_head (send tracking)
+
+[fabric DMA: latency elapses]
+
+PE B's PE_DMA receives the token
+  5. Writes data into dst_addr (B's rx slot) via MemoryStore.write
+  6. Forwards token metadata to PE B's PE_IPCQ (PE-internal wire, ~1 cycle)
+
+PE B's PE_IPCQ receives the metadata
+  7. Updates peer_head_cache (= A's head)
+  8. Wakes any pending recv on that direction
+```
+
+**Steps 5 and 6 must execute in the same SimPy step** — DMA completion
+makes data and metadata atomically visible.
+
+#### Recv flow (credit return — fast path with bottleneck-BW latency)
+
+When the receiver frees a slot, the sender must learn about it
+(backpressure release). Unlike data, the credit return does **not**
+travel through general vc_comm fabric — it uses a **separate fast
+path**, an abstraction of the NVLink / UCIe credit-return wire.
+
+**Latency** is computed from the **bottleneck BW on the path**, not a
+magic constant:
+
+```
+credit_size_bytes = 16  (ccl.yaml: ipcq_credit_size_bytes)
+path = router.find_path(self_pe, peer_pe)
+latency = compute_drain_ns(path, credit_size_bytes)
+        = credit_size_bytes / bottleneck_bw_on_path
+```
+
+That gives us:
+
+- **Topology-proportional approximation**: an in-cube credit return is
+  automatically faster than a cross-SIP credit return.
+- **No magic constants**: no arbitrary `ipcq_ctrl_latency_ns`.
+- **No deadlock risk**: unlike piggyback, B can issue credit even when
+  it has no data to send back.
+- **Reuses existing utility**: `ComponentContext.compute_drain_ns`.
+
+#### Component coupling — SimPy Store channel
+
+PE B's PE_IPCQ does not call PE A's PE_IPCQ directly. Instead, at init
+time, **a SimPy Store is wired between the two** (a per-direction
+fast-path channel) and credit metadata is `put` into that store.
+
+```python
+class PeIpcqComponent:
+    def _delayed_credit_send(self, env, peer_credit_store, my_tail, latency_ns):
+        yield env.timeout(latency_ns)
+        yield peer_credit_store.put(IpcqCreditMetadata(seq=my_tail, ...))
+```
+
+Backend init wires both directions of the fast-path channel as part of
+fan-out (see `IpcqInitMsg` in D12).
+
+#### Credit-return fast path limitations
+
+- `credit_size_bytes` is an estimate (typically 16–64 bytes).
+- The fast path is **excluded from vc_comm BW contention** (separate
+  wire). Real HW credit-return wires are very lightweight, so this is a
+  reasonable first approximation.
+- A follow-up ADR can: model the credit fast path as a separate link
+  (BW limit + contention), or switch to piggyback (`credit_return_mode:
+  piggyback`).
+
+#### PE_DMA's added responsibility
+
+When `vc_comm` receives a token, PE_DMA processes it as the following
+**atomic** sequence. **No SimPy yield is allowed between the two steps**
+(invariant I6):
+
+```python
+def _on_vc_comm_recv(self, env, token):
+    # ── ATOMIC: no yield between these two operations ──
+    data = self._memory_store.read(token.src_space, token.src_addr,
+                                   shape=..., dtype=...)
+    self._memory_store.write(token.dst_endpoint.buffer_kind,
+                             token.dst_addr, data)
+    # 2. Forward metadata to the local PE_IPCQ
+    yield self.out_ports[self._ipcq_id].put(IpcqMetaArrival(token=token))
+    # ───────────────────────────────────────────────────
+```
+
+The final `put` is yieldable but uses an unbounded internal store, so
+it completes in a single step. That `put` is the closing call of the
+atomic block; nothing may be inserted before it.
+
+### D9.5. ADR-0020 (2-pass) integration
+
+`tl.send` / `tl.recv` integrates with ADR-0020's two-pass model. Phase
+1 simulates timing **and** moves data via MemoryStore; Phase 2 enables
+op-log-based correctness verification.
+
+#### Phase 1 (timing + data)
+
+D9 models head and tail updates with two different mechanisms:
+
+- **Send-side (head update)** — DMA payload piggyback. Data write and
+  metadata forward happen in the same SimPy step → automatic atomic
+  visibility.
+- **Recv-side (tail credit return)** — fast-path SimPy Store channel
+  with bottleneck-BW latency, then `peer_tail_cache` update.
+
+Together they preserve ring-buffer pointer consistency.
+
+The op-log records `op_kind="ipcq"` entries for sends (with
+`src/dst/space/addr/nbytes/dir/dtype/shape/sender_seq`) and recvs (with
+`recv_mode/src/dst/space/addr/nbytes/dir/dtype/shape/consumer_seq`).
+Two recv modes:
+
+- **`return_slot`** (default): the slot address is returned to the
+  kernel. Zero-copy.
+- **`copy_to_dst`**: when the kernel passes `dst_addr` + `dst_space`,
+  PE_IPCQ copies the slot data into the user dst.
+
+#### Phase 2 (op_log replay)
+
+When `DataExecutor` encounters an `op_kind="ipcq"` record:
+
+- **send**: idempotent `src → dst` ndarray write.
+- **recv (`return_slot`)**: no-op (the slot already holds the data).
+- **recv (`copy_to_dst`)**: idempotent `slot → dst_addr` copy.
+
+IPCQ ops are pure data movement — Phase 2 has nothing extra to compute.
+The downstream GEMM / Math ops in `DataExecutor` will consume the data
+and naturally validate correctness.
+
+### D10. Host CCL init keeps the PyTorch shape
+
+The host code looks just like real PyTorch DDP. `init_process_group`
+creates the backend object; it does **not** receive IPCQ knobs
+(neighbor topology, buffer_kind, backpressure …).
+
+```python
+# benches/ccl_allreduce.py — same shape as real PyTorch
+def worker(rank, world_size, torch):
+    dist = torch.distributed
+    dist.init_process_group(backend="ahbm")  # reads ccl.yaml + topology
+    tensor = torch.zeros((1, world_size * N_ELEM), dtype="f16", dp=...)
+    tensor.copy_(torch.from_numpy(init))
+    dist.all_reduce(tensor, op="sum")
+```
+
+The IPCQ configuration is decided by the backend at
+`init_process_group` time: it loads `ccl.yaml`, picks the algorithm,
+and pushes IPCQ neighbor tables to every participating PE_IPCQ. The
+host code never has to know about IPCQ.
+
+A bench runs one algorithm, chosen via `ccl.yaml`'s `defaults.algorithm`.
+Switching algorithms is purely a `ccl.yaml` change — no host edits
+required.
+
+#### Init flow (eager)
+
+1. `init_process_group(backend="ahbm")` is called.
+2. Backend loads `ccl.yaml` → resolves `defaults.algorithm`.
+3. Pulls topology + buffer_kind + backpressure + slot config from
+   `algorithms[<algo>]`.
+4. **Immediately** installs neighbor tables on every PE_IPCQ
+   (sideband or fabric `IpcqInitMsg`).
+5. Subsequent `torch.launch(kernel_name, ...)` calls behave normally —
+   PE_IPCQ is already prepared whether the kernel is a CCL kernel or
+   not.
+
+### D11. CCL config file (`ccl.yaml`)
+
+IPCQ config and algorithm metadata live in a separate YAML file,
+following the same pattern as `components.yaml` and `topology.yaml`.
+
+A single benchmark execution runs one algorithm
+(`defaults.algorithm`). Switching algorithms means editing
+`defaults.algorithm` only.
+
+```yaml
+defaults:
+  algorithm: ring_allreduce_tcm
+  buffer_kind: tcm                # tcm | hbm | sram
+  backpressure: sleep             # poll | sleep
+  n_slots: 8
+  slot_size: 4096
+  vc_chunk_size: 256
+  ipcq_credit_size_bytes: 16
+
+algorithms:
+  ring_allreduce_tcm:
+    module: kernbench.ccl.algorithms.ring_allreduce
+    topology: ring_1d             # builtin name or "custom"
+    buffer_kind: tcm
+    n_elem: 8                     # optional, per-algorithm tile width
+
+  tree_allreduce_7:
+    module: kernbench.ccl.algorithms.tree_allreduce
+    topology: tree_binary
+    buffer_kind: tcm
+    world_size: 7                 # algorithm-level override
+    n_elem: 16
+
+  custom_mesh:
+    module: kernbench.ccl.algorithms.custom_mesh
+    topology: custom              # the module supplies its own neighbors()
+```
+
+`world_size` is **not set in `defaults`**. The backend resolves it via:
+`algorithm-level override > defaults override > topology spec`. The
+last fallback (`sips × cubes_per_sip × pes_per_cube`) mirrors real DDP
+where `WORLD_SIZE` comes from env vars rather than config files.
+
+#### Algorithm module structure
+
+Each algorithm module exports two hooks — `kernel` (required) and
+`neighbors` (optional) — plus a `kernel_args` helper that the
+backend uses to populate positional kernel arguments at `all_reduce`
+time:
+
+```python
+# src/kernbench/ccl/algorithms/ring_allreduce.py
+
+def kernel_args(world_size: int, n_elem: int) -> tuple:
+    return (n_elem, world_size)
+
+
+def kernel(t_ptr, n_elem, world_size, tl):
+    """Required — the PE kernel.
+
+    IPCQ is already installed by the backend before this is called.
+    The kernel only uses the four-direction send / recv API.
+    """
+    ...
+
+
+def neighbors(rank, world_size, neighbor_map):
+    """Optional — override the builtin topology's neighbor map.
+
+    Returns a new dict, the modified-in-place dict, or None to keep the
+    builtin map.
+    """
+    return None
+```
+
+#### `neighbors` override patterns
+
+- **Pattern A — tweak a builtin**: drop a direction for some ranks, etc.
+- **Pattern B — replace entirely**: ignore `neighbor_map` and return a
+  brand-new dict.
+- **Pattern C — keep builtin**: omit `neighbors` or return None.
+
+#### Builtin topologies
+
+| topology | direction set |
+|----------|---------------|
+| `ring_1d` | E, W |
+| `ring_1d_unidir` | E only |
+| `mesh_2d` | N, S, E, W |
+| `tree_binary` | parent, child_left, child_right |
+| `none` | (empty) — algorithm must supply `neighbors()` |
+
+#### Adding a new algorithm
+
+1. Write `kernel` and `kernel_args` in
+   `src/kernbench/ccl/algorithms/<algo>.py`.
+2. Add an entry in `ccl.yaml`'s `algorithms` section.
+3. (Optional) provide `neighbors()` for custom topology.
+4. Set `defaults.algorithm` to the new algorithm.
+
+The host bench (`benches/ccl_allreduce.py`) does not change.
+
+### D12. Message / token schema
+
+The new message types added by this ADR. They live in
+`src/kernbench/common/pe_commands.py` and
+`src/kernbench/runtime_api/kernel.py`.
+
+#### `IpcqInitMsg` (sideband, fan-out at init)
+
+The backend pushes neighbor tables to every PE_IPCQ. Structure mirrors
+`MmuMapMsg` (`target_sips`, `target_cubes`, `target_pe`, `entries`).
+Each `IpcqInitEntry` has `direction`, `peer: IpcqEndpoint`,
+`my_rx_base_pa/va`, `n_slots`, `slot_size`, plus a `peer_credit_store`
+field — a `simpy.Store` instance pre-wired so the sender PE_IPCQ can
+push `IpcqCreditMetadata` directly into the receiver's input queue.
+
+#### `IpcqSendCmd` (PE_CPU → PE_IPCQ)
+
+Carries `direction`, source addr/space, nbytes, shape, dtype, and a
+handle id. `data_op=True` so it lands in the op_log.
+
+#### `IpcqRecvCmd` (PE_CPU → PE_IPCQ)
+
+Carries `direction` (or None for round-robin), `recv_mode`
+(`return_slot` / `copy_to_dst`), optional `dst_addr/dst_space`, shape,
+dtype, blocking flag.
+
+#### `IpcqDmaToken` (PE_IPCQ → PE_DMA, vc_comm channel)
+
+Per D9 piggyback: the token carries the data (`src/dst/space/nbytes`)
+plus the head metadata (`sender_seq`, `src_sip/cube/pe`,
+`src_direction`). PE_DMA picks the channel by token type
+(`IpcqDmaToken → vc_comm`, `TileToken → vc_compute`).
+
+The receiver's PE_DMA, on token arrival, performs the I6 atomic
+sequence: write data into MemoryStore, then forward `IpcqMetaArrival`
+to the local PE_IPCQ.
+
+#### `IpcqCreditMetadata` (PE_IPCQ → peer PE_IPCQ, fast path)
+
+Carries `consumer_seq` (= my_tail), source PE coords, and source
+direction. Travels through the dedicated SimPy Store channel rather
+than `vc_comm`. Latency = `credit_size_bytes / bottleneck_bw_on_path`.
+
+There is **no `IpcqPtrUpdate` event** — head updates flow via D9
+piggyback, tail updates via the D9 fast-path channel.
+
+### D13. Test strategy
+
+Following the ADR-0021 D8 pattern.
+
+#### T1. Unit tests (component-level)
+
+- **PE_IPCQ** (`tests/test_pe_ipcq.py`): send without backpressure
+  immediately forwards a token; full peer slot triggers backpressure
+  (poll / sleep modes); recv waits, wakes on `IpcqMetaArrival`;
+  round-robin recv weak fairness; bad direction → `IpcqInvalidDirection`.
+- **PE_DMA virtual channels** (`tests/test_pe_dma_vc.py`): `vc_compute`
+  / `vc_comm` independent progress, chunk interleave, BW split.
+- **Builtin topology** (`tests/test_ccl_topologies.py`): ring_1d /
+  mesh_2d / tree_binary correctness, mesh_2d non-square →
+  `ValueError`, custom resolver returns the module's `neighbors`.
+
+#### T2. Integration tests (E2E send/recv)
+
+- **`tests/test_ipcq_e2e.py`**: 2-rank ring, 4-rank ring (bidirectional
+  no-deadlock), 4×4 mesh.
+- **CCL kernel + 2-pass** (`tests/test_ipcq_2pass.py`): greenlet mode
+  records `ipcq` ops in op_log; DataExecutor produces correct
+  `out.data`.
+
+#### T3. Backend init (`tests/test_ccl_backend_ipcq.py`)
+
+`ccl.yaml` load, builtin topology → `IpcqInitMsg` fan-out, endpoint PA
+consistency, per-`buffer_kind` allocation.
+
+#### T4. Regression
+
+All existing tests pass; ADR-0020 op_log / DataExecutor unaffected for
+non-CCL benches.
+
+#### T5. Performance / overhead
+
+Single send/recv pair latency = (DMA latency) + (IPCQ overhead).
+Should be close to a regular PE_DMA write of the same nbytes (IPCQ
+overhead < 100 ns).
+
+### D14. Invariants and failure modes
+
+#### Invariants
+
+I1. **Slot lifecycle exactly-once**: one send → exactly one recv.
+I2. **Pointer monotonicity**: `my_head` / `my_tail` strictly
+   non-decreasing; `sender_seq` strictly increasing.
+I3. **Endpoint consistency**: if rank A's `direction=E` peer is rank
+   B, then rank B's reverse-direction peer must be rank A. Verified at
+   init.
+I4. **`buffer_kind` consistency**: all PEs in a process group share
+   the same `buffer_kind` (no mixed mode in the first cut).
+I5. **op_log ordering**: send → DMA complete → recv possible. The
+   t_start order in op_log respects this causality.
+I6. **Atomic data + metadata visibility (MUST)**: at the receiver
+   side, data write (`MemoryStore.write`) and metadata forward
+   (`peer_head_cache` update) **must execute in the same SimPy step**.
+   No yield is allowed between the two operations in PE_DMA's vc_comm
+   handler. Code review must reject any inserted `yield` (or `yield
+   from`) — it would create a race where head_cache becomes visible
+   before or after the data.
+I7. **MemoryStore slot existence ↔ pointer**: as a consequence of I6,
+   the step in which `peer_head_cache > my_tail` becomes truthy is the
+   same step in which the slot data is observable.
+
+#### Failure modes (runtime errors)
+
+F1. **Bad direction**: `tl.send(dir="X")` for an uninstalled direction
+   → `IpcqInvalidDirection`, simulation aborts.
+F2. **Type mismatch**: dtype/shape/nbytes disagreement between matched
+   send and recv. Not validated by default; opt-in strict mode catches
+   it (`strict_validation: true` on a PE_IPCQ node attrs).
+F3. **Deadlock detection (timeout-based)**: the simulator empties its
+   schedule while a send/recv is still pending → engine raises
+   `IpcqDeadlock` and embeds a pointer dump.
+F4. **Backend init failure**: missing `defaults.algorithm`, missing
+   `algorithms[name]`, module import failure, topology validation
+   failure (I3, I4) — all raised at `init_process_group` time.
+F5. **Slot full + infinite backpressure**: the peer never recvs.
+   Surfaces as F3 timeout.
+
+#### Diagnostics
+
+- **CCL trace**: `KERNBENCH_CCL_TRACE=1` logs each send/recv as
+  `(rank, t, dir, nbytes)`.
+- **Pointer dump**: `kernbench.ccl.diagnostics.pointer_dump(engine)`
+  prints every PE_IPCQ ring buffer's `my_head`, `my_tail`,
+  `peer_head_cache`, `peer_tail_cache`.
+- **Deadlock dump**: on hang the engine includes the pointer dump in
+  the `IpcqDeadlock` exception message.
+
+### D15. Algorithm-author cheat sheet
+
+Full step-by-step lives in
+[`docs/ccl-author-guide.en.md`](../ccl-author-guide.en.md). The
+shortest version:
+
+| Things you touch | Things you don't |
+|------------------|-------------------|
+| `src/kernbench/ccl/algorithms/<your_algo>.py` (`kernel`, `kernel_args`, optional `neighbors`) | `benches/ccl_allreduce.py` host code |
+| One entry in `ccl.yaml` + optionally `defaults.algorithm` | `src/kernbench/ccl/` framework |
+| (Optional) `tests/test_<your_algo>.py` mock test | PE_IPCQ component, AhbmCCLBackend |
+
+5-step flow: write the kernel → register in `ccl.yaml` → optional
+`neighbors` override → optional mock unit test → SimPy validation via
+`kernbench run --bench ccl_allreduce --verify-data`.
+
+Common mistakes: using a direction that wasn't installed, sends
+without matching recvs (deadlock), dtype/shape disagreement, assuming
+fairness from `tl.recv()` round-robin, confusing
+`tl.num_programs(axis)` with the CCL group size.
+
+---
+
+## Non-goals
+
+- **Host collective**: a model where `dist.all_reduce` itself moves
+  data on the host side is out of scope. This ADR only covers
+  communication that happens inside the PE kernel.
+- **All-reduce algorithms**: ring / tree / etc. live in algorithm
+  modules and can be added without amending this ADR.
+- **Reliability / error handling**: link faults, send/recv failure
+  recovery, etc. are out of scope.
+- **NoC arbiter precision**: dynamic VC contention is left for a future
+  ADR (see D8).
+
+---
+
+## Open questions
+
+- **VC arbitration accuracy** — the first cut uses deterministic
+  chunk interleave + weighted round-robin; heavy contention may report
+  optimistic latency. A NoC arbiter component can be added later.
+- **Credit return BW model** — the fast path is currently outside the
+  fabric BW contention model. Can be modeled as a separate link or
+  switched to piggyback (`credit_return_mode: piggyback`).
+- **Ring buffer slot allocation metadata** — whether the host pushes
+  IPCQ buffer metadata via sideband or via a fabric message similar to
+  `MmuMapMsg` is open.
+- **VC BW split default** — 50/50 vs. weighted (e.g. 80/20). Exposed in
+  `ccl.yaml`; default value TBD.
+- **Direction count** — 4 (N/S/E/W) is fixed in the first cut; 6
+  (with Up/Down for 3D) or N (variable) is future work.
+- **Multi-tile aggregation primitives** — whether
+  `tl.recv_all` or similar is needed for fan-in.
+- **Round-robin recv fairness** — current weak fairness can starve;
+  strict fairness counter is future work.
+- **Deadlock detection precision** — currently timeout-based; a
+  realtime wait-for graph would enable deterministic detection.
+
+---
+
+## Consequences
+
+### Positive
+
+- PE-to-PE direct communication enables CCL kernels to be written.
+- Host stays minimal (just `launch`), synchronization happens inside
+  the PE → strong compute / comm overlap.
+- VCs eliminate HoL blocking → collective latency is not blocked by
+  compute traffic.
+- Buffer placement and backpressure mode are init-time parameters →
+  easy to benchmark.
+- Four-direction logical neighbors → host is free to map
+  ring/mesh/tree algorithms.
+
+### Negative
+
+- One new component (PE_IPCQ) and a redesigned PE_DMA (VCs).
+- IPCQ memory cost = 8 rings × `slot_size` × `n_slots` per PE.
+- VC arbitration is a first-order approximation; heavy contention
+  scenarios may report slightly optimistic latency vs real HW (D8).
+- Chunk-level interleave makes PE_DMA implementation more complex.
+
+---
+
+## Affected files
+
+| File | Change |
+|------|--------|
+| `topology.yaml` | Add `pe_ipcq` to `pe_template`, plus the IPCQ ↔ DMA / CPU / TCM edges. |
+| `components.yaml` | Register `pe_ipcq_v1`. |
+| `src/kernbench/topology/builder.py` | Wire the IPCQ chain into PE-internal edges. |
+| `src/kernbench/components/builtin/pe_ipcq.py` | New. |
+| `src/kernbench/components/builtin/pe_dma.py` | Add VCs, handle `IpcqDmaToken`. |
+| `src/kernbench/common/pe_commands.py` | `IpcqSendCmd`, `IpcqRecvCmd`, `IpcqDmaToken`. |
+| `src/kernbench/triton_emu/tl_context.py` | `tl.send` / `tl.recv` API. |
+| `src/kernbench/runtime_api/distributed.py` | Eager IPCQ install in `AhbmCCLBackend.__init__`. |
+| `src/kernbench/runtime_api/kernel.py` | `IpcqInitMsg` definition. |
+| `src/kernbench/ccl/__init__.py` | New CCL package. |
+| `src/kernbench/ccl/topologies.py` | Builtin topology generators + `resolve_topology()`. |
+| `src/kernbench/ccl/helpers.py` | Algorithm-author helpers (`chunked`, `ring_step`, `tree_step`). |
+| `src/kernbench/ccl/testing.py` | Mock CCL runtime (`run_kernel_in_mock`). |
+| `src/kernbench/ccl/algorithms/*.py` | Algorithm modules (kernel + `kernel_args` + optional `neighbors`). |
+| `ccl.yaml` | Algorithm metadata + IPCQ defaults. |
+| `tests/test_pe_ipcq.py` | PE_IPCQ unit tests. |
+| `tests/test_pe_dma_vc.py` | PE_DMA VC tests. |
+| `tests/test_ipcq_e2e.py` | end-to-end send/recv tests. |
+| `tests/test_ccl_topologies.py` | Builtin topology generator tests. |
+| `tests/test_ccl_allreduce_matrix.py` | Unified bench × algorithm matrix. |
diff --git a/docs/adr/ADR-0023-ipcq-pe-collective.md b/docs/adr/ADR-0023-ipcq-pe-collective.md
new file mode 100644
index 0000000..77369dd
--- /dev/null
+++ b/docs/adr/ADR-0023-ipcq-pe-collective.md
@@ -0,0 +1,1220 @@
+# ADR-0023: PE-level IPCQ — Inter-PE Collective Communication
+
+## Status
+
+Proposed
+
+## Context
+
+### 목표
+
+CCL (Collective Communication Library) 커널을 PE 안에서 실행할 수 있도록
+PE 간 데이터 교환 인프라를 추가한다. 호스트는 그저 각 SIP에 커널을 launch만 하고,
+실제 동기화와 데이터 이동은 **PE 커널 안에서 IPCQ(Inter-Process Communication
+Queue)를 통해** 일어난다.
+
+이는 NCCL이 GPU 커널 안에서 NVLink 통신을 수행하는 모델, 또는 Cerebras/Tenstorrent의
+core-local 통신 큐와 유사하다. 호스트 레벨 collective(`dist.all_reduce`)는
+**미래 작업**으로 미루고, 본 ADR은 커널 collective 인프라에만 집중한다.
+
+### 현재 상태
+
+- ADR-0021 PE 파이프라인 리팩토링: PE 내부가 컴포넌트 단위로 분리됨
+  (PE_CPU, PE_SCHEDULER, PE_DMA, PE_FETCH_STORE, PE_GEMM, PE_MATH, PE_TCM, PE_MMU)
+- PE 간 직접 통신 채널 없음. 모든 데이터 이동은 PE_DMA → cube_noc/UCIe/PCIE → HBM 경로
+- 호스트 CCL skeleton (ADR 없음, ad-hoc 구현): `dist.init_process_group(backend="ahbm")`,
+  `_run_ccl_bench`가 rank별 greenlet로 동시 실행. collective는 stub 상태.
+
+### 풀어야 할 문제
+
+1. PE 간 직접 데이터 이동 (peer's memory에 write)
+2. 동기화 — 송신 측이 수신 측 buffer 공간을 확인해야 함 (backpressure)
+3. compute traffic과 communication traffic의 자원 경쟁 (Head-of-Line blocking)
+4. 호스트가 알고리즘에 따라 (ring/mesh/tree) 논리적 neighbor 토폴로지를 구성할 수 있어야 함
+
+---
+
+## Decision
+
+### D1. PE_IPCQ 컴포넌트 신규 추가
+
+PE 안에 새 컴포넌트 `PE_IPCQ`를 추가한다. PE_GEMM/PE_MATH가 PE_CPU의
+sub-block을 별도 컴포넌트로 모델링하는 것과 동일한 패턴이다.
+
+```
+PE
+├── PE_CPU
+├── PE_SCHEDULER
+├── PE_DMA
+├── PE_IPCQ          ← 신규
+├── PE_FETCH_STORE
+├── PE_GEMM
+├── PE_MATH
+├── PE_TCM
+├── PE_MMU
+```
+
+**역할 분리** (control plane vs data plane):
+- **PE_IPCQ (control plane)**: ring buffer 주소 계산, head/tail pointer 관리,
+  peer pointer 캐시, backpressure 결정, 4-방향 neighbor 매핑
+- **PE_DMA (data plane)**: 실제 데이터를 cube_noc/UCIe/PCIE 경유로 peer 메모리에 전송
+
+PE_IPCQ는 데이터 이동을 직접 수행하지 않고 PE_DMA에 위임한다.
+
+### D2. Ring Buffer 모델
+
+각 PE는 4-방향(N/S/E/W) × {tx, rx} = 총 8개의 ring buffer를 가진다.
+
+```python
+@dataclass
+class IpcqQueuePair:
+    direction: Direction          # N/S/E/W
+    peer: IpcqEndpoint            # init 시 호스트가 설정 (D2.5)
+    tx_buffer_base: int           # 내가 보낼 데이터의 base addr (자기 메모리)
+    rx_buffer_base: int           # 내가 받을 데이터의 base addr (자기 메모리)
+    slot_size: int                # tile 단위
+    n_slots: int                  # ring depth
+    my_head: int                  # 내 send 위치 (다음에 쓸 tx/peer slot)
+    my_tail: int                  # 내 recv 위치 (다음에 읽을 rx slot)
+    peer_head_cache: int          # 캐시: peer가 마지막으로 보낸 head 위치 (D9 piggyback으로 갱신)
+    peer_tail_cache: int          # 캐시: peer가 마지막으로 소비한 tail 위치 (D9 fast path credit으로 갱신)
+```
+
+**필드명 규약 (canonical)**: 본 ADR 전체에서 다음 4개 이름을 일관되게 사용한다.
+
+| 필드 | 소유자 | 갱신 시점 |
+|------|--------|----------|
+| `my_head` | 자기 PE_IPCQ | tl.send 호출 후 즉시 (송신 추적용) |
+| `my_tail` | 자기 PE_IPCQ | tl.recv 호출 후 즉시 (수신 추적용) |
+| `peer_head_cache` | 자기 PE_IPCQ | IpcqMetaArrival 도착 시 (D9 piggyback) |
+| `peer_tail_cache` | 자기 PE_IPCQ | IpcqCreditMetadata 도착 시 (D9 fast path) |
+
+다른 표현(`peer_head_local`, `peer_head`, `peer_tail` 등)은 사용하지 않는다.
+
+**Slot 단위**: fixed-size, 한 slot이 한 tile 데이터를 통째로 담는다.
+descriptor 모델이 아니라 **full data embedding** 모델 (D5에서 상세).
+
+### D2.5. PeAddress / IpcqEndpoint 스키마
+
+`IpcqQueuePair.peer`가 가져야 할 정보를 명시한다. 송신 측 PE_IPCQ가
+peer rx slot에 직접 DMA write하려면 다음을 모두 알아야 한다.
+
+```python
+@dataclass(frozen=True)
+class IpcqEndpoint:
+    """송신 측이 peer's rx_buffer 주소를 계산하기 위해 필요한 모든 정보."""
+    sip: int                     # 목적지 SIP
+    cube: int                    # 목적지 cube
+    pe: int                      # 목적지 PE (cube 내 local index)
+    buffer_kind: str             # "tcm" | "hbm" | "sram" — 어느 메모리 공간
+    rx_base_pa: int              # peer rx_buffer base의 PA (PhysAddr.encode())
+    rx_base_va: int              # peer rx_buffer base의 VA (선택, MMU 사용 시)
+    n_slots: int                 # peer ring depth (경계 wrap-around 계산용)
+    slot_size: int               # peer slot 크기 (offset 계산용)
+```
+
+`IpcqQueuePair`의 `peer` 필드는 이 `IpcqEndpoint` 객체를 들고 있다.
+주소 계산은:
+
+```python
+slot_idx = self.my_head % peer.n_slots
+dst_pa = peer.rx_base_pa + slot_idx * peer.slot_size
+```
+
+PE_IPCQ는 이 dst_pa를 `IpcqDmaToken`의 dst_addr로 PE_DMA에 전달한다.
+PE_DMA(vc_comm)는 fabric 라우팅(cube_noc/UCIe/PCIE)을 통해 dst_pa로 데이터를 전송한다.
+
+**Endpoint 생성 시점**: backend init (D10)에서 모든 PE의 IPCQ buffer를
+allocator로 할당받고, 각 rank의 neighbor table을 만들 때 peer rank의
+endpoint 정보를 install한다. 즉 install 순서는:
+
+1. **모든 rank의 IPCQ buffer 할당** (각 PE의 buffer_kind 메모리 공간에서)
+2. **rank별 endpoint table 구성** (자신의 4-방향 peer가 어느 sip/cube/pe/pa를 갖는지)
+3. **PE_IPCQ에 install** (`IpcqInitMsg` via fabric or sideband)
+
+이 순서는 모든 rank가 서로의 PA를 알아야 하므로, 단계 1을 모든 rank에 대해
+먼저 끝낸 후 단계 2-3을 진행한다.
+
+### D3. 4-방향 매핑 = 논리적 ProcessGroup
+
+PE는 4방향(N/S/E/W)을 logical port로 본다. 실제 peer 주소는 호스트 CCL init이
+알고리즘에 따라 설정한다. PE 커널은 토폴로지를 알지 못하고 방향만 사용한다.
+
+```python
+# 호스트 init 예시 — 1D ring
+for rank in range(world_size):
+    ipcq_set_neighbor(rank, "E", peer=ranks[(rank + 1) % world_size])
+    ipcq_set_neighbor(rank, "W", peer=ranks[(rank - 1) % world_size])
+
+# 호스트 init 예시 — 2D mesh
+for r in range(R):
+    for c in range(C):
+        ipcq_set_neighbor((r, c), "N", peer=((r - 1) % R, c))
+        ipcq_set_neighbor((r, c), "S", peer=((r + 1) % R, c))
+        ipcq_set_neighbor((r, c), "E", peer=(r, (c + 1) % C))
+        ipcq_set_neighbor((r, c), "W", peer=(r, (c - 1) % C))
+```
+
+PE 코드 입장에서 `tl.send(dir="E", ...)`가 어디로 가는지는 알 필요가 없다.
+
+### D4. PE 커널 API
+
+```python
+# Send (blocking, backpressure 발생 가능)
+tl.send(dir: str, src_addr: int, nbytes: int) -> None
+
+# Recv (blocking)
+data = tl.recv(dir: str)            # 특정 방향에서 수신
+data = tl.recv()                    # 4방향 round-robin, 도착한 첫 tile 반환
+
+# Recv (non-blocking)
+handle = tl.recv_async(dir: str)
+data = tl.wait(handle)
+```
+
+`tl.recv()` (방향 미지정)는 IPCQ가 last_polled_dir 인덱스를 들고 있다가
+다음 호출 시 그 다음 방향부터 검사하면서 데이터 있는 첫 슬롯을 반환한다.
+4방향 모두 비어있으면 wait.
+
+**Fairness는 weak fairness**: polling 시작 방향을 회전시켜 단순 편향을
+완화하지만, 한 방향에 데이터가 항상 먼저 도착하면 다른 방향이 starvation될
+수 있다. strict fairness가 필요한 알고리즘은 `tl.recv(dir=...)`로 방향을
+명시해야 한다. (Open Questions 참조)
+
+### D5. Single-hop DMA Write + Full-data Slot 모델
+
+데이터는 송신 측 메모리에서 수신 측 ring slot으로 **단일 DMA 전송**으로
+이동한다. 핵심 속성:
+
+- **Single-hop**: 송신 측 IPCQ가 peer rx slot 주소를 직접 알고 있어 한 번의
+  fabric DMA로 데이터가 도착한다.
+- **No CPU memcpy**: CPU가 데이터를 복사하지 않는다.
+- **No intermediate staging**: 송신/수신 어느 쪽에도 별도 staging buffer가
+  없다 (송신은 자기 source 주소에서 직접, 수신은 자기 ring slot으로 직접).
+
+(엄밀히 말하면 fabric DMA write 자체는 발생하므로 "data movement가 전혀 없다"는
+의미는 아니다. NCCL의 "zero-copy"가 가리키는 것 — CPU memcpy / staging copy
+부재 — 과 동일한 속성이다.)
+
+데이터 이동 모델:
+
+```
+PE A: tl.send(E, src_addr, nbytes)
+  1. IPCQ가 peer rx slot 주소 계산
+       dst_addr = peer.rx_base_pa + (my_head % peer.n_slots) * peer.slot_size
+  2. backpressure: my_head - peer_tail_cache < peer.n_slots ?
+                   (꽉 찼으면 sleep/poll)
+  3. PE_DMA(vc_comm)에 DMA 요청 → src_addr에서 peer의 dst_addr로 nbytes 전송
+  4. my_head += 1
+
+PE B: data = tl.recv(W)
+  1. 내 rx_buffer[my_tail % n_slots] 위치 확인
+  2. 데이터 도착 대기 (D7 backpressure 모드)
+  3. 그 주소를 PE 커널에 반환 (또는 fetch unit으로 register file에 로드)
+  4. my_tail += 1
+  5. credit return fast path 발행 (D9) — bottleneck-BW latency 후
+     peer A의 peer_tail_cache 갱신
+```
+
+**핵심**: Slot에 데이터가 통째로 들어간다. PE B의 recv는 자기 rx_buffer만
+읽으면 되고, A의 메모리를 read하지 않는다. 송신 측 IPCQ가 peer rx slot
+주소를 알고 있으므로 직접 그 주소로 DMA write한다 (single-hop).
+
+본인의 PE_TCM read/write는 DMA를 거치지 않는다 (PE에 직접 붙어있음).
+slot이 본인 TCM에 있으면 직접 접근, 아니면 PE_DMA 경유.
+
+### D6. Buffer 위치 — 3-way benchmark
+
+호스트 CCL init이 IPCQ ring buffer의 메모리 위치를 결정한다:
+
+```python
+ipcq_init(
+    backend="ahbm",
+    buffer_kind="tcm" | "hbm" | "sram",
+    n_slots=8,
+    slot_size=4096,
+)
+```
+
+| 위치 | 특징 | trade-off |
+|------|------|-----------|
+| **PE_TCM** | PE에 직접 붙음, 빠름 | 작음, PE 내부 자원과 경쟁 |
+| **PE-local HBM** | 큼, DMA 경유 | latency 큼 |
+| **Cube SRAM** | 중간 크기, cube-shared | cube 내 PE 간 contention |
+
+세 위치 모두 동일 코드로 동작하며 init만 다르다. 벤치마크로 비교 가능.
+
+**규칙**: peer가 read/write할 때는 DMA 경유. 본인이 자기 PE_TCM 읽기/쓰기는
+DMA 없음.
+
+### D7. Backpressure — 2-mode benchmark
+
+송신 측이 peer slot full을 감지했을 때, 또는 수신 측이 데이터 미도착을
+감지했을 때 어떻게 대기하는가:
+
+| 모드 | 동작 | 모델 |
+|------|------|------|
+| **poll** | 캐시된 peer pointer를 주기적으로 재확인. cache update event를 폴링 | spin loop |
+| **sleep** | SimPy event를 yield하고 sleep, peer가 update event를 trigger하면 wake | interrupt-like |
+
+```python
+ipcq_init(backpressure="poll" | "sleep", ...)
+```
+
+두 모드 모두 구현하여 latency/throughput trade-off를 벤치마크할 수 있다.
+
+### D8. PE_DMA Virtual Channel
+
+PE_DMA를 단일 큐에서 **2-channel virtual channel** 모델로 확장한다.
+
+```
+PE_DMA
+├── vc_compute: GEMM/MATH의 tile load/store/writeback
+└── vc_comm:    IPCQ의 send 데이터
+```
+
+각 VC는 독립적인 state machine을 가진다:
+- 한 채널이 stall되어도 다른 채널은 진행
+- 동일 link(cube_noc, UCIe 등)는 공유하지만, link BW는 두 채널이 분할 사용
+
+**Chunk 단위 인터리브**:
+- 큰 GEMM tile DMA가 한 번에 link를 점유하지 않음
+- chunk_size 단위로 진행 (예: 256B), 매 chunk마다 다른 VC와 link BW 공유
+- chunk_size는 init 파라미터 (작을수록 fair, 클수록 효율)
+
+이로써:
+- HoL blocking 해소 (compute DMA 진행 중에도 IPCQ send 끼어들 수 있음)
+- compute/comm overlap 자연스러움 (NVIDIA copy engine + compute SM 패턴)
+- HW 모델 정합 (NoC virtual channel은 실제 HW 기법)
+
+**첫 구현의 정확도 한계 (intentional)**:
+
+본 ADR의 첫 구현은 **deterministic chunk-level interleave + weighted
+round-robin arbitration** (default 50/50, ccl.yaml에 노출)을 채택한다.
+이는 first-order approximation이며, 실제 HW의 dynamic contention/credit-based
+arbitration보다는 단순화된 모델이다.
+
+| 모델링 항목 | 첫 구현 | 향후 확장 가능 |
+|------------|---------|---------------|
+| VC 간 BW 분할 | 정적 weight | dynamic contention 기반 |
+| Chunk 단위 인터리브 | deterministic round-robin | priority/QoS 기반 |
+| Cross-VC dependency | 없음 (독립) | NoC arbiter component 추가 |
+
+이 단순화는 functional correctness에는 영향이 없으며, latency 측정에서
+heavy contention 시나리오는 실제보다 약간 optimistic한 결과를 낼 수 있다.
+정밀화가 필요하면 별도 ADR로 NoC arbiter를 도입한다.
+
+#### Token routing
+
+- compute용 token (TileToken): 기존 PE_FETCH_STORE → PE_DMA 체이닝 그대로
+- comm용 token (IpcqDmaToken, 신규): PE_IPCQ → PE_DMA로 self-routing
+- PE_DMA가 token 종류로 채널 결정
+
+```python
+class PeDmaComponent:
+    def _process(self, env, token):
+        if isinstance(token, IpcqDmaToken):
+            yield from self._vc_comm_process(env, token)
+        else:
+            yield from self._vc_compute_process(env, token)
+```
+
+### D9. Pointer 동기화 — DMA payload piggyback
+
+실제 하드웨어(NVLink, UCIe 등)는 DMA 메시지의 payload에 메타데이터를
+piggyback하여 송수신과 함께 pointer를 갱신한다. 본 시뮬레이션도 같은 모델을
+채택하여 **별도의 control 채널 없이** 메타데이터가 data와 함께 도착하도록 한다.
+
+이 모델의 핵심 이점:
+
+- **자동 ordering**: 메타데이터가 data와 동일 token으로 이동하므로 data가
+  먼저 visible해진 다음에야 head_cache가 갱신된다. 별도 ordering invariant
+  없이 race condition이 원천 차단된다.
+- **HW 정합**: 실제 NVLink/UCIe의 piggybacked header 모델과 일치
+- **컴포넌트 단순화**: 별도 IpcqPtrUpdate event 종류가 필요 없음
+
+#### Send 흐름 (head 측 piggyback)
+
+```
+PE A: tl.send(E, src_addr, nbytes)
+  1. PE_IPCQ가 backpressure 체크 (peer_tail_cache 기준)
+  2. PE_IPCQ가 IpcqDmaToken 생성:
+     - data 본체 (src_addr → peer dst_addr)
+     - piggyback metadata: (sender_seq, src_sip/cube/pe, src_direction)
+  3. PE_DMA(vc_comm)에 token put
+  4. PE A는 자기 my_head++ (송신 추적용)
+
+[fabric DMA: latency 만큼 진행]
+
+PE B의 PE_DMA가 token 수신
+  5. data를 dst_addr (B의 rx slot)에 MemoryStore.write
+  6. token의 metadata를 PE B의 PE_IPCQ로 forward (PE 내부 wire, ~1 cycle)
+
+PE B의 PE_IPCQ가 metadata 수신
+  7. peer_head_cache 갱신 (= A의 head 위치)
+  8. 대기 중인 recv (해당 direction)가 있으면 wake
+```
+
+여기서 핵심은 **5와 6은 같은 SimPy step**이라는 것이다 — DMA 완료와 동시에
+data와 metadata가 atomic하게 visible해진다.
+
+#### Recv 흐름 (credit return — fast path with bottleneck-BW latency)
+
+수신측이 slot을 비우면 송신측은 그 사실을 알아야 한다 (backpressure 해제).
+data 경로의 piggyback 모델과 달리, credit return은 일반 vc_comm fabric을
+거치지 않고 **별도 fast path**로 처리한다. 이는 실제 HW의 NVLink/UCIe
+credit return fast path를 추상화한 것이다.
+
+**Latency 계산**: magic constant가 아니라 **라우팅 경로의 bottleneck BW**
+기준으로 산출한다.
+
+```
+credit_size_bytes = 16  (ccl.yaml: ipcq_credit_size_bytes)
+path = router.find_path(self_pe, peer_pe)
+latency = compute_drain_ns(path, credit_size_bytes)
+        = credit_size_bytes / bottleneck_bw_on_path
+```
+
+이로써:
+- **토폴로지 비례 approximation**: cube 내 credit return과 cross-SIP credit이
+  자동으로 다른 latency를 가짐 (정확한 값은 아니지만 magic constant보다 의미 있음)
+- **Magic constant 없음**: 별도 `ipcq_ctrl_latency_ns` 같은 임의 값 불필요
+- **Deadlock 위험 없음**: piggyback과 달리 B가 A에게 보낼 데이터가 없어도
+  credit이 자동 발행됨
+- **기존 utility 재사용**: `ComponentContext.compute_drain_ns` 그대로 사용
+
+```
+PE B: tl.recv(W) → 데이터 가져감 → my_tail++
+
+PE B의 PE_IPCQ:
+  1. router로 PE A까지 path 계산
+  2. compute_drain_ns(path, credit_size_bytes) = latency_ns
+  3. env.process(self._delayed_credit_send(latency_ns, peer_credit_store, my_tail))
+
+[fast path: latency_ns 만큼 timeout, fabric vc 미사용]
+
+PE A의 PE_IPCQ가 자기 credit_store에서 IpcqCreditMetadata 수신:
+  4. peer_tail_cache 갱신
+  5. 대기 중인 send (해당 direction)가 있으면 wake
+```
+
+#### Component 결합도 — SimPy Store 채널
+
+PE B의 PE_IPCQ가 PE A의 PE_IPCQ를 직접 호출하지 않는다. 대신 **init 시점에
+양쪽 PE_IPCQ 사이에 SimPy Store를 한 번 wire**해두고 (양방향 fast path 채널),
+credit metadata는 그 store로 put한다.
+
+```python
+class PeIpcqComponent:
+    def _delayed_credit_send(self, env, peer_credit_store, my_tail, latency_ns):
+        yield env.timeout(latency_ns)
+        yield peer_credit_store.put(IpcqCreditMetadata(seq=my_tail, ...))
+```
+
+backend init에서 IpcqInitMsg fan-out 시 양방향 fast path channel을 함께
+설치한다 (D12 IpcqInitMsg에 명시).
+
+#### Credit return fast path의 한계
+
+- `credit_size_bytes`는 estimate. 보통 16-64 bytes로 충분하며, 실제 HW의
+  credit return wire 크기를 모방한 값.
+- fast path는 일반 vc_comm BW contention 모델에서 **제외**된다 (별도 채널).
+  실제 HW의 credit return wire는 매우 lightweight이므로 1차 근사로 합리적.
+- 정밀화가 필요하면 후속 ADR에서:
+  - credit fast path를 별도 link로 모델링 (BW limit + contention)
+  - 또는 piggyback 모드로 변경 가능 (`credit_return_mode: piggyback`)
+
+#### PE_DMA의 책임 추가
+
+PE_DMA(vc_comm)는 token 수신 시 다음 atomic 시퀀스로 처리한다.
+**두 동작 사이에 SimPy yield를 두어서는 안 된다** (I6 MUST 규칙 참조):
+
+```python
+def _on_vc_comm_recv(self, env, token):
+    # ── ATOMIC: 두 동작 사이에 yield 금지 ──
+    # 1. data를 dst_addr에 write (dst의 메모리 공간은 token.dst_endpoint.buffer_kind)
+    data = self._memory_store.read(token.src_space, token.src_addr,
+                                    shape=..., dtype=...)
+    self._memory_store.write(token.dst_endpoint.buffer_kind, token.dst_addr, data)
+    # 2. token의 metadata를 자기 PE의 IPCQ로 forward
+    yield self.out_ports[self._ipcq_id].put(IpcqMetaArrival(token=token))
+    # ─────────────────────────────────────
+```
+
+`out_ports[ipcq_id].put`은 SimPy Store의 yield-able 호출이지만, PE 내부
+wire로 capacity가 unbounded인 store를 사용하므로 즉시 완료된다 (실질적으로
+single-step). 이 최종 put이 atomic 구간의 끝이며, 그 이전에 다른 yield가
+삽입되면 안 된다.
+
+#### Backpressure latency 정확도
+
+backpressure 해제까지 걸리는 시간:
+
+- **데이터 send 측 latency** = full fabric DMA (data + piggyback metadata 함께)
+- **Credit return 측 latency** = fast path with bottleneck-BW
+  (`credit_size_bytes / bottleneck_bw_on_path`)
+
+| 시나리오 | 모델링된 latency | 실제 HW와의 관계 |
+|---------|----------------|----------------|
+| Cube 내 (fast link) | 작음 (bottleneck = cube_noc BW) | topology-aware approximation |
+| Cross-cube (UCIe) | 중간 (bottleneck = UCIe BW) | topology-aware approximation |
+| Cross-SIP (PCIE) | 큼 (bottleneck = PCIE BW) | topology-aware approximation |
+
+별도 magic latency 파라미터 없이 토폴로지에 비례한 first-order
+approximation이 자동으로 반영된다. 실제 HW와 정확히 일치하지는 않지만
+(credit fast path는 contention 모델에서 제외, credit_size_bytes는 estimate),
+magic constant 모델보다 훨씬 의미 있는 비교 가능. 정밀화는 후속 ADR로
+넘긴다.
+
+### D9.5. ADR-0020 (2-Pass) 통합
+
+`tl.send/recv`는 ADR-0020의 2-pass 모델과 통합되어야 한다. Phase 1은
+타이밍과 실제 데이터 이동(MemoryStore) 모두 모델링하고, Phase 2는 op_log
+기반 정합성 검증을 가능케 한다.
+
+#### Phase 1 (타이밍 + 데이터 이동)
+
+D9는 head 갱신과 tail 갱신을 다른 메커니즘으로 모델링한다:
+
+- **Send-side (head update)** — DMA payload piggyback. data write와 metadata
+  forward가 동일 SimPy step에 일어나므로 자동으로 atomic visibility 보장.
+- **Recv-side (tail credit return)** — fast path SimPy Store 채널.
+  bottleneck-BW 기반 latency 후 peer_tail_cache 갱신.
+
+두 메커니즘을 합쳐서 전체 ring buffer pointer 일관성을 유지한다.
+
+**send 시**:
+
+1. PE_IPCQ가 backpressure 체크 (peer_tail_cache 기준)
+2. PE_IPCQ가 IpcqDmaToken 생성 (data + piggyback metadata) → PE_DMA(vc_comm)에 put
+3. PE_DMA가 fabric DMA 시뮬레이션 (latency 진행)
+4. **DMA 완료와 동일한 SimPy step에 atomic 시퀀스**:
+   - **MemoryStore.write(buffer_kind, dst_pa, data)** — single-hop DMA write
+   - 수신측 PE_IPCQ에 metadata forward → peer_head_cache 갱신 → 대기 recv wake
+5. **op_log 기록**: `OpRecord(op_kind="ipcq", op_name="send", params={src_space, src_addr, dst_space, dst_addr, nbytes, dir, dtype, shape, sender_seq})`
+   - `dst_space`는 `token.dst_endpoint.buffer_kind`에서 derive된 값이다
+     (별도 token 필드가 아니다). dst_addr은 `token.dst_addr`.
+
+**recv 시**:
+
+1. PE_IPCQ가 (peer_head_cache > my_tail) AND (MemoryStore.has(slot_addr)) 조건 대기
+   (D9 piggyback 모델에서는 두 조건이 같은 step에 truthy가 되지만, defensive check)
+2. 조건 만족 시: `slot_addr = my_rx_base + slot_idx * slot_size`
+3. **두 가지 모드** (`recv_mode`로 op_log에 기록):
+   - **`return_slot`** (default): slot_addr을 그대로 PE 커널에 반환.
+     데이터 복사 없음. 커널이 slot 메모리를 직접 사용한다.
+   - **`copy_to_dst`**: 호출 시 dst_addr이 지정된 경우. slot 데이터를 읽어서
+     dst_addr에 write. `data = memory_store.read(...)`; `memory_store.write(dst_space, dst_addr, data)`
+4. PE_IPCQ가 my_tail++, fast path credit return을 발행 (D9 — vc_comm
+   fabric을 거치지 않고 별도 SimPy Store 채널로 bottleneck-BW latency 후
+   peer 측 peer_tail_cache 갱신)
+5. **op_log 기록**: `OpRecord(op_kind="ipcq", op_name="recv", params={recv_mode, src_space, src_addr, dst_space, dst_addr, nbytes, dir, dtype, shape, consumer_seq})`
+   - `recv_mode="return_slot"`: src_space/src_addr가 slot 위치, dst_addr=None
+   - `recv_mode="copy_to_dst"`: src_space/src_addr가 slot 위치, dst_space/dst_addr가 사용자 지정 위치
+
+#### Phase 2 (op_log replay)
+
+DataExecutor가 `op_kind="ipcq"` 레코드를 만나면:
+
+- **send**: src → dst (peer rx slot)로 ndarray를 idempotent하게 write
+- **recv (`recv_mode="return_slot"`)**: no-op. slot 데이터는 Phase 1에서
+  이미 적절한 위치에 있으며, 커널이 해당 slot 메모리를 직접 사용함.
+- **recv (`recv_mode="copy_to_dst"`)**: slot → dst_addr로 ndarray를 idempotent
+  하게 copy
+
+본질적으로 IPCQ는 **데이터 이동**만 하므로 Phase 2가 추가로 계산할 것은 없다.
+DataExecutor의 GEMM/Math가 그 데이터를 사용하면 자동으로 정합성이 검증된다.
+
+```python
+class DataExecutor:
+    def _execute_op(self, op):
+        if op.op_kind == "ipcq":
+            self._execute_ipcq(op)
+        elif op.op_kind == "memory":
+            ...
+        elif op.op_kind == "gemm":
+            ...
+
+    def _execute_ipcq(self, op):
+        """IPCQ ops are data movement; Phase 1 already wrote to MemoryStore."""
+        p = op.params
+        if op.op_name == "send":
+            data = self.store.read(p["src_space"], p["src_addr"],
+                                   shape=p["shape"], dtype=p["dtype"])
+            self.store.write(p["dst_space"], p["dst_addr"], data)
+        elif op.op_name == "recv":
+            if p.get("recv_mode") == "copy_to_dst":
+                data = self.store.read(p["src_space"], p["src_addr"],
+                                       shape=p["shape"], dtype=p["dtype"])
+                self.store.write(p["dst_space"], p["dst_addr"], data)
+            # recv_mode == "return_slot": no-op (data already in slot)
+```
+
+#### `--verify-data` 흐름 (CCL 커널)
+
+```
+1. kernbench run --bench ccl_allreduce --verify-data
+2. backend init → IPCQ buffers 할당, neighbor table install
+3. 모든 rank greenlet 동시 실행
+4. 각 PE 커널이 tl.send/recv → MemoryStore에 데이터 누적
+5. 시뮬레이션 완료 후 DataExecutor.run() → ipcq op 멱등 replay (no-op)
+6. 벤치마크가 print(out) 또는 out.data 비교 → 정합성 확인
+```
+
+벤치 작성자는 `out.data`로 결과를 읽고 expected와 비교하면 된다 (ADR-0020 D7
+Tensor.data 패턴).
+
+### D10. 호스트 CCL Init은 PyTorch 패턴 그대로
+
+호스트 코드는 실제 PyTorch distributed 코드와 동일하게 유지한다.
+`init_process_group`은 backend 객체만 만들고, IPCQ 설정 (neighbor topology,
+buffer_kind, backpressure 등)은 받지 않는다.
+
+```python
+# benches/ccl_allreduce.py — 실제 PyTorch와 동일한 호스트 코드
+def run_rank(rank, world_size, torch):
+    dist = torch.distributed
+    dist.init_process_group(backend="ahbm", world_size=world_size, rank=rank)
+
+    tensor = torch.zeros((M, K), dtype="f16", dp=...)
+
+    from kernbench.ccl.algorithms import ring_allreduce
+    torch.launch("ring_allreduce", ring_allreduce.kernel, tensor, rank, world_size)
+```
+
+IPCQ 설정은 backend가 **init_process_group 시점에** `ccl.yaml`을 읽고 즉시
+PE_IPCQ neighbor table을 install한다. 호스트 코드는 IPCQ를 인지할 필요가 없다.
+
+벤치마크 하나는 하나의 알고리즘을 사용하는 것을 가정하며, 사용할 알고리즘은
+`ccl.yaml`의 `defaults.algorithm` 으로 지정한다 (D11). 호스트 코드 변경 없이
+ccl.yaml만 수정하여 다른 알고리즘으로 교체할 수 있다.
+
+#### Init 흐름 (eager)
+
+1. `init_process_group(backend="ahbm")` 호출
+2. backend가 `ccl.yaml` 로드 → `defaults.algorithm` 결정
+3. `algorithms[<algo>]`에서 topology + buffer_kind + backpressure + slot/size 결정
+4. **즉시** 모든 PE의 PE_IPCQ에 neighbor table을 install (sideband 또는 fabric `IpcqInitMsg`)
+5. 이후 `torch.launch(kernel_name, ...)`는 일반 launch와 동일하게 처리
+   (CCL kernel이든 아니든 PE_IPCQ는 이미 준비됨)
+
+### D11. CCL 설정 파일 (`ccl.yaml`)
+
+IPCQ 설정과 알고리즘 metadata는 별도 YAML 파일에 둔다.
+`components.yaml`/`topology.yaml`과 같은 패턴을 유지하며, 변경 이력이 코드처럼
+추적 가능하다.
+
+벤치마크 한 번 실행은 한 알고리즘만 사용한다 (`defaults.algorithm`).
+다른 알고리즘으로 교체하려면 `ccl.yaml`의 `defaults.algorithm` 만 바꾸면 된다.
+
+```yaml
+# ccl.yaml — CCL backend (ahbm) configuration
+#
+# 이 파일은 init_process_group(backend="ahbm") 시점에 로드되며,
+# defaults.algorithm 으로 지정된 알고리즘에 따라 PE_IPCQ neighbor table을
+# install한다. 호스트 코드는 IPCQ 설정을 인지하지 않는다.
+
+defaults:
+  # 이번 벤치 실행에서 사용할 알고리즘. algorithms 섹션에 정의된 것 중 하나.
+  algorithm: ring_allreduce
+
+  # IPCQ ring buffer가 위치할 메모리.
+  #   tcm  — PE-local TCM (작지만 빠름, PE 내부 자원과 경쟁)
+  #   hbm  — PE-local HBM (큼, DMA latency 큼)
+  #   sram — Cube-shared SRAM (중간 크기, cube 내 PE 간 contention)
+  buffer_kind: tcm
+
+  # send/recv가 peer slot full / data 미도착을 만났을 때의 대기 방식.
+  #   poll  — peer pointer 캐시를 spin loop로 재확인
+  #   sleep — SimPy event yield 후 wakeup 대기 (interrupt-like)
+  backpressure: sleep
+
+  # Ring buffer depth (한 방향당 slot 개수). 클수록 in-flight 가능, 메모리 ↑
+  n_slots: 8
+
+  # Slot 하나의 크기 (bytes). 한 tile을 통째로 담을 수 있는 크기여야 함.
+  slot_size: 4096
+
+  # PE_DMA virtual channel chunk 크기 (bytes). 작을수록 fair, 클수록 효율.
+  # IPCQ traffic과 compute traffic 사이의 인터리브 granularity (D8 참조).
+  vc_chunk_size: 256
+
+  # Credit return fast path 메시지 크기 (bytes). 실제 HW의 credit return wire
+  # 크기를 모방. backend가 라우팅 경로의 bottleneck BW를 보고 latency를
+  # 계산한다 (D9 참조). 보통 16-64로 충분.
+  ipcq_credit_size_bytes: 16
+
+algorithms:
+  # ── 알고리즘 정의 ─────────────────────────────────────────────────
+  # 각 entry는 알고리즘 모듈과 그 알고리즘이 요구하는 topology를 명시한다.
+  # 알고리즘별 default override 가능 (buffer_kind, backpressure 등).
+
+  ring_allreduce:
+    # PE 커널이 정의된 모듈. `kernel(t_ptr, rank, world_size, tl)` 함수를 export.
+    module: kernbench.ccl.algorithms.ring_allreduce
+
+    # 이 알고리즘이 요구하는 neighbor topology. builtin 이름 또는 "custom".
+    #   ring_1d        — 1D 양방향 ring (E/W)
+    #   ring_1d_unidir — 1D 단방향 ring (E only)
+    #   mesh_2d        — 2D mesh (N/S/E/W)
+    #   tree_binary    — binary tree (parent/children direction)
+    #   custom         — 모듈의 neighbors(rank, world_size) 함수 사용
+    topology: ring_1d
+
+  tree_allreduce:
+    module: kernbench.ccl.algorithms.tree_allreduce
+    topology: tree_binary
+    # 알고리즘별 override (이 알고리즘만 hbm 사용)
+    buffer_kind: hbm
+
+  custom_mesh:
+    module: kernbench.ccl.algorithms.custom_mesh
+    topology: custom            # 모듈이 직접 neighbors() 함수 제공
+```
+
+#### 알고리즘 모듈 구조
+
+알고리즘 모듈은 두 개의 hook을 export한다 — `kernel`은 필수, `neighbors`는 선택.
+
+```python
+# src/kernbench/ccl/algorithms/ring_allreduce.py
+
+def kernel(t_ptr, rank, world_size, tl):
+    """필수 — PE 커널.
+
+    IPCQ 설정은 backend가 ccl.yaml + neighbors() 결과로 install한 상태이다.
+    커널은 그저 4-방향 send/recv API만 사용하면 된다.
+    """
+    for step in range(world_size - 1):
+        ...
+        tl.send(dir="E", ...)
+        data = tl.recv(dir="W")
+
+
+def neighbors(rank, world_size, neighbor_map):
+    """선택 — neighbor table override hook.
+
+    backend는 ccl.yaml의 topology 필드에 따라 builtin neighbor_map을 생성한 뒤,
+    이 함수가 정의되어 있으면 호출하여 결과를 override 한다.
+
+    Args:
+        rank: 이 rank의 인덱스
+        world_size: 전체 rank 수
+        neighbor_map: ccl.yaml의 topology 필드가 만든 builtin 매핑
+                      예: ring_1d → {"E": (rank+1)%ws, "W": (rank-1)%ws}
+                      mutable dict — 직접 수정 가능
+
+    Returns:
+        dict | None:
+          dict — neighbor_map을 override한 결과
+          None — override 안 함, neighbor_map 그대로 사용
+    """
+    return None  # 또는 수정 후 반환
+```
+
+#### `neighbors` override 패턴
+
+대부분의 알고리즘은 builtin topology만으로 충분하므로 `neighbors` 정의가 필요 없다.
+정의가 필요한 경우의 패턴:
+
+**Pattern A — builtin을 base로 일부만 수정**:
+```python
+def neighbors(rank, world_size, neighbor_map):
+    # 짝수 rank만 W 사용
+    if rank % 2 == 1:
+        neighbor_map.pop("W", None)
+    return neighbor_map
+```
+
+**Pattern B — 완전히 새로 만들기 (skip-connection ring 등)**:
+```python
+def neighbors(rank, world_size, neighbor_map):
+    # neighbor_map은 무시하고 새로 작성
+    return {"E": (rank + 2) % world_size}
+```
+
+#### Builtin topology generators
+
+`ccl.yaml`의 `topology` 필드가 다음 builtin 이름이면 backend가 알아서 처리:
+
+| topology | 설명 | direction set |
+|----------|------|---------------|
+| `ring_1d` | 1D 양방향 ring | E, W |
+| `ring_1d_unidir` | 1D 단방향 ring | E only |
+| `mesh_2d` | 2D mesh | N, S, E, W |
+| `tree_binary` | binary tree (root = rank 0) | parent, child_left, child_right |
+| `none` | 빈 매핑 — 알고리즘이 `neighbors()`로 처음부터 작성 | (없음) |
+
+`topology: none`은 builtin이 빈 dict를 반환하므로 알고리즘의 `neighbors()`가
+처음부터 매핑을 만들어야 한다.
+
+#### 알고리즘 추가 절차
+
+1. `src/kernbench/ccl/algorithms/<algo>.py`에 `kernel` 함수 작성
+2. `ccl.yaml`의 `algorithms` 섹션에 entry 추가 (`module`, `topology`)
+3. (선택) 같은 모듈에 `neighbors()` 함수 추가하여 builtin override
+4. `defaults.algorithm`을 새 알고리즘으로 설정하면 적용
+
+호스트 코드는 손대지 않는다.
+
+### D12. 메시지 / 토큰 스키마
+
+본 ADR이 추가하는 모든 메시지/토큰의 필드를 명시한다. 구현 시 이 정의를
+`src/kernbench/common/pe_commands.py`와 `src/kernbench/runtime_api/kernel.py`에
+그대로 추가한다.
+
+#### `IpcqInitMsg` (sideband, init 시 fan-out)
+
+backend가 모든 PE의 PE_IPCQ에 neighbor table을 install하기 위해 사용한다.
+구조는 `MmuMapMsg`와 유사 (target_sips, target_cubes, target_pe + entries).
+
+```python
+@dataclass(frozen=True)
+class IpcqInitEntry:
+    direction: str               # "N" | "S" | "E" | "W"
+    peer: IpcqEndpoint           # D2.5 참조
+    my_rx_base_pa: int           # 자신의 rx_buffer base
+    my_rx_base_va: int           # 선택
+    n_slots: int
+    slot_size: int
+    # Credit fast path 채널 (D9).
+    # 계약: 이 필드는 반드시 simpy.Store 인스턴스이며, IpcqCreditMetadata
+    # 객체만을 받는 receive endpoint이다 (peer's PE_IPCQ가 자기 입력 큐로
+    # 사용). 송신 측 PE_IPCQ는 _delayed_credit_send에서 이 store에 직접
+    # IpcqCreditMetadata를 put한다. 다른 객체 type을 put해서는 안 된다.
+    # backend init 시 양방향 SimPy Store가 한 번 wire되며 이후 변경 불가.
+    peer_credit_store: "simpy.Store[IpcqCreditMetadata]"
+
+@dataclass(frozen=True)
+class IpcqInitMsg:
+    correlation_id: str
+    request_id: str
+    target_sips: tuple[int, ...]
+    target_cubes: tuple[int, ...]
+    target_pe: int | tuple[int, ...] | str
+    entries: tuple[IpcqInitEntry, ...]   # 이 PE의 4-방향 entry
+    backpressure_mode: str       # "poll" | "sleep"
+    buffer_kind: str             # "tcm" | "hbm" | "sram"
+    credit_size_bytes: int       # D9 fast path latency 계산용 (default 16)
+```
+
+**Credit fast path channel wiring**: backend init이 모든 PE의 PE_IPCQ에
+양방향 fast path 채널을 한 번 설치한다. PE A의 IpcqInitEntry(direction=E)에
+PE B의 credit-receive Store reference를 넣어 송신 측이 직접 put할 수 있게
+한다 (별도 fabric routing 없음).
+
+#### `IpcqSendCmd` (PE_CPU → PE_IPCQ)
+
+```python
+@dataclass(frozen=True)
+class IpcqSendCmd:
+    direction: str               # 어느 방향으로 보낼지
+    src_addr: int                # 보낼 데이터의 원본 주소 (TCM/HBM)
+    src_space: str               # "tcm" | "hbm" | "sram"
+    nbytes: int
+    shape: tuple[int, ...]       # data shape (op_log/MemoryStore용)
+    dtype: str
+    handle_id: str               # completion 추적용
+    data_op: bool = True         # ADR-0020 op_log 기록 대상
+```
+
+#### `IpcqRecvCmd` (PE_CPU → PE_IPCQ)
+
+```python
+@dataclass(frozen=True)
+class IpcqRecvCmd:
+    direction: str | None        # None이면 round-robin (weak fairness, D4)
+    # recv_mode: 두 가지 동작 모드
+    #   "return_slot" — slot 주소를 그대로 PE 커널에 반환 (default, zero-copy)
+    #   "copy_to_dst" — slot 데이터를 dst_addr에 copy 후 반환
+    recv_mode: str = "return_slot"
+    # dst_addr / dst_space는 recv_mode="copy_to_dst"일 때만 사용됨
+    dst_addr: int = 0
+    dst_space: str = ""
+    shape: tuple[int, ...] = ()  # data shape (op_log/MemoryStore용)
+    dtype: str = ""
+    handle_id: str = ""
+    blocking: bool = True        # blocking vs non-blocking
+    data_op: bool = True
+```
+
+#### `IpcqDmaToken` (PE_IPCQ → PE_DMA, vc_comm 채널)
+
+D9의 piggyback 모델에 따라 token이 data + head metadata를 함께 담아
+fabric을 따라 이동한다. 수신 측 PE_DMA가 도착 시점에 data를 dst_addr에
+write하고 metadata를 PE_IPCQ로 forward한다 (atomic).
+
+```python
+@dataclass
+class IpcqDmaToken:
+    # ── Data movement (single-hop DMA write) ──
+    src_addr: int                # 자기 메모리 주소
+    src_space: str
+    dst_addr: int                # peer rx slot 주소 (이미 계산됨)
+    dst_endpoint: IpcqEndpoint   # 라우팅용 (sip/cube/pe)
+    nbytes: int                  # data 크기
+    handle_id: str               # 완료 시 송신 측 PE_IPCQ로 알림
+
+    # ── Piggyback metadata (수신측 PE_IPCQ가 자동 갱신할 정보) ──
+    sender_seq: int              # 단조 증가 sequence number
+                                 # peer가 자기 head_cache로 사용
+    src_sip: int                 # 송신 측 (수신측이 어느 peer인지 식별)
+    src_cube: int
+    src_pe: int
+    src_direction: str           # 송신측 기준 방향 (수신측은 reverse 매핑으로 자기 direction 결정)
+
+    data_op: bool = True         # ADR-0020 op_log 기록 대상
+```
+
+PE_DMA는 token type으로 채널 결정 (D8): TileToken → vc_compute, IpcqDmaToken → vc_comm.
+
+**수신 측 PE_DMA의 처리** (vc_comm 도착 시):
+
+```python
+def _vc_comm_arrival(self, env, token: IpcqDmaToken):
+    # 1. data를 dst_addr에 write (data와 metadata atomic visibility)
+    if self._memory_store is not None:
+        data = self._memory_store.read(token.src_space, token.src_addr,
+                                        shape=..., dtype=...)
+        self._memory_store.write(token.dst_endpoint.buffer_kind, token.dst_addr, data)
+    # 2. metadata를 자기 PE의 IPCQ로 forward (PE 내부 wire, 같은 step)
+    yield self.out_ports[self._ipcq_id].put(IpcqMetaArrival(token=token))
+```
+
+PE_IPCQ는 `IpcqMetaArrival`을 받아 sender_seq를 보고 peer_head_cache를 갱신한다.
+
+#### `IpcqCreditMetadata` (PE_IPCQ → peer PE_IPCQ, fast path 채널)
+
+Credit return은 D9의 fast path 모델에 따라 vc_comm fabric을 거치지 않고
+**별도의 SimPy Store 채널**로 전달된다. backend init 시 양방향 channel이
+미리 wire되며, latency는 bottleneck-BW 기반으로 계산된다.
+
+```python
+@dataclass(frozen=True)
+class IpcqCreditMetadata:
+    """Credit return — recv 측 → send 측 fast path."""
+    consumer_seq: int            # my_tail (recv 측의 새 tail)
+    src_sip: int                 # 누가 보냈는지 (수신 측이 어느 peer credit인지 식별)
+    src_cube: int
+    src_pe: int
+    src_direction: str           # 송신 측 기준 방향 (수신 측은 reverse 매핑)
+```
+
+**전송 흐름**:
+
+```python
+class PeIpcqComponent:
+    def _delayed_credit_send(self, env, peer_credit_store, my_tail, latency_ns):
+        yield env.timeout(latency_ns)
+        yield peer_credit_store.put(IpcqCreditMetadata(
+            consumer_seq=my_tail, src_sip=..., src_cube=..., src_pe=...,
+            src_direction=...,
+        ))
+```
+
+`latency_ns`는 D9에 정의된 대로:
+
+```python
+path = self.ctx.router.find_path(self_pe_prefix, peer_pe_prefix)
+latency_ns = self.ctx.compute_drain_ns(path, credit_size_bytes)
+```
+
+**별도의 IpcqPtrUpdate 이벤트는 없다** — head 갱신은 D9 piggyback 모델로,
+tail 갱신은 D9 fast path SimPy Store 채널로 처리된다.
+
+### D13. 테스트 전략
+
+ADR-0021의 D8 패턴을 따라 단위/통합/regression 테스트를 명시한다.
+
+#### T1. 단위 테스트 (component-level)
+
+- **PE_IPCQ 단위** (`tests/test_pe_ipcq.py`):
+  - send: backpressure 미발생 시 즉시 PE_DMA로 token forward
+  - send: peer slot full → backpressure (poll/sleep 모드별)
+  - send: peer credit return (IpcqCreditMetadata) 도착 후 backpressure 해제
+  - recv: 데이터 도착 시 즉시 반환
+  - recv: 데이터 미도착 → wait → IpcqMetaArrival (D9 piggyback) 수신 시 wake
+  - recv (round-robin): 4-방향 중 도착한 첫 데이터 반환 (weak fairness)
+  - 잘못된 방향 → IpcqInvalidDirection 예외
+
+- **PE_DMA virtual channel** (`tests/test_pe_dma_vc.py`):
+  - vc_compute / vc_comm 독립 진행 (한 채널 stall 시 다른 채널 진행)
+  - chunk-level 인터리브 verification
+  - link BW 분할 (50/50 또는 weighted)
+
+- **builtin topology** (`tests/test_ccl_topologies.py`):
+  - ring_1d/mesh_2d/tree_binary 각각 (rank, world_size) → neighbor dict 정합성
+  - mesh_2d non-square → ValueError
+  - resolve_topology(custom, module) → module.neighbors 반환
+
+#### T2. 통합 테스트 (E2E send/recv)
+
+- **`tests/test_ipcq_e2e.py`**:
+  - 2-rank ring: rank 0 send(E) → rank 1 recv(W) → 데이터 정합성
+  - 4-rank ring: 양방향 send/recv 동시 진행, deadlock 없음
+  - mesh_2d 4×4: N/S/E/W 4방향 동시 send/recv
+
+- **CCL kernel + 2-pass** (`tests/test_ipcq_2pass.py`):
+  - greenlet 모드 + IPCQ → op_log에 ipcq 레코드 생성 검증
+  - DataExecutor가 ipcq op 처리 후 결과 정합성 (`out.data` 확인)
+
+#### T3. Backend init 테스트 (`tests/test_ccl_backend_ipcq.py`)
+
+- ccl.yaml 로드 → `defaults.algorithm` 추출
+- builtin topology → IpcqInitMsg fan-out
+- IpcqEndpoint의 PA가 모든 PE에서 일관 (rank A의 peer E의 rx_base_pa = rank A+1의 자기 rx_base_pa)
+- buffer_kind 별 메모리 할당 (tcm/hbm/sram)
+
+#### T4. Regression
+
+- 기존 401 tests 전부 PASS
+- ADR-0020 통합으로 인한 op_log/DataExecutor 영향 없음 (CCL 미사용 벤치)
+
+#### T5. 성능 / overhead
+
+- 단일 send/recv pair latency = (DMA latency) + (IPCQ overhead)
+- 비교: 같은 nbytes의 일반 PE_DMA write와 거의 동일해야 함 (IPCQ overhead < 100 ns)
+
+### D14. Invariants & Failure Modes
+
+CCL 인프라에서 흔히 발생하는 hang/오류 상황을 명시하고, 대응 방식을 정의한다.
+
+#### Invariants (시뮬레이션이 보장해야 하는 것)
+
+I1. **Slot lifecycle exactly-once**: 한 send → 정확히 한 recv. 중복 send나
+   중복 recv는 sequence 오류로 간주.
+
+I2. **Pointer monotonicity**: my_head, my_tail은 단조 증가 (감소 없음).
+   sender_seq는 송신 측에서 단조 증가, 수신 측 cache 갱신도 단조 증가.
+
+I3. **Endpoint consistency**: rank A의 IpcqEndpoint(direction=E)의 peer가
+   rank B라면, rank B의 IpcqEndpoint(reverse(E))의 peer는 rank A여야 함.
+   backend init 시 검증.
+
+I4. **buffer_kind consistency**: 한 ProcessGroup 내 모든 PE의 buffer_kind는
+   동일 (mixed kind는 supported 안 함, 첫 구현). 검증 실패 시 init 에러.
+
+I5. **op_log ordering**: send → DMA 완료 → recv 가능. op_log의 t_start
+   순서가 이 인과관계를 위배하지 않음.
+
+I6. **Atomic data + metadata visibility (MUST)**: 본 ADR의 correctness 핵심
+   조건이다. 수신 측에서 data write (MemoryStore.write)와 metadata forward
+   (peer_head_cache 갱신)는 동일한 SimPy step에 일어나야 한다. control이
+   data를 앞지를 수 없다.
+
+   **구현 규칙 (MUST)**:
+   - PE_DMA의 vc_comm token 도착 처리(`_vc_comm_arrival`)는 다음 두 동작
+     사이에 **어떤 SimPy yield도 두어서는 안 된다**:
+     1. `MemoryStore.write(token.dst_endpoint.buffer_kind, token.dst_addr, data)`
+     2. PE_IPCQ에 `IpcqMetaArrival` forward
+   - 두 동작은 동일 SimPy event callback 내에서 연속 실행되어야 한다.
+   - 코드 리뷰에서 이 사이에 `yield` (또는 `yield from`)을 추가하는 것은
+     correctness 위반으로 reject한다.
+
+   이 규칙을 위반하면 다른 SimPy process가 끼어들어 head_cache가 data
+   visibility보다 먼저 또는 늦게 보이는 race condition이 발생한다.
+
+I7. **MemoryStore slot existence ↔ pointer**: I6의 결과로,
+   `peer_head_cache > my_tail`이 truthy가 되는 step과 `MemoryStore.has(slot_addr)`
+   이 truthy가 되는 step이 동일하다. recv는 두 조건을 모두 체크하지만 (defensive),
+   단일 조건만 체크해도 정확하다.
+
+#### Failure Modes (런타임 에러)
+
+F1. **잘못된 direction**:
+   - PE 커널이 `tl.send(dir="X")` 호출 → install 안 된 direction
+   - PE_IPCQ가 즉시 `IpcqInvalidDirection` 예외 raise
+   - SimPy 시뮬레이션 즉시 abort, 사용자에게 명확한 에러
+
+F2. **타입 mismatch**:
+   - send와 recv의 dtype/shape/nbytes가 일치하지 않음
+   - 첫 구현은 검증 안 함 (dtype/shape는 hint), 향후 strict mode로 추가
+
+F3. **Deadlock detection (timeout 기반)**:
+   - send: peer_tail_cache가 갱신 안 되고 영원히 wait
+   - recv: peer_head_cache 갱신 안 되고 영원히 wait
+   - 시뮬레이션 timeout (default 10ms simulated time) 초과 시 abort
+   - 디버그를 위해 각 PE의 last send/recv 위치, blocking 상태 dump
+
+F4. **Backend init 실패**:
+   - ccl.yaml에 `defaults.algorithm` 누락
+   - `algorithms[name]` 정의 누락
+   - 알고리즘 모듈 import 실패
+   - topology 검증 실패 (I3, I4)
+   → 모두 `init_process_group` 시점에 즉시 에러
+
+F5. **Slot full + 무한 backpressure**:
+   - peer가 영원히 안 받음
+   - F3과 같이 timeout으로 처리
+   - 디버그: 막힌 PE의 my_head, peer_tail_cache 출력
+
+#### 진단 도구 (구현 단계에서 추가)
+
+- **CCL trace**: 각 send/recv를 (rank, t, dir, nbytes) 형태로 로깅
+- **Pointer dump**: 시뮬레이션 종료 시 또는 hang 시 모든 PE의 IPCQ pointer 상태 출력
+- **Deadlock graph**: hang 발생 시 wait-for 그래프 출력 (어느 PE가 어떤 PE를 기다리는지)
+
+### D15. 알고리즘 작성자 가이드 (요약)
+
+본 섹션은 알고리즘 작성자가 한 화면으로 시작점을 잡을 수 있도록 한다.
+자세한 step-by-step 가이드는 [docs/ccl-author-guide.md](../ccl-author-guide.md) 참조.
+
+#### 만지는 것 / 만지지 않는 것
+
+| 만지는 것 | 만지지 않는 것 |
+|----------|---------------|
+| `src/kernbench/ccl/algorithms/<your_algo>.py` (kernel + 선택적 neighbors) | `benches/ccl_allreduce.py` 호스트 코드 |
+| `ccl.yaml` 의 한 entry 추가 + `defaults.algorithm` | `src/kernbench/ccl/` 프레임워크 |
+| (선택) `tests/test_<your_algo>.py` 단위 테스트 | `src/kernbench/components/builtin/pe_ipcq.py` 컴포넌트 |
+| | `src/kernbench/runtime_api/distributed.py` backend |
+
+#### 알고리즘 모듈 인터페이스 contract
+
+```python
+# src/kernbench/ccl/algorithms/<your_algo>.py
+
+def kernel(*args, tl) -> None:
+    """필수. PE 커널.
+
+    Args (positional): tensor pointers, rank, world_size, 알고리즘 파라미터
+    Args (keyword): tl — TLContext (자동 주입)
+
+    사용 가능한 IPCQ API:
+        tl.send(dir, src_addr, nbytes)         # blocking, backpressure 시 wait
+        tl.recv(dir)                            # 특정 방향에서 blocking recv
+        tl.recv()                               # 4방향 round-robin
+        tl.recv_async(dir) → handle             # non-blocking
+        tl.wait(handle)                         # non-blocking 완료 대기
+
+    기존 API도 그대로 사용:
+        tl.load / tl.store / tl.composite / tl.program_id 등
+    """
+    ...
+
+def neighbors(rank, world_size, neighbor_map) -> dict | None:
+    """선택. ccl.yaml의 builtin topology가 만든 neighbor_map을 override.
+
+    None 반환 → builtin 그대로 사용
+    dict 반환 → 그 dict로 override (builtin을 base로 수정 가능)
+    """
+    return None
+```
+
+#### 5-step 흐름
+
+1. **kernel 함수 작성** — `src/kernbench/ccl/algorithms/<algo>.py` 신규 파일
+2. **ccl.yaml 등록** — `algorithms.<name>` entry + `defaults.algorithm` 변경
+3. **(선택) neighbors override** — builtin topology를 base로 수정이 필요할 때
+4. **단위 테스트** — `kernbench.ccl.testing.run_kernel_in_mock` (SimPy 없이 빠름)
+5. **시뮬 검증** — `kernbench run --bench ccl_allreduce --verify-data`
+
+호스트 코드 (`benches/ccl_allreduce.py`)는 손대지 않는다.
+
+#### 사용 가능한 헬퍼 (`kernbench.ccl.helpers`)
+
+| Helper | 설명 |
+|--------|------|
+| `chunked(addr, n_chunks, ...)` | 텐서를 n개 chunk view로 슬라이싱 |
+| `ring_step(rank, step, ws)` | ring algorithm의 step별 (send_idx, recv_idx) |
+| `tree_step(rank, level)` | binary tree의 level별 parent/child 인덱스 |
+
+#### 디버깅 도구
+
+- `KERNBENCH_CCL_TRACE=1` — send/recv trace 출력
+- 시뮬 종료 시 자동 IPCQ pointer dump
+- Deadlock 시 (10ms 시뮬 시간 초과) wait-for graph dump
+
+#### 흔한 실수
+
+1. **install 안 된 direction 사용** — ccl.yaml의 topology가 ring_1d면 N/S 사용 불가
+2. **send/recv 짝 맞지 않음** — peer 측 recv 없으면 hang (slot full backpressure)
+3. **dtype/shape 불일치** — 첫 구현은 검증 안 함, 작성자 책임
+
+자세한 step-by-step과 hello-world 예제는 `docs/ccl-author-guide.md` 참조.
+
+---
+
+## Non-goals
+
+- **호스트 collective**: `dist.all_reduce`가 데이터 이동을 직접 수행하는 모델은
+  본 ADR 범위 외. 본 ADR은 PE 커널 안에서 일어나는 통신만 다룬다.
+- **All-reduce 알고리즘**: ring/tree 등 알고리즘 자체는 별도 ADR (또는 커널
+  코드)에서 다룬다. 본 ADR은 인프라(IPCQ + VC)만 정의.
+- **Reliability/error handling**: send/recv 실패, link 장애 등은 다루지 않음.
+- **NoC arbiter 정밀 모델**: VC 간 dynamic contention은 첫 구현 범위 외 (D8).
+
+---
+
+## Open Questions
+
+- **VC arbitration 정확도**: 첫 구현은 deterministic chunk interleave +
+  weighted round-robin. heavy contention 시나리오에서 실제보다 optimistic한
+  결과가 나올 수 있음. 정밀화 필요 시 별도 NoC arbiter component 도입을 검토.
+- **Credit return fast path BW 모델**: 첫 구현은 fast path가 fabric BW
+  contention 모델에서 제외 (별도 lightweight wire 가정). 정밀화 필요 시
+  credit fast path를 별도 link로 모델링하거나, `credit_return_mode: piggyback`
+  옵션 추가.
+- **Ring buffer slot의 메모리 할당**: TCM/HBM/SRAM 어디에 두든 IPCQ가 알아야
+  할 metadata (base addr, slot_size, n_slots). init 시 호스트가 사이드밴드로
+  넣을지, fabric MmuMapMsg와 유사한 메시지로 넣을지 결정 필요.
+- **VC 간 BW 분할 default**: 균등 분할(50/50)인지, weighted(예: 80% compute,
+  20% comm)인지. ccl.yaml에 노출하되 default 값 결정 필요.
+- **Direction 개수**: 4방향(N/S/E/W) 고정인지, 6방향(+ Up/Down for 3D),
+  또는 가변 N개로 확장할지. 첫 구현은 4방향 고정.
+- **다중 channel 데이터 구조 (multi-tile aggregation)**: 한 collective에서
+  여러 tile을 fan-out 받는 경우 기존 round-robin recv로 충분한지, 별도
+  primitive(`tl.recv_all`)가 필요한지.
+- **Round-robin recv fairness**: 첫 구현은 last_polled_dir 인덱스 기반 weak
+  fairness. 한 방향에 데이터가 항상 먼저 도착하면 starvation 가능. strict
+  fairness가 필요하면 별도 fairness counter 추가.
+- **Deadlock detection 정밀화**: 첫 구현은 timeout 기반. 향후 wait-for graph
+  실시간 추적으로 deterministic deadlock detection 가능.
+
+---
+
+## Consequences
+
+### 긍정적
+
+- PE 간 직접 통신 가능 → CCL 커널 작성 가능
+- 호스트는 launch만, 동기화는 PE 안에서 → 단순한 호스트 코드, 강한
+  compute/comm overlap
+- VC를 통해 HoL blocking 제거 → collective latency가 compute traffic에
+  block되지 않음
+- Buffer 위치/backpressure 모드를 init 파라미터로 선택 가능 → 벤치마크 가능
+- 4-방향 logical neighbor → 호스트가 ring/mesh/tree 등 알고리즘 자유롭게
+  매핑
+
+### 부정적
+
+- 컴포넌트 1개 신규 추가 (PE_IPCQ), PE_DMA 재설계 (VC 추가)
+- IPCQ 메모리 (8 ring × slot_size × n_slots) 만큼 PE-local 메모리 사용
+- VC arbitration 모델이 first-order approximation이므로 heavy contention
+  시나리오에서 실제 HW보다 약간 optimistic한 latency 결과 가능 (D8 한계)
+- VC chunk-level 인터리브로 PE_DMA 구현이 더 복잡해짐
+
+---
+
+## 영향받는 파일
+
+| 파일 | 변경 |
+|------|------|
+| `topology.yaml` | pe_template에 pe_ipcq 추가, ipcq↔dma/cpu/tcm edge 추가 |
+| `components.yaml` | pe_ipcq_v1 등록 |
+| `src/kernbench/topology/builder.py` | PE 내부 edge에 ipcq 체인 추가 |
+| `src/kernbench/components/builtin/pe_ipcq.py` | 신규 |
+| `src/kernbench/components/builtin/pe_dma.py` | VC 추가, IpcqDmaToken 처리 |
+| `src/kernbench/common/pe_commands.py` | IpcqSendCmd, IpcqRecvCmd, IpcqDmaToken 정의 |
+| `src/kernbench/triton_emu/tl_context.py` | tl.send / tl.recv API |
+| `src/kernbench/runtime_api/distributed.py` | ccl.yaml 로드, init 시 IPCQ install (eager) |
+| `src/kernbench/runtime_api/kernel.py` | IpcqInitMsg (sideband) 정의 |
+| `src/kernbench/ccl/__init__.py` | 신규 — CCL 패키지 |
+| `src/kernbench/ccl/topologies.py` | 신규 — builtin topology generators (ring_1d, mesh_2d, tree_binary 등), `resolve_topology()` |
+| `src/kernbench/ccl/helpers.py` | 신규 — 알고리즘 작성 헬퍼 (chunked, ring_step 등) |
+| `src/kernbench/ccl/testing.py` | 신규 — mock CCL runtime (`run_kernel_in_mock`) |
+| `ccl.yaml` | 신규 — 알고리즘 metadata + IPCQ default 설정 |
+| `src/kernbench/ccl/algorithms/ring_allreduce.py` | 신규 — 첫 알고리즘 예제 |
+| `tests/test_pe_ipcq.py` | 신규 — PE_IPCQ 단위 테스트 |
+| `tests/test_pe_dma_vc.py` | 신규 — PE_DMA virtual channel 테스트 |
+| `tests/test_ipcq_e2e.py` | 신규 — send/recv end-to-end 테스트 |
+| `tests/test_ccl_topologies.py` | 신규 — builtin topology generator 단위 테스트 |
diff --git a/docs/ccl-author-guide.en.md b/docs/ccl-author-guide.en.md
new file mode 100644
index 0000000..e2e62f9
--- /dev/null
+++ b/docs/ccl-author-guide.en.md
@@ -0,0 +1,592 @@
+# CCL Algorithm Author Guide (English)
+
+This document is a step-by-step guide for engineers writing CCL
+(Collective Communication Library) algorithms in kernbench. The
+internal system design and component structure live in
+[ADR-0023](adr/ADR-0023-ipcq-pe-collective.md).
+
+The goal here is to clearly separate **what an algorithm author has to
+touch** from **what they can leave alone**, and to get a first
+algorithm running through the shortest possible path.
+
+---
+
+## 0. Five-minute tour
+
+| Things you touch | Location |
+|------------------|----------|
+| Algorithm module (kernel + optional `neighbors()`) | `src/kernbench/ccl/algorithms/<algo>.py` |
+| Algorithm registration | `ccl.yaml` |
+| Host bench (rank count, init, launch, verify) | `benches/<your_bench>.py` |
+| (Optional) unit test | `tests/test_<algo>.py` |
+
+| Things you do NOT touch | Location |
+|--------------------------|----------|
+| TLContext API | `src/kernbench/triton_emu/tl_context.py` (ADR-0022 spec) |
+| Framework (topology generators, helpers, mock testing) | `src/kernbench/ccl/` |
+| PE_IPCQ / PE_DMA components | `src/kernbench/components/builtin/` |
+| Backend implementation (`install_ipcq`) | `src/kernbench/runtime_api/distributed.py` and `kernbench/ccl/install.py` |
+
+Workflow:
+1. Write a `kernel` function in the algorithm module.
+2. Register an entry in `ccl.yaml`.
+3. Write a host bench using `torch.distributed.init_process_group` /
+   `torch.distributed.all_reduce` (the unified `benches/ccl_allreduce.py`
+   handles the common case).
+4. (Optional) Run the mock runtime for fast unit tests (a few ms).
+5. `kernbench run --bench <name> --verify-data` for full SimPy verification.
+
+---
+
+## 1. Hello World — the simplest send/recv
+
+Each PE sends its tile to its E neighbor once and receives a tile from
+its W neighbor once. The reference code lives in
+[`src/kernbench/ccl/algorithms/hello_send.py`](../src/kernbench/ccl/algorithms/hello_send.py).
+
+### Step 1: write the kernel
+
+New file `src/kernbench/ccl/algorithms/hello_send.py`:
+
+```python
+"""Hello world: send your tile to the next rank, receive from the previous one."""
+
+
+def kernel(t_ptr, n_elem, tl):
+    # Global rank is computed from program_id(0/1) (ADR-0022).
+    local_pe = tl.program_id(axis=0)
+    cube_id = tl.program_id(axis=1)
+    pes_per_cube = tl.num_programs(axis=0)
+    rank = cube_id * pes_per_cube + local_pe
+
+    nbytes = n_elem * 2  # f16
+    pe_addr = t_ptr + rank * nbytes
+
+    # Load our slice and send it east.
+    src = tl.load(pe_addr, shape=(n_elem,), dtype="f16")
+    tl.send(dir="E", src=src)
+
+    # Receive from west and store directly back into our slice.
+    recv = tl.recv(dir="W", shape=(n_elem,), dtype="f16")
+    tl.store(pe_addr, recv)
+
+
+def kernel_args(world_size: int, n_elem: int) -> tuple:
+    """Positional kernel args used by the ahbm backend (after t_ptr)."""
+    return (n_elem,)
+```
+
+Key points:
+
+- **Global rank is computed from `program_id(axis=0)` + `program_id(axis=1)`.**
+  TL has no contractually-supported `tl.rank` / `tl.world_size`. If the
+  host needs to pass `world_size` or anything else as an algorithm
+  parameter, it goes through ordinary `torch.launch` arguments.
+- **`tl.send` takes a `TensorHandle`.** PE_IPCQ reads
+  `addr`/`space`/`shape`/`dtype`/`nbytes` from the handle to issue an
+  `IpcqDmaToken` to PE_DMA.
+- **`tl.recv` requires `shape` and `dtype`.** The returned TensorHandle
+  points at the IPCQ ring slot and can be used directly as a `dst`
+  handle (e.g. `tl.store(pe_addr, recv)`). Phase 2's `dma_write` replay
+  handles the (slot → hbm) copy, so user code never has to touch
+  `recv.data`.
+
+### Step 2: register in `ccl.yaml`
+
+```yaml
+algorithms:
+  hello_send:
+    module: kernbench.ccl.algorithms.hello_send
+    topology: ring_1d
+    buffer_kind: tcm
+    world_size: 8
+```
+
+`world_size` here is optional. If absent, `AhbmCCLBackend` derives it
+from the topology spec (`sips × cubes_per_sip × pes_per_cube`).
+
+### Step 3: write a host bench (optional — the unified bench may suffice)
+
+For most CCL benchmarks the existing `benches/ccl_allreduce.py` is
+sufficient: it reads `ccl.yaml`, picks the algorithm, sets up the
+process group, and runs the collective. If your algorithm needs custom
+host logic, write a new bench file along the same lines.
+
+The host code looks like a real PyTorch DDP worker:
+
+```python
+"""benches/ccl_hello.py"""
+from __future__ import annotations
+
+import numpy as np
+
+from kernbench.policy.placement.dp import DPPolicy
+
+
+N_ELEM = 8
+
+
+def worker(rank: int, world_size: int, torch) -> None:
+    """Per-rank business logic — mirrors a real PyTorch DDP worker."""
+    dp = DPPolicy(
+        sip="replicate", cube="replicate", pe="column_wise",
+        num_sips=1, num_cubes=1, num_pes=world_size,
+    )
+    tensor = torch.zeros(
+        (1, world_size * N_ELEM), dtype="f16", dp=dp, name="hello_in",
+    )
+
+    # Per-rank initialization via the real PyTorch idiom.
+    init = np.zeros((1, world_size * N_ELEM), dtype=np.float16)
+    for r in range(world_size):
+        init[0, r * N_ELEM : (r + 1) * N_ELEM] = float(r + 1)
+    tensor.copy_(torch.from_numpy(init))
+
+    # The collective itself.
+    torch.distributed.all_reduce(tensor, op="sum")
+
+    # Verify on rank 0 (real PyTorch DDP idiom).
+    if rank == 0:
+        result = tensor.numpy()
+        for r in range(world_size):
+            expected = float(((r - 1) % world_size) + 1)
+            slice_r = result[0, r * N_ELEM : (r + 1) * N_ELEM]
+            print(
+                f"  rank {r}: got {float(slice_r.mean()):.1f}, "
+                f"expected {expected:.1f}"
+            )
+
+
+def run(torch) -> None:
+    """CLI entry point. Initializes dist, dispatches to worker."""
+    dist = torch.distributed
+    dist.init_process_group(backend="ahbm")
+    worker(
+        rank=dist.get_rank(),
+        world_size=dist.get_world_size(),
+        torch=torch,
+    )
+```
+
+### Step 4: unit test (optional but strongly recommended)
+
+`tests/test_hello_send.py`:
+
+```python
+import numpy as np
+
+from kernbench.ccl.algorithms.hello_send import kernel
+from kernbench.ccl.testing import run_kernel_in_mock
+
+
+def test_hello_send_4_ranks():
+    n_elem = 8
+    inputs = [
+        np.full((n_elem,), float(r + 1), dtype=np.float16)
+        for r in range(4)
+    ]
+    outputs = run_kernel_in_mock(
+        kernel_fn=kernel,
+        world_size=4,
+        topology="ring_1d",
+        inputs=inputs,
+        kernel_args=(n_elem,),
+    )
+    # rank r should now hold rank (r-1) % 4's data.
+    for r in range(4):
+        assert np.array_equal(outputs[r], inputs[(r - 1) % 4])
+```
+
+`run_kernel_in_mock` runs every rank concurrently in pure Python (no
+SimPy), so a unit test like this finishes in **milliseconds**. It only
+verifies algorithmic correctness — no latency, no DMA, no fabric.
+
+### Step 5: SimPy validation
+
+```bash
+kernbench run --topology topology.yaml --bench ccl_hello --verify-data
+```
+
+Phase 1 runs the SimPy simulation + MemoryStore data movement, Phase 2
+replays the op_log for correctness. The bench's `print` lines should
+show OK for every rank.
+
+---
+
+## 2. Ring all-reduce — the second algorithm
+
+Slightly more complex. Each PE runs `world_size - 1` rounds, sending
+its current tile east and accumulating the tile received from the west.
+After all rounds, every PE holds the global sum.
+
+The reference implementation lives in
+[`src/kernbench/ccl/algorithms/ring_allreduce.py`](../src/kernbench/ccl/algorithms/ring_allreduce.py).
+The core flow:
+
+```python
+"""Ring all-reduce."""
+
+
+def kernel(t_ptr, n_elem, world_size, tl):
+    local_pe = tl.program_id(axis=0)
+    cube_id = tl.program_id(axis=1)
+    pes_per_cube = tl.num_programs(axis=0)
+    rank = cube_id * pes_per_cube + local_pe
+    nbytes = n_elem * 2
+    pe_addr = t_ptr + rank * nbytes
+
+    # The handle points at HBM[pe_addr]. In greenlet mode .data is
+    # populated, but the kernel never has to touch .data directly.
+    acc = tl.load(pe_addr, shape=(n_elem,), dtype="f16")
+    current = acc  # source for the first send
+
+    for _step in range(world_size - 1):
+        tl.send(dir="E", src=current)
+        recv = tl.recv(dir="W", shape=(n_elem,), dtype="f16")
+        # TensorHandle operator overload → MathCmd → PE_MATH dispatch.
+        # Phase 1 only models timing; Phase 2 DataExecutor replays the
+        # actual numpy accumulation.
+        acc = acc + recv
+        current = recv  # forward the received slot to the next round
+
+    # Store the final accumulator back to HBM. Source is acc (a PE-local
+    # scratch addr); dst is HBM. The op_log dma_write entry records both
+    # ends so Phase 2 copies the math result into HBM at verify time.
+    tl.store(pe_addr, acc)
+
+
+def kernel_args(world_size: int, n_elem: int) -> tuple:
+    return (n_elem, world_size)
+```
+
+Four key points:
+
+1. **Accumulation goes through TensorHandle operators.** `acc + recv`
+   emits a `MathCmd` and dispatches it through PE_MATH — i.e. the
+   real hardware path, so the latency model stays accurate. Per
+   ADR-0020 D3, Phase 1 only simulates timing; Phase 2's `DataExecutor`
+   replays the op_log and runs the actual numpy accumulation.
+2. **Use `current = recv` to forward.** Each round must update the send
+   source to the just-received slot handle so the same data circulates
+   exactly once around the ring. Setting `current = acc` would resend
+   the cumulative sum, inflating the result.
+3. **`tl.store(pe_addr, acc)` exactly once at the end.** Do not use a
+   store→reload pattern in the middle. `acc` lives in PE-local scratch;
+   the op_log records `(src=scratch, dst=hbm)` and Phase 2 first runs
+   math (filling scratch) then copies via the dma_write snapshot.
+4. **`world_size` is passed by the host explicitly.** TL only knows the
+   topology slot count (e.g. `num_programs(axis=0)` is "PEs per cube"),
+   not the participating CCL group size. The host bench knows
+   `world_size` and forwards it as an explicit kernel argument.
+
+For registration in `ccl.yaml` and wiring through the unified bench,
+look at the existing `ring_allreduce_tcm/_hbm/_sram` entries plus
+[`benches/ccl_allreduce.py`](../benches/ccl_allreduce.py). Mock unit
+tests live in
+[`tests/test_ccl_mock_runtime.py`](../tests/test_ccl_mock_runtime.py)
+and follow the `kernel_args=(n_elem, world_size)` convention.
+
+---
+
+## 3. `neighbors()` override — custom topology
+
+Most algorithms are happy with the builtin topologies (`ring_1d`,
+`mesh_2d`, `tree_binary`, `ring_1d_unidir`, `none`). If you want to
+modify a builtin or define a brand-new connectivity pattern, define a
+`neighbors()` function in your algorithm module.
+
+### Signature
+
+```python
+def neighbors(
+    rank: int, world_size: int, neighbor_map: dict[str, int],
+) -> dict[str, int] | None:
+    """Override the neighbor map produced by the builtin topology.
+
+    Args:
+        neighbor_map: the mapping the ccl.yaml ``topology`` field built.
+                      For ring_1d this is {"E": (rank+1)%ws, "W": (rank-1)%ws}.
+                      The dict is mutable — modify in place if you want.
+
+    Returns:
+        dict: the new neighbor map (or the modified-in-place dict).
+        None: do not override; use neighbor_map as-is.
+    """
+    return None
+```
+
+### Pattern A: tweak a builtin
+
+```python
+def neighbors(rank, world_size, neighbor_map):
+    # Only even ranks use W; remove W from odd ranks.
+    if rank % 2 == 1:
+        neighbor_map.pop("W", None)
+    return neighbor_map
+```
+
+### Pattern B: replace entirely (skip-connection ring)
+
+```python
+def neighbors(rank, world_size, neighbor_map):
+    return {"E": (rank + 2) % world_size}
+```
+
+### Pattern C: keep builtin
+
+Either omit `neighbors` entirely or return None:
+
+```python
+def neighbors(rank, world_size, neighbor_map):
+    return None  # explicit "use the builtin"
+```
+
+---
+
+## 4. PE kernel API reference (ADR-0023 D4)
+
+### IPCQ API
+
+| API | Description | Blocking? |
+|-----|-------------|-----------|
+| `tl.send(dir, src=TensorHandle)` | Send to a peer in the given direction. | Yes (waits if peer slots are full) |
+| `tl.send(dir, src_addr=..., nbytes=..., shape=..., dtype=..., space=...)` | Same, keyword form. | Yes |
+| `tl.recv(dir, shape=..., dtype=...)` | Blocking recv from one direction. | Yes |
+| `tl.recv(shape=..., dtype=...)` | Round-robin recv across all four directions. | Yes |
+| `tl.recv_async(dir, shape=..., dtype=...) → RecvFuture` | Non-blocking recv. | No |
+| `tl.wait(future)` | Wait for a non-blocking recv future → returns the resolved TensorHandle. | Yes |
+
+### Existing TL API (ADR-0020/0022, unchanged)
+
+| API | Description |
+|-----|-------------|
+| `tl.load(addr, shape, dtype) → TensorHandle` | DMA read; in greenlet mode `.data` carries the ndarray. |
+| `tl.store(addr, handle)` | DMA write — when `handle.data` is set the runner propagates it to MemoryStore. |
+| `tl.composite(op, ...)` | Submit a GEMM/Math composite (non-blocking). |
+| `tl.program_id(axis=0)` | Local PE id within the cube. |
+| `tl.program_id(axis=1)` | Cube id (ADR-0022). |
+| `tl.num_programs(axis=0/1)` | Topology slot counts (NOT the participating-rank count). |
+
+### Two recv modes
+
+The default is `return_slot` (zero-copy): the IPCQ slot address is
+returned in `handle.addr`. To force a copy into a custom destination,
+pass `dst_addr` + `dst_space`:
+
+```python
+recv = tl.recv(
+    dir="W", shape=(8,), dtype="f16",
+    dst_addr=my_scratch_addr,
+    dst_space="hbm",
+)
+# After this call recv.addr == my_scratch_addr (copy_to_dst mode).
+```
+
+---
+
+## 5. Helpers (`kernbench.ccl.helpers`)
+
+Convenience helpers to keep algorithm code short:
+
+```python
+from kernbench.ccl.helpers import chunked, ring_step, tree_step
+```
+
+### `chunked(base_addr, n_chunks, n_elem, dtype="f16") → list[Chunk]`
+
+Split a tile of `n_elem` elements into `n_chunks` equal-size views.
+Each `Chunk` has `addr`, `n_elem`, `nbytes` fields.
+
+```python
+chunks = chunked(t_ptr, n_chunks=4, n_elem=64, dtype="f16")
+# chunks[0..3] are 16-element views with consecutive addresses.
+```
+
+### `ring_step(rank, step, world_size) → (send_idx, recv_idx)`
+
+Per-step chunk indices for a ring algorithm (reduce-scatter / all-gather):
+
+```python
+for step in range(world_size - 1):
+    send_idx, recv_idx = ring_step(rank, step, world_size)
+    tl.send(
+        dir="E", src_addr=chunks[send_idx].addr,
+        nbytes=chunks[send_idx].nbytes,
+        shape=(chunks[send_idx].n_elem,), dtype="f16",
+    )
+    recv = tl.recv(
+        dir="W", shape=(chunks[recv_idx].n_elem,), dtype="f16",
+    )
+    # accumulate ...
+```
+
+### `tree_step(rank, world_size) → {"parent": int|None, "children": list[int]}`
+
+Parent / children rank ids for a binary tree:
+
+```python
+info = tree_step(rank, world_size)
+if info["parent"] is None:
+    print(f"rank {rank} is the root")
+for child in info["children"]:
+    ...
+```
+
+---
+
+## 6. Unit testing — Mock runtime
+
+`kernbench.ccl.testing.run_kernel_in_mock` runs an algorithm without
+SimPy for fast feedback.
+
+### Basic usage
+
+```python
+import numpy as np
+
+from kernbench.ccl.testing import run_kernel_in_mock
+from kernbench.ccl.algorithms.my_algo import kernel
+
+
+def test_my_algo():
+    n_elem = 16
+    inputs = [np.arange(n_elem, dtype="f16") + r for r in range(4)]
+    expected = sum(inputs)
+    outputs = run_kernel_in_mock(
+        kernel_fn=kernel,
+        world_size=4,
+        topology="ring_1d",
+        inputs=inputs,
+        kernel_args=(n_elem, 4),  # positional args after t_ptr
+    )
+    for r in range(4):
+        assert np.allclose(outputs[r], expected, rtol=1e-3)
+```
+
+### Behavior
+
+- All ranks run their kernels concurrently as cooperative greenlets.
+- `tl.send` / `tl.recv` are serviced by in-memory FIFOs (no DMA, no
+  latency).
+- Each rank's last `store` is what the helper returns as a numpy array.
+
+### Limitations
+
+- No latency or performance numbers (it is not a simulation).
+- No PE_DMA, fabric, or BW model.
+- Correctness only.
+- One cube assumed: `program_id(axis=1)` is always 0.
+
+---
+
+## 7. Debugging
+
+### CCL trace
+
+```bash
+KERNBENCH_CCL_TRACE=1 kernbench run --topology topology.yaml \
+    --bench ccl_allreduce --verify-data
+```
+
+Per-rank send/recv events appear on stdout:
+
+```
+[ccl t=346.4 send] sip0.cube0.pe1 dir=E nbytes=64 seq=0
+[ccl t=360.4 recv] sip0.cube0.pe2 dir=W nbytes=64
+```
+
+### Pointer dump
+
+`kernbench.ccl.diagnostics.pointer_dump(engine)` returns a multi-line
+dump of every PE_IPCQ ring buffer's `my_head`, `my_tail`,
+`peer_head_cache`, `peer_tail_cache`. When something hangs, this shows
+which rank is stuck and on what.
+
+### Deadlock detection
+
+When the SimPy schedule empties because of unmatched send/recv pairs,
+the engine raises `IpcqDeadlock` and embeds the pointer dump in the
+message (ADR-0023 D14 F3). Wait-for-graph visualization is future
+work.
+
+---
+
+## 8. Common mistakes
+
+### 1. Using a direction that wasn't installed
+
+`topology: ring_1d` only installs E and W. Trying:
+
+```python
+tl.send(dir="N", ...)   # → IpcqInvalidDirection
+```
+
+Fix: switch to `topology: mesh_2d`, or add N/S in a `neighbors()` override.
+
+### 2. `send` without a matching `recv`
+
+```python
+def kernel(..., tl):
+    for _ in range(100):
+        tl.send(dir="E", ...)
+    # The peer never recvs → ring buffer fills → backpressure → deadlock.
+```
+
+Fix: every `send` needs a matching `recv` on the receiver side.
+Otherwise `IpcqDeadlock` is raised.
+
+### 3. dtype/shape mismatch
+
+By default mismatches are not validated. The author is responsible for
+consistency. Set `strict_validation: true` on a PE_IPCQ node's attrs to
+enable D14 F2 strict mode and catch them immediately.
+
+### 4. Assuming round-robin recv fairness
+
+`tl.recv()` (no direction) returns the first slot to arrive in
+round-robin order, but **arrival order is not predictable**. If your
+algorithm depends on a particular direction, name it explicitly:
+`tl.recv(dir="N", ...)`.
+
+### 5. Confusing `num_programs` with the CCL group size
+
+`tl.num_programs(axis=0/1)` reports topology slot counts, not the
+number of ranks participating in the collective. The host bench knows
+`world_size` and must pass it through as a kernel argument.
+
+### 6. Overwriting the send source before it's actually sent
+
+PE_DMA snapshots the source data into the IpcqDmaToken at send time,
+preserving in-flight semantics. Even so, the safest pattern is to call
+`tl.send` first and only mutate the source addr afterwards. If you
+mutate the addr before `tl.send` makes it into the PE_DMA queue, the
+snapshot will pick up the wrong data.
+
+---
+
+## 9. Next steps
+
+- Try other topologies (`mesh_2d`, `tree_binary`).
+- Faster algorithms (recursive halving / doubling).
+- Compare `buffer_kind` (tcm/hbm/sram) and `backpressure` (poll/sleep)
+  modes for latency.
+- Larger-scale validation through the unified `ccl_allreduce` bench
+  with different `ccl.yaml` overlays.
+
+If you add a new algorithm or pattern, please send a PR.
+
+---
+
+## References
+
+- [ADR-0023](adr/ADR-0023-ipcq-pe-collective.md): IPCQ + PE-level collective design.
+- [ADR-0022](adr/ADR-0022-program-id-2d-grid.md): 2D grid program_id (axis=0/1).
+- [ADR-0020](adr/ADR-0020-data-execution-two-pass.md): 2-pass data execution.
+- [ADR-0021](adr/ADR-0021-pe-pipeline-refactor.md): PE pipeline refactor.
+
+Existing algorithm examples:
+
+- [`src/kernbench/ccl/algorithms/hello_send.py`](../src/kernbench/ccl/algorithms/hello_send.py) — simplest send/recv
+- [`src/kernbench/ccl/algorithms/ring_allreduce.py`](../src/kernbench/ccl/algorithms/ring_allreduce.py) — ring all-reduce
+- [`src/kernbench/ccl/algorithms/mesh_allreduce.py`](../src/kernbench/ccl/algorithms/mesh_allreduce.py) — 2D mesh all-reduce
+- [`src/kernbench/ccl/algorithms/tree_allreduce.py`](../src/kernbench/ccl/algorithms/tree_allreduce.py) — binary tree all-reduce
diff --git a/docs/ccl-author-guide.md b/docs/ccl-author-guide.md
new file mode 100644
index 0000000..4fa7cb4
--- /dev/null
+++ b/docs/ccl-author-guide.md
@@ -0,0 +1,537 @@
+# CCL Algorithm Author Guide
+
+이 문서는 kernbench에서 CCL (Collective Communication Library) 알고리즘을
+직접 작성하는 사람을 위한 step-by-step 가이드이다. 시스템 내부 설계와
+컴포넌트 구조는 [ADR-0023](adr/ADR-0023-ipcq-pe-collective.md)에 있다.
+
+본 가이드는 알고리즘 작성자가 **자신이 만져야 할 곳**과 **만지지 않아도 될 곳**을
+명확히 분리하고, 가장 짧은 경로로 첫 알고리즘을 동작시키는 것을 목표로 한다.
+
+---
+
+## 0. 5분 요약
+
+| 만지는 것 | 위치 |
+|----------|------|
+| 알고리즘 모듈 (kernel + 선택적 neighbors) | `src/kernbench/ccl/algorithms/<algo>.py` |
+| 알고리즘 등록 | `ccl.yaml` |
+| 호스트 bench (PE 수, 메모리 init, launch, 검증) | `benches/<your_bench>.py` |
+| (선택) 단위 테스트 | `tests/test_<algo>.py` |
+
+| 만지지 않는 것 | 위치 |
+|---------------|------|
+| TLContext API | `src/kernbench/triton_emu/tl_context.py` (ADR-0022 spec) |
+| 프레임워크 (topology generators, helpers, mock testing) | `src/kernbench/ccl/` |
+| PE_IPCQ / PE_DMA 컴포넌트 | `src/kernbench/components/builtin/` |
+| backend 구현 (install_ipcq) | `src/kernbench/runtime_api/distributed.py` 및 `kernbench/ccl/install.py` |
+
+흐름:
+1. 알고리즘 모듈에 `kernel` 작성
+2. `ccl.yaml`에 entry 등록
+3. 호스트 bench에서 `install_ipcq` + `launch`
+4. (선택) mock runtime으로 단위 테스트 (수 ms)
+5. `kernbench run --bench <name> --verify-data`로 SimPy 검증
+
+---
+
+## 1. Hello World — 가장 단순한 send/recv
+
+각 PE가 자기 데이터를 E 방향 이웃에 한 번 보내고, W 방향에서 한 번 받는
+가장 단순한 알고리즘이다. 실제 동작 코드는
+[`src/kernbench/ccl/algorithms/hello_send.py`](../src/kernbench/ccl/algorithms/hello_send.py)
+에 있다.
+
+### Step 1: kernel 작성
+
+새 파일 `src/kernbench/ccl/algorithms/hello_send.py`:
+
+```python
+"""Hello world: 자기 데이터를 다음 rank에 보내고 이전 rank에서 받기."""
+def kernel(t_ptr, n_elem, tl):
+    # 글로벌 rank는 program_id(0/1)에서 계산 (ADR-0022)
+    local_pe = tl.program_id(axis=0)
+    cube_id = tl.program_id(axis=1)
+    pes_per_cube = tl.num_programs(axis=0)
+    rank = cube_id * pes_per_cube + local_pe
+
+    nbytes = n_elem * 2  # f16
+    pe_addr = t_ptr + rank * nbytes
+
+    # 자기 슬라이스를 로드해서 E로 보낸다.
+    src = tl.load(pe_addr, shape=(n_elem,), dtype="f16")
+    tl.send(dir="E", src=src)
+
+    # W 방향에서 받아서 그대로 자기 슬라이스에 store한다.
+    recv = tl.recv(dir="W", shape=(n_elem,), dtype="f16")
+    tl.store(pe_addr, recv)
+```
+
+핵심 포인트:
+
+- **글로벌 rank는 `program_id(axis=0)` + `program_id(axis=1)`에서 계산.** TL에는
+  `tl.rank` / `tl.world_size` 같은 약속되지 않은 확장이 없다. 호스트가
+  `world_size` 같은 알고리즘 파라미터가 필요하면 `torch.launch`의 일반 인자로
+  전달한다.
+- **`tl.send`는 `TensorHandle`을 받는다.** 핸들의 `addr`/`space`/`shape`/`dtype`/`nbytes`를
+  PE_IPCQ가 읽어 PE_DMA에 IpcqDmaToken을 발행한다.
+- **`tl.recv`는 `shape`와 `dtype`이 필수.** 반환된 TensorHandle은 IPCQ ring slot을
+  가리키며, `tl.store(pe_addr, recv)`처럼 dst 핸들로 그대로 사용할 수 있다.
+  Phase 2 dma_write replay가 (slot, hbm) 복사를 수행하므로 numpy `.data`를
+  직접 만질 필요가 없다.
+
+### Step 2: ccl.yaml 등록
+
+`ccl.yaml`의 `algorithms` 섹션에 entry를 추가한다. (defaults.algorithm은 호스트
+bench가 `install_ipcq(algorithm=...)`로 명시 전달해도 되므로 꼭 바꿀 필요는 없다.)
+
+```yaml
+algorithms:
+  hello_send:
+    module: kernbench.ccl.algorithms.hello_send
+    topology: ring_1d
+    buffer_kind: tcm
+```
+
+### Step 3: 호스트 bench 작성
+
+새 파일 `benches/ccl_hello.py`:
+
+```python
+"""Hello-world ring rotation bench (각 PE가 W 이웃의 데이터를 1번 받음)."""
+import numpy as np
+
+from kernbench.ccl.algorithms import hello_send
+from kernbench.policy.placement.dp import DPPolicy
+
+ALGORITHM = "hello_send"
+N_ELEM = 8
+WORLD_SIZE = 8
+
+
+def run(torch):
+    plan = torch.install_ipcq(algorithm=ALGORITHM)
+
+    a = torch.zeros(
+        (1, WORLD_SIZE * N_ELEM), dtype="f16",
+        dp=DPPolicy(
+            sip="replicate", cube="replicate", pe="column_wise",
+            num_sips=1, num_cubes=1,
+        ),
+        name="hello_in",
+    )
+
+    store = torch.engine.memory_store
+    base = a._handle.va_base or a._handle.shards[0].pa
+    nbytes = N_ELEM * 2
+    for r in range(WORLD_SIZE):
+        store.write("hbm", base + r * nbytes,
+                    np.full((N_ELEM,), float(r + 1), dtype=np.float16))
+
+    torch.launch(ALGORITHM, hello_send.kernel, a, N_ELEM)
+
+    # rank r은 rank (r-1)%ws의 데이터를 가져야 한다.
+    for r, (sip, cube, pe) in enumerate(plan["rank_to_pe"]):
+        result = store.read("hbm", base + r * nbytes, shape=(N_ELEM,), dtype="f16")
+        prev = float(((r - 1) % WORLD_SIZE) + 1)
+        ok = np.allclose(result, prev)
+        print(f"  [{'OK ' if ok else 'FAIL'}] rank {r} got {float(result.mean()):.1f}, "
+              f"expected {prev:.1f}")
+```
+
+### Step 4: 단위 테스트 (선택, 강력 추천)
+
+`tests/test_hello_send.py`:
+
+```python
+import numpy as np
+from kernbench.ccl.algorithms.hello_send import kernel
+from kernbench.ccl.testing import run_kernel_in_mock
+
+
+def test_hello_send_4_ranks():
+    n_elem = 8
+    inputs = [np.full((n_elem,), float(r + 1), dtype=np.float16) for r in range(4)]
+
+    outputs = run_kernel_in_mock(
+        kernel_fn=kernel,
+        world_size=4,
+        topology="ring_1d",
+        inputs=inputs,
+        kernel_args=(n_elem,),
+    )
+
+    # rank r은 rank (r-1) % 4의 데이터를 받아야 함
+    for r in range(4):
+        assert np.array_equal(outputs[r], inputs[(r - 1) % 4])
+```
+
+`run_kernel_in_mock`는 SimPy 없이 순수 Python으로 모든 rank를 동시 실행하므로
+**ms 단위로 끝난다**. 알고리즘 logic 정합성만 검증.
+
+### Step 5: 시뮬 검증
+
+```bash
+kernbench run --topology topology.yaml --bench ccl_hello --verify-data
+```
+
+Phase 1에서 SimPy 시뮬레이션 + MemoryStore 데이터 이동, Phase 2에서 op_log
+정합성 replay. 호스트 bench의 `print` 검증이 모든 rank에 대해 OK여야 한다.
+
+---
+
+## 2. Ring All-Reduce — 두 번째 알고리즘
+
+조금 더 복잡한 예제. Ring all-reduce는 N-1 라운드 동안 각 PE가 자기 데이터를
+E로 보내고 W에서 받아 누적한다. 최종적으로 모든 PE가 글로벌 sum을 갖는다.
+
+실제 동작 코드는 [`src/kernbench/ccl/algorithms/ring_allreduce.py`](../src/kernbench/ccl/algorithms/ring_allreduce.py)
+참조. 핵심 흐름:
+
+```python
+"""Ring all-reduce."""
+
+
+def kernel(t_ptr, n_elem, world_size, tl):
+    # rank
+    local_pe = tl.program_id(axis=0)
+    cube_id = tl.program_id(axis=1)
+    pes_per_cube = tl.num_programs(axis=0)
+    rank = cube_id * pes_per_cube + local_pe
+    nbytes = n_elem * 2
+    pe_addr = t_ptr + rank * nbytes
+
+    # HBM의 자기 슬라이스를 가리키는 TensorHandle. greenlet 모드에선 .data가
+    # 채워지지만 커널은 .data를 직접 만질 필요가 없다.
+    acc = tl.load(pe_addr, shape=(n_elem,), dtype="f16")
+    current = acc  # 첫 라운드 send 출처
+
+    for _step in range(world_size - 1):
+        tl.send(dir="E", src=current)
+        recv = tl.recv(dir="W", shape=(n_elem,), dtype="f16")
+        # TensorHandle 연산자 오버로드 → MathCmd → PE_MATH 디스패치.
+        # Phase 1은 타이밍만, Phase 2 DataExecutor가 실제 numpy 누적을 수행한다.
+        acc = acc + recv
+        current = recv  # 다음 라운드는 직전에 받은 슬롯을 다시 forward
+
+    # 최종 누적값을 자기 슬라이스에 store. 출처는 acc(=PE-local scratch addr)
+    # 이고 dst는 HBM. op_log dma_write가 (scratch, hbm) 복사 정보를 기록하므로
+    # Phase 2가 검증 시점에 HBM[pe_addr]에 정답을 채워준다.
+    tl.store(pe_addr, acc)
+```
+
+네 가지 포인트:
+
+1. **누적은 TensorHandle 연산자**: `acc + recv`는 `MathCmd`를 emit하고
+   PE_MATH로 디스패치된다 — 실제 하드웨어 경로를 거치므로 latency 모델이
+   정확하다. ADR-0020 D3대로 Phase 1은 타이밍만 시뮬레이션하고, Phase 2
+   `DataExecutor`가 op_log를 재실행하면서 numpy 누적을 수행한다.
+2. **`current = recv`로 forward**: 매 라운드의 send 출처를 직전에 받은 슬롯
+   핸들로 갱신해야 같은 데이터가 ring을 순회하면서 누적이 한 번씩 일어난다.
+   `current = acc`로 두면 누적값이 다시 송출되어 결과가 부풀려진다.
+3. **`tl.store(pe_addr, acc)` 한 번이면 끝**: 중간에 store→reload 패턴은
+   금지다. acc는 PE-local scratch에 살고, op_log가 (src=scratch, dst=hbm)
+   메타데이터를 기록한다. Phase 2가 math를 먼저 실행해 scratch를 채운 뒤
+   dma_write 스냅샷으로 HBM에 복사한다.
+4. **`world_size`는 호스트가 명시 전달**: TL은 topology slot 수만 안다 (예:
+   `num_programs(axis=0)`은 cube당 PE 수). 실제 참여하는 CCL group 크기는 bench가
+   알고 호스트→kernel 인자로 넘긴다.
+
+`ccl.yaml` 등록 + 호스트 bench는 [`benches/ccl_allreduce_tcm.py`](../benches/ccl_allreduce_tcm.py)
+참조. mock 단위 테스트는 [`tests/test_ccl_mock_runtime.py`](../tests/test_ccl_mock_runtime.py)
+를 그대로 따라하면 된다 (`kernel_args=(n_elem, world_size)` 인자 형태).
+
+---
+
+## 3. neighbors() override — Custom topology
+
+대부분의 알고리즘은 builtin topology(`ring_1d`, `mesh_2d`, `tree_binary`,
+`ring_1d_unidir`, `none`)로 충분하다. builtin을 변형하거나 새로 만들고 싶으면
+알고리즘 모듈에 `neighbors()`를 정의한다.
+
+### 시그니처
+
+```python
+def neighbors(rank: int, world_size: int, neighbor_map: dict[str, int]) -> dict[str, int] | None:
+    """builtin topology가 만든 neighbor_map을 override.
+
+    Args:
+        neighbor_map: ccl.yaml의 topology 필드가 만든 builtin 매핑.
+                      예: ring_1d → {"E": (rank+1)%ws, "W": (rank-1)%ws}
+                      mutable dict — 직접 수정 가능.
+
+    Returns:
+        dict: neighbor_map을 override한 결과 (또는 수정한 그 dict)
+        None: override 안 함, neighbor_map 그대로 사용
+    """
+    return None
+```
+
+### Pattern A: builtin을 base로 일부만 수정
+
+```python
+def neighbors(rank, world_size, neighbor_map):
+    # 짝수 rank만 W 방향 사용 (홀수 rank는 W 제거)
+    if rank % 2 == 1:
+        neighbor_map.pop("W", None)
+    return neighbor_map
+```
+
+### Pattern B: 완전히 새로 작성 (skip-connection ring)
+
+```python
+def neighbors(rank, world_size, neighbor_map):
+    # neighbor_map은 무시하고 새로 작성
+    return {"E": (rank + 2) % world_size}
+```
+
+### Pattern C: builtin 사용, override 없음
+
+`neighbors()` 함수를 정의하지 않거나 None을 반환:
+
+```python
+def neighbors(rank, world_size, neighbor_map):
+    return None  # 명시적으로 builtin 사용
+```
+
+---
+
+## 4. PE 커널 API 레퍼런스 (ADR-0023 D4)
+
+### IPCQ API
+
+| API | 설명 | Blocking? |
+|-----|------|-----------|
+| `tl.send(dir, src=TensorHandle)` | direction으로 데이터 send | Yes (peer slot full 시 wait) |
+| `tl.send(dir, src_addr=..., nbytes=..., shape=..., dtype=..., space=...)` | 동일, keyword 형태 | Yes |
+| `tl.recv(dir, shape=..., dtype=...)` | 특정 방향에서 blocking recv | Yes |
+| `tl.recv(shape=..., dtype=...)` | 4방향 round-robin recv (방향 미지정) | Yes |
+| `tl.recv_async(dir, shape=..., dtype=...) → RecvFuture` | non-blocking recv | No |
+| `tl.wait(future)` | non-blocking future 완료 대기 → TensorHandle | Yes |
+
+### 기존 TL API (ADR-0020/0022, 그대로 사용 가능)
+
+| API | 설명 |
+|-----|------|
+| `tl.load(addr, shape, dtype) → TensorHandle` | DMA read; greenlet 모드에서 `.data`에 ndarray |
+| `tl.store(addr, handle)` | DMA write — handle.data가 있으면 MemoryStore에 propagate |
+| `tl.composite(op, ...)` | GEMM/Math compute 비동기 submit |
+| `tl.program_id(axis=0)` | cube 내 local PE id |
+| `tl.program_id(axis=1)` | cube id (ADR-0022) |
+| `tl.num_programs(axis=0/1)` | topology 슬롯 수 (참여 ranks 수가 아님) |
+
+### `recv` 두 가지 모드
+
+기본은 `return_slot` (zero-copy): IPCQ slot 주소가 그대로 handle.addr에 들어온다.
+slot 데이터를 별도 위치로 복사하고 싶으면 `dst_addr` + `dst_space`를 명시:
+
+```python
+recv = tl.recv(
+    dir="W", shape=(8,), dtype="f16",
+    dst_addr=my_scratch_addr,
+    dst_space="hbm",
+)
+# 이제 recv.addr == my_scratch_addr (copy_to_dst 모드)
+```
+
+---
+
+## 5. Helpers (`kernbench.ccl.helpers`)
+
+알고리즘 코드를 짧게 유지하기 위한 헬퍼들:
+
+```python
+from kernbench.ccl.helpers import chunked, ring_step, tree_step
+```
+
+### `chunked(base_addr, n_chunks, n_elem, dtype="f16") → list[Chunk]`
+
+총 `n_elem` 개의 element를 `n_chunks` 등분한 view 리스트를 반환. 각 `Chunk`는
+`addr`, `n_elem`, `nbytes` 필드를 가진다.
+
+```python
+chunks = chunked(t_ptr, n_chunks=4, n_elem=64, dtype="f16")
+# chunks[0..3] 각각 16 element view, addr이 연속
+```
+
+### `ring_step(rank, step, world_size) → (send_idx, recv_idx)`
+
+Ring algorithm의 step별 chunk 인덱스 (reduce-scatter / all-gather):
+
+```python
+for step in range(world_size - 1):
+    send_idx, recv_idx = ring_step(rank, step, world_size)
+    tl.send(dir="E", src_addr=chunks[send_idx].addr,
+            nbytes=chunks[send_idx].nbytes,
+            shape=(chunks[send_idx].n_elem,), dtype="f16")
+    recv = tl.recv(dir="W", shape=(chunks[recv_idx].n_elem,), dtype="f16")
+    # accumulate ...
+```
+
+### `tree_step(rank, world_size) → {"parent": int|None, "children": list[int]}`
+
+Binary tree의 parent/children rank:
+
+```python
+info = tree_step(rank, world_size)
+if info["parent"] is None:
+    print(f"rank {rank} is the root")
+for child in info["children"]:
+    ...
+```
+
+---
+
+## 6. 단위 테스트 — Mock Runtime
+
+`kernbench.ccl.testing.run_kernel_in_mock`은 SimPy를 거치지 않고 알고리즘을
+빠르게 검증할 수 있다.
+
+### 기본 사용법
+
+```python
+from kernbench.ccl.testing import run_kernel_in_mock
+from kernbench.ccl.algorithms.my_algo import kernel
+import numpy as np
+
+
+def test_my_algo():
+    n_elem = 16
+    inputs = [np.arange(n_elem, dtype="f16") + r for r in range(4)]
+    expected = sum(inputs)
+
+    outputs = run_kernel_in_mock(
+        kernel_fn=kernel,
+        world_size=4,
+        topology="ring_1d",
+        inputs=inputs,
+        kernel_args=(n_elem, 4),  # kernel의 (t_ptr 이후) 추가 positional 인자
+    )
+
+    for r in range(4):
+        assert np.allclose(outputs[r], expected, rtol=1e-3)
+```
+
+### 동작
+
+- 4개 rank의 kernel을 greenlet으로 동시 실행
+- `tl.send/recv`를 in-memory FIFO로 즉시 처리 (DMA, latency 무시)
+- 각 rank가 마지막에 store한 데이터를 ndarray로 반환
+
+### 한계
+
+- latency / 성능 측정 불가 (시뮬레이션이 아님)
+- PE_DMA, fabric, BW 모델 안 함
+- 정합성 검증만 가능
+- 한 cube 안에서 동작하는 가정 — `program_id(axis=1)`은 항상 0
+
+---
+
+## 7. 디버깅
+
+### CCL trace
+
+```bash
+KERNBENCH_CCL_TRACE=1 kernbench run --topology topology.yaml \
+    --bench ccl_allreduce_tcm --verify-data
+```
+
+각 rank의 send/recv 시점이 stdout에 출력된다:
+
+```
+[ccl t=346.4 send] sip0.cube0.pe1 dir=E nbytes=64 seq=0
+[ccl t=360.4 recv] sip0.cube0.pe2 dir=W nbytes=64
+...
+```
+
+### Pointer dump
+
+`kernbench.ccl.diagnostics.pointer_dump(engine)`는 모든 PE_IPCQ의 ring buffer
+상태(`my_head`, `my_tail`, `peer_head_cache`, `peer_tail_cache`)를 multi-line
+문자열로 반환한다. hang이 발생하면 어느 rank가 어떤 상태에서 막혔는지 한눈에
+보인다.
+
+### Deadlock detection
+
+매칭되지 않는 send/recv 등으로 SimPy 스케줄이 비면 engine이 `IpcqDeadlock`을
+던지며 pointer dump를 메시지에 포함시킨다 (ADR-0023 D14 F3). 별도 wait-for graph
+시각화는 미래 작업.
+
+---
+
+## 8. 흔한 실수
+
+### 1. install 안 된 direction 사용
+
+ccl.yaml의 `topology: ring_1d`는 E/W만 install한다. N/S 사용 시:
+
+```python
+tl.send(dir="N", ...)   # → IpcqInvalidDirection 예외
+```
+
+해결: `topology: mesh_2d`로 바꾸거나, `neighbors()` override로 N/S 추가.
+
+### 2. send만 호출하고 recv 없음
+
+```python
+def kernel(..., tl):
+    for _ in range(100):
+        tl.send(dir="E", ...)
+    # peer 측 recv 없음 → ring buffer 가득 차면 backpressure → deadlock
+```
+
+해결: 모든 send에 짝이 되는 recv가 있어야 한다. 안 그러면 `IpcqDeadlock`이
+발생한다.
+
+### 3. dtype/shape 불일치
+
+기본 모드에서는 dtype/shape mismatch를 검증하지 않는다. 작성자가 직접 보장하거나,
+PE_IPCQ 노드 attrs에 `strict_validation: true`를 설정해 D14 F2 strict 모드로
+mismatch를 즉시 잡을 수 있다.
+
+### 4. round-robin recv의 fairness 가정
+
+`tl.recv()` (방향 미지정)는 round-robin으로 가져오지만, 도착한 첫 슬롯을 반환한다.
+**도착 순서를 알 수 없으므로** 알고리즘이 도착 방향에 의존하면 안 된다.
+필요하면 `tl.recv(dir="N", ...)`처럼 명시.
+
+### 5. CCL 그룹 크기 가정
+
+`tl.num_programs(axis=0/1)`은 토폴로지 슬롯 개수이지 CCL group 크기가 아니다.
+참여하는 rank 수(`world_size`)는 호스트 bench가 알고 있고, kernel 인자로 명시
+전달해야 한다.
+
+### 6. 호스트가 send-source 메모리를 도착 전에 덮어씀
+
+PE_DMA가 송신 시점에 src 데이터를 토큰에 스냅샷해서 in-flight 데이터의 의미가
+보존된다. 그래도 하나의 PE 안에서 같은 주소를 여러 step에 걸쳐 갱신할 때는
+direct send 후 다른 step에서 같은 주소를 store해도 안전하다 (token snapshot 덕분).
+하지만 `tl.send`가 PE_DMA 큐에 enqueue되기 전에 주소를 덮어쓰면 잘못된 데이터가
+스냅샷된다 — `tl.send`를 먼저, 메모리 변경을 나중에 하는 게 권장.
+
+---
+
+## 9. 다음 단계
+
+- `mesh_2d` / `tree_binary` 같은 다른 topology 활용
+- recursive halving/doubling 등 더 빠른 알고리즘
+- `buffer_kind` (tcm/hbm/sram) / `backpressure` (poll/sleep) 모드별 latency 비교
+- `ccl_ring_allreduce_multicube.py`, `ccl_ring_allreduce_multisip.py`처럼 큰
+  scale의 ring 검증
+
+새 알고리즘이나 패턴을 추가했다면 PR로 기여해주세요.
+
+---
+
+## 참고
+
+- [ADR-0023](adr/ADR-0023-ipcq-pe-collective.md): IPCQ + PE-level collective 설계
+- [ADR-0022](adr/ADR-0022-program-id-2d-grid.md): 2D grid program_id (axis=0/1)
+- [ADR-0020](adr/ADR-0020-data-execution-two-pass.md): 2-pass data execution
+- [ADR-0021](adr/ADR-0021-pe-pipeline-refactor.md): PE pipeline refactor
+
+기존 알고리즘 예제:
+
+- [`src/kernbench/ccl/algorithms/hello_send.py`](../src/kernbench/ccl/algorithms/hello_send.py) — 가장 단순한 send/recv
+- [`src/kernbench/ccl/algorithms/ring_allreduce.py`](../src/kernbench/ccl/algorithms/ring_allreduce.py) — ring all-reduce
+- [`src/kernbench/ccl/algorithms/mesh_allreduce.py`](../src/kernbench/ccl/algorithms/mesh_allreduce.py) — 2D mesh all-reduce
+- [`src/kernbench/ccl/algorithms/tree_allreduce.py`](../src/kernbench/ccl/algorithms/tree_allreduce.py) — binary tree all-reduce
diff --git a/src/kernbench/ccl/__init__.py b/src/kernbench/ccl/__init__.py
new file mode 100644
index 0000000..aa60e46
--- /dev/null
+++ b/src/kernbench/ccl/__init__.py
@@ -0,0 +1,9 @@
+"""CCL (Collective Communication Library) framework for kernbench (ADR-0023).
+
+This package provides:
+    - topologies: builtin neighbor topology generators (ring/mesh/tree)
+    - helpers:    utilities for algorithm authors (chunked, ring_step, ...)
+    - testing:    mock CCL runtime for fast unit tests of algorithm kernels
+
+See docs/adr/ADR-0023-ipcq-pe-collective.md and docs/ccl-author-guide.md.
+"""
diff --git a/src/kernbench/ccl/algorithms/__init__.py b/src/kernbench/ccl/algorithms/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/kernbench/ccl/algorithms/hello_send.py b/src/kernbench/ccl/algorithms/hello_send.py
new file mode 100644
index 0000000..9275be1
--- /dev/null
+++ b/src/kernbench/ccl/algorithms/hello_send.py
@@ -0,0 +1,29 @@
+"""Hello-world CCL kernel for the docs/ccl-author-guide.md walkthrough.
+
+Each PE sends its tile to the E neighbor and receives one tile from W,
+then stores the received tile back into its own HBM slice. The simplest
+possible demonstration of ``tl.send`` / ``tl.recv``.
+"""
+from __future__ import annotations
+
+
+def kernel_args(world_size: int, n_elem: int) -> tuple:
+    """Return the positional kernel arguments for the ahbm backend."""
+    return (n_elem,)
+
+
+def kernel(t_ptr, n_elem, tl):
+    local_pe = tl.program_id(axis=0)
+    cube_id = tl.program_id(axis=1)
+    pes_per_cube = tl.num_programs(axis=0)
+    rank = cube_id * pes_per_cube + local_pe
+    nbytes = n_elem * 2
+    pe_addr = t_ptr + rank * nbytes
+
+    # Send our local HBM tile to the E neighbor.
+    src = tl.load(pe_addr, shape=(n_elem,), dtype="f16")
+    tl.send(dir="E", src=src)
+
+    # Receive a tile from W and store it into our slice (overwrite).
+    recv = tl.recv(dir="W", shape=(n_elem,), dtype="f16")
+    tl.store(pe_addr, recv)
diff --git a/src/kernbench/ccl/algorithms/mesh_allreduce.py b/src/kernbench/ccl/algorithms/mesh_allreduce.py
new file mode 100644
index 0000000..e668c5c
--- /dev/null
+++ b/src/kernbench/ccl/algorithms/mesh_allreduce.py
@@ -0,0 +1,73 @@
+"""2D-mesh all-reduce kernel (ADR-0023).
+
+Two-phase reduce on a square mesh of side ``S`` (world_size = S*S):
+  1. Row reduce: ring all-reduce along E/W within each row.
+  2. Column reduce: ring all-reduce along N/S within each column.
+
+After both phases, every rank holds the global sum.
+
+Uses TensorHandle math (PE_MATH) for accumulation. Op_log captures the
+data flow so Phase 2 produces correct final HBM contents. Math/recv
+handles are passed directly to the next send, avoiding store→reload
+which doesn't propagate correctly with timing-only Phase 1 math.
+"""
+from __future__ import annotations
+
+import math
+
+
+def kernel_args(world_size: int, n_elem: int) -> tuple:
+    """Return the positional kernel arguments for the ahbm backend.
+
+    Mesh all-reduce requires ``world_size`` to be a perfect square —
+    the mesh side length is ``sqrt(world_size)``.
+    """
+    side = int(round(math.sqrt(world_size)))
+    if side * side != world_size:
+        raise ValueError(
+            f"mesh_allreduce requires a square world_size; got {world_size}"
+        )
+    return (n_elem, side)
+
+
+def kernel(t_ptr, n_elem, side, tl):
+    """All-reduce on a square mesh.
+
+    Args:
+        t_ptr: HBM base address (column-sharded VA shared across ranks)
+        n_elem: number of f16 elements per tile
+        side: mesh side length (sqrt(world_size))
+        tl: TLContext (ADR-0022).
+    """
+    local_pe = tl.program_id(axis=0)
+    cube_id = tl.program_id(axis=1)
+    pes_per_cube = tl.num_programs(axis=0)
+    rank = cube_id * pes_per_cube + local_pe
+    nbytes = n_elem * 2
+
+    pe_addr = t_ptr + rank * nbytes
+    acc = tl.load(pe_addr, shape=(n_elem,), dtype="f16")
+    current = acc
+
+    # ── Phase 1: row ring (E direction) ──
+    # Ring forwards each received tile (not the cumulative acc) so every
+    # tile passes through every rank exactly once.
+    for _ in range(side - 1):
+        tl.send(dir="E", src=current)
+        recv = tl.recv(dir="W", shape=(n_elem,), dtype="f16")
+        acc = acc + recv
+        current = recv
+
+    # Phase 2 column ring starts from the row-phase accumulator. We do NOT
+    # store/reload here — the math handle's scratch addr is the source for
+    # the first column send and Phase 2 ipcq_copy replays from there.
+    current = acc
+
+    # ── Phase 2: column ring (S direction) ──
+    for _ in range(side - 1):
+        tl.send(dir="S", src=current)
+        recv = tl.recv(dir="N", shape=(n_elem,), dtype="f16")
+        acc = acc + recv
+        current = recv
+
+    tl.store(pe_addr, acc)
diff --git a/src/kernbench/ccl/algorithms/ring_allreduce.py b/src/kernbench/ccl/algorithms/ring_allreduce.py
new file mode 100644
index 0000000..ac8c9dd
--- /dev/null
+++ b/src/kernbench/ccl/algorithms/ring_allreduce.py
@@ -0,0 +1,80 @@
+"""Ring all-reduce kernel for IPCQ-based PE collective (ADR-0023).
+
+Algorithm: 1D ring of N PEs, each PE starts with one tile of data.
+After ``world_size - 1`` rounds, every PE's accumulator holds the sum
+of all PE tiles.
+
+Strategy
+--------
+Each PE starts with its own tile in HBM. The kernel:
+1. Loads the local tile into a TensorHandle (the accumulator).
+2. In each of ``world_size - 1`` rounds:
+   - Sends the current accumulator/recv slot to the E neighbor.
+   - Receives a tile from the W neighbor — the recv handle points
+     into the per-direction TCM slot.
+   - Adds the received tile to the accumulator using the TensorHandle
+     operator overload, which dispatches to ``MathCmd`` (PE_MATH).
+3. Stores the final accumulator back to HBM via tl.store. The store is
+   recorded in op_log with both src and dst, so Phase 2 will copy the
+   replayed math result from PE-local scratch into HBM.
+
+ADR-0020 D3 split: Phase 1 simulates timing only — math results are
+not yet computed, so the accumulator data flowing through Phase 1 may
+be stale. Phase 2's DataExecutor replays math + IPCQ copies + dma_write
+in stable t_start order, producing correct final HBM contents.
+"""
+from __future__ import annotations
+
+
+def kernel_args(world_size: int, n_elem: int) -> tuple:
+    """Return the positional kernel arguments for the ahbm backend.
+
+    Ring all-reduce takes (n_elem, world_size) after the tensor pointer.
+    """
+    return (n_elem, world_size)
+
+
+def kernel(t_ptr, n_elem, world_size, tl):
+    """Ring all-reduce.
+
+    Args:
+        t_ptr: HBM base address of the column-sharded tensor — all PEs
+               share this base. The per-PE slice lives at
+               ``t_ptr + global_rank * n_elem * 2``.
+        n_elem: number of f16 elements per tile.
+        world_size: total number of participating ranks (passed by host).
+        tl: TLContext (auto-injected, ADR-0022). The kernel derives the
+            global rank from ``program_id(axis=0)`` (local PE) and
+            ``program_id(axis=1)`` (cube id):
+
+                rank = cube_id * pes_per_cube + local_pe
+    """
+    local_pe = tl.program_id(axis=0)
+    cube_id = tl.program_id(axis=1)
+    pes_per_cube = tl.num_programs(axis=0)
+    rank = cube_id * pes_per_cube + local_pe
+    nbytes = n_elem * 2  # f16
+
+    # Each PE reads from its own slice of the shared base address
+    pe_addr = t_ptr + rank * nbytes
+
+    # Load the local tile — handle points at HBM[pe_addr].
+    acc = tl.load(pe_addr, shape=(n_elem,), dtype="f16")
+    # The ring forwards each received tile to the next neighbor (NOT the
+    # cumulative accumulator), so every rank's tile passes through every
+    # rank exactly once. The accumulator sums the new arrival each round.
+    current = acc
+
+    for _step in range(world_size - 1):
+        tl.send(dir="E", src=current)
+        recv = tl.recv(dir="W", shape=(n_elem,), dtype="f16")
+        # TensorHandle add → MathCmd → PE_MATH (timing in Phase 1, real
+        # numpy in Phase 2 via DataExecutor). The result handle lives at
+        # an auto-allocated PE-local scratch addr.
+        acc = acc + recv
+        current = recv  # forward W's tile to E next round
+
+    # Final result back to this PE's HBM slice. Op_log captures the
+    # source (scratch addr) and dst (HBM slice) so Phase 2 copies the
+    # accumulated value into HBM for verification.
+    tl.store(pe_addr, acc)
diff --git a/src/kernbench/ccl/algorithms/tree_allreduce.py b/src/kernbench/ccl/algorithms/tree_allreduce.py
new file mode 100644
index 0000000..9462846
--- /dev/null
+++ b/src/kernbench/ccl/algorithms/tree_allreduce.py
@@ -0,0 +1,80 @@
+"""Tree all-reduce kernel for IPCQ-based PE collective (ADR-0023).
+
+Two-phase binary tree all-reduce:
+
+  Phase 1 (reduce up):
+    - leaf nodes send their value to ``parent``
+    - internal nodes recv from each child, sum, then send to ``parent``
+    - root accumulates child contributions; final acc holds global sum
+
+  Phase 2 (broadcast down):
+    - root sends acc to ``child_left`` and ``child_right`` (if present)
+    - internal nodes recv from ``parent``, then forward to children
+    - all ranks store the final acc to HBM
+
+Uses TensorHandle math (PE_MATH) for accumulation. Op_log captures the
+data flow so Phase 2 produces correct final HBM contents. The kernel
+deliberately avoids the store→reload→send pattern: math/recv handles
+are passed directly to the next send so PE_DMA snapshots a deterministic
+source addr that Phase 2 can replay.
+"""
+from __future__ import annotations
+
+
+def kernel_args(world_size: int, n_elem: int) -> tuple:
+    """Return the positional kernel arguments for the ahbm backend."""
+    return (n_elem, world_size)
+
+
+def kernel(t_ptr, n_elem, world_size, tl):
+    """Tree all-reduce.
+
+    Args:
+        t_ptr: HBM base address.
+        n_elem: number of f16 elements per tile.
+        world_size: total number of participating ranks (passed by host).
+        tl: TLContext (ADR-0022). Global rank from program_id(0/1).
+    """
+    local_pe = tl.program_id(axis=0)
+    cube_id = tl.program_id(axis=1)
+    pes_per_cube = tl.num_programs(axis=0)
+    rank = cube_id * pes_per_cube + local_pe
+    nbytes = n_elem * 2
+
+    pe_addr = t_ptr + rank * nbytes
+    acc = tl.load(pe_addr, shape=(n_elem,), dtype="f16")
+
+    # Compute children/parent existence (matches tree_binary topology generator)
+    has_parent = rank > 0
+    left = 2 * rank + 1
+    right = 2 * rank + 2
+    has_left = left < world_size
+    has_right = right < world_size
+
+    # ── Phase 1: reduce up ──
+    if has_left:
+        recv = tl.recv(dir="child_left", shape=(n_elem,), dtype="f16")
+        acc = acc + recv
+    if has_right:
+        recv = tl.recv(dir="child_right", shape=(n_elem,), dtype="f16")
+        acc = acc + recv
+
+    if has_parent:
+        # Send the math/load handle directly — its addr is either the
+        # original HBM tile (leaf) or the PE-local scratch where the
+        # accumulator lives. Phase 2 ipcq_copy replays from the same addr.
+        tl.send(dir="parent", src=acc)
+
+    # ── Phase 2: broadcast down ──
+    if has_parent:
+        # Replace acc with the value broadcast from the parent (the global
+        # sum). The recv handle points at the parent-direction TCM slot.
+        acc = tl.recv(dir="parent", shape=(n_elem,), dtype="f16")
+
+    if has_left:
+        tl.send(dir="child_left", src=acc)
+    if has_right:
+        tl.send(dir="child_right", src=acc)
+
+    # Final store to HBM for the bench's verification path.
+    tl.store(pe_addr, acc)
diff --git a/src/kernbench/ccl/diagnostics.py b/src/kernbench/ccl/diagnostics.py
new file mode 100644
index 0000000..6dec58e
--- /dev/null
+++ b/src/kernbench/ccl/diagnostics.py
@@ -0,0 +1,127 @@
+"""CCL diagnostics: trace + pointer dump + deadlock (ADR-0023 D14).
+
+Trace
+-----
+Set ``KERNBENCH_CCL_TRACE=1`` (or any truthy value) to enable per-event
+logging of CCL send/recv to stdout. Off by default.
+
+Pointer dump
+------------
+``pointer_dump(engine)`` returns a multi-line string showing every PE_IPCQ's
+ring buffer state (my_head, my_tail, peer_head_cache, peer_tail_cache).
+Useful for diagnosing hangs.
+
+Deadlock
+--------
+``IpcqDeadlock`` is raised by the engine when SimPy's schedule empties
+while a request is still pending — typical of unmatched send/recv pairs.
+The exception message includes the pointer dump.
+"""
+from __future__ import annotations
+
+import os
+from typing import Any
+
+
+class IpcqDeadlock(RuntimeError):
+    """Raised when the simulation cannot make further progress while a
+    CCL request is still pending (D14 F3)."""
+
+
+# ── Trace toggle ─────────────────────────────────────────────────────
+
+
+_TRACE_ENABLED: bool = False
+
+
+def reload_trace_setting() -> None:
+    """Re-read the ``KERNBENCH_CCL_TRACE`` env var."""
+    global _TRACE_ENABLED
+    val = os.environ.get("KERNBENCH_CCL_TRACE", "")
+    _TRACE_ENABLED = val.strip().lower() in {"1", "true", "yes", "on"}
+
+
+def trace_enabled() -> bool:
+    return _TRACE_ENABLED
+
+
+# Initialise once at import time
+reload_trace_setting()
+
+
+# ── Trace event functions ────────────────────────────────────────────
+
+
+def log_send(
+    t_ns: float,
+    sender: str,
+    direction: str,
+    nbytes: int,
+    sender_seq: int,
+) -> None:
+    if not _TRACE_ENABLED:
+        return
+    print(
+        f"[ccl t={t_ns:.1f} send] {sender} dir={direction} nbytes={nbytes} seq={sender_seq}",
+        flush=True,
+    )
+
+
+def log_recv(
+    t_ns: float,
+    receiver: str,
+    direction: str,
+    nbytes: int,
+) -> None:
+    if not _TRACE_ENABLED:
+        return
+    print(
+        f"[ccl t={t_ns:.1f} recv] {receiver} dir={direction} nbytes={nbytes}",
+        flush=True,
+    )
+
+
+def log_credit_return(
+    t_ns: float,
+    sender: str,
+    direction: str,
+    consumer_seq: int,
+) -> None:
+    if not _TRACE_ENABLED:
+        return
+    print(
+        f"[ccl t={t_ns:.1f} credit] {sender} dir={direction} seq={consumer_seq}",
+        flush=True,
+    )
+
+
+# ── Pointer dump ─────────────────────────────────────────────────────
+
+
+def pointer_dump(engine: Any) -> str:
+    """Return a multi-line string of every PE_IPCQ's pointer state."""
+    lines: list[str] = []
+    components = getattr(engine, "_components", {})
+    for node_id in sorted(components):
+        if not node_id.endswith(".pe_ipcq"):
+            continue
+        comp = components[node_id]
+        qps = getattr(comp, "queue_pairs", {})
+        if not qps:
+            continue
+        lines.append(node_id)
+        for d in sorted(qps):
+            qp = qps[d]
+            peer = qp["peer"]
+            lines.append(
+                f"  {d}: peer=sip{peer.sip}.cube{peer.cube}.pe{peer.pe}  "
+                f"my_head={qp['my_head']} my_tail={qp['my_tail']}  "
+                f"peer_head_cache={qp['peer_head_cache']} "
+                f"peer_tail_cache={qp['peer_tail_cache']}"
+            )
+    return "\n".join(lines)
+
+
+def print_pointer_dump(engine: Any) -> None:
+    """Convenience: print pointer_dump(engine) to stdout."""
+    print(pointer_dump(engine), flush=True)
diff --git a/src/kernbench/ccl/helpers.py b/src/kernbench/ccl/helpers.py
new file mode 100644
index 0000000..cc3f240
--- /dev/null
+++ b/src/kernbench/ccl/helpers.py
@@ -0,0 +1,118 @@
+"""Helpers for CCL algorithm authors (ADR-0023 D15).
+
+These are pure utility functions usable from any kernel module:
+
+    from kernbench.ccl.helpers import chunked, ring_step, tree_step
+
+They keep algorithm code short and free of off-by-one bugs.
+"""
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any
+
+
+_DTYPE_BYTES = {
+    "f16": 2, "fp16": 2, "float16": 2, "bf16": 2,
+    "f32": 4, "fp32": 4, "float32": 4,
+    "i8": 1, "int8": 1,
+    "i16": 2, "int16": 2,
+    "i32": 4, "int32": 4,
+}
+
+
+def _itemsize(dtype: str) -> int:
+    if dtype not in _DTYPE_BYTES:
+        raise ValueError(f"Unsupported dtype: {dtype}")
+    return _DTYPE_BYTES[dtype]
+
+
+# ── chunked ──────────────────────────────────────────────────────────
+
+
+@dataclass(frozen=True)
+class Chunk:
+    """One chunk of a tensor used by collective algorithms."""
+
+    addr: int
+    n_elem: int
+    nbytes: int
+
+
+def chunked(
+    base_addr: int,
+    n_chunks: int,
+    n_elem: int,
+    dtype: str = "f16",
+) -> list[Chunk]:
+    """Slice a 1D buffer into ``n_chunks`` equal Chunks.
+
+    Args:
+        base_addr: starting address of the buffer.
+        n_chunks: number of equal chunks to produce.
+        n_elem: total number of elements (must be divisible by n_chunks).
+        dtype: element type for byte-size calculation.
+
+    Returns:
+        List of ``Chunk`` objects whose addresses are consecutive.
+
+    Raises:
+        ValueError: if n_elem is not divisible by n_chunks.
+    """
+    if n_elem % n_chunks != 0:
+        raise ValueError(
+            f"chunked: n_elem ({n_elem}) not divisible by n_chunks ({n_chunks})"
+        )
+    per_chunk_elem = n_elem // n_chunks
+    isize = _itemsize(dtype)
+    per_chunk_bytes = per_chunk_elem * isize
+    return [
+        Chunk(
+            addr=base_addr + i * per_chunk_bytes,
+            n_elem=per_chunk_elem,
+            nbytes=per_chunk_bytes,
+        )
+        for i in range(n_chunks)
+    ]
+
+
+# ── ring_step ────────────────────────────────────────────────────────
+
+
+def ring_step(rank: int, step: int, world_size: int) -> tuple[int, int]:
+    """Return ``(send_chunk_idx, recv_chunk_idx)`` for a ring algorithm step.
+
+    Standard reduce-scatter / all-gather ring schedule:
+        at step s, rank r sends chunk (r - s) and receives chunk (r - s - 1)
+        modulo world_size.
+
+    Used by ring all-reduce kernels:
+
+        for step in range(world_size - 1):
+            send_idx, recv_idx = ring_step(rank, step, world_size)
+            tl.send(dir="E", src=chunks[send_idx])
+            chunks[recv_idx] += tl.recv(dir="W").data
+    """
+    send_idx = (rank - step) % world_size
+    recv_idx = (rank - step - 1) % world_size
+    return send_idx, recv_idx
+
+
+# ── tree_step ────────────────────────────────────────────────────────
+
+
+def tree_step(rank: int, world_size: int) -> dict[str, Any]:
+    """Return parent/children for binary tree rooted at rank 0.
+
+    Returns:
+        ``{"parent": int|None, "children": list[int]}``
+    """
+    parent = (rank - 1) // 2 if rank > 0 else None
+    children: list[int] = []
+    left = 2 * rank + 1
+    right = 2 * rank + 2
+    if left < world_size:
+        children.append(left)
+    if right < world_size:
+        children.append(right)
+    return {"parent": parent, "children": children}
diff --git a/src/kernbench/ccl/install.py b/src/kernbench/ccl/install.py
new file mode 100644
index 0000000..4218763
--- /dev/null
+++ b/src/kernbench/ccl/install.py
@@ -0,0 +1,266 @@
+"""IPCQ install plan for AhbmCCLBackend (ADR-0023 D10/D11/D12).
+
+Given a ccl.yaml config, the topology, and the engine, this module:
+
+1. Loads ccl.yaml and resolves the chosen algorithm.
+2. Maps each rank to a (sip, cube, pe) PE address using a linear scheme.
+3. Allocates per-rank IPCQ ring buffer base addresses (synthetic but
+   unique-per-PE; see notes below).
+4. Builds neighbor tables via the algorithm's ``topology`` field plus the
+   optional ``neighbors()`` override hook from the algorithm module.
+5. Wires bidirectional credit-return SimPy Stores between every (PE, peer)
+   pair.
+6. Installs each PE_IPCQ component's neighbor table directly via its
+   ``_install_neighbors`` sideband call (equivalent to fan-out IpcqInitMsg
+   without going through fabric).
+
+Address scheme
+--------------
+For the first implementation we use a synthetic address scheme that
+guarantees uniqueness per (sip, cube, pe, direction) without going
+through ``PEMemAllocator``. The address is encoded as:
+
+    base = IPCQ_BASE | (sip << 40) | (cube << 32) | (pe << 24)
+    rx_base[direction_idx] = base + direction_idx * (n_slots * slot_size)
+
+The ``buffer_kind`` (tcm/hbm/sram) selects the *MemoryStore space* into
+which data is written. Within a space, addresses are unique per PE so
+the existing MemoryStore (``{space: {addr: ndarray}}``) handles them
+naturally.
+
+This bypasses the topology's address resolver / PhysAddr encoding and
+treats IPCQ buffers as a separate, parallel address namespace. Real PA
+encoding can be plugged in later without changing the rest of the design.
+"""
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+import simpy
+import yaml
+
+from kernbench.ccl.topologies import resolve_topology
+from kernbench.common.ipcq_types import (
+    IpcqEndpoint,
+    IpcqInitEntry,
+)
+from kernbench.runtime_api.kernel import IpcqInitMsg
+
+
+# IPCQ synthetic address space top bit
+_IPCQ_BASE = 1 << 60
+
+
+def _ipcq_base_for_pe(sip: int, cube: int, pe: int) -> int:
+    return _IPCQ_BASE | (sip << 40) | (cube << 32) | (pe << 24)
+
+
+# ── ccl.yaml loading ─────────────────────────────────────────────────
+
+
+def load_ccl_config(path: str | Path | None = None) -> dict:
+    """Load and validate ccl.yaml. Searches cwd and project root."""
+    if path is None:
+        candidates = [
+            Path.cwd() / "ccl.yaml",
+            Path(__file__).resolve().parents[3] / "ccl.yaml",
+        ]
+        for p in candidates:
+            if p.exists():
+                path = p
+                break
+    if path is None:
+        raise FileNotFoundError(
+            "ccl.yaml not found. Place it at project root or cwd."
+        )
+    with open(path) as f:
+        cfg = yaml.safe_load(f)
+    if "defaults" not in cfg:
+        raise ValueError("ccl.yaml missing 'defaults' section")
+    if "algorithms" not in cfg:
+        raise ValueError("ccl.yaml missing 'algorithms' section")
+    return cfg
+
+
+def resolve_algorithm_config(cfg: dict, name: str | None = None) -> dict:
+    """Merge defaults with the chosen algorithm's overrides.
+
+    Returns a flat dict with at minimum: module, topology, buffer_kind,
+    backpressure, n_slots, slot_size, ipcq_credit_size_bytes, world_size.
+    """
+    defaults = dict(cfg.get("defaults", {}))
+    algo_name = name or defaults.get("algorithm")
+    if algo_name is None:
+        raise ValueError("ccl.yaml: defaults.algorithm not set")
+    algos = cfg.get("algorithms", {})
+    if algo_name not in algos:
+        raise ValueError(
+            f"ccl.yaml: algorithm '{algo_name}' not in algorithms section"
+        )
+    merged = defaults.copy()
+    merged.update(algos[algo_name])
+    merged["algorithm"] = algo_name
+    return merged
+
+
+# ── rank → PE mapping ────────────────────────────────────────────────
+
+
+def linear_rank_to_pe(rank: int, spec: dict) -> tuple[int, int, int]:
+    """Map a rank to (sip, cube, pe) using linear topology order."""
+    sips = spec["system"]["sips"]["count"]
+    cubes_per_sip = spec["sip"]["cube_mesh"]["w"] * spec["sip"]["cube_mesh"]["h"]
+    pe_layout = spec["cube"]["pe_layout"]
+    pes_per_cube = pe_layout["pe_per_corner"] * len(pe_layout["corners"])
+
+    pes_per_sip = cubes_per_sip * pes_per_cube
+    if rank >= sips * pes_per_sip:
+        raise ValueError(
+            f"rank {rank} exceeds total PE count {sips * pes_per_sip}"
+        )
+    sip = rank // pes_per_sip
+    rem = rank % pes_per_sip
+    cube = rem // pes_per_cube
+    pe = rem % pes_per_cube
+    return sip, cube, pe
+
+
+# ── Install plan ─────────────────────────────────────────────────────
+
+
+def install_ipcq(
+    engine: Any,
+    spec: dict,
+    cfg: dict,
+    algo_module: Any | None = None,
+    rank_to_pe: list[tuple[int, int, int]] | None = None,
+) -> dict[str, Any]:
+    """Build neighbor tables and install them in every participating PE_IPCQ.
+
+    Args:
+        engine: GraphEngine with ``_components`` dict
+        spec: topology spec dict
+        cfg: merged algorithm config (from ``resolve_algorithm_config``)
+        algo_module: optional algorithm Python module (for neighbors override)
+        rank_to_pe: optional explicit rank → (sip, cube, pe) mapping. If
+                    None, the default linear mapping is used.
+
+    Returns:
+        A diagnostics dict with the install plan (rank → PE map, neighbor table).
+    """
+    if "world_size" in cfg:
+        world_size = int(cfg["world_size"])
+    else:
+        # Topology-derived fallback (mirrors AhbmCCLBackend / RuntimeContext).
+        sips = int(spec.get("system", {}).get("sips", {}).get("count", 1))
+        cm = spec.get("sip", {}).get("cube_mesh", {})
+        cubes_per_sip = int(cm.get("w", 1)) * int(cm.get("h", 1))
+        pl = spec.get("cube", {}).get("pe_layout", {})
+        corners = pl.get("corners", [])
+        pe_per_corner = int(pl.get("pe_per_corner", 1))
+        pes_per_cube = pe_per_corner * max(len(corners), 1)
+        world_size = sips * cubes_per_sip * pes_per_cube
+    buffer_kind = cfg["buffer_kind"]
+    n_slots = int(cfg["n_slots"])
+    slot_size = int(cfg["slot_size"])
+    backpressure = cfg["backpressure"]
+    credit_size_bytes = int(cfg.get("ipcq_credit_size_bytes", 16))
+
+    # Step 1: rank → (sip, cube, pe)
+    if rank_to_pe is not None:
+        if len(rank_to_pe) != world_size:
+            raise ValueError(
+                f"rank_to_pe has {len(rank_to_pe)} entries but world_size={world_size}"
+            )
+        rank_pe = list(rank_to_pe)
+    else:
+        rank_pe: list[tuple[int, int, int]] = [
+            linear_rank_to_pe(r, spec) for r in range(world_size)
+        ]
+    pe_to_rank = {(s, c, p): r for r, (s, c, p) in enumerate(rank_pe)}
+
+    # Step 2: resolve topology fn (with optional override)
+    topo_fn = resolve_topology(cfg["topology"], algo_module=algo_module)
+
+    # Build per-rank neighbor map
+    neighbor_table: dict[int, dict[str, int]] = {}
+    for r in range(world_size):
+        neighbor_table[r] = topo_fn(r, world_size)
+
+    # Step 3: pull the live engine reference for each PE_IPCQ
+    components = engine._components
+    pe_ipcq_id = lambda s, c, p: f"sip{s}.cube{c}.pe{p}.pe_ipcq"
+
+    # Step 4: per-PE rx_base address and per-PE credit_inbox
+    direction_keys = sorted({d for nt in neighbor_table.values() for d in nt})
+    direction_idx = {d: i for i, d in enumerate(direction_keys)}
+    bytes_per_direction = n_slots * slot_size
+
+    def rx_base(s: int, c: int, p: int, d: str) -> int:
+        return _ipcq_base_for_pe(s, c, p) + direction_idx[d] * bytes_per_direction
+
+    # Wire bidirectional credit stores: backend creates the SimPy Stores
+    # by reading each rank's PE_IPCQ.credit_inbox property.
+    rank_to_credit_inbox: dict[int, simpy.Store] = {}
+    for r, (s, c, p) in enumerate(rank_pe):
+        comp = components[pe_ipcq_id(s, c, p)]
+        # Trigger lazy creation of credit_inbox if not yet started.
+        # PE_IPCQ.start() creates it; we ensure it exists.
+        if comp._credit_inbox is None:
+            comp._credit_inbox = simpy.Store(engine._env)
+        rank_to_credit_inbox[r] = comp.credit_inbox
+
+    # Step 5: build IpcqInitMsg per rank and call _install_neighbors directly
+    plan: dict[str, Any] = {
+        "world_size": world_size,
+        "rank_to_pe": rank_pe,
+        "buffer_kind": buffer_kind,
+        "neighbor_table": neighbor_table,
+    }
+
+    def reverse_direction(my_rank: int, peer_rank: int) -> str | None:
+        """Find which direction in peer's neighbor table points back to my_rank."""
+        for d, target in neighbor_table[peer_rank].items():
+            if target == my_rank:
+                return d
+        return None
+
+    for r, (s, c, p) in enumerate(rank_pe):
+        my_pe_ipcq = components[pe_ipcq_id(s, c, p)]
+        nbrs = neighbor_table[r]
+        entries: list[IpcqInitEntry] = []
+        for d, peer_rank in nbrs.items():
+            if peer_rank is None:
+                continue
+            peer_s, peer_c, peer_p = rank_pe[peer_rank]
+            peer_dir = reverse_direction(r, peer_rank)
+            if peer_dir is None:
+                # Peer doesn't have a reverse entry — skip (asymmetric topology)
+                continue
+            peer_endpoint = IpcqEndpoint(
+                sip=peer_s, cube=peer_c, pe=peer_p,
+                buffer_kind=buffer_kind,
+                rx_base_pa=rx_base(peer_s, peer_c, peer_p, peer_dir),
+                rx_base_va=0,
+                n_slots=n_slots, slot_size=slot_size,
+            )
+            entries.append(IpcqInitEntry(
+                direction=d,
+                peer=peer_endpoint,
+                my_rx_base_pa=rx_base(s, c, p, d),
+                my_rx_base_va=0,
+                n_slots=n_slots, slot_size=slot_size,
+                peer_credit_store=rank_to_credit_inbox[peer_rank],
+            ))
+        msg = IpcqInitMsg(
+            correlation_id="ccl_init", request_id=f"init_r{r}",
+            target_sips=(s,), target_cubes=(c,), target_pe=p,
+            entries=tuple(entries),
+            backpressure_mode=backpressure,
+            buffer_kind=buffer_kind,
+            credit_size_bytes=credit_size_bytes,
+        )
+        my_pe_ipcq._install_neighbors(msg)
+
+    return plan
diff --git a/src/kernbench/ccl/testing.py b/src/kernbench/ccl/testing.py
new file mode 100644
index 0000000..2d099ef
--- /dev/null
+++ b/src/kernbench/ccl/testing.py
@@ -0,0 +1,465 @@
+"""Mock CCL runtime for fast unit tests of algorithm kernels (ADR-0023 D15).
+
+Runs a kernel function once per rank with a minimal ``tl`` shim — no SimPy,
+no PE_DMA, no fabric simulation. Just enough to verify *functional*
+correctness of an IPCQ-based collective algorithm.
+
+Cross-rank send/recv is implemented with greenlet cooperative scheduling
+plus per-(rank, direction) FIFO queues. Backpressure is not modeled —
+queues are unbounded.
+
+Typical usage in a test::
+
+    from kernbench.ccl.testing import run_kernel_in_mock
+    from kernbench.ccl.algorithms.ring_allreduce import kernel
+
+    inputs = [np.full(16, r + 1, dtype="f16") for r in range(4)]
+    outputs = run_kernel_in_mock(
+        kernel_fn=kernel, world_size=4, topology="ring_1d",
+        inputs=inputs, kernel_args=(16,),
+    )
+    for r in range(4):
+        assert np.allclose(outputs[r], sum(inputs))
+"""
+from __future__ import annotations
+
+from collections import deque
+from typing import Any, Callable
+
+import numpy as np
+from greenlet import greenlet
+
+from kernbench.ccl.topologies import resolve_topology
+from kernbench.common.ipcq_types import IpcqInvalidDirection
+from kernbench.common.pe_commands import TensorHandle
+
+
+# ── Per-rank fake state ──────────────────────────────────────────────
+
+
+class _MockRankState:
+    """Per-rank scratch holding HBM/recv slots and tl shim hooks."""
+
+    def __init__(
+        self,
+        rank: int,
+        world_size: int,
+        neighbors: dict[str, int],
+        input_arr: np.ndarray,
+    ) -> None:
+        self.rank = rank
+        self.world_size = world_size
+        self.neighbors = neighbors  # direction → peer rank
+        # HBM "memory": addr → ndarray. Per-rank, no cross-rank sharing.
+        self._hbm: dict[int, np.ndarray] = {}
+        self._tcm: dict[int, np.ndarray] = {}
+        # ``t_ptr`` is the address the kernel sees. Real benches use a
+        # column-sharded VA so each rank reads from ``t_ptr + rank*nbytes``.
+        # Mirror that here: each rank's slice lives at the rank-specific addr.
+        nbytes = int(input_arr.nbytes)
+        self.t_ptr = 0  # base; per-rank offset is rank * nbytes
+        self._slice_addr = rank * nbytes
+        self._hbm[self._slice_addr] = input_arr.copy()
+        # Inbound recv FIFOs: direction → deque[ndarray]
+        self.recv_q: dict[str, deque[np.ndarray]] = {d: deque() for d in neighbors}
+        # Output (set when kernel calls tl.store at slice address)
+        self.output: np.ndarray | None = None
+        # Greenlet for this rank — set later
+        self.g: greenlet | None = None
+
+
+# ── Mock TLContext ───────────────────────────────────────────────────
+
+
+class _MockTL:
+    """Drop-in tl shim for mock runtime.
+
+    Supports the subset of TLContext API that algorithm authors use:
+    program_id, num_programs, load, store, send, recv, recv_async, wait,
+    plus arithmetic operations on TensorHandle (eager numpy execution,
+    no SimPy involved).
+    """
+
+    def __init__(self, state: _MockRankState, scheduler: "_MockScheduler") -> None:
+        self._state = state
+        self._scheduler = scheduler
+        self._handle_counter = 0
+
+    def _next_id(self) -> str:
+        self._handle_counter += 1
+        return f"mt{self._handle_counter}"
+
+    @property
+    def rank(self) -> int:
+        return self._state.rank
+
+    @property
+    def world_size(self) -> int:
+        return self._state.world_size
+
+    # axis-aware
+    def program_id(self, axis: int = 0) -> int:
+        return self._state.rank if axis == 0 else 0
+
+    def num_programs(self, axis: int = 0) -> int:
+        return self._state.world_size if axis == 0 else 1
+
+    # ── arithmetic ops (called by TensorHandle.__add__ etc.) ──
+
+    def _binary_math(self, op: str, a: TensorHandle, b: TensorHandle) -> TensorHandle:
+        a_data = np.asarray(a.data) if a.data is not None else None
+        b_data = np.asarray(b.data) if b.data is not None else None
+        if a_data is None or b_data is None:
+            result = None
+        elif op == "add":
+            result = a_data + b_data
+        elif op == "sub":
+            result = a_data - b_data
+        elif op == "mul":
+            result = a_data * b_data
+        elif op == "div":
+            result = a_data / b_data
+        elif op == "maximum":
+            result = np.maximum(a_data, b_data)
+        elif op == "minimum":
+            result = np.minimum(a_data, b_data)
+        else:
+            raise NotImplementedError(f"mock _binary_math: op {op!r} not implemented")
+        return TensorHandle(
+            id=self._next_id(),
+            addr=0, shape=a.shape, dtype=a.dtype,
+            nbytes=int(np.prod(a.shape)) * 2 if a.shape else 0,
+            data=result, space="tcm",
+        )
+
+    def maximum(self, a: TensorHandle, b: TensorHandle) -> TensorHandle:
+        return self._binary_math("maximum", a, b)
+
+    def minimum(self, a: TensorHandle, b: TensorHandle) -> TensorHandle:
+        return self._binary_math("minimum", a, b)
+
+    def fma(
+        self, a: TensorHandle, b: TensorHandle, c: TensorHandle,
+    ) -> TensorHandle:
+        a_data = np.asarray(a.data) if a.data is not None else None
+        b_data = np.asarray(b.data) if b.data is not None else None
+        c_data = np.asarray(c.data) if c.data is not None else None
+        result = (
+            a_data * b_data + c_data
+            if (a_data is not None and b_data is not None and c_data is not None)
+            else None
+        )
+        return TensorHandle(
+            id=self._next_id(),
+            addr=0, shape=a.shape, dtype=a.dtype,
+            nbytes=int(np.prod(a.shape)) * 2 if a.shape else 0,
+            data=result, space="tcm",
+        )
+
+    def clamp(
+        self,
+        x: TensorHandle,
+        min: TensorHandle,
+        max: TensorHandle,
+    ) -> TensorHandle:
+        x_data = np.asarray(x.data) if x.data is not None else None
+        lo = np.asarray(min.data) if min.data is not None else None
+        hi = np.asarray(max.data) if max.data is not None else None
+        result = (
+            np.minimum(np.maximum(x_data, lo), hi)
+            if (x_data is not None and lo is not None and hi is not None)
+            else None
+        )
+        return TensorHandle(
+            id=self._next_id(),
+            addr=0, shape=x.shape, dtype=x.dtype,
+            nbytes=int(np.prod(x.shape)) * 2 if x.shape else 0,
+            data=result, space="tcm",
+        )
+
+    def softmax(self, x: TensorHandle, axis: int = -1) -> TensorHandle:
+        x_data = np.asarray(x.data) if x.data is not None else None
+        if x_data is None:
+            result = None
+        else:
+            x_max = np.max(x_data, axis=axis, keepdims=True)
+            e = np.exp(x_data - x_max)
+            s = np.sum(e, axis=axis, keepdims=True)
+            result = e / s
+        return TensorHandle(
+            id=self._next_id(),
+            addr=0, shape=x.shape, dtype=x.dtype,
+            nbytes=int(np.prod(x.shape)) * 2 if x.shape else 0,
+            data=result, space="tcm",
+        )
+
+    @staticmethod
+    def cdiv(a: int, b: int) -> int:
+        return -(-int(a) // int(b))
+
+    def _unary_math(self, op: str, x: TensorHandle) -> TensorHandle:
+        x_data = np.asarray(x.data) if x.data is not None else None
+        if x_data is None:
+            result = None
+        elif op == "exp":
+            result = np.exp(x_data)
+        elif op == "log":
+            result = np.log(x_data)
+        elif op == "sqrt":
+            result = np.sqrt(x_data)
+        elif op == "abs":
+            result = np.abs(x_data)
+        elif op == "sigmoid":
+            result = 1.0 / (1.0 + np.exp(-x_data))
+        elif op == "cos":
+            result = np.cos(x_data)
+        elif op == "sin":
+            result = np.sin(x_data)
+        else:
+            raise NotImplementedError(f"mock _unary_math: op {op!r} not implemented")
+        return TensorHandle(
+            id=self._next_id(),
+            addr=0, shape=x.shape, dtype=x.dtype,
+            nbytes=int(np.prod(x.shape)) * 2 if x.shape else 0,
+            data=result, space="tcm",
+        )
+
+    def load(self, ptr: int, shape: tuple[int, ...], dtype: str = "f16") -> TensorHandle:
+        data = self._state._hbm.get(ptr)
+        if data is None:
+            data = np.zeros(shape, dtype=np.float16)
+        return TensorHandle(
+            id=f"load_{ptr}", addr=ptr, shape=shape, dtype=dtype,
+            nbytes=int(np.prod(shape)) * 2, data=data, space="hbm",
+        )
+
+    def store(self, ptr: int, handle: TensorHandle) -> None:
+        if handle.data is not None:
+            self._state._hbm[ptr] = np.asarray(handle.data)
+            if ptr == self._state._slice_addr:
+                self._state.output = self._state._hbm[ptr]
+
+    # IPCQ
+    def send(
+        self,
+        dir: str,
+        src: TensorHandle | None = None,
+        *,
+        src_addr: int | None = None,
+        nbytes: int | None = None,
+        shape: tuple[int, ...] | None = None,
+        dtype: str = "f16",
+        space: str = "tcm",
+    ) -> None:
+        if dir not in self._state.neighbors:
+            raise IpcqInvalidDirection(
+                f"mock tl.send: direction {dir!r} not in neighbors {list(self._state.neighbors)}"
+            )
+        if src is not None:
+            if src.data is not None:
+                data = np.asarray(src.data)
+            else:
+                # Resolve from this rank's local memory at src.addr
+                space_dict = self._state._hbm if src.space == "hbm" else self._state._tcm
+                stored = space_dict.get(src.addr)
+                if stored is None:
+                    raise RuntimeError(
+                        f"mock tl.send: no data at {src.space}:0x{src.addr:x}"
+                    )
+                data = np.asarray(stored)
+        else:
+            data = None
+        if data is None:
+            raise RuntimeError("mock tl.send: src is None")
+        peer_rank = self._state.neighbors[dir]
+        # Find the reverse direction in peer's neighbors that points back to me
+        peer_state = self._scheduler.states[peer_rank]
+        reverse_dir = None
+        for d, target in peer_state.neighbors.items():
+            if target == self._state.rank:
+                reverse_dir = d
+                break
+        if reverse_dir is None:
+            raise RuntimeError(
+                f"mock tl.send: peer rank {peer_rank} has no reverse direction"
+            )
+        peer_state.recv_q[reverse_dir].append(data.copy())
+        # After delivering, hand control back to scheduler so the receiver
+        # can wake up.
+        self._scheduler.yield_()
+
+    def recv_async(
+        self,
+        dir: str,
+        shape: tuple[int, ...] = (),
+        dtype: str = "f16",
+    ) -> dict:
+        """Non-blocking recv. Returns a future dict to pass to tl.wait."""
+        if dir not in self._state.neighbors:
+            raise IpcqInvalidDirection(
+                f"mock tl.recv_async: direction {dir!r} not in neighbors"
+            )
+        return {"_kind": "recv_future", "dir": dir, "shape": shape, "dtype": dtype}
+
+    def wait(self, future: Any) -> TensorHandle:
+        """Block until the recv future has data."""
+        if not isinstance(future, dict) or future.get("_kind") != "recv_future":
+            raise TypeError("tl.wait: expected recv future from tl.recv_async")
+        d = future["dir"]
+        while not self._state.recv_q[d]:
+            self._scheduler.yield_()
+        data = self._state.recv_q[d].popleft()
+        return self._make_handle(data, d, future["dtype"])
+
+    def recv(
+        self,
+        dir: str | None = None,
+        shape: tuple[int, ...] = (),
+        dtype: str = "f16",
+    ) -> TensorHandle:
+        if dir is not None and dir not in self._state.neighbors:
+            raise IpcqInvalidDirection(
+                f"mock tl.recv: direction {dir!r} not in neighbors {list(self._state.neighbors)}"
+            )
+        # Wait for data
+        while True:
+            if dir is None:
+                # round-robin over directions
+                for d in self._state.neighbors:
+                    if self._state.recv_q[d]:
+                        data = self._state.recv_q[d].popleft()
+                        return self._make_handle(data, d, dtype)
+            else:
+                if self._state.recv_q[dir]:
+                    data = self._state.recv_q[dir].popleft()
+                    return self._make_handle(data, dir, dtype)
+            # Yield to other ranks
+            self._scheduler.yield_()
+
+    def _make_handle(self, data: np.ndarray, direction: str, dtype: str) -> TensorHandle:
+        return TensorHandle(
+            id=f"recv_{direction}",
+            addr=0, shape=data.shape, dtype=dtype,
+            nbytes=int(data.nbytes), data=data, space="tcm",
+        )
+
+
+# ── Cooperative scheduler ────────────────────────────────────────────
+
+
+class _MockScheduler:
+    """Round-robin cooperative scheduler over rank greenlets."""
+
+    def __init__(self, states: list[_MockRankState]) -> None:
+        self.states = states
+        self._parent: greenlet | None = None
+        self._cur_idx = 0
+
+    def yield_(self) -> None:
+        """Called from inside a rank greenlet to give other ranks a turn."""
+        assert self._parent is not None
+        self._parent.switch()
+
+    def run(self, kernel_fn: Callable, kernel_args: tuple) -> list[np.ndarray]:
+        from kernbench.triton_emu.tl_context import TLContext
+
+        self._parent = greenlet.getcurrent()
+        n = len(self.states)
+
+        # Per-rank tl shim
+        tls: dict[int, _MockTL] = {}
+
+        def _spawn(rank_idx: int) -> greenlet:
+            state = self.states[rank_idx]
+            tl = _MockTL(state, self)
+            tls[rank_idx] = tl
+
+            def _entry():
+                # Activate this rank's tl for TensorHandle operator overloads
+                TLContext._set_active(tl)  # type: ignore[attr-defined]
+                try:
+                    kernel_fn(state.t_ptr, *kernel_args, tl=tl)
+                finally:
+                    TLContext._set_active(None)  # type: ignore[attr-defined]
+
+            return greenlet(_entry)
+
+        for state in self.states:
+            state.g = _spawn(state.rank)
+
+        # Drive each rank round-robin until all dead. Detect global deadlock.
+        max_rounds = 10_000
+        round_no = 0
+        while True:
+            alive = [s for s in self.states if s.g is not None and not s.g.dead]
+            if not alive:
+                break
+            progressed = False
+            for s in self.states:
+                if s.g is None or s.g.dead:
+                    continue
+                # Multi-rank greenlets share TLContext active state via the
+                # module-level thread-local; restore this rank's tl before
+                # resuming so TensorHandle operator overloads dispatch to
+                # the right _MockTL.
+                TLContext._set_active(tls[s.rank])  # type: ignore[attr-defined]
+                s.g.switch()
+                if s.g.dead:
+                    progressed = True
+            TLContext._set_active(None)  # type: ignore[attr-defined]
+            # Loose progress check: if no greenlet died and queues didn't grow,
+            # advance round counter; abort after too many idle rounds.
+            round_no += 1
+            if round_no > max_rounds and not progressed:
+                raise RuntimeError(
+                    "mock CCL runtime: deadlock detected (no progress for "
+                    f"{max_rounds} rounds)"
+                )
+
+        return [
+            s.output if s.output is not None else s._hbm.get(s._slice_addr)
+            for s in self.states
+        ]
+
+
+# ── Public entry ────────────────────────────────────────────────────
+
+
+def run_kernel_in_mock(
+    kernel_fn: Callable,
+    world_size: int,
+    topology: str,
+    inputs: list[np.ndarray],
+    kernel_args: tuple = (),
+    algo_module: Any | None = None,
+) -> list[np.ndarray]:
+    """Run a CCL kernel under the mock runtime with no SimPy/fabric.
+
+    Args:
+        kernel_fn: ``kernel(t_ptr, *kernel_args, tl=...)``
+        world_size: number of ranks
+        topology: builtin topology name (e.g. "ring_1d")
+        inputs: per-rank input ndarrays. ``inputs[r]`` becomes rank r's
+                local tile at HBM address 0.
+        kernel_args: extra positional args after t_ptr
+        algo_module: optional module providing ``neighbors()`` override
+
+    Returns:
+        Per-rank output ndarrays — whatever the kernel wrote via tl.store
+        (or the original input if the kernel didn't store).
+    """
+    if len(inputs) != world_size:
+        raise ValueError(f"len(inputs)={len(inputs)} != world_size={world_size}")
+
+    topo_fn = resolve_topology(topology, algo_module=algo_module)
+    states = [
+        _MockRankState(
+            rank=r, world_size=world_size,
+            neighbors=topo_fn(r, world_size),
+            input_arr=inputs[r],
+        )
+        for r in range(world_size)
+    ]
+
+    sched = _MockScheduler(states)
+    return sched.run(kernel_fn, kernel_args)
diff --git a/src/kernbench/ccl/topologies.py b/src/kernbench/ccl/topologies.py
new file mode 100644
index 0000000..c2ed1f4
--- /dev/null
+++ b/src/kernbench/ccl/topologies.py
@@ -0,0 +1,128 @@
+"""Builtin neighbor topology generators for CCL backend (ADR-0023 D11).
+
+Each generator takes ``(rank, world_size)`` and returns a
+``dict[direction, peer_rank]`` for that rank. ``direction`` is one of
+``"N" | "S" | "E" | "W"`` for ring/mesh, or
+``"parent" | "child_left" | "child_right"`` for tree topologies.
+
+Algorithm modules may override the generated map by defining a
+``neighbors(rank, world_size, neighbor_map) -> dict | None`` function in
+the same module (see D11 / D15). ``resolve_topology`` wires these together.
+"""
+from __future__ import annotations
+
+from typing import Any, Callable
+
+NeighborMap = dict[str, int]
+TopologyFn = Callable[[int, int], NeighborMap]
+
+
+# ── Builtin generators ───────────────────────────────────────────────
+
+
+def ring_1d(rank: int, world_size: int) -> NeighborMap:
+    """1D bidirectional ring (E/W)."""
+    return {
+        "E": (rank + 1) % world_size,
+        "W": (rank - 1) % world_size,
+    }
+
+
+def ring_1d_unidir(rank: int, world_size: int) -> NeighborMap:
+    """1D unidirectional ring (E only)."""
+    return {"E": (rank + 1) % world_size}
+
+
+def mesh_2d(rank: int, world_size: int) -> NeighborMap:
+    """Square 2D mesh (N/S/E/W).
+
+    Layout: rank = row * side + col, with side = sqrt(world_size).
+    Wrap-around (torus) on all four edges.
+    """
+    side = int(round(world_size ** 0.5))
+    if side * side != world_size:
+        raise ValueError(
+            f"mesh_2d requires square world_size, got {world_size}"
+        )
+    r, c = divmod(rank, side)
+    return {
+        "N": ((r - 1) % side) * side + c,
+        "S": ((r + 1) % side) * side + c,
+        "W": r * side + (c - 1) % side,
+        "E": r * side + (c + 1) % side,
+    }
+
+
+def tree_binary(rank: int, world_size: int) -> NeighborMap:
+    """Binary tree rooted at rank 0.
+
+    Children of rank r are 2r+1 and 2r+2 (if within world_size).
+    Parent of rank r > 0 is (r-1)//2.
+    Returned keys (only those that exist):
+        "parent", "child_left", "child_right"
+    """
+    n: NeighborMap = {}
+    if rank > 0:
+        n["parent"] = (rank - 1) // 2
+    left = 2 * rank + 1
+    right = 2 * rank + 2
+    if left < world_size:
+        n["child_left"] = left
+    if right < world_size:
+        n["child_right"] = right
+    return n
+
+
+def none(rank: int, world_size: int) -> NeighborMap:
+    """Empty map — algorithm's neighbors() must build from scratch."""
+    return {}
+
+
+_BUILTIN: dict[str, TopologyFn] = {
+    "ring_1d": ring_1d,
+    "ring_1d_unidir": ring_1d_unidir,
+    "mesh_2d": mesh_2d,
+    "tree_binary": tree_binary,
+    "none": none,
+}
+
+
+# ── Resolution ───────────────────────────────────────────────────────
+
+
+def resolve_topology(
+    name: str, algo_module: Any | None = None,
+) -> TopologyFn:
+    """Return a callable ``(rank, world_size) -> NeighborMap``.
+
+    Args:
+        name: builtin topology name from ccl.yaml. Must be one of
+              ``ring_1d``, ``ring_1d_unidir``, ``mesh_2d``, ``tree_binary``,
+              or ``none``.
+        algo_module: optional algorithm module. If it defines
+              ``neighbors(rank, world_size, neighbor_map)``, that hook is
+              invoked after the builtin to override the result.
+              Returning None from neighbors() leaves the builtin map
+              unchanged; returning a dict replaces it.
+
+    Raises:
+        ValueError: if ``name`` is not a known builtin.
+    """
+    if name not in _BUILTIN:
+        raise ValueError(
+            f"Unknown topology '{name}'. "
+            f"Available builtins: {list(_BUILTIN)}"
+        )
+    builtin_fn = _BUILTIN[name]
+    override_fn = getattr(algo_module, "neighbors", None) if algo_module else None
+    if override_fn is None or not callable(override_fn):
+        return builtin_fn
+
+    def _wrapped(rank: int, world_size: int) -> NeighborMap:
+        base = builtin_fn(rank, world_size)
+        result = override_fn(rank, world_size, base)
+        if result is None:
+            return base
+        return result
+
+    return _wrapped
diff --git a/src/kernbench/common/ipcq_types.py b/src/kernbench/common/ipcq_types.py
new file mode 100644
index 0000000..0deb789
--- /dev/null
+++ b/src/kernbench/common/ipcq_types.py
@@ -0,0 +1,234 @@
+"""IPCQ schemas and exceptions (ADR-0023 D2.5, D12, D14 F1).
+
+This module contains the data structures and exceptions used by the
+PE-level IPCQ collective communication infrastructure. The host-facing
+sideband fan-out message ``IpcqInitMsg`` lives in
+``kernbench.runtime_api.kernel`` (alongside other fabric messages),
+while all internal token / metadata / command schemas are kept here.
+
+Layering:
+    PE_CPU       --IpcqRequest(IpcqSendCmd|IpcqRecvCmd)--> PE_IPCQ
+    PE_IPCQ      --IpcqDmaToken-->                         PE_DMA (vc_comm)
+    PE_DMA       --IpcqMetaArrival-->                      PE_IPCQ (atomic, D9)
+    PE_IPCQ      --IpcqCreditMetadata-->                   peer PE_IPCQ (fast path, D9)
+
+See ADR-0023 for the full design.
+"""
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any, Union
+
+if TYPE_CHECKING:
+    import simpy
+
+
+# ── D14 F1: invalid direction exception ──────────────────────────────
+
+
+class IpcqInvalidDirection(ValueError):
+    """Raised when a kernel calls tl.send/recv with a direction that
+    has no neighbor installed for this PE."""
+
+
+# ── D2.5: IpcqEndpoint ───────────────────────────────────────────────
+
+
+@dataclass(frozen=True)
+class IpcqEndpoint:
+    """송신 측이 peer's rx_buffer 주소를 계산하기 위해 필요한 모든 정보 (D2.5).
+
+    Sender PE_IPCQ uses this to compute the destination PA for its DMA
+    write into the peer's rx ring buffer slot:
+
+        slot_idx = sender.my_head % peer.n_slots
+        dst_pa   = peer.rx_base_pa + slot_idx * peer.slot_size
+    """
+
+    sip: int                     # destination SIP
+    cube: int                    # destination cube
+    pe: int                      # destination PE (cube-local index)
+    buffer_kind: str             # "tcm" | "hbm" | "sram"
+    rx_base_pa: int              # peer rx_buffer base PA (PhysAddr.encode())
+    rx_base_va: int              # peer rx_buffer base VA (optional, MMU)
+    n_slots: int                 # peer ring depth (wrap-around modulo)
+    slot_size: int               # peer slot size (offset multiplier)
+
+
+# ── D12: IpcqInitEntry (used by IpcqInitMsg in kernel.py) ────────────
+
+
+@dataclass(frozen=True)
+class IpcqInitEntry:
+    """One direction's neighbor entry that backend installs into a PE_IPCQ
+    via IpcqInitMsg (kernbench.runtime_api.kernel.IpcqInitMsg, D12).
+    """
+
+    direction: str               # "N" | "S" | "E" | "W"
+    peer: IpcqEndpoint           # see D2.5
+    my_rx_base_pa: int           # this PE's own rx_buffer base
+    my_rx_base_va: int           # this PE's own rx_buffer base VA (optional)
+    n_slots: int                 # this PE's ring depth
+    slot_size: int               # this PE's slot size
+    # Credit fast path channel (D9).
+    # Contract: must be a simpy.Store instance dedicated to receiving
+    # IpcqCreditMetadata objects only. Backend wires it once at init time
+    # and the receiving PE_IPCQ owns its consumer side; the sender (peer's
+    # PE_IPCQ) puts IpcqCreditMetadata directly into this store via
+    # _delayed_credit_send. Do not put any other object type.
+    peer_credit_store: "simpy.Store"
+
+
+# ── D12: IpcqSendCmd (PE_CPU → PE_IPCQ) ──────────────────────────────
+
+
+@dataclass(frozen=True)
+class IpcqSendCmd:
+    """tl.send command issued by the kernel to PE_IPCQ."""
+
+    direction: str               # "N" | "S" | "E" | "W"
+    src_addr: int                # source data address (TCM/HBM/SRAM)
+    src_space: str               # "tcm" | "hbm" | "sram"
+    nbytes: int
+    shape: tuple[int, ...]       # data shape (op_log + MemoryStore use)
+    dtype: str
+    handle_id: str               # completion tracking
+    data_op: bool = True         # ADR-0020 op_log recording flag
+
+
+# ── D12: IpcqRecvCmd (PE_CPU → PE_IPCQ) ──────────────────────────────
+
+
+@dataclass(frozen=True)
+class IpcqRecvCmd:
+    """tl.recv command issued by the kernel to PE_IPCQ.
+
+    Two modes (recv_mode):
+        "return_slot" — return slot address as-is (default, zero-copy).
+                        Kernel uses the slot memory directly.
+        "copy_to_dst" — copy slot data to dst_addr, then return.
+    """
+
+    direction: str | None        # None → round-robin (weak fairness, D4)
+    shape: tuple[int, ...]
+    dtype: str
+    handle_id: str
+    recv_mode: str = "return_slot"
+    dst_addr: int = 0            # used only when recv_mode == "copy_to_dst"
+    dst_space: str = ""          # used only when recv_mode == "copy_to_dst"
+    blocking: bool = True
+    data_op: bool = True
+
+
+# ── D12: IpcqDmaToken (PE_IPCQ → PE_DMA, vc_comm) ───────────────────
+
+
+@dataclass
+class IpcqDmaToken:
+    """Token sent from PE_IPCQ to PE_DMA (vc_comm channel) carrying both
+    the data move request and the piggyback metadata (ADR-0023 D9).
+
+    Receiving PE_DMA processes this atomically (I6 MUST):
+        1. MemoryStore.write(dst_endpoint.buffer_kind, dst_addr, data)
+        2. Forward IpcqMetaArrival(token=self) to peer PE_IPCQ
+    No yield is allowed between the two steps.
+
+    The ``data`` field is a snapshot taken by the sender's PE_DMA at the
+    moment the send is issued. This preserves "in-flight data" semantics:
+    if the sender mutates its source memory after issuing the send but
+    before arrival, the receiver still gets the snapshot. The snapshot is
+    None for control-only tokens (e.g. credit-only updates).
+    """
+
+    # ── Data movement (single-hop DMA write) ──
+    src_addr: int
+    src_space: str
+    dst_addr: int                # already-computed peer rx slot PA
+    dst_endpoint: IpcqEndpoint   # routing target (sip/cube/pe) + buffer_kind
+    nbytes: int
+    handle_id: str               # completion notify back to sender PE_IPCQ
+    # Optional shape/dtype carried for op_log + MemoryStore convenience.
+    shape: tuple[int, ...] = ()
+    dtype: str = "f16"
+    # In-flight data snapshot (sender PE_DMA captures this at send time).
+    data: Any = None
+
+    # ── Piggyback metadata (D9) ──
+    sender_seq: int = 0          # monotonic; receiver updates peer_head_cache
+    src_sip: int = 0
+    src_cube: int = 0
+    src_pe: int = 0
+    src_direction: str = "E"     # sender-side direction; receiver maps to its own
+
+    data_op: bool = True
+
+
+# ── D12: IpcqMetaArrival (PE_DMA → PE_IPCQ, intra-PE wire) ──────────
+
+
+@dataclass
+class IpcqMetaArrival:
+    """Posted by receiving PE_DMA into the destination PE's PE_IPCQ inbox
+    in the same SimPy step as the MemoryStore.write (D9, I6 MUST).
+
+    The receiver PE_IPCQ uses ``token.sender_seq`` to update its
+    peer_head_cache for the corresponding direction.
+    """
+
+    token: IpcqDmaToken
+
+
+# ── D12: IpcqCreditMetadata (PE_IPCQ → peer PE_IPCQ, fast path) ─────
+
+
+@dataclass(frozen=True)
+class IpcqCreditMetadata:
+    """Credit return — recv-side → send-side fast path (D9).
+
+    Sent by ``PeIpcqComponent._delayed_credit_send`` after a
+    bottleneck-BW based latency, putting the metadata directly into
+    the peer's pre-wired credit store (no fabric routing).
+    """
+
+    consumer_seq: int            # my_tail at recv side (new tail value)
+    src_sip: int                 # which peer is sending the credit
+    src_cube: int
+    src_pe: int
+    src_direction: str           # sender-side direction (peer maps to its own)
+
+
+# ── Request wrapper (PE_CPU → PE_IPCQ) ───────────────────────────────
+
+
+@dataclass
+class IpcqRequest:
+    """Wrapper carrying an IpcqSendCmd or IpcqRecvCmd plus a SimPy completion
+    event. Posted by PE_CPU into PE_IPCQ's inbox; PE_IPCQ calls
+    ``done.succeed()`` when the request is fully processed.
+
+    For recv requests, the result (slot address, direction, dtype, shape)
+    is written into ``result_data`` so the caller can read it after wait.
+    """
+
+    command: "IpcqSendCmd | IpcqRecvCmd"
+    done: "simpy.Event"
+    result_data: dict[str, Any] = field(default_factory=dict)
+
+
+# ── RecvFuture (kernel ↔ runner handshake for tl.recv_async / tl.wait) ─
+
+
+@dataclass
+class RecvFuture:
+    """Opaque future returned by ``tl.recv_async``.
+
+    The KernelRunner attaches a SimPy event and the IpcqRequest in the
+    background; ``tl.wait(future)`` switches back to the runner which
+    yields on the event and resolves the result into a TensorHandle.
+    """
+
+    cmd: "IpcqRecvCmd"
+    request: Any = None         # IpcqRequest (set by runner)
+    event: Any = None           # simpy.Event (set by runner)
+    resolved: bool = False
+    result: Any = None          # cached TensorHandle after wait()
diff --git a/src/kernbench/common/pe_commands.py b/src/kernbench/common/pe_commands.py
index ed0dc8b..e70c367 100644
--- a/src/kernbench/common/pe_commands.py
+++ b/src/kernbench/common/pe_commands.py
@@ -33,6 +33,7 @@ class TensorHandle:
     dtype: str
     nbytes: int                      # total byte size
     data: object = None              # reserved for validate mode
+    space: str = "tcm"               # MemoryStore space ("tcm" | "hbm" | "sram")
 
 
 @dataclass(frozen=True)
diff --git a/src/kernbench/components/builtin/pe_cpu.py b/src/kernbench/components/builtin/pe_cpu.py
index 455b7de..d5f615a 100644
--- a/src/kernbench/components/builtin/pe_cpu.py
+++ b/src/kernbench/components/builtin/pe_cpu.py
@@ -42,9 +42,30 @@ class PeCpuComponent(ComponentBase):
             self._cube_idx = int(parts[1].replace("cube", ""))
         except (IndexError, ValueError):
             self._cube_idx = 0
-        # num_cubes from spec (for tl.program_id(axis=1))
+        # num_cubes from spec (for tl.program_id(axis=1) — ADR-0022)
         spec = ctx.spec if ctx else {}
-        self._num_cubes = spec.get("system", {}).get("sips", {}).get("cubes_per_sip", 1)
+        cube_mesh = spec.get("sip", {}).get("cube_mesh", {})
+        if cube_mesh:
+            self._num_cubes = int(cube_mesh.get("w", 1)) * int(cube_mesh.get("h", 1))
+        else:
+            self._num_cubes = (
+                spec.get("system", {}).get("sips", {}).get("cubes_per_sip", 1)
+            )
+        # PE-local scratch for kernel math output handles (ADR-0020 D3
+        # extension; reserved portion of TCM addressed via a synthetic
+        # MemoryStore key, not the real PA encoder).
+        pe_template = spec.get("cube", {}).get("pe_template", {})
+        tcm_attrs = pe_template.get("components", {}).get("pe_tcm", {}).get("attrs", {})
+        scratch_mb = float(tcm_attrs.get("kernel_scratch_mb", 1))
+        self._tl_scratch_size = int(scratch_mb * (1 << 20))
+        # PE-unique base address — high bit pattern to avoid collision with
+        # IPCQ ring buffers (which use bit 60).
+        self._tl_scratch_base = (
+            (1 << 61)
+            | (self._sip_idx << 40)
+            | (self._cube_idx << 32)
+            | (self._pe_idx << 24)
+        )
 
     def _find_shard(self, shards: tuple) -> Any:
         """Find shard matching this PE's (sip, cube, pe). Fallback to positional index."""
@@ -146,6 +167,8 @@ class PeCpuComponent(ComponentBase):
             scheduler_id=scheduler_id,
             out_ports=self.out_ports,
             store=store,
+            scratch_base=self._tl_scratch_base,
+            scratch_size=self._tl_scratch_size,
         )
         yield from runner.run(env, kernel_fn, kernel_args, num_programs)
         return getattr(runner, "_composite_results", [])
diff --git a/src/kernbench/components/builtin/pe_dma.py b/src/kernbench/components/builtin/pe_dma.py
index 80edc96..eae866f 100644
--- a/src/kernbench/components/builtin/pe_dma.py
+++ b/src/kernbench/components/builtin/pe_dma.py
@@ -106,18 +106,131 @@ class PeDmaComponent(PeEngineBase):
         pe_txn.done.succeed()
 
     def _worker(self, env: simpy.Environment) -> Generator:
-        """Handle TileToken (pipeline), PeInternalTxn (legacy), and Transaction (fabric)."""
+        """Handle TileToken (pipeline), PeInternalTxn (legacy), IpcqDmaToken,
+        and Transaction (fabric)."""
+        from kernbench.common.ipcq_types import IpcqDmaToken
         from kernbench.common.pe_commands import PeInternalTxn
         from kernbench.components.builtin.pe_types import TileToken
 
         while True:
             msg: Any = yield self._inbox.get()
-            if isinstance(msg, TileToken):
+            if isinstance(msg, IpcqDmaToken):
+                # Outbound: IPCQ token from local PE_IPCQ → forward via fabric
+                env.process(self._handle_ipcq_outbound(env, msg))
+            elif isinstance(msg, TileToken):
                 env.process(self._pipeline_process(env, msg))
             elif isinstance(msg, PeInternalTxn):
                 env.process(self._handle_with_hooks(env, msg))
             else:
-                env.process(self._forward_txn(env, msg))
+                # Transaction (or unknown). May carry IpcqDmaToken inbound.
+                req = getattr(msg, "request", None)
+                if isinstance(req, IpcqDmaToken):
+                    env.process(self._handle_ipcq_inbound(env, msg))
+                else:
+                    env.process(self._forward_txn(env, msg))
+
+    # ── IPCQ outbound (PE_IPCQ → PE_DMA → fabric) ───────────────────
+
+    def _handle_ipcq_outbound(self, env: simpy.Environment, token: Any) -> Generator:
+        """Forward IpcqDmaToken from local PE_IPCQ through the fabric to peer
+        PE_DMA. ADR-0023 D8 (vc_comm channel)."""
+        if self.ctx is None:
+            return  # nothing to do
+        peer = token.dst_endpoint
+        peer_pe_dma = f"sip{peer.sip}.cube{peer.cube}.pe{peer.pe}.pe_dma"
+
+        # Snapshot the source data at send time (D9 in-flight semantics).
+        # Without this, the receiver could read stale or future data if the
+        # sender mutates src_addr between send issue and DMA arrival.
+        store = getattr(self.ctx, "memory_store", None)
+        if store is not None and token.data is None:
+            try:
+                snap = store.read(
+                    token.src_space, token.src_addr,
+                    shape=token.shape, dtype=token.dtype,
+                )
+                # Copy so later mutations to src_addr don't affect the snapshot.
+                token.data = snap.copy() if hasattr(snap, "copy") else snap
+            except Exception:
+                token.data = None
+
+        # Record the IPCQ copy in op_log at OUTBOUND time. ADR-0020 D6:
+        # Phase 2 replays the copy in t_start order; using outbound time
+        # (rather than inbound) ensures the copy executes before any later
+        # local op at the sender that might overwrite token.src_addr (e.g.
+        # a tl.store after a recv).
+        if self._op_logger is not None:
+            try:
+                self._op_logger.record_copy(
+                    t_start=float(env.now), t_end=float(env.now),
+                    component_id=self.node.id,
+                    src_space=token.src_space, src_addr=token.src_addr,
+                    dst_space=peer.buffer_kind,
+                    dst_addr=token.dst_addr,
+                    shape=token.shape, dtype=token.dtype, nbytes=token.nbytes,
+                )
+            except Exception:
+                pass
+
+        try:
+            path = self.ctx.router.find_path(self._pe_prefix, peer_pe_dma)
+        except Exception:
+            return
+        drain_ns = self.ctx.compute_drain_ns(path, token.nbytes)
+
+        sub_done = env.event()
+        sub_txn = Transaction(
+            request=token, path=path, step=0,
+            nbytes=token.nbytes, done=sub_done, drain_ns=drain_ns,
+        )
+        if len(path) > 1:
+            next_hop = path[1]
+            if next_hop in self.out_ports:
+                yield self.out_ports[next_hop].put(sub_txn.advance())
+            else:
+                return
+        # Note: don't wait on sub_done here — fire-and-forget for vc_comm.
+        # IPCQ slot bookkeeping (peer_head) was already updated by PE_IPCQ;
+        # backpressure is via credit return, not via this DMA's completion.
+
+    # ── IPCQ inbound (fabric → PE_DMA → MemoryStore + PE_IPCQ) ──────
+
+    def _handle_ipcq_inbound(self, env: simpy.Environment, txn: Any) -> Generator:
+        """At destination PE_DMA: atomically write data and forward metadata.
+
+        I6 (MUST): no SimPy yield between MemoryStore.write and the
+        IpcqMetaArrival put into PE_IPCQ.
+        """
+        from kernbench.common.ipcq_types import IpcqMetaArrival
+
+        token = txn.request
+
+        # ── ATOMIC: do not introduce yield between these two operations ──
+        # 1. Move data via MemoryStore (single-hop DMA write).
+        # Prefer the in-flight snapshot stashed by the sender PE_DMA;
+        # fall back to a fresh read of src_addr if no snapshot is present
+        # (e.g. control-only token).
+        store = getattr(self.ctx, "memory_store", None) if self.ctx else None
+        if store is not None:
+            try:
+                data = token.data
+                if data is None:
+                    data = store.read(
+                        token.src_space, token.src_addr,
+                        shape=token.shape, dtype=token.dtype,
+                    )
+                store.write(token.dst_endpoint.buffer_kind, token.dst_addr, data)
+            except Exception:
+                pass
+
+        # 2. Forward IpcqMetaArrival to local PE_IPCQ
+        ipcq_id = f"{self._pe_prefix}.pe_ipcq"
+        if ipcq_id in self.out_ports:
+            yield self.out_ports[ipcq_id].put(IpcqMetaArrival(token=token))
+        # ─────────────────────────────────────────────────────────────────
+
+        if not txn.done.triggered:
+            txn.done.succeed()
 
     def _pipeline_process(self, env: simpy.Environment, token: Any) -> Generator:
         """Pipeline mode: DMA read/write via fabric, then self-route."""
diff --git a/src/kernbench/components/builtin/pe_ipcq.py b/src/kernbench/components/builtin/pe_ipcq.py
new file mode 100644
index 0000000..710786a
--- /dev/null
+++ b/src/kernbench/components/builtin/pe_ipcq.py
@@ -0,0 +1,455 @@
+"""PE_IPCQ component (ADR-0023): per-PE IPCQ control plane.
+
+Responsibilities:
+    - Hold per-direction queue pair state (my_head, my_tail,
+      peer_head_cache, peer_tail_cache, ring buffer addresses)
+    - Process IpcqInitMsg from backend to install neighbor table
+    - Handle IpcqRequest(IpcqSendCmd) from PE_CPU:
+        compute peer slot address, check backpressure, forward
+        IpcqDmaToken to PE_DMA (vc_comm)
+    - Handle IpcqRequest(IpcqRecvCmd) from PE_CPU:
+        wait for data arrival, return slot address (or copy to dst),
+        send fast-path credit return
+    - Handle IpcqMetaArrival from PE_DMA: update peer_head_cache, wake recv
+    - Handle IpcqCreditMetadata via own credit_inbox: update peer_tail_cache,
+      wake send
+
+PE_IPCQ does NOT move data — it forwards IpcqDmaToken to PE_DMA which
+performs the actual fabric DMA.
+
+Credit return uses a fast path: PE_IPCQ creates a SimPy process with a
+bottleneck-BW based latency, then puts IpcqCreditMetadata directly into
+the peer's pre-wired credit_store.
+"""
+from __future__ import annotations
+
+from collections.abc import Generator
+from typing import TYPE_CHECKING, Any
+
+import simpy
+
+from kernbench.common.ipcq_types import (
+    IpcqCreditMetadata,
+    IpcqDmaToken,
+    IpcqInvalidDirection,
+    IpcqMetaArrival,
+    IpcqRecvCmd,
+    IpcqRequest,
+    IpcqSendCmd,
+)
+from kernbench.components.base import ComponentBase
+
+if TYPE_CHECKING:
+    from kernbench.components.context import ComponentContext
+    from kernbench.runtime_api.kernel import IpcqInitMsg
+    from kernbench.topology.types import Node
+
+
+_DIR_ORDER: tuple[str, ...] = ("N", "S", "E", "W", "parent", "child_left", "child_right")
+
+
+class PeIpcqComponent(ComponentBase):
+    """PE_IPCQ: ring buffer pointer + neighbor management for CCL.
+
+    Owned by one PE; talks to PE_DMA via out_ports[<pe_dma_id>] and
+    receives credit return metadata via the public ``credit_inbox``
+    SimPy Store (wired by backend at IpcqInitMsg installation time).
+    """
+
+    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
+        super().__init__(node, ctx)
+        # Strict shape/dtype validation (D14 F2). Off by default.
+        self._strict: bool = bool(node.attrs.get("strict_validation", False))
+        # direction → list of received tokens (for strict-mode peek of next slot)
+        self._arrived_tokens: dict[str, list] = {}
+        # Parse self (sip, cube, pe) from node id, e.g. "sip0.cube0.pe0.pe_ipcq"
+        self._pe_prefix: str = node.id.rsplit(".", 1)[0]
+        parts = self._pe_prefix.split(".")
+        try:
+            self._self_sip = int(parts[0].replace("sip", ""))
+        except (IndexError, ValueError):
+            self._self_sip = 0
+        try:
+            self._self_cube = int(parts[1].replace("cube", ""))
+        except (IndexError, ValueError):
+            self._self_cube = 0
+        try:
+            self._self_pe = int(parts[2].replace("pe", ""))
+        except (IndexError, ValueError):
+            self._self_pe = 0
+
+        self._dma_node_id = f"{self._pe_prefix}.pe_dma"
+        # direction → state dict (see _install_neighbors for shape)
+        self._queue_pairs: dict[str, dict[str, Any]] = {}
+        self._installed = False
+        self._buffer_kind: str = "tcm"
+        self._backpressure_mode: str = "sleep"
+        self._credit_size_bytes: int = 16
+        # waiters for recv (per direction) and any-direction (for round-robin)
+        self._recv_waiters: dict[str, list[simpy.Event]] = {}
+        self._any_recv_waiters: list[simpy.Event] = []
+        # waiters for send backpressure (per direction)
+        self._send_waiters: dict[str, list[simpy.Event]] = {}
+        # round-robin cursor over installed directions
+        self._rr_dirs: list[str] = []
+        self._rr_cursor: int = 0
+        # credit_inbox is created in start() once env is available
+        self._credit_inbox: simpy.Store | None = None
+
+    # ── Public ──
+
+    @property
+    def credit_inbox(self) -> simpy.Store:
+        """SimPy Store that backend wires as ``peer_credit_store`` on
+        every remote sender targeting this PE. Used by D9 fast path."""
+        assert self._credit_inbox is not None, "PE_IPCQ not started yet"
+        return self._credit_inbox
+
+    @property
+    def queue_pairs(self) -> dict[str, dict[str, Any]]:
+        """Test/debug accessor."""
+        return self._queue_pairs
+
+    # ── Lifecycle ──
+
+    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
+        yield env.timeout(0)
+
+    def start(self, env: simpy.Environment) -> None:
+        # Create credit_inbox even if there are no in_ports yet
+        if self._credit_inbox is None:
+            self._credit_inbox = simpy.Store(env)
+        # If no in_ports were wired (e.g. unit test), still spin up workers
+        if not self.in_ports:
+            self._inbox = simpy.Store(env)
+        super().start(env)
+        env.process(self._credit_worker(env))
+
+    # ── Worker (override of ComponentBase._worker) ──
+
+    def _worker(self, env: simpy.Environment) -> Generator:
+        from kernbench.runtime_api.kernel import IpcqInitMsg
+
+        while True:
+            msg: Any = yield self._inbox.get()
+
+            # IpcqInitMsg may arrive wrapped in a transaction (with .request)
+            # or directly.
+            request_obj = getattr(msg, "request", None)
+            if isinstance(request_obj, IpcqInitMsg):
+                self._install_neighbors(request_obj)
+                done = getattr(msg, "done", None)
+                if done is not None and not done.triggered:
+                    done.succeed()
+                continue
+            if isinstance(msg, IpcqInitMsg):
+                self._install_neighbors(msg)
+                continue
+
+            if isinstance(msg, IpcqMetaArrival):
+                self._handle_meta_arrival(msg)
+                continue
+
+            if isinstance(msg, IpcqRequest):
+                env.process(self._handle_request(env, msg))
+                continue
+
+            # Unknown message — drop or forward via base class fallback
+            env.process(self._forward_txn(env, msg))
+
+    # ── Init ──
+
+    def _install_neighbors(self, msg: IpcqInitMsg) -> None:
+        self._installed = True
+        self._buffer_kind = msg.buffer_kind
+        self._backpressure_mode = msg.backpressure_mode
+        self._credit_size_bytes = msg.credit_size_bytes
+        for entry in msg.entries:
+            self._queue_pairs[entry.direction] = {
+                "peer": entry.peer,
+                "my_rx_base_pa": entry.my_rx_base_pa,
+                "my_rx_base_va": entry.my_rx_base_va,
+                "n_slots": entry.n_slots,
+                "slot_size": entry.slot_size,
+                "peer_credit_store": entry.peer_credit_store,
+                "my_head": 0,
+                "my_tail": 0,
+                "peer_head_cache": 0,
+                "peer_tail_cache": 0,
+            }
+            self._recv_waiters.setdefault(entry.direction, [])
+            self._send_waiters.setdefault(entry.direction, [])
+        # Reset round-robin order to a stable canonical sequence
+        self._rr_dirs = [d for d in _DIR_ORDER if d in self._queue_pairs]
+        self._rr_cursor = 0
+
+    # ── Send ──
+
+    def _handle_request(self, env: simpy.Environment, req: IpcqRequest) -> Generator:
+        cmd = req.command
+        if isinstance(cmd, IpcqSendCmd):
+            yield from self._handle_send(env, req, cmd)
+        elif isinstance(cmd, IpcqRecvCmd):
+            yield from self._handle_recv(env, req, cmd)
+
+    def _handle_send(
+        self, env: simpy.Environment, req: IpcqRequest, cmd: IpcqSendCmd,
+    ) -> Generator:
+        if cmd.direction not in self._queue_pairs:
+            raise IpcqInvalidDirection(
+                f"PE {self._pe_prefix}: direction {cmd.direction!r} not installed"
+            )
+        qp = self._queue_pairs[cmd.direction]
+        peer = qp["peer"]
+
+        # Backpressure: wait while ring full
+        while (qp["my_head"] - qp["peer_tail_cache"]) >= peer.n_slots:
+            wait_event = env.event()
+            self._send_waiters[cmd.direction].append(wait_event)
+            yield wait_event
+
+        # Compute peer slot address
+        slot_idx = qp["my_head"] % peer.n_slots
+        dst_pa = peer.rx_base_pa + slot_idx * peer.slot_size
+
+        token = IpcqDmaToken(
+            src_addr=cmd.src_addr,
+            src_space=cmd.src_space,
+            dst_addr=dst_pa,
+            dst_endpoint=peer,
+            nbytes=cmd.nbytes,
+            handle_id=cmd.handle_id,
+            shape=cmd.shape,
+            dtype=cmd.dtype,
+            sender_seq=qp["my_head"],
+            src_sip=self._self_sip,
+            src_cube=self._self_cube,
+            src_pe=self._self_pe,
+            src_direction=cmd.direction,
+        )
+
+        # Forward to PE_DMA (vc_comm)
+        yield self.out_ports[self._dma_node_id].put(token)
+        qp["my_head"] += 1
+        # Diagnostics trace (D14)
+        from kernbench.ccl import diagnostics
+        if diagnostics.trace_enabled():
+            diagnostics.log_send(
+                t_ns=float(env.now), sender=self._pe_prefix,
+                direction=cmd.direction, nbytes=cmd.nbytes,
+                sender_seq=qp["my_head"] - 1,
+            )
+        if not req.done.triggered:
+            req.done.succeed()
+
+    # ── Recv ──
+
+    def _handle_recv(
+        self, env: simpy.Environment, req: IpcqRequest, cmd: IpcqRecvCmd,
+    ) -> Generator:
+        if cmd.direction is None:
+            direction = yield from self._wait_any_direction(env)
+        else:
+            if cmd.direction not in self._queue_pairs:
+                raise IpcqInvalidDirection(
+                    f"PE {self._pe_prefix}: direction {cmd.direction!r} not installed"
+                )
+            direction = cmd.direction
+            qp = self._queue_pairs[direction]
+            while qp["peer_head_cache"] <= qp["my_tail"]:
+                wait_event = env.event()
+                self._recv_waiters[direction].append(wait_event)
+                yield wait_event
+
+        qp = self._queue_pairs[direction]
+        slot_idx = qp["my_tail"] % qp["n_slots"]
+        slot_addr = qp["my_rx_base_pa"] + slot_idx * qp["slot_size"]
+
+        # Strict validation (D14 F2): peek the next-arrived token's metadata
+        # against the recv command's expected shape/dtype/nbytes.
+        arrived = self._arrived_tokens.get(direction, [])
+        if arrived:
+            front = arrived.pop(0)
+            if self._strict:
+                expected_nbytes = self._nbytes_for(cmd.shape, cmd.dtype)
+                if front.dtype != cmd.dtype:
+                    raise ValueError(
+                        f"PE_IPCQ {self._pe_prefix} recv strict: dtype mismatch — "
+                        f"sender={front.dtype} recv={cmd.dtype}"
+                    )
+                if front.shape != cmd.shape:
+                    raise ValueError(
+                        f"PE_IPCQ {self._pe_prefix} recv strict: shape mismatch — "
+                        f"sender={front.shape} recv={cmd.shape}"
+                    )
+                if front.nbytes != expected_nbytes:
+                    raise ValueError(
+                        f"PE_IPCQ {self._pe_prefix} recv strict: nbytes mismatch — "
+                        f"sender={front.nbytes} recv={expected_nbytes}"
+                    )
+
+        req.result_data["src_space"] = self._buffer_kind
+        req.result_data["src_addr"] = slot_addr
+        req.result_data["direction"] = direction
+        req.result_data["dtype"] = cmd.dtype
+        req.result_data["shape"] = cmd.shape
+        req.result_data["nbytes"] = self._nbytes_for(cmd.shape, cmd.dtype)
+
+        # copy_to_dst mode: rebind the result handle to (dst_space, dst_addr).
+        # When op_log is disabled, we also do the actual data move now;
+        # when op_log is enabled, Phase 2 replays the slot→dst copy from
+        # the op_log entry below so we don't pollute the slot in Phase 1.
+        if cmd.recv_mode == "copy_to_dst" and self.ctx is not None:
+            req.result_data["src_space"] = cmd.dst_space
+            req.result_data["src_addr"] = cmd.dst_addr
+            store = getattr(self.ctx, "memory_store", None)
+            if store is not None and self._op_logger is None:
+                try:
+                    data = store.read(self._buffer_kind, slot_addr, shape=cmd.shape, dtype=cmd.dtype)
+                    store.write(cmd.dst_space, cmd.dst_addr, data)
+                except Exception:
+                    pass
+            if self._op_logger is not None:
+                # Record slot → dst copy for Phase 2 replay (ADR-0023 D9.5).
+                try:
+                    self._op_logger.record_copy(
+                        t_start=float(env.now), t_end=float(env.now),
+                        component_id=self.node.id,
+                        src_space=self._buffer_kind, src_addr=slot_addr,
+                        dst_space=cmd.dst_space, dst_addr=cmd.dst_addr,
+                        shape=cmd.shape, dtype=cmd.dtype,
+                        nbytes=self._nbytes_for(cmd.shape, cmd.dtype),
+                    )
+                except Exception:
+                    pass
+
+        qp["my_tail"] += 1
+
+        # Diagnostics trace (D14)
+        from kernbench.ccl import diagnostics
+        if diagnostics.trace_enabled():
+            diagnostics.log_recv(
+                t_ns=float(env.now), receiver=self._pe_prefix,
+                direction=direction,
+                nbytes=req.result_data.get("nbytes", 0),
+            )
+
+        # Fast path credit return — bottleneck BW based latency
+        env.process(
+            self._delayed_credit_send(env, direction, qp["peer_credit_store"], qp["my_tail"])
+        )
+
+        if not req.done.triggered:
+            req.done.succeed()
+
+    def _wait_any_direction(self, env: simpy.Environment) -> Generator:
+        """Round-robin scan over installed directions; wait until at least one
+        has data. Returns the chosen direction (str)."""
+        if not self._rr_dirs:
+            raise IpcqInvalidDirection(
+                f"PE {self._pe_prefix}: no neighbors installed"
+            )
+        while True:
+            n = len(self._rr_dirs)
+            for i in range(n):
+                idx = (self._rr_cursor + i) % n
+                d = self._rr_dirs[idx]
+                qp = self._queue_pairs[d]
+                if qp["peer_head_cache"] > qp["my_tail"]:
+                    self._rr_cursor = (idx + 1) % n
+                    return d
+            # Nothing available — wait until any arrival
+            wait_event = env.event()
+            self._any_recv_waiters.append(wait_event)
+            yield wait_event
+
+    # ── Metadata arrival from PE_DMA (D9) ──
+
+    def _handle_meta_arrival(self, msg: IpcqMetaArrival) -> None:
+        token = msg.token
+        sender_key = (token.src_sip, token.src_cube, token.src_pe)
+        for d, qp in self._queue_pairs.items():
+            p = qp["peer"]
+            if (p.sip, p.cube, p.pe) == sender_key:
+                qp["peer_head_cache"] = max(qp["peer_head_cache"], token.sender_seq + 1)
+                # Track arrived token for strict-mode peek
+                self._arrived_tokens.setdefault(d, []).append(token)
+                # Wake any blocked recv on this direction
+                waiters = self._recv_waiters.get(d, [])
+                self._recv_waiters[d] = []
+                for ev in waiters:
+                    if not ev.triggered:
+                        ev.succeed()
+                # Wake any-direction waiters
+                any_waiters = self._any_recv_waiters
+                self._any_recv_waiters = []
+                for ev in any_waiters:
+                    if not ev.triggered:
+                        ev.succeed()
+                return
+        # Unknown sender — silently drop (could log)
+
+    # ── Credit return (fast path) ──
+
+    def _credit_worker(self, env: simpy.Environment) -> Generator:
+        """Process IpcqCreditMetadata from credit_inbox."""
+        assert self._credit_inbox is not None
+        while True:
+            credit: IpcqCreditMetadata = yield self._credit_inbox.get()
+            sender_key = (credit.src_sip, credit.src_cube, credit.src_pe)
+            for d, qp in self._queue_pairs.items():
+                p = qp["peer"]
+                if (p.sip, p.cube, p.pe) == sender_key:
+                    qp["peer_tail_cache"] = max(qp["peer_tail_cache"], credit.consumer_seq)
+                    # Wake any blocked send on this direction
+                    waiters = self._send_waiters.get(d, [])
+                    self._send_waiters[d] = []
+                    for ev in waiters:
+                        if not ev.triggered:
+                            ev.succeed()
+                    break
+
+    def _delayed_credit_send(
+        self,
+        env: simpy.Environment,
+        direction: str,
+        peer_credit_store: simpy.Store,
+        new_tail: int,
+    ) -> Generator:
+        """Wait bottleneck-BW latency, then put IpcqCreditMetadata into peer
+        credit store (D9 fast path)."""
+        latency_ns = self._credit_latency_ns(direction)
+        if latency_ns > 0:
+            yield env.timeout(latency_ns)
+        meta = IpcqCreditMetadata(
+            consumer_seq=new_tail,
+            src_sip=self._self_sip,
+            src_cube=self._self_cube,
+            src_pe=self._self_pe,
+            src_direction=direction,
+        )
+        yield peer_credit_store.put(meta)
+
+    def _credit_latency_ns(self, direction: str) -> float:
+        """Compute credit fast path latency = credit_size / bottleneck_bw.
+
+        Falls back to 0 when ctx/router is unavailable (unit-test mode).
+        """
+        if self.ctx is None:
+            return 0.0
+        qp = self._queue_pairs[direction]
+        peer = qp["peer"]
+        peer_pe_prefix = f"sip{peer.sip}.cube{peer.cube}.pe{peer.pe}"
+        try:
+            path = self.ctx.router.find_path(self._pe_prefix, peer_pe_prefix)
+            return self.ctx.compute_drain_ns(path, self._credit_size_bytes)
+        except Exception:
+            return 0.0
+
+    # ── Helpers ──
+
+    @staticmethod
+    def _nbytes_for(shape: tuple[int, ...], dtype: str) -> int:
+        from math import prod
+        bits = {"f16": 16, "bf16": 16, "f32": 32, "i8": 8, "i16": 16, "i32": 32}.get(dtype, 16)
+        return prod(shape) * (bits // 8) if shape else 0
diff --git a/src/kernbench/runtime_api/bench_runner.py b/src/kernbench/runtime_api/bench_runner.py
index 4f478f7..bc80cc0 100644
--- a/src/kernbench/runtime_api/bench_runner.py
+++ b/src/kernbench/runtime_api/bench_runner.py
@@ -29,11 +29,10 @@ def run_bench(
     correlation_id: str = "bench0",
     completion_policy: CompletionPolicy = CompletionPolicy.LAST_SUBMITTED,
 ) -> BenchResult:
-    """
-    Minimal bench runner.
+    """Minimal bench runner.
 
     - topology: compiled topology object (opaque to runtime here)
-    - bench_fn: callable that receives RuntimeContext and submits requests
+    - bench_fn: callable ``run(torch)`` receiving a RuntimeContext
     - device: DeviceSelector ("all" or "sip:<N>")
     - engine_factory: builds sim_engine for given topology & device
     - completion_policy: how to determine overall completion/result
@@ -48,7 +47,6 @@ def run_bench(
     )
 
     bench_fn(ctx)
-
     ctx.wait_all()
 
     collected_traces = ctx._traces or None
diff --git a/src/kernbench/runtime_api/context.py b/src/kernbench/runtime_api/context.py
index b522ed5..3b2afc6 100644
--- a/src/kernbench/runtime_api/context.py
+++ b/src/kernbench/runtime_api/context.py
@@ -9,6 +9,39 @@ from kernbench.common.types import Completion, RequestHandle, SimEngine
 from .types import DeviceSelector
 
 
+def _world_size_from_spec(spec: dict | None) -> int:
+    """Derive world_size from topology spec: sips × cubes × pes_per_cube."""
+    spec = spec or {}
+    sips = int(spec.get("system", {}).get("sips", {}).get("count", 1))
+    cm = spec.get("sip", {}).get("cube_mesh", {})
+    cubes_per_sip = int(cm.get("w", 1)) * int(cm.get("h", 1))
+    pl = spec.get("cube", {}).get("pe_layout", {})
+    corners = pl.get("corners", [])
+    pe_per_corner = int(pl.get("pe_per_corner", 1))
+    pes_per_cube = pe_per_corner * max(len(corners), 1)
+    return sips * cubes_per_sip * pes_per_cube
+
+
+def _numpy_to_dtype_str(np_dtype) -> str:
+    """Map numpy dtype → kernbench dtype string used by Tensor."""
+    import numpy as np
+
+    kind_map = {
+        np.float16: "f16",
+        np.float32: "f32",
+        np.int8: "i8",
+        np.int16: "i16",
+        np.int32: "i32",
+        np.uint8: "u8",
+        np.uint16: "u16",
+        np.uint32: "u32",
+    }
+    for np_type, s in kind_map.items():
+        if np.dtype(np_dtype) == np.dtype(np_type):
+            return s
+    raise ValueError(f"unsupported numpy dtype: {np_dtype!r}")
+
+
 @dataclass
 class RuntimeContext:
     engine: SimEngine
@@ -23,6 +56,66 @@ class RuntimeContext:
     _tensor_counter: int = field(default=0, init=False)
     _traces: list[dict] = field(default_factory=list, init=False)
     _tensors: list[Any] = field(default_factory=list, init=False)
+    distributed: Any = field(default=None, init=False)  # DistributedContext for CCL benches
+    _ipcq_plan: dict = field(default_factory=dict, init=False)  # ADR-0023 install plan
+
+    def __post_init__(self) -> None:
+        # Eagerly attach a DistributedContext so bench code can do
+        # ``dist = torch.distributed`` + ``dist.init_process_group(...)``
+        # without needing a separate launcher to install it.
+        from kernbench.runtime_api.distributed import DistributedContext
+        dc = DistributedContext()
+        dc._ctx_ref = self  # back-reference for AhbmCCLBackend to reach ctx.launch etc.
+        self.distributed = dc
+
+    def install_ipcq(
+        self,
+        algorithm: str | None = None,
+        ccl_yaml: str | None = None,
+        world_size_override: int | None = None,
+        rank_to_pe: list[tuple[int, int, int]] | None = None,
+    ) -> dict:
+        """Install IPCQ neighbor tables on all participating PEs (ADR-0023 D10).
+
+        Loads ``ccl.yaml`` (or the path provided), resolves the chosen
+        algorithm (or ``defaults.algorithm`` if None), and pushes per-PE
+        IpcqInitMsg into every PE_IPCQ component via the engine.
+
+        Args:
+            algorithm: name of the algorithm in ccl.yaml (or use defaults).
+            ccl_yaml: optional path to ccl.yaml.
+            world_size_override: if set, replace the algorithm's world_size.
+
+        Returns the install plan dict (rank → (sip,cube,pe), neighbor table).
+        """
+        import importlib
+        from kernbench.ccl.install import (
+            install_ipcq as _install,
+            load_ccl_config,
+            resolve_algorithm_config,
+        )
+
+        cfg = load_ccl_config(ccl_yaml)
+        merged = resolve_algorithm_config(cfg, algorithm)
+        if world_size_override is not None:
+            merged["world_size"] = world_size_override
+        elif "world_size" not in merged:
+            # Derive from topology.yaml when neither the algorithm entry
+            # nor ``defaults`` carries ``world_size`` (matches pytorch DDP
+            # where env vars determine ranks, not the ccl config file).
+            merged["world_size"] = _world_size_from_spec(self.spec)
+        algo_module = None
+        try:
+            algo_module = importlib.import_module(merged["module"])
+        except ModuleNotFoundError:
+            pass
+        plan = _install(
+            self.engine, self.spec, merged,
+            algo_module=algo_module, rank_to_pe=rank_to_pe,
+        )
+        self._ipcq_plan = plan
+        self._ipcq_config = merged
+        return plan
 
     def __enter__(self):
         return self
@@ -258,6 +351,24 @@ class RuntimeContext:
         """Allocate a tensor in HBM without initialization (like torch.empty)."""
         return self._create_tensor(shape, dtype, name, pattern=None, dp=dp)
 
+    def from_numpy(self, arr: Any):
+        """Create a host-side tensor wrapping a numpy array.
+
+        Mirrors ``torch.from_numpy``. The returned tensor is NOT deployed
+        to any PE — it lives in an in-memory host staging buffer. Use
+        ``target.copy_(host_tensor)`` to scatter its contents into a
+        sharded, deployed tensor.
+        """
+        import numpy as np
+        from kernbench.runtime_api.tensor import Tensor
+
+        arr_c = np.ascontiguousarray(arr)
+        dtype_str = _numpy_to_dtype_str(arr_c.dtype)
+        t = Tensor(shape=tuple(arr_c.shape), dtype=dtype_str, name="host")
+        t._host_buffer = arr_c
+        t._memory_store = getattr(self.engine, "_memory_store", None)
+        return t
+
     def _create_tensor(
         self,
         shape: tuple[int, ...],
@@ -418,13 +529,12 @@ class RuntimeContext:
             TensorArgShard,
         )
         from kernbench.runtime_api.tensor import Tensor
-        from kernbench.triton_emu.registry import register_kernel
+        from kernbench.triton_emu.registry import _kernels, register_kernel
 
-        # Register kernel (idempotent)
-        try:
-            register_kernel(kernel_name, kernel_fn)
-        except ValueError:
-            pass
+        # Register kernel (idempotent overwrite — last call wins).
+        # Tests can re-register the same kernel_name with a different
+        # function; the user's most recent launch must use the latest fn.
+        _kernels[kernel_name] = kernel_fn
 
         # Collect tensors and scalars
         tensor_args: list[Tensor] = []
@@ -506,6 +616,7 @@ class RuntimeContext:
 
         # Per-SIP kernel launch: each SIP gets TensorArgs with local va_base
         last_handle = None
+        _pending_handles: list[tuple[Any, int]] = []
         for sip_id in sorted(sip_set):
             sip_kernel_args: list = []
             sip_cube_set: set[int] = set()
@@ -566,10 +677,17 @@ class RuntimeContext:
                 target_cubes=target_cubes,
                 target_pe=target_pe,
             ))
+            # Defer wait until all SIPs are submitted (multi-SIP CCL needs
+            # all participating PEs to be live concurrently — waiting
+            # per-SIP would deadlock when ranks span SIP boundaries).
+            _pending_handles.append((h, sip_id))
+            last_handle = h
+
+        # Drain pending handles now that every SIP has a launch posted.
+        for h, sip_id in _pending_handles:
             self.wait(h, _meta={
                 "phase": "kernel", "name": kernel_name,
                 "sip": sip_id, "target_pe": target_pe,
             })
-            last_handle = h
 
         return last_handle
diff --git a/src/kernbench/runtime_api/distributed.py b/src/kernbench/runtime_api/distributed.py
new file mode 100644
index 0000000..e2a3231
--- /dev/null
+++ b/src/kernbench/runtime_api/distributed.py
@@ -0,0 +1,179 @@
+"""PyTorch-compatible distributed communication shim (ADR-0023 D11).
+
+Provides a ``torch.distributed``-like API whose public surface matches
+real PyTorch so that bench code looks identical to a DDP training script.
+
+Only the ``ahbm`` backend is implemented. It:
+
+1. Reads ``ccl.yaml`` to decide which collective algorithm to run.
+2. Derives world_size from the algorithm entry, the defaults section, or
+   from the topology spec (``system.sips.count × sip.cube_mesh × pe_layout``).
+3. At ``init_process_group`` time, eagerly installs the IPCQ neighbor
+   table once (one-time comm setup — mirrors NCCL communicator creation).
+4. On each ``all_reduce(tensor)`` call, reads per-shard metadata from the
+   tensor handle and dispatches ``torch.launch`` with the registered
+   kernel. The kernel performs intra-PE ring/tree/mesh CCL via IPCQ,
+   and Phase 2 DataExecutor replays math + copies from op_log so
+   MemoryStore is correct when ``all_reduce`` returns.
+
+Host bench code uses only real-PyTorch names:
+    dist.init_process_group, dist.is_initialized, dist.get_world_size,
+    dist.get_rank, dist.get_backend, dist.all_reduce, dist.barrier
+"""
+from __future__ import annotations
+
+import importlib
+from typing import Any
+
+
+class AhbmCCLBackend:
+    """Ahbm CCL backend — drives kernel-level collectives via IPCQ."""
+
+    def __init__(self, torch_ctx: Any) -> None:
+        from kernbench.ccl.install import (
+            load_ccl_config,
+            resolve_algorithm_config,
+        )
+
+        self.ctx = torch_ctx
+        self._cfg_all = load_ccl_config()
+        self._merged = resolve_algorithm_config(self._cfg_all)
+        self._algo_module = importlib.import_module(self._merged["module"])
+        self._world_size = self._resolve_world_size()
+
+        # Eager IPCQ install — ``init_process_group`` time. Mirrors NCCL
+        # communicator creation: done once, reused across every subsequent
+        # collective call on the same process group.
+        self.ctx.install_ipcq(
+            algorithm=self._merged["algorithm"],
+            world_size_override=self._world_size,
+        )
+
+    def _resolve_world_size(self) -> int:
+        """Derive world_size (priority: algorithm override > defaults > topology).
+
+        Topology derivation:
+            sips × cubes_per_sip × pes_per_cube
+        """
+        if "world_size" in self._merged:
+            return int(self._merged["world_size"])
+        defaults = self._cfg_all.get("defaults", {})
+        if "world_size" in defaults:
+            return int(defaults["world_size"])
+        spec = self.ctx.spec or {}
+        sips = int(spec.get("system", {}).get("sips", {}).get("count", 1))
+        cm = spec.get("sip", {}).get("cube_mesh", {})
+        cubes_per_sip = int(cm.get("w", 1)) * int(cm.get("h", 1))
+        pl = spec.get("cube", {}).get("pe_layout", {})
+        corners = pl.get("corners", [])
+        pe_per_corner = int(pl.get("pe_per_corner", 1))
+        pes_per_cube = pe_per_corner * max(len(corners), 1)
+        return sips * cubes_per_sip * pes_per_cube
+
+    @property
+    def world_size(self) -> int:
+        return self._world_size
+
+    def all_reduce(self, tensor: Any, op: str = "sum") -> None:
+        """Dispatch the configured CCL algorithm as a single kernel launch.
+
+        Raises if ``op != "sum"`` (current kernels only implement add
+        reduction) or if the tensor's shard count disagrees with the
+        world_size that was installed into PE_IPCQ.
+        """
+        if op != "sum":
+            raise NotImplementedError(f"all_reduce op={op!r} not supported")
+        if tensor._handle is None:
+            raise RuntimeError(
+                f"Tensor '{tensor.name}' is not deployed (call torch.zeros "
+                "with a DPPolicy first)"
+            )
+        shards = tensor._handle.shards
+        if len(shards) != self._world_size:
+            raise RuntimeError(
+                f"all_reduce tensor has {len(shards)} shards but the "
+                f"ahbm backend was installed with world_size="
+                f"{self._world_size}; adjust the tensor's DPPolicy or "
+                "restart the process group"
+            )
+        n_elem = shards[0].nbytes // tensor.itemsize
+        kernel_fn = self._algo_module.kernel
+        kernel_args = self._algo_module.kernel_args(self._world_size, n_elem)
+        self.ctx.launch(
+            self._merged["algorithm"], kernel_fn, tensor, *kernel_args,
+        )
+
+    def barrier(self) -> None:
+        # Single-driver model → no cross-process sync needed. Keeping the
+        # method so ``dist.barrier()`` is callable (pytorch-compat surface).
+        return None
+
+
+class DistributedContext:
+    """torch.distributed-compat facade.
+
+    Public surface matches real PyTorch so bench code reads identically
+    to a DDP training script. Single-driver semantics: ``get_rank()``
+    always returns 0 because kernbench runs as one Python process;
+    ``get_world_size()`` returns the CCL group size (number of PEs
+    participating in the collective).
+    """
+
+    def __init__(self) -> None:
+        self._backend: AhbmCCLBackend | None = None
+
+    def init_process_group(
+        self,
+        backend: str = "ahbm",
+        world_size: int | None = None,
+        rank: int | None = None,
+        **kwargs: Any,
+    ) -> None:
+        """Create the default process group.
+
+        ``world_size`` and ``rank`` are accepted for API parity with
+        ``torch.distributed.init_process_group`` but ignored — the ahbm
+        backend derives both from ``ccl.yaml`` + topology automatically
+        (like reading ``RANK``/``WORLD_SIZE`` env vars in real DDP).
+        """
+        if backend != "ahbm":
+            raise ValueError(
+                f"Unsupported backend '{backend}'. Only 'ahbm' is supported."
+            )
+        ctx = getattr(self, "_ctx_ref", None)
+        if ctx is None:
+            raise RuntimeError(
+                "DistributedContext not bound to a RuntimeContext"
+            )
+        self._backend = AhbmCCLBackend(torch_ctx=ctx)
+
+    def is_initialized(self) -> bool:
+        return self._backend is not None
+
+    def get_world_size(self) -> int:
+        self._ensure_initialized()
+        return self._backend.world_size
+
+    def get_rank(self) -> int:
+        # Single-driver kernbench: there is only one host rank.
+        self._ensure_initialized()
+        return 0
+
+    def get_backend(self) -> str:
+        self._ensure_initialized()
+        return "ahbm"
+
+    def all_reduce(self, tensor: Any, op: str = "sum") -> None:
+        self._ensure_initialized()
+        self._backend.all_reduce(tensor, op=op)
+
+    def barrier(self) -> None:
+        self._ensure_initialized()
+        self._backend.barrier()
+
+    def _ensure_initialized(self) -> None:
+        if self._backend is None:
+            raise RuntimeError(
+                "Default process group has not been initialized. "
+                "Call init_process_group(backend='ahbm') first."
+            )
diff --git a/src/kernbench/runtime_api/kernel.py b/src/kernbench/runtime_api/kernel.py
index acda736..27fe732 100644
--- a/src/kernbench/runtime_api/kernel.py
+++ b/src/kernbench/runtime_api/kernel.py
@@ -152,3 +152,30 @@ class MmuUnmapMsg:
     target_cubes: tuple[int, ...] | Literal["all"] = "all"
     target_pe: int | Literal["all"] = "all"
     msg_type: Literal["mmu_unmap"] = "mmu_unmap"
+
+
+@dataclass(frozen=True)
+class IpcqInitMsg:
+    """IPCQ neighbor table install (sideband fan-out, ADR-0023 D10/D12).
+
+    Backend issues this at ``init_process_group`` time to install per-PE
+    IPCQ neighbor tables. Each entry covers one direction (N/S/E/W) and
+    carries the peer's IpcqEndpoint plus this PE's own rx_buffer base
+    and a pre-wired SimPy Store for credit return fast path (D9).
+
+    Routing is similar to MmuMapMsg.
+    """
+
+    correlation_id: str
+    request_id: str
+    target_sips: tuple[int, ...] | Literal["all"] = "all"
+    target_cubes: tuple[int, ...] | Literal["all"] = "all"
+    target_pe: int | tuple[int, ...] | Literal["all"] = "all"
+    # entries: tuple[IpcqInitEntry, ...] — kept as tuple of plain objects to
+    # avoid a runtime import cycle (IpcqInitEntry lives in
+    # kernbench.common.ipcq_types).
+    entries: tuple = ()
+    backpressure_mode: str = "sleep"  # "poll" | "sleep"
+    buffer_kind: str = "tcm"          # "tcm" | "hbm" | "sram"
+    credit_size_bytes: int = 16
+    msg_type: Literal["ipcq_init"] = "ipcq_init"
diff --git a/src/kernbench/runtime_api/tensor.py b/src/kernbench/runtime_api/tensor.py
index 7fa40c6..05f86d2 100644
--- a/src/kernbench/runtime_api/tensor.py
+++ b/src/kernbench/runtime_api/tensor.py
@@ -146,6 +146,11 @@ class Tensor:
         self._handle: TensorHandle | None = None
         self._ctx_ref: weakref.ref | None = None  # set by RuntimeContext
         self._memory_store = None  # set by RuntimeContext when enable_data=True
+        # Host-side staging buffer for torch.from_numpy() results. A tensor
+        # with a non-None _host_buffer is NOT deployed to any PE — it lives
+        # only on the host. Use `target.copy_(host_tensor)` to scatter the
+        # data into a deployed, sharded target tensor.
+        self._host_buffer: np.ndarray | None = None
 
     def __del__(self) -> None:
         if self._ctx_ref is None or self._handle is None:
@@ -166,15 +171,85 @@ class Tensor:
 
     @property
     def data(self) -> np.ndarray:
-        """Tensor data as numpy array. Returns actual values when enable_data=True,
-        zeros placeholder otherwise (like an uninitialized tensor)."""
-        if self._memory_store is not None and self._handle is not None:
-            shard = self._handle.shards[0]
+        """Tensor data as numpy array.
+
+        Gathers all shards into a single full-shape array. Returns actual
+        values when enable_data=True, zeros placeholder otherwise (like an
+        uninitialized tensor). Alias of ``numpy()``.
+        """
+        return self.numpy()
+
+    def _shard_store_addr(self, shard: TensorShard) -> int:
+        """MemoryStore key for a shard.
+
+        Kernels read tensors via VA (translated to PA by PE_DMA's MMU when
+        a mapping exists, otherwise the addr is treated as a PA-equivalent
+        key). Tensor I/O therefore writes/reads at ``va_base + offset_bytes``
+        when ``va_base`` is set, falling back to ``shard.pa`` for the
+        VA-less mode used by some legacy paths.
+        """
+        if self._handle and self._handle.va_base:
+            return self._handle.va_base + shard.offset_bytes
+        return shard.pa
+
+    def numpy(self) -> np.ndarray:
+        """Return a single numpy array gathered from all shards.
+
+        Mirrors ``torch.Tensor.numpy()``. In kernbench, sharded tensors are
+        gathered into a single full-shape ndarray according to each shard's
+        ``offset_bytes`` / ``nbytes`` range.
+        """
+        np_dtype = _numpy_dtype(self.dtype)
+        # Host-side tensor (created via torch.from_numpy) has no shards.
+        if self._host_buffer is not None:
+            return self._host_buffer.copy()
+        if self._handle is None or self._memory_store is None:
+            return np.zeros(self.shape, dtype=np_dtype)
+        flat = np.zeros(math.prod(self.shape), dtype=np_dtype)
+        for shard in self._handle.shards:
+            start = shard.offset_bytes // self.itemsize
+            count = shard.nbytes // self.itemsize
             try:
-                return self._memory_store.read("hbm", shard.pa, shape=self.shape, dtype=self.dtype)
+                piece = self._memory_store.read(
+                    "hbm", self._shard_store_addr(shard),
+                )
             except KeyError:
-                pass
-        return np.zeros(self.shape, dtype=_numpy_dtype(self.dtype))
+                continue
+            flat[start : start + count] = (
+                np.asarray(piece, dtype=np_dtype).reshape(-1)[:count]
+            )
+        return flat.reshape(self.shape)
+
+    def copy_(self, source: "Tensor") -> "Tensor":
+        """In-place copy from another tensor into self.
+
+        Mirrors ``torch.Tensor.copy_()``. If ``source`` is a host tensor
+        (from ``torch.from_numpy``), its ndarray is split across self's
+        shards using each shard's byte range. If ``source`` is a deployed
+        (sharded) tensor, its contents are gathered first and then
+        re-scattered into self's shard layout.
+
+        Shapes must match. Returns self.
+        """
+        if self._handle is None or self._memory_store is None:
+            raise RuntimeError(
+                f"Tensor '{self.name}' must be deployed before copy_()"
+            )
+        if source.shape != self.shape:
+            raise ValueError(
+                f"copy_ shape mismatch: self={self.shape} source={source.shape}"
+            )
+        np_dtype = _numpy_dtype(self.dtype)
+        arr = source.numpy().astype(np_dtype, copy=False)
+        flat = np.ascontiguousarray(arr).reshape(-1)
+        for shard in self._handle.shards:
+            start = shard.offset_bytes // self.itemsize
+            count = shard.nbytes // self.itemsize
+            piece = flat[start : start + count].copy()
+            self._memory_store.write(
+                "hbm", self._shard_store_addr(shard), piece,
+            )
+        return self
 
     @property
     def itemsize(self) -> int:
diff --git a/src/kernbench/sim_engine/data_executor.py b/src/kernbench/sim_engine/data_executor.py
index fe0be02..d52bebb 100644
--- a/src/kernbench/sim_engine/data_executor.py
+++ b/src/kernbench/sim_engine/data_executor.py
@@ -51,7 +51,42 @@ class DataExecutor:
             self._execute_math(op)
 
     def _execute_memory(self, op: OpRecord) -> None:
-        """Memory ops are already handled by Phase 1 MemoryStore. Skip."""
+        """Replay memory copy ops in Phase 2 (ADR-0020 + ADR-0023).
+
+        - dma_read: no-op (handle already references HBM source).
+        - dma_write: copy (src_space, src_addr) → (dst_space, dst_addr).
+          Required because Phase 2 may have just produced new data at the
+          source addr (e.g. PE_MATH scratch output).
+        - ipcq_copy: copy across PEs — sender's source → receiver's slot.
+          Required because the source may be a Phase 2 math output, and
+          a downstream math op on the receiver reads from the slot.
+
+        Legacy entries without src/dst metadata are silently skipped.
+        """
+        p = op.params
+        if op.op_name == "dma_write" or op.op_name == "ipcq_copy":
+            src_space = p.get("src_space")
+            src_addr = p.get("src_addr")
+            dst_space = p.get("dst_space")
+            dst_addr = p.get("dst_addr")
+            if (src_space is None or src_addr is None
+                    or dst_space is None or dst_addr is None):
+                return
+            # Prefer the Phase-1-time snapshot (captured at record_end /
+            # outbound) so we don't read from a source that has since been
+            # mutated by another op. Fall back to MemoryStore for sources
+            # that had no Phase 1 data (e.g. math scratch outputs that
+            # only get populated by Phase 2's math replay).
+            data = p.get("snapshot")
+            if data is None:
+                try:
+                    data = self.store.read(
+                        src_space, src_addr,
+                        shape=p.get("shape"), dtype=p.get("dtype"),
+                    )
+                except KeyError:
+                    return
+            self.store.write(dst_space, dst_addr, data)
 
     def _execute_gemm(self, op: OpRecord) -> None:
         """Execute GEMM: out = a @ b."""
@@ -77,18 +112,35 @@ class DataExecutor:
         """Execute math op: unary, binary, or reduction."""
         p = op.params
         math_op = p.get("op", op.op_name)
-        space = p.get("addr_space", "tcm")
         dtype = p.get("dtype", "f32")
         input_addrs = p.get("input_addrs", [])
         input_shapes = p.get("input_shapes", [])
+        # Per-input space/dtype (ADR-0023 CCL accumulation): math ops can
+        # mix inputs from different MemoryStore spaces (e.g. acc in "hbm",
+        # recv slot in "tcm"). Fall back to legacy single-space mode when
+        # the per-input lists are absent.
+        input_spaces = p.get("input_spaces") or [p.get("addr_space", "tcm")] * len(input_addrs)
+        input_dtypes = p.get("input_dtypes") or [dtype] * len(input_addrs)
+        # Per-input data snapshots (ADR-0020 D6): captured at op_log
+        # record time. Phase 1 has correct values for slot/HBM addrs at
+        # that moment, which lets Phase 2 sidestep the slot-wraparound
+        # races where a later round overwrites a slot before this op
+        # runs in t_start order.
+        snapshots = p.get("input_snapshots") or [None] * len(input_addrs)
+        dst_space = p.get("dst_space", p.get("addr_space", "tcm"))
 
         inputs = []
-        for addr, shape in zip(input_addrs, input_shapes):
-            inputs.append(self.store.read(space, addr, shape=shape, dtype=dtype))
+        for addr, shape, space, idtype, snap in zip(
+            input_addrs, input_shapes, input_spaces, input_dtypes, snapshots
+        ):
+            if snap is not None:
+                inputs.append(snap)
+            else:
+                inputs.append(self.store.read(space, addr, shape=shape, dtype=idtype))
 
         result = _compute_math(math_op, inputs, p.get("axis"))
         if result is not None:
-            self.store.write(space, p["dst_addr"], result)
+            self.store.write(dst_space, p["dst_addr"], result)
 
     def verify(self, expected: dict[tuple[str, int], np.ndarray],
                rtol: float = 1e-3, atol: float = 1e-3) -> dict[str, bool]:
@@ -146,6 +198,14 @@ def _compute_math(op: str, inputs: list[np.ndarray], axis: int | None) -> np.nda
     if op == "min":
         return np.min(x, axis=axis, keepdims=True)
 
+    # Softmax (numerically stable)
+    if op == "softmax":
+        ax = axis if axis is not None else -1
+        x_max = np.max(x, axis=ax, keepdims=True)
+        e = np.exp(x - x_max)
+        s = np.sum(e, axis=ax, keepdims=True)
+        return e / s
+
     # Binary
     if len(inputs) >= 2:
         y = inputs[1]
@@ -157,9 +217,18 @@ def _compute_math(op: str, inputs: list[np.ndarray], axis: int | None) -> np.nda
             return x * y
         if op == "div":
             return x / y
+        if op == "maximum":
+            return np.maximum(x, y)
+        if op == "minimum":
+            return np.minimum(x, y)
 
     # Ternary
-    if op == "where" and len(inputs) >= 3:
-        return np.where(inputs[0], inputs[1], inputs[2])
+    if len(inputs) >= 3:
+        if op == "where":
+            return np.where(inputs[0], inputs[1], inputs[2])
+        if op == "fma":
+            return inputs[0] * inputs[1] + inputs[2]
+        if op == "clamp":
+            return np.minimum(np.maximum(inputs[0], inputs[1]), inputs[2])
 
     return None
diff --git a/src/kernbench/sim_engine/engine.py b/src/kernbench/sim_engine/engine.py
index eef7c46..0995397 100644
--- a/src/kernbench/sim_engine/engine.py
+++ b/src/kernbench/sim_engine/engine.py
@@ -51,8 +51,12 @@ class GraphEngine:
         if enable_data:
             from kernbench.sim_engine.memory_store import MemoryStore
             from kernbench.sim_engine.op_log import OpLogger
-            self._op_logger = OpLogger()
             self._memory_store = MemoryStore()
+            self._op_logger = OpLogger(memory_store=self._memory_store)
+        # Cursor for incremental Phase 2 replay (ADR-0020 D6).
+        # SimPy env.now is monotonic so newly logged records always sort
+        # to the tail; the cursor remains valid across waits.
+        self._data_cursor = 0
 
         ctx = ComponentContext(
             router=self._router,
@@ -147,11 +151,60 @@ class GraphEngine:
         self._env.process(self._process(str(handle), request, event))
         return handle
 
+    def _flush_data_phase(self) -> None:
+        """Replay newly recorded op_log entries through DataExecutor.
+
+        ADR-0020 D6 Phase 2: when data tracking is enabled, run DataExecutor
+        on records added since the last flush so that callers reading
+        MemoryStore between launches observe correct (compute-replayed)
+        tensor data.
+
+        Cursor-based incremental replay is necessary because Phase 2 is
+        NOT idempotent across full re-runs: a math op writes a TCM scratch
+        addr, a later dma_write copies that scratch into HBM[X], and an
+        even-later math op may then read HBM[X]. Re-running everything
+        from scratch would let the second pass's first math op read the
+        already-overwritten HBM[X] instead of the original input.
+        """
+        if self._op_logger is None or self._memory_store is None:
+            return
+        records = self._op_logger.records  # sorted by t_start (stable)
+        if self._data_cursor >= len(records):
+            return
+        new_records = records[self._data_cursor:]
+        from kernbench.sim_engine.data_executor import DataExecutor
+        DataExecutor(new_records, self._memory_store).run()
+        self._data_cursor = len(records)
+
     def wait(self, handle: RequestHandle) -> None:
         key = str(handle)
         event = self._events[key]
         if not event.triggered:
-            self._env.run(until=event)
+            try:
+                self._env.run(until=event)
+            except (simpy.core.EmptySchedule, RuntimeError) as exc:
+                # SimPy raises EmptySchedule directly OR (in newer simpy)
+                # wraps it as a RuntimeError("No scheduled events left ...").
+                # Either case while our event is still pending → IPCQ deadlock.
+                msg = str(exc)
+                is_deadlock = (
+                    isinstance(exc, simpy.core.EmptySchedule)
+                    or "No scheduled events left" in msg
+                )
+                if not is_deadlock:
+                    raise
+                from kernbench.ccl.diagnostics import IpcqDeadlock, pointer_dump
+                dump = pointer_dump(self)
+                if dump.strip():
+                    raise IpcqDeadlock(
+                        "IPCQ deadlock: simulation schedule empty while "
+                        f"request {handle!r} is still pending.\n"
+                        f"Pointer state:\n{dump}"
+                    ) from None
+                raise
+        # ADR-0020: replay newly logged ops so the caller observes
+        # post-Phase-2 tensor state from MemoryStore.
+        self._flush_data_phase()
 
     def get_completion(self, handle: RequestHandle) -> tuple[Completion, Trace | None]:
         return self._results[str(handle)]
diff --git a/src/kernbench/sim_engine/op_log.py b/src/kernbench/sim_engine/op_log.py
index bf0f5de..f0accd8 100644
--- a/src/kernbench/sim_engine/op_log.py
+++ b/src/kernbench/sim_engine/op_log.py
@@ -29,9 +29,13 @@ class OpLogger:
     Records are maintained in t_start stable ordering (insertion order).
     """
 
-    def __init__(self) -> None:
+    def __init__(self, memory_store: Any | None = None) -> None:
         self._records: list[OpRecord] = []
         self._pending: dict[int, dict[str, Any]] = {}  # msg id → partial record
+        # Optional MemoryStore reference. When set, math op records capture
+        # input data snapshots at record_end time so Phase 2 replay does
+        # not depend on slot/scratch addrs surviving until math runs.
+        self._memory_store = memory_store
 
     @property
     def records(self) -> list[OpRecord]:
@@ -53,6 +57,38 @@ class OpLogger:
         if pending is None:
             return
         op_kind, op_name, params = _extract_op_info(msg)
+        # Snapshot data at record time so Phase 2 replay sidesteps
+        # downstream mutations of source addrs (e.g. a tl.store that
+        # overwrites HBM after a load handle was sent, or a slot that
+        # gets reused on the next ring round).
+        if self._memory_store is not None:
+            if op_kind == "math":
+                snaps: list[Any] = []
+                for addr, shape, space, idtype in zip(
+                    params.get("input_addrs", []),
+                    params.get("input_shapes", []),
+                    params.get("input_spaces", []),
+                    params.get("input_dtypes", []),
+                ):
+                    try:
+                        arr = self._memory_store.read(
+                            space, addr, shape=shape, dtype=idtype,
+                        )
+                        snaps.append(arr.copy() if hasattr(arr, "copy") else arr)
+                    except Exception:
+                        snaps.append(None)
+                params["input_snapshots"] = snaps
+            elif op_name == "dma_write":
+                try:
+                    arr = self._memory_store.read(
+                        params["src_space"], params["src_addr"],
+                        shape=params.get("shape"), dtype=params.get("dtype"),
+                    )
+                    params["snapshot"] = (
+                        arr.copy() if hasattr(arr, "copy") else arr
+                    )
+                except Exception:
+                    params["snapshot"] = None
         self._records.append(OpRecord(
             t_start=pending["t_start"],
             t_end=t,
@@ -62,6 +98,45 @@ class OpLogger:
             params=params,
         ))
 
+    def record_copy(
+        self, t_start: float, t_end: float, component_id: str,
+        src_space: str, src_addr: int,
+        dst_space: str, dst_addr: int,
+        shape: tuple[int, ...], dtype: str, nbytes: int,
+    ) -> None:
+        """Record a memory copy op for Phase 2 replay (ADR-0023 + ADR-0020).
+
+        Used by PE_DMA at outbound (sender) time: the snapshot captures
+        the source data at the moment the send was issued, so Phase 2
+        replay does not see later mutations of the source addr (e.g. a
+        tl.store that runs after the recv at the sender).
+
+        For sources whose data is not yet materialized in Phase 1 (math
+        scratch outputs), the snapshot is None and Phase 2 falls back to
+        reading from MemoryStore — by which point the corresponding math
+        op has been replayed and the scratch addr is populated.
+        """
+        snap = None
+        if self._memory_store is not None:
+            try:
+                arr = self._memory_store.read(
+                    src_space, src_addr, shape=shape, dtype=dtype,
+                )
+                snap = arr.copy() if hasattr(arr, "copy") else arr
+            except Exception:
+                snap = None
+        self._records.append(OpRecord(
+            t_start=t_start, t_end=t_end,
+            component_id=component_id,
+            op_kind="memory", op_name="ipcq_copy",
+            params={
+                "src_space": src_space, "src_addr": src_addr,
+                "dst_space": dst_space, "dst_addr": dst_addr,
+                "shape": shape, "dtype": dtype, "nbytes": nbytes,
+                "snapshot": snap,
+            },
+        ))
+
 
 def _extract_op_info(msg: Any) -> tuple[str, str, dict[str, Any]]:
     """Extract op_kind, op_name, params from a data_op message."""
@@ -76,6 +151,11 @@ def _extract_op_info(msg: Any) -> tuple[str, str, dict[str, Any]]:
         }
     if isinstance(msg, DmaWriteCmd):
         return "memory", "dma_write", {
+            "src_space": getattr(msg.handle, "space", "tcm"),
+            "src_addr": msg.handle.addr,
+            "shape": msg.handle.shape,
+            "dtype": msg.handle.dtype,
+            "dst_space": "hbm",
             "dst_addr": msg.dst_addr,
             "nbytes": msg.nbytes,
             "handle_id": msg.handle.id,
@@ -96,7 +176,10 @@ def _extract_op_info(msg: Any) -> tuple[str, str, dict[str, Any]]:
         return "math", msg.op, {
             "input_addrs": [h.addr for h in msg.inputs],
             "input_shapes": [h.shape for h in msg.inputs],
+            "input_spaces": [getattr(h, "space", "tcm") for h in msg.inputs],
+            "input_dtypes": [h.dtype for h in msg.inputs],
             "dst_addr": msg.out.addr,
+            "dst_space": getattr(msg.out, "space", "tcm"),
             "shape_out": msg.out.shape,
             "dtype": msg.out.dtype,
             "axis": msg.axis,
diff --git a/src/kernbench/topology/builder.py b/src/kernbench/topology/builder.py
index ae18f3c..2337a1b 100644
--- a/src/kernbench/topology/builder.py
+++ b/src/kernbench/topology/builder.py
@@ -25,6 +25,7 @@ _PE_COMP_OFFSETS = {
     "pe_math": (0.0, 0.15),
     "pe_mmu": (0.15, -0.15),
     "pe_tcm": (0.3, 0.0),
+    "pe_ipcq": (-0.15, 0.15),
 }
 
 
@@ -698,6 +699,20 @@ def _add_pe_internal_edges(edges: list[Edge], pp: str, pe_links: dict) -> None:
                 kind="pe_internal",
             ))
 
+    # PE_IPCQ edges (ADR-0023 D1, D9 D10)
+    ipcq_edges = [
+        ("pe_cpu",  "pe_ipcq", "cpu_to_ipcq_mm"),  # IpcqRequest
+        ("pe_ipcq", "pe_dma",  "ipcq_to_dma_mm"),  # IpcqDmaToken outbound
+        ("pe_dma",  "pe_ipcq", "dma_to_ipcq_mm"),  # IpcqMetaArrival inbound
+    ]
+    for src_c, dst_c, mm_key in ipcq_edges:
+        if mm_key in pe_links:
+            edges.append(Edge(
+                src=f"{pp}.{src_c}", dst=f"{pp}.{dst_c}",
+                distance_mm=pe_links[mm_key],
+                kind="pe_internal",
+            ))
+
 
 # ── Inter-cube / IO / system edges ──────────────────────────────────
 
@@ -765,7 +780,13 @@ def _add_io_to_cube_edges(
 def _add_system_to_io_edges(
     edges: list[Edge], sp: str, sip_spec: dict, system: dict,
 ) -> None:
-    """Add fabric switch → IO chiplet PCIe edges."""
+    """Add bidirectional fabric switch ↔ IO chiplet PCIe edges.
+
+    Both directions are needed:
+      switch → pcie_ep   for host→device traffic (memory writes, kernel launch)
+      pcie_ep → switch   for device-side outbound traffic (cross-SIP IPCQ
+                          send between PE_DMAs through the system switch).
+    """
     sw_id = "fabric.switch0"
     sys_link = system["links"]["io_ep_to_switch"]
     for inst in sip_spec["iochiplet"]["instances"]:
@@ -776,6 +797,12 @@ def _add_system_to_io_edges(
             bw_gbs=sys_link["bw_gbs_per_ep"],
             kind="pcie",
         ))
+        edges.append(Edge(
+            src=pcie_ep_id, dst=sw_id,
+            distance_mm=sys_link["distance_mm"],
+            bw_gbs=sys_link["bw_gbs_per_ep"],
+            kind="pcie",
+        ))
 
 
 # ── View builders ────────────────────────────────────────────────────
@@ -1113,13 +1140,14 @@ def _build_pe_view(spec: dict) -> ViewGraph:
         "pe_math": (7.0, 6.5),
         "pe_mmu": (4.0, 1.5),
         "pe_tcm": (10.0, 4.0),
+        "pe_ipcq": (4.0, 6.5),
     }
 
     nodes: dict[str, Node] = {}
     view_edges: list[Edge] = []
 
     for comp_name, comp_spec in pe_tmpl["components"].items():
-        px, py = positions[comp_name]
+        px, py = positions.get(comp_name, (1.0, 1.0))
         nodes[comp_name] = Node(
             id=comp_name, kind=comp_spec["kind"], impl=comp_spec["impl"],
             attrs=comp_spec["attrs"], pos_mm=(px, py),
diff --git a/src/kernbench/triton_emu/kernel_runner.py b/src/kernbench/triton_emu/kernel_runner.py
index 593733f..ca25ea1 100644
--- a/src/kernbench/triton_emu/kernel_runner.py
+++ b/src/kernbench/triton_emu/kernel_runner.py
@@ -15,6 +15,7 @@ from typing import TYPE_CHECKING, Any
 import simpy
 from greenlet import greenlet
 
+from kernbench.common.ipcq_types import IpcqRecvCmd, IpcqRequest, IpcqSendCmd, RecvFuture
 from kernbench.common.pe_commands import (
     CompletionHandle,
     CompositeCmd,
@@ -51,6 +52,9 @@ class KernelRunner:
         out_ports: dict[str, simpy.Store],
         store: MemoryStore | None = None,
         num_cubes: int = 1,
+        ipcq_id: str | None = None,
+        scratch_base: int = 0,
+        scratch_size: int = 1 << 20,
     ) -> None:
         self._pe_prefix = pe_prefix
         self._pe_idx = pe_idx
@@ -61,6 +65,13 @@ class KernelRunner:
         self._out_ports = out_ports
         self._store = store
         self._parent: greenlet | None = None
+        # Optional IPCQ port (ADR-0023). If None, IPCQ commands raise.
+        self._ipcq_id = ipcq_id or f"{pe_prefix}.pe_ipcq"
+        # PE-local scratch for compute output TensorHandles (ADR-0020 D3
+        # extension). The TLContext allocates from this pool when math/dot
+        # ops produce a result that may later be used as a send/store source.
+        self._scratch_base = scratch_base
+        self._scratch_size = scratch_size
 
     def run(
         self,
@@ -89,7 +100,10 @@ class KernelRunner:
             num_cubes=self._num_cubes,
             dispatch_cycles=0,
             runner=self,
+            scratch_base=self._scratch_base,
+            scratch_size=self._scratch_size,
         )
+        self._tl = tl  # exposed so switch_to_simpy can re-set on restore
 
         def _kernel_entry():
             TLContext._set_active(tl)  # type: ignore[attr-defined]
@@ -103,13 +117,20 @@ class KernelRunner:
         pending: dict[str, simpy.Event] = {}
         composite_results: list[dict] = []
 
+        # Helper: set our tl as active just before resuming the kernel.
+        # Multiple PE kernel runners share the same thread-local; without
+        # this, another runner's kernel may have left a different context.
+        def _switch_kernel(*args):
+            TLContext._set_active(tl)  # type: ignore[attr-defined]
+            return g.switch(*args)
+
         # Start kernel — first switch returns first command (or None if kernel is done)
-        cmd = g.switch()
+        cmd = _switch_kernel()
 
         while cmd is not None:
             if isinstance(cmd, PeCpuOverheadCmd):
                 yield env.timeout(cmd.cycles)
-                cmd = g.switch()
+                cmd = _switch_kernel()
 
             elif isinstance(cmd, WaitCmd):
                 if cmd.handle is not None:
@@ -120,7 +141,7 @@ class KernelRunner:
                     for evt in pending.values():
                         yield evt
                     pending.clear()
-                cmd = g.switch()
+                cmd = _switch_kernel()
 
             elif isinstance(cmd, DmaReadCmd):
                 # Dispatch DMA through SimPy components
@@ -141,10 +162,12 @@ class KernelRunner:
                         )
                     except KeyError:
                         pass
-                cmd = g.switch(data)
+                cmd = _switch_kernel(data)
 
             elif isinstance(cmd, DmaWriteCmd):
-                # Write to MemoryStore first (visibility = issue, ADR-0020 D3)
+                # Write to MemoryStore first (visibility = issue, ADR-0020 D3).
+                # When data is None (e.g. timing-only TensorHandle math result),
+                # this is a no-op; Phase 2 dma_write replay handles those.
                 if self._store is not None and cmd.handle.data is not None:
                     self._store.write("hbm", cmd.dst_addr, cmd.handle.data)
 
@@ -154,7 +177,7 @@ class KernelRunner:
                 )
                 yield self._out_ports[self._scheduler_id].put(pe_txn)
                 yield done_evt
-                cmd = g.switch()
+                cmd = _switch_kernel()
 
             elif isinstance(cmd, CompositeCmd):
                 # Non-blocking composite
@@ -165,7 +188,7 @@ class KernelRunner:
                 composite_results.append(pe_txn.result_data)
                 yield self._out_ports[self._scheduler_id].put(pe_txn)
                 pending[cmd.completion.id] = done_evt
-                cmd = g.switch()
+                cmd = _switch_kernel()
 
             elif isinstance(cmd, (GemmCmd, MathCmd)):
                 # Blocking compute command
@@ -175,7 +198,90 @@ class KernelRunner:
                 )
                 yield self._out_ports[self._scheduler_id].put(pe_txn)
                 yield done_evt
-                cmd = g.switch()
+                cmd = _switch_kernel()
+
+            elif isinstance(cmd, IpcqSendCmd):
+                # Forward IpcqRequest to PE_IPCQ, wait for done
+                if self._ipcq_id not in self._out_ports:
+                    raise RuntimeError(
+                        f"PE_IPCQ port {self._ipcq_id!r} not wired to runner"
+                    )
+                done_evt = env.event()
+                req = IpcqRequest(command=cmd, done=done_evt)
+                yield self._out_ports[self._ipcq_id].put(req)
+                yield done_evt
+                cmd = _switch_kernel()
+
+            elif isinstance(cmd, IpcqRecvCmd):
+                if self._ipcq_id not in self._out_ports:
+                    raise RuntimeError(
+                        f"PE_IPCQ port {self._ipcq_id!r} not wired to runner"
+                    )
+                done_evt = env.event()
+                req = IpcqRequest(command=cmd, done=done_evt)
+                yield self._out_ports[self._ipcq_id].put(req)
+                yield done_evt
+                # Read actual data from MemoryStore at the slot address
+                data = None
+                src_space = req.result_data.get("src_space", "tcm")
+                src_addr = req.result_data.get("src_addr", 0)
+                if self._store is not None:
+                    try:
+                        data = self._store.read(
+                            src_space, src_addr,
+                            shape=cmd.shape, dtype=cmd.dtype,
+                        )
+                    except KeyError:
+                        pass
+                # Build result dict for tl.recv to wrap in TensorHandle
+                result = {
+                    "data": data,
+                    "src_space": src_space,
+                    "src_addr": src_addr,
+                    "direction": req.result_data.get("direction", cmd.direction),
+                    "dtype": cmd.dtype,
+                    "shape": cmd.shape,
+                    "nbytes": req.result_data.get("nbytes", 0),
+                }
+                cmd = _switch_kernel(result)
+
+            elif isinstance(cmd, tuple) and len(cmd) == 2 and cmd[0] == "recv_async":
+                # Non-blocking recv: post the IpcqRequest now, store the
+                # event in the future, return None to kernel.
+                future: RecvFuture = cmd[1]
+                done_evt = env.event()
+                req = IpcqRequest(command=future.cmd, done=done_evt)
+                future.request = req
+                future.event = done_evt
+                yield self._out_ports[self._ipcq_id].put(req)
+                cmd = _switch_kernel(None)
+
+            elif isinstance(cmd, tuple) and len(cmd) == 2 and cmd[0] == "recv_wait":
+                future = cmd[1]
+                if not future.event.triggered:
+                    yield future.event
+                req = future.request
+                src_space = req.result_data.get("src_space", "tcm")
+                src_addr = req.result_data.get("src_addr", 0)
+                data = None
+                if self._store is not None:
+                    try:
+                        data = self._store.read(
+                            src_space, src_addr,
+                            shape=future.cmd.shape, dtype=future.cmd.dtype,
+                        )
+                    except KeyError:
+                        pass
+                result = {
+                    "data": data,
+                    "src_space": src_space,
+                    "src_addr": src_addr,
+                    "direction": req.result_data.get("direction", future.cmd.direction),
+                    "dtype": future.cmd.dtype,
+                    "shape": future.cmd.shape,
+                    "nbytes": req.result_data.get("nbytes", 0),
+                }
+                cmd = _switch_kernel(result)
 
             else:
                 # Unknown command — pass through as blocking
@@ -185,7 +291,7 @@ class KernelRunner:
                 )
                 yield self._out_ports[self._scheduler_id].put(pe_txn)
                 yield done_evt
-                cmd = g.switch()
+                cmd = _switch_kernel()
 
         # Wait remaining pending composites
         for evt in pending.values():
diff --git a/src/kernbench/triton_emu/tl_context.py b/src/kernbench/triton_emu/tl_context.py
index 3498a84..22e43d6 100644
--- a/src/kernbench/triton_emu/tl_context.py
+++ b/src/kernbench/triton_emu/tl_context.py
@@ -17,6 +17,7 @@ from __future__ import annotations
 import math
 from typing import Literal
 
+from kernbench.common.ipcq_types import IpcqRecvCmd, IpcqSendCmd, RecvFuture
 from kernbench.common.pe_commands import (
     CompletionHandle,
     CompositeCmd,
@@ -55,6 +56,8 @@ class TLContext:
         runner: Any = None,
         cube_id: int = 0,
         num_cubes: int = 1,
+        scratch_base: int = 0,
+        scratch_size: int = 1 << 20,  # 1 MiB per kernel invocation
     ) -> None:
         self._pe_id = pe_id
         self._num_programs = num_programs
@@ -65,6 +68,33 @@ class TLContext:
         self._handle_counter = 0
         self._completion_counter = 0
         self._runner = runner  # KernelRunner for greenlet mode (ADR-0020 D3)
+        # PE-local scratch allocator for math/compute output handles.
+        # Each binary/unary/reduction op auto-allocates a unique addr from
+        # this pool so the resulting TensorHandle can be the source of a
+        # later tl.send / tl.store. Cursor resets on every kernel invocation.
+        self._scratch_base = scratch_base
+        self._scratch_size = scratch_size
+        self._scratch_cursor = 0
+
+    def _scratch_alloc(self, nbytes: int) -> int:
+        """Allocate a unique scratch address for an output TensorHandle.
+
+        Returns 0 if no scratch base was configured (e.g. command-list mode);
+        in that case the resulting handle has addr=0 and cannot be used as a
+        send/store source. Greenlet/runner mode always supplies a base.
+        """
+        if self._scratch_base == 0:
+            return 0
+        # 16-byte alignment
+        aligned = (nbytes + 15) & ~15
+        addr = self._scratch_base + self._scratch_cursor
+        self._scratch_cursor += aligned
+        if self._scratch_cursor > self._scratch_size:
+            raise RuntimeError(
+                f"TLContext scratch overflow: requested {nbytes}B, "
+                f"used {self._scratch_cursor}/{self._scratch_size}B"
+            )
+        return addr
 
     @property
     def commands(self) -> list[PeCommand]:
@@ -93,11 +123,30 @@ class TLContext:
 
     def _make_handle(
         self, addr: int, shape: tuple[int, ...], dtype: str,
+        space: str = "tcm",
     ) -> TensorHandle:
         return TensorHandle(
             id=self._next_handle_id(),
             addr=addr, shape=shape, dtype=dtype,
             nbytes=self._nbytes(shape, dtype),
+            space=space,
+        )
+
+    def _make_compute_out(
+        self, shape: tuple[int, ...], dtype: str,
+    ) -> TensorHandle:
+        """Allocate an output TensorHandle in PE-local scratch (TCM space).
+
+        Used by math/compute ops so the result has a real address that can
+        be the source of a later send/store. The data field stays None in
+        Phase 1 — Phase 2 DataExecutor fills the actual ndarray.
+        """
+        nbytes = self._nbytes(shape, dtype)
+        addr = self._scratch_alloc(nbytes)
+        return TensorHandle(
+            id=self._next_handle_id(),
+            addr=addr, shape=shape, dtype=dtype,
+            nbytes=nbytes, space="tcm",
         )
 
     # ── Reference (no DMA, metadata only) ────────────────────────
@@ -124,20 +173,26 @@ class TLContext:
     def load(
         self, ptr: int, shape: tuple[int, ...], dtype: str = "f16",
     ) -> TensorHandle:
-        """Load tensor from HBM to TCM. Returns TensorHandle.
+        """Load tensor from HBM. Returns TensorHandle pointing at HBM[ptr].
 
         In greenlet mode: returns TensorHandle with actual numpy data.
         In command-list mode: returns TensorHandle with data=None.
+
+        The returned handle's ``space`` is "hbm" so subsequent ops (math,
+        send, store) using this handle as a source resolve via MemoryStore
+        at ``(hbm, ptr)`` — which is where the load's underlying data
+        actually lives in Phase 2 storage.
         """
         self._emit_dispatch_overhead()
-        handle = self._make_handle(addr=ptr, shape=shape, dtype=dtype)
+        handle = self._make_handle(addr=ptr, shape=shape, dtype=dtype, space="hbm")
         cmd = DmaReadCmd(handle=handle, src_addr=ptr, nbytes=handle.nbytes)
         data = self._emit(cmd)
         if data is not None:
-            # Greenlet mode: attach real data to handle
+            # Greenlet mode: attach real data to handle (preserve space)
             return TensorHandle(
                 id=handle.id, addr=handle.addr, shape=handle.shape,
                 dtype=handle.dtype, nbytes=handle.nbytes, data=data,
+                space=handle.space,
             )
         return handle
 
@@ -162,7 +217,7 @@ class TLContext:
             raise ValueError(f"dot shape mismatch: a.K={k} != b.K={k2}")
         out_shape = (*a.shape[:-2], m, n)
         out_dtype = a.dtype
-        out = self._make_handle(addr=0, shape=out_shape, dtype=out_dtype)
+        out = self._make_compute_out(shape=out_shape, dtype=out_dtype)
         self._emit_dispatch_overhead()
         self._emit(GemmCmd(a=a, b=b, out=out, m=m, k=k, n=n))
         return out
@@ -170,7 +225,7 @@ class TLContext:
     # ── MATH Engine: unary (blocking) ─────────────────────────────
 
     def _unary_math(self, op: str, x: TensorHandle) -> TensorHandle:
-        out = self._make_handle(addr=0, shape=x.shape, dtype=x.dtype)
+        out = self._make_compute_out(shape=x.shape, dtype=x.dtype)
         self._emit_dispatch_overhead()
         self._emit(MathCmd(op=op, inputs=(x,), out=out))
         return out
@@ -203,7 +258,7 @@ class TLContext:
     ) -> TensorHandle:
         out_shape = list(x.shape)
         out_shape[axis] = 1
-        out = self._make_handle(addr=0, shape=tuple(out_shape), dtype=x.dtype)
+        out = self._make_compute_out(shape=tuple(out_shape), dtype=x.dtype)
         self._emit_dispatch_overhead()
         self._emit(MathCmd(op=op, inputs=(x,), out=out, axis=axis))
         return out
@@ -222,7 +277,7 @@ class TLContext:
     def _binary_math(
         self, op: str, a: TensorHandle, b: TensorHandle,
     ) -> TensorHandle:
-        out = self._make_handle(addr=0, shape=a.shape, dtype=a.dtype)
+        out = self._make_compute_out(shape=a.shape, dtype=a.dtype)
         self._emit_dispatch_overhead()
         self._emit(MathCmd(op=op, inputs=(a, b), out=out))
         return out
@@ -230,15 +285,67 @@ class TLContext:
     def where(
         self, cond: TensorHandle, a: TensorHandle, b: TensorHandle,
     ) -> TensorHandle:
-        out = self._make_handle(addr=0, shape=a.shape, dtype=a.dtype)
+        out = self._make_compute_out(shape=a.shape, dtype=a.dtype)
         self._emit_dispatch_overhead()
         self._emit(MathCmd(op="where", inputs=(cond, a, b), out=out))
         return out
 
+    def maximum(self, a: TensorHandle, b: TensorHandle) -> TensorHandle:
+        """Element-wise max of two tensors (real Triton: tl.maximum)."""
+        return self._binary_math("maximum", a, b)
+
+    def minimum(self, a: TensorHandle, b: TensorHandle) -> TensorHandle:
+        """Element-wise min of two tensors (real Triton: tl.minimum)."""
+        return self._binary_math("minimum", a, b)
+
+    def fma(
+        self, a: TensorHandle, b: TensorHandle, c: TensorHandle,
+    ) -> TensorHandle:
+        """Fused multiply-add: a * b + c (real Triton: tl.fma)."""
+        out = self._make_compute_out(shape=a.shape, dtype=a.dtype)
+        self._emit_dispatch_overhead()
+        self._emit(MathCmd(op="fma", inputs=(a, b, c), out=out))
+        return out
+
+    def clamp(
+        self,
+        x: TensorHandle,
+        min: TensorHandle,
+        max: TensorHandle,
+    ) -> TensorHandle:
+        """Clamp x to [min, max] (real Triton: tl.clamp)."""
+        out = self._make_compute_out(shape=x.shape, dtype=x.dtype)
+        self._emit_dispatch_overhead()
+        self._emit(MathCmd(op="clamp", inputs=(x, min, max), out=out))
+        return out
+
+    def softmax(self, x: TensorHandle, axis: int = -1) -> TensorHandle:
+        """Numerically-stable softmax along ``axis`` (real Triton: tl.softmax).
+
+        Implemented as a single MathCmd (op="softmax") so timing accounts
+        for one MATH dispatch; Phase 2 DataExecutor expands it to the
+        canonical (x - max) → exp → sum → div sequence.
+        """
+        out = self._make_compute_out(shape=x.shape, dtype=x.dtype)
+        self._emit_dispatch_overhead()
+        self._emit(MathCmd(op="softmax", inputs=(x,), out=out, axis=axis))
+        return out
+
+    # ── Scalar helpers (real Triton: tl.cdiv etc.) ────────────────
+
+    @staticmethod
+    def cdiv(a: int, b: int) -> int:
+        """Ceiling division: (a + b - 1) // b (real Triton: tl.cdiv).
+
+        Used by host/kernel grid math; not a tensor op, so no MathCmd
+        is emitted. Mirrors triton.cdiv.
+        """
+        return -(-int(a) // int(b))
+
     # ── Index / Scalar (PE_CPU, no engine) ────────────────────────
 
     def program_id(self, axis: int = 0) -> int:
-        """Return program instance index.
+        """Return program instance index (ADR-0022).
 
         axis=0: local PE id within cube.
         axis=1: cube id.
@@ -248,7 +355,7 @@ class TLContext:
         return self._pe_id
 
     def num_programs(self, axis: int = 0) -> int:
-        """Return total number of program instances.
+        """Return total number of program instances (ADR-0022).
 
         axis=0: num PEs per cube.
         axis=1: num cubes.
@@ -284,6 +391,119 @@ class TLContext:
             dtype=x.dtype, nbytes=x.nbytes, data=x.data,
         )
 
+    # ── IPCQ (CCL) collective primitives (ADR-0023 D4) ────────────
+
+    def send(
+        self,
+        dir: str,
+        src: TensorHandle | None = None,
+        *,
+        src_addr: int | None = None,
+        nbytes: int | None = None,
+        shape: tuple[int, ...] | None = None,
+        dtype: str = "f16",
+        space: str = "tcm",
+    ) -> None:
+        """Send tensor data to the peer in the given direction.
+
+        Two calling forms:
+            tl.send(dir, handle)                       # use handle's metadata
+            tl.send(dir, src_addr=..., nbytes=..., shape=..., dtype=..., space=...)
+
+        Blocking: returns when PE_IPCQ has accepted the request and
+        forwarded the IpcqDmaToken to PE_DMA. Backpressure may apply.
+        """
+        if src is not None:
+            src_addr = src.addr
+            nbytes = src.nbytes
+            shape = src.shape
+            dtype = src.dtype
+            space = getattr(src, "space", space)
+        if src_addr is None or nbytes is None or shape is None:
+            raise ValueError("tl.send: provide either a TensorHandle or src_addr/nbytes/shape")
+        self._emit_dispatch_overhead()
+        cmd = IpcqSendCmd(
+            direction=dir,
+            src_addr=src_addr, src_space=space,
+            nbytes=nbytes, shape=shape, dtype=dtype,
+            handle_id=self._next_handle_id(),
+        )
+        self._emit(cmd)
+
+    def recv(
+        self,
+        dir: str | None = None,
+        shape: tuple[int, ...] = (),
+        dtype: str = "f16",
+        space: str = "tcm",
+        dst_addr: int | None = None,
+        dst_space: str | None = None,
+    ) -> TensorHandle:
+        """Receive tensor data from a peer.
+
+        Args:
+            dir: specific direction (e.g. "W"), or None for round-robin.
+            shape, dtype: expected tensor metadata.
+            dst_addr / dst_space: if both are provided, the slot data is
+                copied to (dst_space, dst_addr) before the handle is
+                returned ("copy_to_dst" mode). Otherwise the slot address
+                is returned directly ("return_slot" mode).
+
+        Returns:
+            TensorHandle pointing to the slot (or dst) where the data has
+            arrived. In greenlet/runner mode, ``handle.data`` carries the
+            actual ndarray; in command-list mode the handle is a placeholder.
+        """
+        self._emit_dispatch_overhead()
+        if dst_addr is not None and dst_space is not None:
+            cmd = IpcqRecvCmd(
+                direction=dir,
+                shape=shape, dtype=dtype,
+                handle_id=self._next_handle_id(),
+                recv_mode="copy_to_dst",
+                dst_addr=dst_addr, dst_space=dst_space,
+            )
+        else:
+            cmd = IpcqRecvCmd(
+                direction=dir,
+                shape=shape, dtype=dtype,
+                handle_id=self._next_handle_id(),
+            )
+        result = self._emit(cmd)
+        if isinstance(result, dict):
+            slot_addr = int(result.get("src_addr", 0))
+            slot_space = str(result.get("src_space", "tcm"))
+            data = result.get("data")
+            return TensorHandle(
+                id=self._next_handle_id(),
+                addr=slot_addr,
+                shape=shape,
+                dtype=dtype,
+                nbytes=self._nbytes(shape, dtype),
+                data=data,
+                space=slot_space,
+            )
+        return self._make_handle(addr=0, shape=shape, dtype=dtype)
+
+    def recv_async(
+        self,
+        dir: str,
+        shape: tuple[int, ...] = (),
+        dtype: str = "f16",
+    ) -> "RecvFuture":
+        """Non-blocking recv. Returns a future to pass into ``tl.wait``."""
+        self._emit_dispatch_overhead()
+        cmd = IpcqRecvCmd(
+            direction=dir,
+            shape=shape, dtype=dtype,
+            handle_id=self._next_handle_id(),
+            blocking=False,
+        )
+        future = RecvFuture(cmd=cmd)
+        if self._runner is not None:
+            self._runner.switch_to_simpy(("recv_async", future))
+        return future
+
     # ── Composite + Control ───────────────────────────────────────
 
     def composite(
@@ -316,9 +536,40 @@ class TLContext:
         ))
         return completion
 
-    def wait(self, handle: CompletionHandle | None = None) -> None:
-        """Wait for a specific composite or all pending composites."""
+    def wait(self, handle: "CompletionHandle | RecvFuture | None" = None) -> Any:
+        """Wait for a composite, a recv future, or all pending composites.
+
+        - ``CompletionHandle`` (or None): wait for composite completion.
+        - ``RecvFuture``: wait for a non-blocking ``recv_async`` to finish.
+          Returns the resolved ``TensorHandle``.
+        """
+        if isinstance(handle, RecvFuture):
+            if handle.resolved:
+                return handle.result
+            if self._runner is None:
+                raise RuntimeError(
+                    "tl.wait(RecvFuture) requires runner mode (greenlet)"
+                )
+            result_dict = self._runner.switch_to_simpy(("recv_wait", handle))
+            slot_addr = int(result_dict.get("src_addr", 0))
+            slot_space = str(result_dict.get("src_space", "tcm"))
+            data = result_dict.get("data")
+            th = TensorHandle(
+                id=self._next_handle_id(),
+                addr=slot_addr,
+                shape=handle.cmd.shape,
+                dtype=handle.cmd.dtype,
+                nbytes=self._nbytes(handle.cmd.shape, handle.cmd.dtype),
+                data=data,
+                space=slot_space,
+            )
+            handle.resolved = True
+            handle.result = th
+            return th
+
+        # Composite path (existing behaviour)
         self._emit(WaitCmd(handle=handle))
+        return None
 
     def cycles(self, n: int) -> None:
         """Declare PE_CPU scalar execution overhead (cycles)."""
diff --git a/tests/test_ccl_allreduce_matrix.py b/tests/test_ccl_allreduce_matrix.py
new file mode 100644
index 0000000..e19c4ba
--- /dev/null
+++ b/tests/test_ccl_allreduce_matrix.py
@@ -0,0 +1,142 @@
+"""End-to-end matrix tests for the unified ``ccl_allreduce`` bench.
+
+Each parametrized case writes a tmp ``ccl.yaml`` overlay that selects a
+specific (algorithm, world_size, buffer_kind, n_elem) combination, then
+runs the bench via the CLI and asserts the printed line reports all
+ranks OK.
+
+This single test file replaces the per-variant bench tests
+(test_ccl_allreduce_e2e, test_ccl_mesh_allreduce, test_ccl_tree_allreduce,
+test_ccl_multicube, test_ccl_multisip).
+"""
+from __future__ import annotations
+
+import os
+import textwrap
+
+import pytest
+
+import kernbench.cli.main as cli_main
+
+
+CCL_YAML_TEMPLATE = textwrap.dedent("""\
+    defaults:
+      algorithm: {algorithm}
+      buffer_kind: {buffer_kind}
+      backpressure: sleep
+      n_slots: 4
+      slot_size: 4096
+      vc_chunk_size: 256
+      ipcq_credit_size_bytes: 16
+
+    algorithms:
+      {algorithm}:
+        module: {module}
+        topology: {topology}
+        buffer_kind: {buffer_kind}
+{world_size_line}{n_elem_line}
+""")
+
+
+def _write_ccl_yaml(
+    tmp_path,
+    *,
+    algorithm: str,
+    module: str,
+    topology: str,
+    buffer_kind: str = "tcm",
+    world_size: int | None = None,
+    n_elem: int | None = None,
+) -> str:
+    """Write a tmp ccl.yaml in tmp_path and return its directory."""
+    ws_line = f"        world_size: {world_size}\n" if world_size is not None else ""
+    nel_line = f"        n_elem: {n_elem}\n" if n_elem is not None else ""
+    body = CCL_YAML_TEMPLATE.format(
+        algorithm=algorithm,
+        module=module,
+        topology=topology,
+        buffer_kind=buffer_kind,
+        world_size_line=ws_line,
+        n_elem_line=nel_line,
+    )
+    yaml_path = tmp_path / "ccl.yaml"
+    yaml_path.write_text(body)
+    return str(tmp_path)
+
+
+CASES = [
+    # algorithm, module, topology, buffer_kind, world_size, n_elem, expected_ws
+    pytest.param(
+        "ring_allreduce_tcm", "kernbench.ccl.algorithms.ring_allreduce",
+        "ring_1d", "tcm", None, 8, 256,
+        id="ring_full_system_tcm",
+    ),
+    pytest.param(
+        "ring_allreduce_hbm", "kernbench.ccl.algorithms.ring_allreduce",
+        "ring_1d", "hbm", None, 8, 256,
+        id="ring_full_system_hbm",
+    ),
+    pytest.param(
+        "ring_allreduce_sram", "kernbench.ccl.algorithms.ring_allreduce",
+        "ring_1d", "sram", None, 8, 256,
+        id="ring_full_system_sram",
+    ),
+    pytest.param(
+        "ring_allreduce_8", "kernbench.ccl.algorithms.ring_allreduce",
+        "ring_1d", "tcm", 8, 32, 8,
+        id="ring_single_cube",
+    ),
+    pytest.param(
+        "ring_allreduce_16", "kernbench.ccl.algorithms.ring_allreduce",
+        "ring_1d", "tcm", 16, 16, 16,
+        id="ring_multi_cube",
+    ),
+    pytest.param(
+        "mesh_allreduce_4", "kernbench.ccl.algorithms.mesh_allreduce",
+        "mesh_2d", "tcm", 4, 16, 4,
+        id="mesh_2x2",
+    ),
+    pytest.param(
+        "tree_allreduce_7", "kernbench.ccl.algorithms.tree_allreduce",
+        "tree_binary", "tcm", 7, 16, 7,
+        id="tree_binary_7",
+    ),
+]
+
+
+@pytest.mark.parametrize(
+    "algorithm,module,topology,buffer_kind,world_size,n_elem,expected_ws",
+    CASES,
+)
+def test_ccl_allreduce_matrix(
+    tmp_path, capsys, monkeypatch,
+    algorithm, module, topology, buffer_kind, world_size, n_elem, expected_ws,
+):
+    """Each (algorithm × buffer × world_size) combo passes through the
+    unified bench and yields all ranks OK."""
+    project_root = os.path.abspath(
+        os.path.join(os.path.dirname(__file__), "..")
+    )
+    yaml_dir = _write_ccl_yaml(
+        tmp_path,
+        algorithm=algorithm,
+        module=module,
+        topology=topology,
+        buffer_kind=buffer_kind,
+        world_size=world_size,
+        n_elem=n_elem,
+    )
+    monkeypatch.chdir(yaml_dir)
+    rc = cli_main.main([
+        "run",
+        "--topology", os.path.join(project_root, "topology.yaml"),
+        "--bench", "ccl_allreduce",
+        "--verify-data",
+    ])
+    assert rc == 0
+    out = capsys.readouterr().out
+    assert "FAIL" not in out, f"unexpected FAIL in output:\n{out}"
+    assert f"{algorithm} (ws={expected_ws}): {expected_ws} OK" in out, (
+        f"expected '{algorithm} (ws={expected_ws}): {expected_ws} OK' "
+        f"in output:\n{out}"
+    )
diff --git a/tests/test_ccl_deadlock_detection.py b/tests/test_ccl_deadlock_detection.py
new file mode 100644
index 0000000..9dbb133
--- /dev/null
+++ b/tests/test_ccl_deadlock_detection.py
@@ -0,0 +1,125 @@
+"""Tests for IPCQ deadlock detection (ADR-0023 D14 F3)."""
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any
+
+import pytest
+import simpy
+
+from kernbench.ccl import diagnostics
+from kernbench.common.ipcq_types import (
+    IpcqEndpoint,
+    IpcqInitEntry,
+    IpcqRecvCmd,
+    IpcqRequest,
+)
+from kernbench.components.builtin.pe_ipcq import PeIpcqComponent
+from kernbench.runtime_api.kernel import IpcqInitMsg
+from kernbench.topology.types import Node
+
+
+@dataclass
+class _FakeTxn:
+    request: Any
+    done: simpy.Event
+    result_data: dict[str, Any] = field(default_factory=dict)
+
+
+def _make_isolated_pe_ipcq(env):
+    node = Node(
+        id="sip0.cube0.pe0.pe_ipcq", kind="pe_ipcq",
+        impl="builtin.pe_ipcq", attrs={}, pos_mm=None,
+    )
+    comp = PeIpcqComponent(node, ctx=None)
+    comp.in_ports["host"] = simpy.Store(env)
+    comp.out_ports["sip0.cube0.pe0.pe_dma"] = simpy.Store(env)
+    comp.start(env)
+
+    peer_credit = simpy.Store(env)
+    ep = IpcqEndpoint(
+        sip=0, cube=0, pe=1, buffer_kind="tcm",
+        rx_base_pa=0x10_000, rx_base_va=0,
+        n_slots=4, slot_size=4096,
+    )
+    init_msg = IpcqInitMsg(
+        correlation_id="t", request_id="t",
+        target_sips=(0,), target_cubes=(0,), target_pe=0,
+        entries=(IpcqInitEntry(
+            direction="W", peer=ep,
+            my_rx_base_pa=0x40_000, my_rx_base_va=0,
+            n_slots=4, slot_size=4096,
+            peer_credit_store=peer_credit,
+        ),),
+        backpressure_mode="sleep",
+        buffer_kind="tcm",
+        credit_size_bytes=16,
+    )
+    done = env.event()
+    comp.in_ports["host"].put(_FakeTxn(request=init_msg, done=done))
+    env.run(until=done)
+    return comp
+
+
+def test_pointer_dump_includes_blocked_state():
+    """A blocked recv should still be visible in the pointer dump."""
+    env = simpy.Environment()
+    comp = _make_isolated_pe_ipcq(env)
+
+    # Issue a recv that will block (no data has arrived)
+    recv_cmd = IpcqRecvCmd(direction="W", shape=(8,), dtype="f16", handle_id="r1")
+    req = IpcqRequest(command=recv_cmd, done=env.event())
+    comp.in_ports["host"].put(req)
+    env.run(until=10)
+    assert not req.done.triggered
+
+    # Pointer dump should show my_tail=0 and peer_head_cache=0
+    # We need to use the engine API but for an isolated component, just call directly
+    class FakeEngine:
+        _components = {"sip0.cube0.pe0.pe_ipcq": comp}
+
+    dump = diagnostics.pointer_dump(FakeEngine())
+    assert "my_tail=0" in dump
+    assert "peer_head_cache=0" in dump
+
+
+def test_deadlock_detection_recv_without_send():
+    """A recv with no matching sender → SimPy schedule empties → engine
+    raises ``IpcqDeadlock`` with a pointer dump.
+    """
+    from kernbench.ccl.diagnostics import IpcqDeadlock
+    from kernbench.policy.placement.dp import DPPolicy
+    from kernbench.runtime_api.bench_runner import run_bench
+    from kernbench.runtime_api.types import resolve_device
+    from kernbench.sim_engine.engine import GraphEngine
+    from kernbench.topology.builder import resolve_topology
+
+    def deadlock_kernel(t_ptr, n_elem, tl):
+        # Every PE just receives, no sends → no one delivers → deadlock
+        tl.recv(dir="W", shape=(n_elem,), dtype="f16")
+
+    topo = resolve_topology("topology.yaml")
+
+    def run(torch):
+        torch.install_ipcq(
+            algorithm="ring_allreduce_tcm", world_size_override=8,
+        )
+        a = torch.zeros(
+            (1, 8 * 8),
+            dtype="f16",
+            dp=DPPolicy(
+                sip="replicate", cube="replicate", pe="column_wise",
+                num_sips=1, num_cubes=1,
+            ),
+            name="dl_in",
+        )
+        torch.launch("dl", deadlock_kernel, a, 8)
+
+    with pytest.raises(IpcqDeadlock):
+        run_bench(
+            topology=topo, bench_fn=run,
+            device=resolve_device("all"),
+            engine_factory=lambda t, d: GraphEngine(
+                getattr(t, "topology_obj", t), enable_data=True
+            ),
+        )
diff --git a/tests/test_ccl_diagnostics.py b/tests/test_ccl_diagnostics.py
new file mode 100644
index 0000000..5d949fb
--- /dev/null
+++ b/tests/test_ccl_diagnostics.py
@@ -0,0 +1,70 @@
+"""Tests for CCL diagnostics: trace + pointer dump (ADR-0023 D14)."""
+from __future__ import annotations
+
+import os
+
+from kernbench.ccl import diagnostics
+
+
+# ── trace toggle ─────────────────────────────────────────────────────
+
+
+def test_trace_disabled_by_default(monkeypatch):
+    monkeypatch.delenv("KERNBENCH_CCL_TRACE", raising=False)
+    diagnostics.reload_trace_setting()
+    assert diagnostics.trace_enabled() is False
+
+
+def test_trace_enabled_via_env(monkeypatch):
+    monkeypatch.setenv("KERNBENCH_CCL_TRACE", "1")
+    diagnostics.reload_trace_setting()
+    assert diagnostics.trace_enabled() is True
+
+
+def test_trace_record_send(monkeypatch, capsys):
+    monkeypatch.setenv("KERNBENCH_CCL_TRACE", "1")
+    diagnostics.reload_trace_setting()
+    diagnostics.log_send(t_ns=100.0, sender="sip0.cube0.pe0",
+                         direction="E", nbytes=64, sender_seq=0)
+    out = capsys.readouterr().out
+    assert "send" in out
+    assert "sip0.cube0.pe0" in out
+    assert "dir=E" in out
+    monkeypatch.delenv("KERNBENCH_CCL_TRACE")
+    diagnostics.reload_trace_setting()
+
+
+def test_trace_record_recv(monkeypatch, capsys):
+    monkeypatch.setenv("KERNBENCH_CCL_TRACE", "1")
+    diagnostics.reload_trace_setting()
+    diagnostics.log_recv(t_ns=200.0, receiver="sip0.cube0.pe1",
+                         direction="W", nbytes=64)
+    out = capsys.readouterr().out
+    assert "recv" in out
+    assert "sip0.cube0.pe1" in out
+    monkeypatch.delenv("KERNBENCH_CCL_TRACE")
+    diagnostics.reload_trace_setting()
+
+
+# ── pointer dump ────────────────────────────────────────────────────
+
+
+def test_pointer_dump_format():
+    from kernbench.sim_engine.engine import GraphEngine
+    from kernbench.topology.builder import resolve_topology
+    from kernbench.ccl.install import (
+        install_ipcq, load_ccl_config, resolve_algorithm_config,
+    )
+
+    topo = resolve_topology("topology.yaml").topology_obj
+    engine = GraphEngine(topo, enable_data=True)
+    cfg = resolve_algorithm_config(load_ccl_config(), name="ring_allreduce_tcm")
+    install_ipcq(engine, topo.spec, cfg)
+
+    dump = diagnostics.pointer_dump(engine)
+    # 8 ranks × 2 directions = 16 lines (plus 8 PE headers)
+    assert "sip0.cube0.pe0" in dump
+    assert "E:" in dump
+    assert "W:" in dump
+    assert "my_head=" in dump
+    assert "peer_tail_cache=" in dump
diff --git a/tests/test_ccl_framework.py b/tests/test_ccl_framework.py
new file mode 100644
index 0000000..93dfa9c
--- /dev/null
+++ b/tests/test_ccl_framework.py
@@ -0,0 +1,62 @@
+"""Tests for the torch.distributed-compat facade (ADR-0023 D11).
+
+These tests verify the public API surface of ``DistributedContext`` +
+``AhbmCCLBackend``. End-to-end correctness of the allreduce itself is
+covered by tests/test_ccl_allreduce_matrix.py.
+"""
+from __future__ import annotations
+
+from kernbench.runtime_api.distributed import AhbmCCLBackend, DistributedContext
+
+
+def test_init_process_group_requires_ctx_ref():
+    """Using DistributedContext without RuntimeContext binding should fail."""
+    dist = DistributedContext()
+    # Not bound to a RuntimeContext → init should raise.
+    try:
+        dist.init_process_group(backend="ahbm")
+        assert False, "expected RuntimeError"
+    except RuntimeError:
+        pass
+
+
+def test_init_process_group_rejects_unknown_backend():
+    """Unknown backend raises ValueError (matches pytorch behavior)."""
+    dist = DistributedContext()
+    dist._ctx_ref = object()  # dummy; won't be reached before the check
+    try:
+        dist.init_process_group(backend="nccl")
+        assert False, "expected ValueError"
+    except ValueError:
+        pass
+
+
+def test_distributed_pytorch_compat_surface():
+    """DistributedContext only exposes real torch.distributed API names."""
+    # Every public attribute should either be a real pytorch name or private.
+    allowed = {
+        "init_process_group",
+        "is_initialized",
+        "get_world_size",
+        "get_rank",
+        "get_backend",
+        "all_reduce",
+        "barrier",
+    }
+    dc = DistributedContext()
+    for attr in dir(dc):
+        if attr.startswith("_"):
+            continue
+        assert attr in allowed, (
+            f"DistributedContext exposes non-pytorch API: {attr!r}"
+        )
+
+
+def test_backend_class_surface():
+    """AhbmCCLBackend exposes only all_reduce + barrier + world_size."""
+    # Ensure we don't accidentally leak internal method names.
+    public = {m for m in dir(AhbmCCLBackend) if not m.startswith("_")}
+    # Class must at minimum expose these.
+    assert "all_reduce" in public
+    assert "barrier" in public
+    assert "world_size" in public
diff --git a/tests/test_ccl_hello_world_guide.py b/tests/test_ccl_hello_world_guide.py
new file mode 100644
index 0000000..68bc017
--- /dev/null
+++ b/tests/test_ccl_hello_world_guide.py
@@ -0,0 +1,81 @@
+"""Validate the hello-world example from docs/ccl-author-guide.md.
+
+This is the simplest possible CCL kernel — each PE sends its tile E
+and receives a tile from W. After running, each rank's slice should
+contain the data of the previous rank.
+"""
+from __future__ import annotations
+
+import numpy as np
+
+from kernbench.ccl.algorithms import hello_send
+from kernbench.ccl.testing import run_kernel_in_mock
+
+
+def test_hello_send_4_ranks_mock():
+    n_elem = 8
+    inputs = [np.full((n_elem,), float(r + 1), dtype=np.float16) for r in range(4)]
+
+    outputs = run_kernel_in_mock(
+        kernel_fn=hello_send.kernel,
+        world_size=4,
+        topology="ring_1d",
+        inputs=inputs,
+        kernel_args=(n_elem,),
+    )
+
+    # rank r should have rank (r-1) % 4's data
+    for r in range(4):
+        prev = inputs[(r - 1) % 4]
+        assert np.array_equal(outputs[r], prev), f"rank {r}: got {outputs[r]}"
+
+
+def test_hello_send_via_simpy_runner():
+    """Same but through real SimPy + IPCQ."""
+    from kernbench.policy.placement.dp import DPPolicy
+    from kernbench.runtime_api.bench_runner import run_bench
+    from kernbench.runtime_api.types import resolve_device
+    from kernbench.sim_engine.engine import GraphEngine
+    from kernbench.topology.builder import resolve_topology
+
+    topo = resolve_topology("topology.yaml")
+    n_elem = 8
+    world_size = 8
+
+    def run(torch):
+        # World size for this hello test is 8 (one cube). ccl.yaml no
+        # longer carries a default world_size — pass it explicitly.
+        plan = torch.install_ipcq(
+            algorithm="ring_allreduce_tcm", world_size_override=world_size,
+        )
+        a = torch.zeros(
+            (1, world_size * n_elem), dtype="f16",
+            dp=DPPolicy(
+                sip="replicate", cube="replicate", pe="column_wise",
+                num_sips=1, num_cubes=1,
+            ),
+            name="hello_in",
+        )
+        store = torch.engine.memory_store
+        base = a._handle.va_base or a._handle.shards[0].pa
+        nbytes = n_elem * 2
+        for r in range(world_size):
+            store.write("hbm", base + r * nbytes,
+                        np.full((n_elem,), float(r + 1), dtype=np.float16))
+
+        torch.launch("hello_send", hello_send.kernel, a, n_elem)
+
+        # Each rank should hold the previous rank's data after the round
+        for r in range(world_size):
+            arr = store.read("hbm", base + r * nbytes, shape=(n_elem,), dtype="f16")
+            prev_value = float(((r - 1) % world_size) + 1)
+            assert np.allclose(arr, prev_value), f"rank {r}: got {arr}, expected {prev_value}"
+
+    result = run_bench(
+        topology=topo, bench_fn=run,
+        device=resolve_device("all"),
+        engine_factory=lambda t, d: GraphEngine(
+            getattr(t, "topology_obj", t), enable_data=True
+        ),
+    )
+    assert result.completion.ok
diff --git a/tests/test_ccl_helpers.py b/tests/test_ccl_helpers.py
new file mode 100644
index 0000000..a15cb9c
--- /dev/null
+++ b/tests/test_ccl_helpers.py
@@ -0,0 +1,68 @@
+"""Tests for CCL algorithm-author helpers (ADR-0023 D15)."""
+from __future__ import annotations
+
+import pytest
+
+from kernbench.ccl.helpers import (
+    Chunk,
+    chunked,
+    ring_step,
+    tree_step,
+)
+
+
+# ── chunked ──────────────────────────────────────────────────────────
+
+
+def test_chunked_basic():
+    chunks = chunked(base_addr=0x1000, n_chunks=4, n_elem=64, dtype="f16")
+    assert len(chunks) == 4
+    # Each chunk has 16 elements (64 / 4)
+    assert chunks[0] == Chunk(addr=0x1000, n_elem=16, nbytes=32)
+    assert chunks[1] == Chunk(addr=0x1020, n_elem=16, nbytes=32)
+    assert chunks[2] == Chunk(addr=0x1040, n_elem=16, nbytes=32)
+    assert chunks[3] == Chunk(addr=0x1060, n_elem=16, nbytes=32)
+
+
+def test_chunked_f32():
+    chunks = chunked(base_addr=0x100, n_chunks=2, n_elem=8, dtype="f32")
+    assert chunks[0].nbytes == 16  # 4 elem × 4 bytes
+    assert chunks[1].addr == 0x100 + 16
+
+
+def test_chunked_uneven_raises():
+    with pytest.raises(ValueError):
+        chunked(base_addr=0x100, n_chunks=3, n_elem=10, dtype="f16")
+
+
+# ── ring_step ────────────────────────────────────────────────────────
+
+
+def test_ring_step_4_ranks():
+    # Standard reduce-scatter ring step:
+    # at step s, rank r sends chunk (r-s) and receives chunk (r-s-1) (mod ws)
+    assert ring_step(rank=0, step=0, world_size=4) == (0, 3)
+    assert ring_step(rank=0, step=1, world_size=4) == (3, 2)
+    assert ring_step(rank=1, step=0, world_size=4) == (1, 0)
+    assert ring_step(rank=2, step=0, world_size=4) == (2, 1)
+
+
+# ── tree_step ────────────────────────────────────────────────────────
+
+
+def test_tree_step_root():
+    info = tree_step(rank=0, world_size=7)
+    assert info["parent"] is None
+    assert info["children"] == [1, 2]
+
+
+def test_tree_step_internal():
+    info = tree_step(rank=1, world_size=7)
+    assert info["parent"] == 0
+    assert info["children"] == [3, 4]
+
+
+def test_tree_step_leaf():
+    info = tree_step(rank=4, world_size=7)
+    assert info["parent"] == 1
+    assert info["children"] == []
diff --git a/tests/test_ccl_install.py b/tests/test_ccl_install.py
new file mode 100644
index 0000000..738611e
--- /dev/null
+++ b/tests/test_ccl_install.py
@@ -0,0 +1,100 @@
+"""Tests for CCL backend install (ADR-0023 D10/D11)."""
+from __future__ import annotations
+
+from kernbench.ccl.install import (
+    install_ipcq,
+    linear_rank_to_pe,
+    load_ccl_config,
+    resolve_algorithm_config,
+)
+from kernbench.sim_engine.engine import GraphEngine
+from kernbench.topology.builder import resolve_topology
+
+
+def _engine():
+    topo = resolve_topology("topology.yaml").topology_obj
+    return GraphEngine(topo, enable_data=True), topo
+
+
+def test_load_ccl_config():
+    cfg = load_ccl_config()
+    assert "defaults" in cfg
+    assert "algorithms" in cfg
+
+
+def test_resolve_algorithm_config_default():
+    cfg = load_ccl_config()
+    merged = resolve_algorithm_config(cfg)
+    assert merged["algorithm"] == cfg["defaults"]["algorithm"]
+    # ccl.yaml no longer carries defaults.world_size — backend derives
+    # it from topology.yaml at install time. Just check the field is
+    # absent here (verified per-test where install_ipcq is called).
+    assert "world_size" not in merged or merged["world_size"] >= 1
+
+
+def test_resolve_algorithm_config_override():
+    cfg = load_ccl_config()
+    merged = resolve_algorithm_config(cfg, name="ring_allreduce_hbm")
+    assert merged["algorithm"] == "ring_allreduce_hbm"
+    assert merged["buffer_kind"] == "hbm"  # algo override
+    # defaults still apply
+    assert merged["n_slots"] == cfg["defaults"]["n_slots"]
+
+
+def test_linear_rank_to_pe():
+    engine, topo = _engine()
+    spec = topo.spec
+    # Cube 0 of SIP 0
+    assert linear_rank_to_pe(0, spec) == (0, 0, 0)
+    assert linear_rank_to_pe(7, spec) == (0, 0, 7)
+    # Should not exceed total PE count
+    pes_per_sip = (
+        spec["sip"]["cube_mesh"]["w"] * spec["sip"]["cube_mesh"]["h"]
+        * spec["cube"]["pe_layout"]["pe_per_corner"]
+        * len(spec["cube"]["pe_layout"]["corners"])
+    )
+    sips = spec["system"]["sips"]["count"]
+    total = sips * pes_per_sip
+    assert total >= 8
+
+
+def test_install_ipcq_neighbors_correct():
+    engine, topo = _engine()
+    cfg = load_ccl_config()
+    merged = resolve_algorithm_config(cfg, name="ring_allreduce_tcm")
+    # Force a single-cube 8-rank install for the assertions below.
+    merged["world_size"] = 8
+    plan = install_ipcq(engine, topo.spec, merged)
+
+    assert plan["world_size"] == 8
+    assert plan["buffer_kind"] == "tcm"
+
+    # Each rank should have E and W entries
+    for r, nbrs in plan["neighbor_table"].items():
+        assert "E" in nbrs
+        assert "W" in nbrs
+
+    # Inspect installed PE_IPCQ for rank 0
+    ipcq = engine._components["sip0.cube0.pe0.pe_ipcq"]
+    qp_e = ipcq.queue_pairs["E"]
+    qp_w = ipcq.queue_pairs["W"]
+    assert qp_e["peer"].pe == 1   # rank 0's E neighbor is rank 1
+    assert qp_w["peer"].pe == 7   # rank 0's W neighbor is rank 7
+    # rx_base addresses should be unique
+    assert qp_e["my_rx_base_pa"] != qp_w["my_rx_base_pa"]
+
+
+def test_install_ipcq_credit_stores_wired():
+    engine, topo = _engine()
+    cfg = load_ccl_config()
+    merged = resolve_algorithm_config(cfg, name="ring_allreduce_tcm")
+    merged["world_size"] = 8
+    install_ipcq(engine, topo.spec, merged)
+
+    # rank 0 (pe0) sending E goes to rank 1 (pe1)
+    # rank 0's peer_credit_store on E direction should equal rank 1's credit_inbox
+    pe0 = engine._components["sip0.cube0.pe0.pe_ipcq"]
+    pe1 = engine._components["sip0.cube0.pe1.pe_ipcq"]
+
+    qp_e = pe0.queue_pairs["E"]
+    assert qp_e["peer_credit_store"] is pe1.credit_inbox
diff --git a/tests/test_ccl_mock_runtime.py b/tests/test_ccl_mock_runtime.py
new file mode 100644
index 0000000..b39f7a2
--- /dev/null
+++ b/tests/test_ccl_mock_runtime.py
@@ -0,0 +1,83 @@
+"""Tests for the mock CCL runtime (ADR-0023 D15)."""
+from __future__ import annotations
+
+import numpy as np
+
+from kernbench.ccl.algorithms import ring_allreduce
+from kernbench.ccl.testing import run_kernel_in_mock
+
+
+def test_ring_allreduce_4_ranks():
+    """Run the ring all-reduce kernel under the mock runtime, no SimPy."""
+    n_elem = 8
+    inputs = [
+        np.full((n_elem,), float(r + 1), dtype=np.float16)
+        for r in range(4)
+    ]
+    expected = sum(inputs)  # [10, 10, ..., 10]
+
+    outputs = run_kernel_in_mock(
+        kernel_fn=ring_allreduce.kernel,
+        world_size=4,
+        topology="ring_1d",
+        inputs=inputs,
+        kernel_args=(n_elem, 4),
+    )
+
+    assert len(outputs) == 4
+    for r in range(4):
+        assert np.allclose(outputs[r], expected)
+
+
+def test_ring_allreduce_8_ranks():
+    n_elem = 16
+    inputs = [
+        np.full((n_elem,), float(r + 1), dtype=np.float16)
+        for r in range(8)
+    ]
+    expected = sum(inputs)  # [36, 36, ...]
+
+    outputs = run_kernel_in_mock(
+        kernel_fn=ring_allreduce.kernel,
+        world_size=8,
+        topology="ring_1d",
+        inputs=inputs,
+        kernel_args=(n_elem, 8),
+    )
+    for r in range(8):
+        assert np.allclose(outputs[r], expected)
+
+
+def test_ring_allreduce_random_data():
+    n_elem = 32
+    rng = np.random.default_rng(42)
+    inputs = [rng.standard_normal(n_elem).astype(np.float16) for _ in range(4)]
+    expected = sum(inputs)
+
+    outputs = run_kernel_in_mock(
+        kernel_fn=ring_allreduce.kernel,
+        world_size=4,
+        topology="ring_1d",
+        inputs=inputs,
+        kernel_args=(n_elem, 4),
+    )
+    for r in range(4):
+        assert np.allclose(outputs[r], expected, rtol=1e-2, atol=1e-2)
+
+
+def test_mock_runtime_invalid_direction_raises():
+    """A kernel that uses an unsupported direction should raise."""
+    import pytest
+
+    def bad_kernel(t_ptr, n_elem, tl):
+        tl.send(dir="N", src_addr=0, nbytes=2, shape=(1,), dtype="f16", space="hbm")
+
+    inputs = [np.array([1.0], dtype=np.float16) for _ in range(2)]
+    with pytest.raises(Exception):
+        run_kernel_in_mock(
+            kernel_fn=bad_kernel,
+            world_size=2,
+            topology="ring_1d",
+            inputs=inputs,
+            kernel_args=(1,),
+        )
diff --git a/tests/test_ccl_performance.py b/tests/test_ccl_performance.py
new file mode 100644
index 0000000..05ba8a4
--- /dev/null
+++ b/tests/test_ccl_performance.py
@@ -0,0 +1,134 @@
+"""CCL performance validation tests (ADR-0023 D13 T5).
+
+Sanity-checks the simulated latency of the unified ``ccl_allreduce`` bench
+under different ``ccl.yaml`` algorithm choices:
+
+  - All buffer kinds finish in non-zero simulated time.
+  - Latency is bounded well under 1 ms for small tiles.
+
+These are sanity checks on the model itself, not on absolute numbers.
+"""
+from __future__ import annotations
+
+import importlib
+import os
+from contextlib import contextmanager
+
+import pytest
+
+from kernbench.runtime_api.bench_runner import run_bench
+from kernbench.runtime_api.types import resolve_device
+from kernbench.sim_engine.engine import GraphEngine
+from kernbench.topology.builder import resolve_topology
+
+
+def _engine_factory(topology, device):
+    return GraphEngine(getattr(topology, "topology_obj", topology), enable_data=True)
+
+
+@contextmanager
+def _ccl_yaml_override(algorithm: str, world_size: int | None = None):
+    """Write a tmp ccl.yaml that forces a specific algorithm + world_size."""
+    import tempfile
+    entry_extra = f"\n    world_size: {world_size}" if world_size is not None else ""
+    body = f"""
+defaults:
+  algorithm: {algorithm}
+  buffer_kind: tcm
+  backpressure: sleep
+  n_slots: 4
+  slot_size: 4096
+  vc_chunk_size: 256
+  ipcq_credit_size_bytes: 16
+
+algorithms:
+  ring_allreduce_tcm:
+    module: kernbench.ccl.algorithms.ring_allreduce
+    topology: ring_1d
+    buffer_kind: tcm
+  ring_allreduce_hbm:
+    module: kernbench.ccl.algorithms.ring_allreduce
+    topology: ring_1d
+    buffer_kind: hbm
+  ring_allreduce_sram:
+    module: kernbench.ccl.algorithms.ring_allreduce
+    topology: ring_1d
+    buffer_kind: sram{entry_extra if algorithm.startswith("ring") else ""}
+  {algorithm}:
+    module: kernbench.ccl.algorithms.ring_allreduce
+    topology: ring_1d
+    buffer_kind: tcm{entry_extra}
+""" if world_size is not None else f"""
+defaults:
+  algorithm: {algorithm}
+  buffer_kind: tcm
+  backpressure: sleep
+  n_slots: 4
+  slot_size: 4096
+  vc_chunk_size: 256
+  ipcq_credit_size_bytes: 16
+
+algorithms:
+  ring_allreduce_tcm:
+    module: kernbench.ccl.algorithms.ring_allreduce
+    topology: ring_1d
+    buffer_kind: tcm
+  ring_allreduce_hbm:
+    module: kernbench.ccl.algorithms.ring_allreduce
+    topology: ring_1d
+    buffer_kind: hbm
+  ring_allreduce_sram:
+    module: kernbench.ccl.algorithms.ring_allreduce
+    topology: ring_1d
+    buffer_kind: sram
+"""
+    with tempfile.TemporaryDirectory() as tmp:
+        path = os.path.join(tmp, "ccl.yaml")
+        with open(path, "w") as f:
+            f.write(body)
+        old_cwd = os.getcwd()
+        os.chdir(tmp)
+        try:
+            yield path
+        finally:
+            os.chdir(old_cwd)
+
+
+def _run_unified(algorithm: str, world_size: int | None = None) -> float:
+    """Run the unified ccl_allreduce bench under a ccl.yaml override,
+    return simulated kernel total_ns."""
+    with _ccl_yaml_override(algorithm, world_size):
+        topo = resolve_topology(
+            os.path.join(os.path.dirname(__file__), "..", "topology.yaml")
+        )
+        bench_mod = importlib.import_module("benches.ccl_allreduce")
+        result = run_bench(
+            topology=topo, bench_fn=bench_mod.run,
+            device=resolve_device("all"),
+            engine_factory=_engine_factory,
+        )
+    assert result.completion.ok, f"{algorithm} did not complete"
+    last_kernel = None
+    for tr in (result.traces or []):
+        if tr.get("phase") == "kernel":
+            last_kernel = tr
+    assert last_kernel is not None, f"{algorithm} produced no kernel trace"
+    return float(last_kernel.get("total_ns", 0.0))
+
+
+@pytest.mark.parametrize("algorithm", [
+    "ring_allreduce_tcm",
+    "ring_allreduce_hbm",
+    "ring_allreduce_sram",
+])
+def test_ccl_latency_positive(algorithm):
+    """Every buffer kind must produce a positive simulated latency."""
+    ns = _run_unified(algorithm)
+    assert ns > 0
+
+
+def test_ccl_latency_under_reasonable_bound():
+    """Sanity bound: ring all-reduce (tile=32 f16) should finish in well
+    under 1 ms simulated. Way overhead-dominated for small tiles."""
+    ns = _run_unified("ring_allreduce_tcm")
+    assert ns < 100_000_000  # < 100 ms simulated — very loose bound
diff --git a/tests/test_ccl_round_robin_recv.py b/tests/test_ccl_round_robin_recv.py
new file mode 100644
index 0000000..8b98dfa
--- /dev/null
+++ b/tests/test_ccl_round_robin_recv.py
@@ -0,0 +1,48 @@
+"""Test that tl.recv() (no direction) works under the mock runtime
+and the SimPy PE_IPCQ component (ADR-0023 D4 weak fairness)."""
+from __future__ import annotations
+
+import numpy as np
+
+from kernbench.ccl.testing import run_kernel_in_mock
+
+
+def kernel_round_robin(t_ptr, n_elem, tl):
+    """Each PE sends one tile E then receives N-1 tiles via round-robin.
+    Uses TensorHandle math (PE_MATH) so Phase 2 produces correct HBM
+    contents under SimPy + op_log replay."""
+    rank = tl.program_id(axis=0)
+    world_size = tl.num_programs(axis=0)
+    nbytes = n_elem * 2
+
+    pe_addr = t_ptr + rank * nbytes
+    acc = tl.load(pe_addr, shape=(n_elem,), dtype="f16")
+    current = acc
+
+    for _step in range(world_size - 1):
+        tl.send(dir="E", src=current)
+        # No direction → round-robin
+        recv = tl.recv(shape=(n_elem,), dtype="f16")
+        acc = acc + recv
+        current = recv  # forward W's tile to E next round
+
+    tl.store(pe_addr, acc)
+
+
+def test_round_robin_recv_mock_runtime():
+    n_elem = 8
+    inputs = [
+        np.full((n_elem,), float(r + 1), dtype=np.float16)
+        for r in range(4)
+    ]
+    expected = sum(inputs)  # [10,...]
+
+    outputs = run_kernel_in_mock(
+        kernel_fn=kernel_round_robin,
+        world_size=4,
+        topology="ring_1d",
+        inputs=inputs,
+        kernel_args=(n_elem,),
+    )
+    for r in range(4):
+        assert np.allclose(outputs[r], expected)
diff --git a/tests/test_ccl_strict_mode.py b/tests/test_ccl_strict_mode.py
new file mode 100644
index 0000000..d145b02
--- /dev/null
+++ b/tests/test_ccl_strict_mode.py
@@ -0,0 +1,140 @@
+"""Tests for IPCQ strict shape/dtype validation (ADR-0023 D14 F2)."""
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any
+
+import pytest
+import simpy
+
+from kernbench.common.ipcq_types import (
+    IpcqDmaToken,
+    IpcqEndpoint,
+    IpcqInitEntry,
+    IpcqInvalidDirection,
+    IpcqMetaArrival,
+    IpcqRecvCmd,
+    IpcqRequest,
+    IpcqSendCmd,
+)
+from kernbench.components.builtin.pe_ipcq import PeIpcqComponent
+from kernbench.runtime_api.kernel import IpcqInitMsg
+from kernbench.topology.types import Node
+
+
+# ── helpers (smaller copy of test_pe_ipcq fixtures) ────────────────
+
+
+@dataclass
+class _FakeTxn:
+    request: Any
+    done: simpy.Event
+    result_data: dict[str, Any] = field(default_factory=dict)
+
+
+def _make(env, strict: bool = True):
+    node = Node(
+        id="sip0.cube0.pe0.pe_ipcq", kind="pe_ipcq",
+        impl="builtin.pe_ipcq",
+        attrs={"strict_validation": strict},
+        pos_mm=None,
+    )
+    comp = PeIpcqComponent(node, ctx=None)
+    comp.in_ports["host"] = simpy.Store(env)
+    comp.out_ports["sip0.cube0.pe0.pe_dma"] = simpy.Store(env)
+    comp.start(env)
+
+    peer_credit = simpy.Store(env)
+    ep = IpcqEndpoint(
+        sip=0, cube=0, pe=1, buffer_kind="tcm",
+        rx_base_pa=0x10_000, rx_base_va=0,
+        n_slots=4, slot_size=4096,
+    )
+    init_msg = IpcqInitMsg(
+        correlation_id="t", request_id="t",
+        target_sips=(0,), target_cubes=(0,), target_pe=0,
+        entries=(IpcqInitEntry(
+            direction="W", peer=ep,
+            my_rx_base_pa=0x40_000, my_rx_base_va=0,
+            n_slots=4, slot_size=4096,
+            peer_credit_store=peer_credit,
+        ),),
+        backpressure_mode="sleep",
+        buffer_kind="tcm",
+        credit_size_bytes=16,
+    )
+    done = env.event()
+    comp.in_ports["host"].put(_FakeTxn(request=init_msg, done=done))
+    env.run(until=done)
+    return comp
+
+
+# ── F2 tests ─────────────────────────────────────────────────────────
+
+
+def test_strict_mode_dtype_mismatch_raises():
+    env = simpy.Environment()
+    comp = _make(env, strict=True)
+
+    # Pre-arrive metadata with f32 dtype
+    fake_token = IpcqDmaToken(
+        src_addr=0, src_space="tcm",
+        dst_addr=0x40_000, dst_endpoint=comp._queue_pairs["W"]["peer"],
+        nbytes=64, handle_id="x",
+        shape=(8,), dtype="f32",  # mismatched
+        sender_seq=0,
+        src_sip=0, src_cube=0, src_pe=1, src_direction="E",
+    )
+    comp.in_ports["host"].put(IpcqMetaArrival(token=fake_token))
+    env.run(until=5)
+
+    # recv expecting f16 → should raise on strict
+    recv_cmd = IpcqRecvCmd(direction="W", shape=(8,), dtype="f16", handle_id="r")
+    req = IpcqRequest(command=recv_cmd, done=env.event())
+    comp.in_ports["host"].put(req)
+    with pytest.raises(ValueError, match="dtype"):
+        env.run(until=req.done)
+
+
+def test_strict_mode_shape_mismatch_raises():
+    env = simpy.Environment()
+    comp = _make(env, strict=True)
+
+    fake_token = IpcqDmaToken(
+        src_addr=0, src_space="tcm",
+        dst_addr=0x40_000, dst_endpoint=comp._queue_pairs["W"]["peer"],
+        nbytes=64, handle_id="x",
+        shape=(16,), dtype="f16",  # wrong shape
+        sender_seq=0,
+        src_sip=0, src_cube=0, src_pe=1, src_direction="E",
+    )
+    comp.in_ports["host"].put(IpcqMetaArrival(token=fake_token))
+    env.run(until=5)
+
+    recv_cmd = IpcqRecvCmd(direction="W", shape=(8,), dtype="f16", handle_id="r")
+    req = IpcqRequest(command=recv_cmd, done=env.event())
+    comp.in_ports["host"].put(req)
+    with pytest.raises(ValueError, match="shape"):
+        env.run(until=req.done)
+
+
+def test_non_strict_mode_silently_accepts():
+    env = simpy.Environment()
+    comp = _make(env, strict=False)
+
+    fake_token = IpcqDmaToken(
+        src_addr=0, src_space="tcm",
+        dst_addr=0x40_000, dst_endpoint=comp._queue_pairs["W"]["peer"],
+        nbytes=64, handle_id="x",
+        shape=(16,), dtype="f32",  # both wrong
+        sender_seq=0,
+        src_sip=0, src_cube=0, src_pe=1, src_direction="E",
+    )
+    comp.in_ports["host"].put(IpcqMetaArrival(token=fake_token))
+    env.run(until=5)
+
+    recv_cmd = IpcqRecvCmd(direction="W", shape=(8,), dtype="f16", handle_id="r")
+    req = IpcqRequest(command=recv_cmd, done=env.event())
+    comp.in_ports["host"].put(req)
+    env.run(until=req.done)
+    assert req.done.triggered  # no exception
diff --git a/tests/test_ccl_topologies.py b/tests/test_ccl_topologies.py
new file mode 100644
index 0000000..42dcd43
--- /dev/null
+++ b/tests/test_ccl_topologies.py
@@ -0,0 +1,164 @@
+"""Tests for CCL builtin topology generators (ADR-0023 D11)."""
+import pytest
+
+from kernbench.ccl.topologies import (
+    mesh_2d,
+    none,
+    resolve_topology,
+    ring_1d,
+    ring_1d_unidir,
+    tree_binary,
+)
+
+
+# ── ring_1d ──────────────────────────────────────────────────────────
+
+
+def test_ring_1d_4_ranks():
+    assert ring_1d(0, 4) == {"E": 1, "W": 3}
+    assert ring_1d(1, 4) == {"E": 2, "W": 0}
+    assert ring_1d(2, 4) == {"E": 3, "W": 1}
+    assert ring_1d(3, 4) == {"E": 0, "W": 2}
+
+
+def test_ring_1d_2_ranks():
+    assert ring_1d(0, 2) == {"E": 1, "W": 1}
+    assert ring_1d(1, 2) == {"E": 0, "W": 0}
+
+
+# ── ring_1d_unidir ───────────────────────────────────────────────────
+
+
+def test_ring_1d_unidir():
+    assert ring_1d_unidir(0, 4) == {"E": 1}
+    assert ring_1d_unidir(3, 4) == {"E": 0}
+
+
+# ── mesh_2d ──────────────────────────────────────────────────────────
+
+
+def test_mesh_2d_2x2():
+    # 2x2 mesh:
+    # 0 1
+    # 2 3
+    assert mesh_2d(0, 4) == {"N": 2, "S": 2, "E": 1, "W": 1}
+    assert mesh_2d(1, 4) == {"N": 3, "S": 3, "E": 0, "W": 0}
+    assert mesh_2d(2, 4) == {"N": 0, "S": 0, "E": 3, "W": 3}
+    assert mesh_2d(3, 4) == {"N": 1, "S": 1, "E": 2, "W": 2}
+
+
+def test_mesh_2d_4x4():
+    # 4x4 mesh: rank = r*4 + c
+    n = mesh_2d(5, 16)  # r=1, c=1
+    assert n["N"] == 1   # ((1-1)%4)*4 + 1
+    assert n["S"] == 9   # ((1+1)%4)*4 + 1
+    assert n["W"] == 4   # 1*4 + (1-1)%4
+    assert n["E"] == 6   # 1*4 + (1+1)%4
+
+
+def test_mesh_2d_non_square_raises():
+    with pytest.raises(ValueError):
+        mesh_2d(0, 5)
+
+
+# ── tree_binary ──────────────────────────────────────────────────────
+
+
+def test_tree_binary_root():
+    n = tree_binary(0, 7)
+    assert "parent" not in n
+    assert n["child_left"] == 1
+    assert n["child_right"] == 2
+
+
+def test_tree_binary_internal():
+    n = tree_binary(1, 7)
+    assert n["parent"] == 0
+    assert n["child_left"] == 3
+    assert n["child_right"] == 4
+
+
+def test_tree_binary_leaf():
+    n = tree_binary(6, 7)
+    assert n["parent"] == 2
+    assert "child_left" not in n
+    assert "child_right" not in n
+
+
+# ── none ─────────────────────────────────────────────────────────────
+
+
+def test_none_returns_empty():
+    assert none(0, 4) == {}
+    assert none(3, 7) == {}
+
+
+# ── resolve_topology ─────────────────────────────────────────────────
+
+
+def test_resolve_topology_builtin():
+    fn = resolve_topology("ring_1d")
+    assert fn(0, 4) == {"E": 1, "W": 3}
+
+
+def test_resolve_topology_unknown_raises():
+    with pytest.raises(ValueError):
+        resolve_topology("nonsense")
+
+
+def test_resolve_topology_with_neighbors_override_pattern_a():
+    """Algorithm module with neighbors() that mutates builtin map."""
+    class FakeModule:
+        @staticmethod
+        def neighbors(rank, world_size, neighbor_map):
+            if rank % 2 == 1:
+                neighbor_map.pop("W", None)
+            return neighbor_map
+
+    fn = resolve_topology("ring_1d", algo_module=FakeModule)
+    assert fn(0, 4) == {"E": 1, "W": 3}
+    assert fn(1, 4) == {"E": 2}  # W removed
+
+
+def test_resolve_topology_with_neighbors_override_pattern_b():
+    """Algorithm module with neighbors() that returns brand-new dict."""
+    class FakeModule:
+        @staticmethod
+        def neighbors(rank, world_size, neighbor_map):
+            return {"E": (rank + 2) % world_size}
+
+    fn = resolve_topology("ring_1d", algo_module=FakeModule)
+    assert fn(0, 4) == {"E": 2}
+    assert fn(3, 4) == {"E": 1}
+
+
+def test_resolve_topology_with_neighbors_override_pattern_c_none():
+    """Algorithm module's neighbors() returns None → builtin used as-is."""
+    class FakeModule:
+        @staticmethod
+        def neighbors(rank, world_size, neighbor_map):
+            return None
+
+    fn = resolve_topology("ring_1d", algo_module=FakeModule)
+    assert fn(0, 4) == {"E": 1, "W": 3}
+
+
+def test_resolve_topology_none_with_neighbors_override():
+    """topology=none + custom neighbors() builds from scratch."""
+    class FakeModule:
+        @staticmethod
+        def neighbors(rank, world_size, neighbor_map):
+            assert neighbor_map == {}  # builtin returned empty
+            return {"E": (rank + 1) % world_size}
+
+    fn = resolve_topology("none", algo_module=FakeModule)
+    assert fn(0, 4) == {"E": 1}
+
+
+def test_resolve_topology_module_without_neighbors():
+    """Algorithm module without neighbors() function works normally."""
+    class FakeModule:
+        pass  # no neighbors attribute
+
+    fn = resolve_topology("ring_1d", algo_module=FakeModule)
+    assert fn(0, 4) == {"E": 1, "W": 3}
diff --git a/tests/test_cross_sip_routing.py b/tests/test_cross_sip_routing.py
new file mode 100644
index 0000000..4d0ba86
--- /dev/null
+++ b/tests/test_cross_sip_routing.py
@@ -0,0 +1,73 @@
+"""Cross-SIP PE_DMA routing tests (ADR-0023, topology v2).
+
+Verifies that PE_DMA in one SIP can route to PE_DMA in another SIP via
+the bidirectional pcie_ep ↔ fabric.switch0 path. Required for IPCQ
+multi-SIP collectives.
+"""
+from __future__ import annotations
+
+import pytest
+
+from kernbench.policy.routing.router import PathRouter, RoutingError
+from kernbench.topology.builder import resolve_topology
+
+
+def _topo():
+    return resolve_topology("topology.yaml").topology_obj
+
+
+# ── New edge ────────────────────────────────────────────────────────
+
+
+def test_pcie_ep_to_switch_edge_exists():
+    """The reverse pcie_ep → switch edge must exist for outbound traffic."""
+    topo = _topo()
+    pairs = {(e.src, e.dst) for e in topo.edges}
+    assert ("sip0.io0.pcie_ep", "fabric.switch0") in pairs
+    assert ("sip1.io0.pcie_ep", "fabric.switch0") in pairs
+
+
+def test_existing_switch_to_pcie_ep_still_present():
+    """Host→device path must remain intact (regression)."""
+    topo = _topo()
+    pairs = {(e.src, e.dst) for e in topo.edges}
+    assert ("fabric.switch0", "sip0.io0.pcie_ep") in pairs
+    assert ("fabric.switch0", "sip1.io0.pcie_ep") in pairs
+
+
+# ── Cross-SIP path ──────────────────────────────────────────────────
+
+
+def test_router_finds_cross_sip_pe_dma_path():
+    topo = _topo()
+    r = PathRouter(topo)
+    path = r.find_path("sip0.cube0.pe0", "sip1.cube0.pe0.pe_dma")
+    assert len(path) > 0
+    assert path[0] == "sip0.cube0.pe0.pe_dma"
+    assert path[-1] == "sip1.cube0.pe0.pe_dma"
+    assert "fabric.switch0" in path
+
+
+def test_router_finds_cross_sip_far_pe_path():
+    """Last cube of sip0 → first cube of sip1."""
+    topo = _topo()
+    r = PathRouter(topo)
+    path = r.find_path("sip0.cube15.pe7", "sip1.cube0.pe0.pe_dma")
+    assert "fabric.switch0" in path
+
+
+# ── Regression: intra-SIP routing unchanged ─────────────────────────
+
+
+def test_router_intra_sip_path_unchanged():
+    topo = _topo()
+    r = PathRouter(topo)
+    path = r.find_path("sip0.cube0.pe0", "sip0.cube0.pe1.pe_dma")
+    assert "fabric.switch0" not in path  # should not detour through switch
+
+
+def test_router_intra_cube_path_unchanged():
+    topo = _topo()
+    r = PathRouter(topo)
+    path = r.find_path("sip0.cube0.pe0", "sip0.cube0.hbm_ctrl")
+    assert "fabric.switch0" not in path
diff --git a/tests/test_data_executor.py b/tests/test_data_executor.py
index 0204ba8..02bb022 100644
--- a/tests/test_data_executor.py
+++ b/tests/test_data_executor.py
@@ -58,6 +58,69 @@ def test_math_exp():
     assert np.allclose(result, np.exp(x))
 
 
+def test_math_extra_ops():
+    """Phase 2 replay of tl.maximum/minimum/fma/clamp/softmax."""
+    store = MemoryStore()
+    a = np.array([1.0, 5.0, 3.0], dtype=np.float32)
+    b = np.array([4.0, 2.0, 6.0], dtype=np.float32)
+    c = np.array([0.5, 0.5, 0.5], dtype=np.float32)
+    store.write("tcm", 0x0, a)
+    store.write("tcm", 0x100, b)
+    store.write("tcm", 0x200, c)
+
+    def _math(name, op, dst, inputs, axis=None):
+        return OpRecord(
+            t_start=float(dst), t_end=float(dst) + 1.0,
+            component_id="pe_math", op_kind="math", op_name=name,
+            params={
+                "op": op,
+                "input_addrs": [a for a, _ in inputs],
+                "input_shapes": [s for _, s in inputs],
+                "input_spaces": ["tcm"] * len(inputs),
+                "input_dtypes": ["f32"] * len(inputs),
+                "dst_addr": dst, "dst_space": "tcm",
+                "shape_out": (3,), "dtype": "f32", "axis": axis,
+            },
+        )
+
+    ops = [
+        _math("maximum", "maximum", 0x300, [(0x0, (3,)), (0x100, (3,))]),
+        _math("minimum", "minimum", 0x400, [(0x0, (3,)), (0x100, (3,))]),
+        _math("fma",     "fma",     0x500, [(0x0, (3,)), (0x100, (3,)), (0x200, (3,))]),
+        _math("clamp",   "clamp",   0x600, [(0x0, (3,)), (0x200, (3,)), (0x100, (3,))]),
+    ]
+    DataExecutor(ops, store).run()
+
+    assert np.array_equal(store.read("tcm", 0x300), np.maximum(a, b))
+    assert np.array_equal(store.read("tcm", 0x400), np.minimum(a, b))
+    assert np.array_equal(store.read("tcm", 0x500), a * b + c)
+    assert np.array_equal(
+        store.read("tcm", 0x600), np.minimum(np.maximum(a, c), b)
+    )
+
+
+def test_math_softmax():
+    store = MemoryStore()
+    x = np.array([[1.0, 2.0, 3.0], [10.0, 20.0, 30.0]], dtype=np.float32)
+    store.write("tcm", 0x0, x)
+
+    op = OpRecord(
+        t_start=0.0, t_end=1.0,
+        component_id="pe_math", op_kind="math", op_name="softmax",
+        params={
+            "op": "softmax",
+            "input_addrs": [0x0], "input_shapes": [(2, 3)],
+            "input_spaces": ["tcm"], "input_dtypes": ["f32"],
+            "dst_addr": 0x100, "dst_space": "tcm",
+            "shape_out": (2, 3), "dtype": "f32", "axis": -1,
+        },
+    )
+    DataExecutor([op], store).run()
+    expected = np.exp(x - x.max(axis=-1, keepdims=True))
+    expected /= expected.sum(axis=-1, keepdims=True)
+    assert np.allclose(store.read("tcm", 0x100), expected)
+
+
 def test_math_add():
     store = MemoryStore()
     a = np.array([1.0, 2.0], dtype=np.float32)
diff --git a/tests/test_ipcq_types.py b/tests/test_ipcq_types.py
new file mode 100644
index 0000000..647837b
--- /dev/null
+++ b/tests/test_ipcq_types.py
@@ -0,0 +1,169 @@
+"""Tests for IPCQ type schemas (ADR-0023 D2.5, D12, D14 F1)."""
+import pytest
+
+from kernbench.common.ipcq_types import (
+    IpcqCreditMetadata,
+    IpcqDmaToken,
+    IpcqEndpoint,
+    IpcqInitEntry,
+    IpcqInvalidDirection,
+    IpcqMetaArrival,
+    IpcqRecvCmd,
+    IpcqSendCmd,
+)
+from kernbench.runtime_api.kernel import IpcqInitMsg
+
+
+# ── IpcqEndpoint ─────────────────────────────────────────────────────
+
+
+def test_ipcq_endpoint_basic():
+    ep = IpcqEndpoint(
+        sip=0, cube=0, pe=1,
+        buffer_kind="tcm",
+        rx_base_pa=0x1000, rx_base_va=0,
+        n_slots=8, slot_size=4096,
+    )
+    assert ep.sip == 0
+    assert ep.buffer_kind == "tcm"
+    assert ep.n_slots == 8
+
+
+def test_ipcq_endpoint_frozen():
+    ep = IpcqEndpoint(
+        sip=0, cube=0, pe=1, buffer_kind="tcm",
+        rx_base_pa=0x1000, rx_base_va=0, n_slots=8, slot_size=4096,
+    )
+    with pytest.raises(Exception):  # FrozenInstanceError
+        ep.sip = 99  # type: ignore
+
+
+# ── IpcqDmaToken ─────────────────────────────────────────────────────
+
+
+def test_ipcq_dma_token():
+    ep = IpcqEndpoint(
+        sip=0, cube=0, pe=1, buffer_kind="tcm",
+        rx_base_pa=0x1000, rx_base_va=0, n_slots=8, slot_size=4096,
+    )
+    tok = IpcqDmaToken(
+        src_addr=0x500, src_space="tcm",
+        dst_addr=0x1000, dst_endpoint=ep,
+        nbytes=128, handle_id="h1",
+        sender_seq=0,
+        src_sip=0, src_cube=0, src_pe=0, src_direction="E",
+    )
+    assert tok.nbytes == 128
+    assert tok.dst_endpoint.buffer_kind == "tcm"
+    assert tok.data_op is True
+
+
+# ── IpcqCreditMetadata ───────────────────────────────────────────────
+
+
+def test_ipcq_credit_metadata():
+    cm = IpcqCreditMetadata(
+        consumer_seq=3, src_sip=0, src_cube=0, src_pe=1, src_direction="W",
+    )
+    assert cm.consumer_seq == 3
+    assert cm.src_direction == "W"
+
+
+def test_ipcq_credit_metadata_frozen():
+    cm = IpcqCreditMetadata(
+        consumer_seq=3, src_sip=0, src_cube=0, src_pe=1, src_direction="W",
+    )
+    with pytest.raises(Exception):
+        cm.consumer_seq = 99  # type: ignore
+
+
+# ── IpcqMetaArrival ──────────────────────────────────────────────────
+
+
+def test_ipcq_meta_arrival():
+    ep = IpcqEndpoint(
+        sip=0, cube=0, pe=1, buffer_kind="tcm",
+        rx_base_pa=0x1000, rx_base_va=0, n_slots=8, slot_size=4096,
+    )
+    tok = IpcqDmaToken(
+        src_addr=0x500, src_space="tcm",
+        dst_addr=0x1000, dst_endpoint=ep,
+        nbytes=128, handle_id="h1",
+        sender_seq=0,
+        src_sip=0, src_cube=0, src_pe=0, src_direction="E",
+    )
+    ma = IpcqMetaArrival(token=tok)
+    assert ma.token.sender_seq == 0
+    assert ma.token.src_direction == "E"
+
+
+# ── IpcqSendCmd / IpcqRecvCmd ────────────────────────────────────────
+
+
+def test_ipcq_send_cmd():
+    cmd = IpcqSendCmd(
+        direction="E", src_addr=0x100, src_space="tcm",
+        nbytes=64, shape=(8, 8), dtype="f16", handle_id="s1",
+    )
+    assert cmd.direction == "E"
+    assert cmd.data_op is True
+
+
+def test_ipcq_recv_cmd_default_return_slot():
+    cmd = IpcqRecvCmd(direction="W", shape=(8, 8), dtype="f16", handle_id="r1")
+    assert cmd.recv_mode == "return_slot"
+    assert cmd.dst_addr == 0
+
+
+def test_ipcq_recv_cmd_round_robin():
+    cmd = IpcqRecvCmd(direction=None, shape=(8, 8), dtype="f16", handle_id="r2")
+    assert cmd.direction is None
+
+
+def test_ipcq_recv_cmd_copy_to_dst():
+    cmd = IpcqRecvCmd(
+        direction="W", recv_mode="copy_to_dst",
+        dst_addr=0x2000, dst_space="hbm",
+        shape=(8, 8), dtype="f16", handle_id="r3",
+    )
+    assert cmd.recv_mode == "copy_to_dst"
+    assert cmd.dst_addr == 0x2000
+
+
+# ── IpcqInvalidDirection ─────────────────────────────────────────────
+
+
+def test_ipcq_invalid_direction():
+    with pytest.raises(IpcqInvalidDirection):
+        raise IpcqInvalidDirection("direction 'X' not installed")
+
+
+# ── IpcqInitEntry / IpcqInitMsg ──────────────────────────────────────
+
+
+def test_ipcq_init_entry_and_msg():
+    import simpy
+    env = simpy.Environment()
+    credit_store = simpy.Store(env)
+
+    ep = IpcqEndpoint(
+        sip=0, cube=0, pe=1, buffer_kind="tcm",
+        rx_base_pa=0x1000, rx_base_va=0, n_slots=8, slot_size=4096,
+    )
+    entry = IpcqInitEntry(
+        direction="E", peer=ep,
+        my_rx_base_pa=0x2000, my_rx_base_va=0,
+        n_slots=8, slot_size=4096,
+        peer_credit_store=credit_store,
+    )
+    msg = IpcqInitMsg(
+        correlation_id="c1", request_id="r1",
+        target_sips=(0,), target_cubes=(0,), target_pe=0,
+        entries=(entry,),
+        backpressure_mode="sleep",
+        buffer_kind="tcm",
+        credit_size_bytes=16,
+    )
+    assert msg.entries[0].direction == "E"
+    assert msg.entries[0].peer.sip == 0
+    assert msg.credit_size_bytes == 16
diff --git a/tests/test_pe_dma_ipcq.py b/tests/test_pe_dma_ipcq.py
new file mode 100644
index 0000000..7ce30da
--- /dev/null
+++ b/tests/test_pe_dma_ipcq.py
@@ -0,0 +1,206 @@
+"""Tests for PE_DMA IPCQ handling (ADR-0023 D8 + D9 atomic).
+
+PE_DMA gains two new behaviors:
+  1. Outbound: when it receives an IpcqDmaToken from local PE_IPCQ, it
+     forwards it through the fabric (next-hop port) toward the peer
+     PE_DMA.
+  2. Inbound: when it receives a Transaction wrapping an IpcqDmaToken,
+     it performs MemoryStore.write at dst_endpoint.buffer_kind/dst_addr
+     and forwards IpcqMetaArrival(token) to local PE_IPCQ — both in the
+     SAME SimPy step (I6 MUST).
+"""
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any
+
+import numpy as np
+import simpy
+
+from kernbench.common.ipcq_types import (
+    IpcqDmaToken,
+    IpcqEndpoint,
+    IpcqMetaArrival,
+)
+from kernbench.components.builtin.pe_dma import PeDmaComponent
+from kernbench.sim_engine.memory_store import MemoryStore
+from kernbench.sim_engine.transaction import Transaction
+from kernbench.topology.types import Node
+
+
+# ── Mock context ─────────────────────────────────────────────────────
+
+
+@dataclass
+class _MockResolver:
+    pass
+
+
+@dataclass
+class _MockRouter:
+    """Returns a fixed two-hop path for any (src, dst)."""
+
+    def find_path(self, src: str, dst: str) -> list[str]:
+        return [src, "fake_router", dst]
+
+
+@dataclass
+class _MockCtx:
+    router: Any = field(default_factory=_MockRouter)
+    resolver: Any = field(default_factory=_MockResolver)
+    memory_store: Any = None
+    edge_map: dict = field(default_factory=dict)
+    spec: dict = field(default_factory=dict)
+    op_logger: Any = None
+
+    def compute_drain_ns(self, path: list[str], nbytes: int) -> float:
+        return 0.0
+
+    def get_shared_resource(self, env, key, capacity=1):
+        return simpy.Resource(env, capacity=capacity)
+
+
+def _make_pe_dma(
+    env: simpy.Environment, pe_prefix: str, store: MemoryStore | None = None,
+) -> PeDmaComponent:
+    node = Node(
+        id=f"{pe_prefix}.pe_dma",
+        kind="pe_dma",
+        impl="builtin.pe_dma",
+        attrs={},
+        pos_mm=None,
+    )
+    ctx = _MockCtx(memory_store=store)
+    comp = PeDmaComponent(node, ctx=ctx)
+    comp.in_ports["host"] = simpy.Store(env)
+    comp.out_ports["fake_router"] = simpy.Store(env)
+    comp.out_ports[f"{pe_prefix}.pe_ipcq"] = simpy.Store(env)
+    comp.start(env)
+    return comp
+
+
+def _make_endpoint(sip=0, cube=0, pe=1, buffer_kind="tcm") -> IpcqEndpoint:
+    return IpcqEndpoint(
+        sip=sip, cube=cube, pe=pe,
+        buffer_kind=buffer_kind,
+        rx_base_pa=0x10_000, rx_base_va=0,
+        n_slots=4, slot_size=4096,
+    )
+
+
+# ── Outbound: PE_IPCQ → PE_DMA → fabric ──────────────────────────────
+
+
+def test_outbound_forwards_token_through_fabric():
+    env = simpy.Environment()
+    store = MemoryStore()
+    src_arr = np.arange(16, dtype=np.float16)
+    store.write("tcm", 0x500, src_arr)
+
+    src = _make_pe_dma(env, "sip0.cube0.pe0", store=store)
+
+    peer = _make_endpoint(pe=1)
+    token = IpcqDmaToken(
+        src_addr=0x500, src_space="tcm",
+        dst_addr=0x10_000, dst_endpoint=peer,
+        nbytes=32, handle_id="t1",
+        shape=(16,), dtype="f16",
+        sender_seq=0,
+        src_sip=0, src_cube=0, src_pe=0, src_direction="E",
+    )
+    src.in_ports["host"].put(token)
+    env.run(until=10)
+
+    # The token should be wrapped in a Transaction and forwarded to "fake_router"
+    fab = src.out_ports["fake_router"]
+    assert len(fab.items) == 1
+    txn = fab.items[0]
+    assert isinstance(txn, Transaction)
+    assert isinstance(txn.request, IpcqDmaToken)
+    assert txn.request.dst_addr == 0x10_000
+
+
+# ── Inbound: PE_DMA → MemoryStore.write + IpcqMetaArrival forward ───
+
+
+def test_inbound_writes_memory_and_forwards_metadata_atomically():
+    env = simpy.Environment()
+    store = MemoryStore()
+
+    # Sender wrote source data to MemoryStore
+    src_arr = np.arange(16, dtype=np.float16) + 100
+    store.write("tcm", 0x500, src_arr)
+
+    dst = _make_pe_dma(env, "sip0.cube0.pe1", store=store)
+
+    peer = _make_endpoint(sip=0, cube=0, pe=1, buffer_kind="tcm")
+    token = IpcqDmaToken(
+        src_addr=0x500, src_space="tcm",
+        dst_addr=0x10_000, dst_endpoint=peer,
+        nbytes=32, handle_id="t1",
+        shape=(16,), dtype="f16",
+        sender_seq=0,
+        src_sip=0, src_cube=0, src_pe=0, src_direction="E",
+    )
+
+    # Wrap in a Transaction with this PE_DMA as the terminal
+    done = env.event()
+    txn = Transaction(
+        request=token, path=["fake_router", "sip0.cube0.pe1.pe_dma"],
+        step=1, nbytes=32, done=done,
+    )
+    dst.in_ports["host"].put(txn)
+    env.run(until=done)
+
+    # 1. MemoryStore should have the data at dst_addr
+    arrived = store.read("tcm", 0x10_000, shape=(16,), dtype="f16")
+    assert np.array_equal(arrived, src_arr)
+
+    # 2. IpcqMetaArrival should be in PE_IPCQ port
+    ipcq_port = dst.out_ports["sip0.cube0.pe1.pe_ipcq"]
+    assert len(ipcq_port.items) == 1
+    arrival = ipcq_port.items[0]
+    assert isinstance(arrival, IpcqMetaArrival)
+    assert arrival.token.sender_seq == 0
+    assert arrival.token.src_pe == 0
+
+
+def test_inbound_no_yield_between_write_and_metadata_forward():
+    """Soft check: when multiple inbound IPCQ tokens arrive, the order of
+    MemoryStore writes and IpcqMetaArrival forwards is preserved (no
+    interleaving from extraneous yields).
+    """
+    env = simpy.Environment()
+    store = MemoryStore()
+
+    for i in range(3):
+        store.write("tcm", 0x500 + i * 0x100, np.arange(8, dtype=np.float16) + i * 10)
+
+    dst = _make_pe_dma(env, "sip0.cube0.pe1", store=store)
+    peer = _make_endpoint(sip=0, cube=0, pe=1)
+
+    for i in range(3):
+        token = IpcqDmaToken(
+            src_addr=0x500 + i * 0x100, src_space="tcm",
+            dst_addr=0x10_000 + i * 0x100, dst_endpoint=peer,
+            nbytes=16, handle_id=f"t{i}",
+            shape=(8,), dtype="f16",
+            sender_seq=i,
+            src_sip=0, src_cube=0, src_pe=0, src_direction="E",
+        )
+        done = env.event()
+        txn = Transaction(
+            request=token, path=["fake_router", "sip0.cube0.pe1.pe_dma"],
+            step=1, nbytes=16, done=done,
+        )
+        dst.in_ports["host"].put(txn)
+        env.run(until=done)
+
+    # Check ordering of arrivals
+    ipcq_port = dst.out_ports["sip0.cube0.pe1.pe_ipcq"]
+    arrivals = list(ipcq_port.items)
+    assert [a.token.sender_seq for a in arrivals] == [0, 1, 2]
+    # Memory must be in order
+    for i in range(3):
+        arr = store.read("tcm", 0x10_000 + i * 0x100, shape=(8,), dtype="f16")
+        assert arr[0] == i * 10
diff --git a/tests/test_pe_ipcq.py b/tests/test_pe_ipcq.py
new file mode 100644
index 0000000..b339315
--- /dev/null
+++ b/tests/test_pe_ipcq.py
@@ -0,0 +1,317 @@
+"""Tests for PE_IPCQ component (ADR-0023 D1, D2, D9, D14).
+
+These tests use a mock setup: PeIpcqComponent is instantiated directly,
+its in_ports/out_ports are wired to plain SimPy Stores, and IpcqInitMsg
+is delivered via a simple dummy transaction wrapper. PE_DMA is mocked
+as a Store that we drain manually.
+"""
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any
+
+import pytest
+import simpy
+
+from kernbench.common.ipcq_types import (
+    IpcqCreditMetadata,
+    IpcqDmaToken,
+    IpcqEndpoint,
+    IpcqInitEntry,
+    IpcqInvalidDirection,
+    IpcqMetaArrival,
+    IpcqRecvCmd,
+    IpcqRequest,
+    IpcqSendCmd,
+)
+from kernbench.components.builtin.pe_ipcq import PeIpcqComponent
+from kernbench.runtime_api.kernel import IpcqInitMsg
+from kernbench.topology.types import Node
+
+
+# ── Fakes / fixtures ─────────────────────────────────────────────────
+
+
+@dataclass
+class _FakeTxn:
+    request: Any
+    done: simpy.Event
+    result_data: dict[str, Any] = field(default_factory=dict)
+
+
+def _make_pe_ipcq(env: simpy.Environment, pe_prefix: str = "sip0.cube0.pe0") -> PeIpcqComponent:
+    """Create a PeIpcqComponent with mocked ports.
+
+    Returns the component with:
+      - in_ports["host"] for posting IpcqInitMsg / IpcqRequest
+      - out_ports["__pe_dma__"] for outgoing IpcqDmaToken (drain manually)
+      - The component is started.
+    """
+    node = Node(
+        id=f"{pe_prefix}.pe_ipcq",
+        kind="pe_ipcq",
+        impl="builtin.pe_ipcq",
+        attrs={},
+        pos_mm=None,
+    )
+    comp = PeIpcqComponent(node, ctx=None)
+    comp.in_ports["host"] = simpy.Store(env)
+    comp.out_ports[f"{pe_prefix}.pe_dma"] = simpy.Store(env)
+    comp.start(env)
+    return comp
+
+
+def _install_two_neighbors(env: simpy.Environment, comp: PeIpcqComponent) -> tuple[simpy.Store, simpy.Store]:
+    """Install E and W neighbor entries with peer_credit_stores.
+
+    Returns (peer_e_credit_store, peer_w_credit_store) — i.e. the stores
+    that the component will put credits into when it receives data.
+    """
+    peer_e_credit = simpy.Store(env)
+    peer_w_credit = simpy.Store(env)
+
+    ep_e = IpcqEndpoint(
+        sip=0, cube=0, pe=1,
+        buffer_kind="tcm",
+        rx_base_pa=0x10_000, rx_base_va=0,
+        n_slots=4, slot_size=4096,
+    )
+    ep_w = IpcqEndpoint(
+        sip=0, cube=0, pe=2,
+        buffer_kind="tcm",
+        rx_base_pa=0x20_000, rx_base_va=0,
+        n_slots=4, slot_size=4096,
+    )
+    init_msg = IpcqInitMsg(
+        correlation_id="t", request_id="t",
+        target_sips=(0,), target_cubes=(0,), target_pe=0,
+        entries=(
+            IpcqInitEntry(
+                direction="E", peer=ep_e,
+                my_rx_base_pa=0x30_000, my_rx_base_va=0,
+                n_slots=4, slot_size=4096,
+                peer_credit_store=peer_e_credit,
+            ),
+            IpcqInitEntry(
+                direction="W", peer=ep_w,
+                my_rx_base_pa=0x40_000, my_rx_base_va=0,
+                n_slots=4, slot_size=4096,
+                peer_credit_store=peer_w_credit,
+            ),
+        ),
+        backpressure_mode="sleep",
+        buffer_kind="tcm",
+        credit_size_bytes=16,
+    )
+    done = env.event()
+    comp.in_ports["host"].put(_FakeTxn(request=init_msg, done=done))
+    env.run(until=done)
+    return peer_e_credit, peer_w_credit
+
+
+# ── send: forward token to PE_DMA ────────────────────────────────────
+
+
+def test_send_forwards_token_to_pe_dma():
+    env = simpy.Environment()
+    comp = _make_pe_ipcq(env)
+    _install_two_neighbors(env, comp)
+    pe_dma = comp.out_ports["sip0.cube0.pe0.pe_dma"]
+
+    cmd = IpcqSendCmd(
+        direction="E", src_addr=0x500, src_space="tcm",
+        nbytes=128, shape=(8, 8), dtype="f16", handle_id="s1",
+    )
+    done = env.event()
+    comp.in_ports["host"].put(IpcqRequest(command=cmd, done=done))
+    env.run(until=done)
+
+    # Token should be in PE_DMA's mock store
+    assert len(pe_dma.items) == 1
+    token = pe_dma.items[0]
+    assert isinstance(token, IpcqDmaToken)
+    assert token.dst_addr == 0x10_000  # peer.rx_base_pa + 0
+    assert token.nbytes == 128
+    assert token.sender_seq == 0
+    assert token.src_direction == "E"
+
+
+def test_send_advances_my_head_and_slot_addresses():
+    env = simpy.Environment()
+    comp = _make_pe_ipcq(env)
+    _install_two_neighbors(env, comp)
+    pe_dma = comp.out_ports["sip0.cube0.pe0.pe_dma"]
+
+    for i in range(3):
+        cmd = IpcqSendCmd(
+            direction="E", src_addr=0x500 + i,
+            src_space="tcm", nbytes=64,
+            shape=(8,), dtype="f16", handle_id=f"s{i}",
+        )
+        done = env.event()
+        comp.in_ports["host"].put(IpcqRequest(command=cmd, done=done))
+        env.run(until=done)
+
+    tokens = pe_dma.items
+    assert [t.sender_seq for t in tokens] == [0, 1, 2]
+    # slot addresses: peer.rx_base_pa (0x10_000) + i * slot_size (4096)
+    assert [t.dst_addr for t in tokens] == [0x10_000, 0x11_000, 0x12_000]
+
+
+def test_send_invalid_direction_raises():
+    env = simpy.Environment()
+    comp = _make_pe_ipcq(env)
+    _install_two_neighbors(env, comp)
+
+    cmd = IpcqSendCmd(
+        direction="N", src_addr=0x100, src_space="tcm",
+        nbytes=64, shape=(8,), dtype="f16", handle_id="s_bad",
+    )
+    done = env.event()
+    comp.in_ports["host"].put(IpcqRequest(command=cmd, done=done))
+
+    with pytest.raises(IpcqInvalidDirection):
+        env.run(until=done)
+
+
+# ── recv: wait for data and return slot address ─────────────────────
+
+
+def test_recv_waits_until_metadata_arrives():
+    env = simpy.Environment()
+    comp = _make_pe_ipcq(env)
+    _install_two_neighbors(env, comp)
+
+    recv_cmd = IpcqRecvCmd(
+        direction="W", shape=(8,), dtype="f16", handle_id="r1",
+    )
+    recv_req = IpcqRequest(command=recv_cmd, done=env.event())
+    comp.in_ports["host"].put(recv_req)
+
+    # Run a bit — recv should not complete yet (no data)
+    env.run(until=10)
+    assert not recv_req.done.triggered
+
+    # Simulate metadata arrival from peer (W direction = sender pe=2)
+    fake_token = IpcqDmaToken(
+        src_addr=0, src_space="tcm",
+        dst_addr=0x40_000, dst_endpoint=comp._queue_pairs["W"]["peer"],
+        nbytes=64, handle_id="x",
+        shape=(8,), dtype="f16",
+        sender_seq=0,
+        src_sip=0, src_cube=0, src_pe=2, src_direction="E",
+    )
+    comp.in_ports["host"].put(IpcqMetaArrival(token=fake_token))
+    env.run(until=recv_req.done)
+
+    assert recv_req.result_data["src_addr"] == 0x40_000  # my_rx_base_pa for W
+    assert recv_req.result_data["direction"] == "W"
+
+
+def test_recv_returns_immediately_if_data_already_present():
+    env = simpy.Environment()
+    comp = _make_pe_ipcq(env)
+    _install_two_neighbors(env, comp)
+
+    # Pre-arrive metadata
+    fake_token = IpcqDmaToken(
+        src_addr=0, src_space="tcm",
+        dst_addr=0x40_000, dst_endpoint=comp._queue_pairs["W"]["peer"],
+        nbytes=64, handle_id="x",
+        shape=(8,), dtype="f16",
+        sender_seq=0,
+        src_sip=0, src_cube=0, src_pe=2, src_direction="E",
+    )
+    comp.in_ports["host"].put(IpcqMetaArrival(token=fake_token))
+    env.run(until=5)
+
+    recv_cmd = IpcqRecvCmd(
+        direction="W", shape=(8,), dtype="f16", handle_id="r1",
+    )
+    recv_req = IpcqRequest(command=recv_cmd, done=env.event())
+    comp.in_ports["host"].put(recv_req)
+    env.run(until=recv_req.done)
+
+    assert recv_req.result_data["src_addr"] == 0x40_000
+
+
+def test_recv_round_robin_picks_arrived_direction():
+    env = simpy.Environment()
+    comp = _make_pe_ipcq(env)
+    _install_two_neighbors(env, comp)
+
+    # Pre-arrive metadata only on W direction
+    fake_token = IpcqDmaToken(
+        src_addr=0, src_space="tcm",
+        dst_addr=0x40_000, dst_endpoint=comp._queue_pairs["W"]["peer"],
+        nbytes=64, handle_id="x",
+        shape=(8,), dtype="f16",
+        sender_seq=0,
+        src_sip=0, src_cube=0, src_pe=2, src_direction="E",
+    )
+    comp.in_ports["host"].put(IpcqMetaArrival(token=fake_token))
+    env.run(until=5)
+
+    # recv() with no direction → round-robin
+    recv_cmd = IpcqRecvCmd(
+        direction=None, shape=(8,), dtype="f16", handle_id="r_rr",
+    )
+    recv_req = IpcqRequest(command=recv_cmd, done=env.event())
+    comp.in_ports["host"].put(recv_req)
+    env.run(until=recv_req.done)
+
+    assert recv_req.result_data["direction"] == "W"
+
+
+# ── backpressure: send blocks when full ──────────────────────────────
+
+
+def test_send_blocks_when_peer_slot_full():
+    env = simpy.Environment()
+    comp = _make_pe_ipcq(env)
+    _install_two_neighbors(env, comp)
+
+    # n_slots = 4, so 4 sends should succeed; 5th blocks
+    for i in range(4):
+        cmd = IpcqSendCmd(
+            direction="E", src_addr=0x500, src_space="tcm",
+            nbytes=64, shape=(8,), dtype="f16", handle_id=f"s{i}",
+        )
+        done = env.event()
+        comp.in_ports["host"].put(IpcqRequest(command=cmd, done=done))
+        env.run(until=done)
+
+    # 5th send: should not complete
+    cmd5 = IpcqSendCmd(
+        direction="E", src_addr=0x500, src_space="tcm",
+        nbytes=64, shape=(8,), dtype="f16", handle_id="s5",
+    )
+    req5 = IpcqRequest(command=cmd5, done=env.event())
+    comp.in_ports["host"].put(req5)
+    env.run(until=20)
+    assert not req5.done.triggered
+
+    # Send a credit return: peer (E direction, pe=1) consumed slot 0
+    credit = IpcqCreditMetadata(
+        consumer_seq=1,  # peer consumed up to my_tail=1
+        src_sip=0, src_cube=0, src_pe=1, src_direction="W",  # peer's view
+    )
+    comp.credit_inbox.put(credit)
+    env.run(until=req5.done)
+    assert req5.done.triggered
+
+
+# ── Init test ────────────────────────────────────────────────────────
+
+
+def test_init_installs_neighbors():
+    env = simpy.Environment()
+    comp = _make_pe_ipcq(env)
+    _install_two_neighbors(env, comp)
+
+    assert "E" in comp._queue_pairs
+    assert "W" in comp._queue_pairs
+    assert comp._queue_pairs["E"]["peer"].pe == 1
+    assert comp._queue_pairs["W"]["peer"].pe == 2
+    assert comp._queue_pairs["E"]["my_head"] == 0
+    assert comp._queue_pairs["E"]["peer_tail_cache"] == 0
diff --git a/tests/test_recv_copy_to_dst.py b/tests/test_recv_copy_to_dst.py
new file mode 100644
index 0000000..c4388dc
--- /dev/null
+++ b/tests/test_recv_copy_to_dst.py
@@ -0,0 +1,80 @@
+"""Tests for recv_mode='copy_to_dst' (ADR-0023 D9.5)."""
+from __future__ import annotations
+
+import numpy as np
+
+
+def test_recv_copy_to_dst_via_simpy_runner():
+    """Run a kernel that uses tl.recv(..., dst_addr=..., dst_space=...).
+    Verify the data is moved to the dst location after recv.
+    """
+    import importlib
+
+    from kernbench.policy.placement.dp import DPPolicy
+    from kernbench.runtime_api.bench_runner import run_bench
+    from kernbench.runtime_api.types import resolve_device
+    from kernbench.sim_engine.engine import GraphEngine
+    from kernbench.topology.builder import resolve_topology
+    from kernbench.common.pe_commands import TensorHandle
+
+    def kernel(t_ptr, n_elem, dst_buf_addr, tl):
+        rank = tl.program_id(axis=0)
+        ws = tl.num_programs(axis=0)
+        nbytes = n_elem * 2
+        # Each PE sends own data, then recv into a custom dst slot
+        current = TensorHandle(
+            id="loc", addr=t_ptr + rank * nbytes,
+            shape=(n_elem,), dtype="f16",
+            nbytes=nbytes, data=None, space="hbm",
+        )
+        tl.send(dir="E", src=current)
+        # copy_to_dst: move into a per-rank scratch HBM addr
+        recv = tl.recv(
+            dir="W", shape=(n_elem,), dtype="f16",
+            dst_addr=dst_buf_addr + rank * nbytes,
+            dst_space="hbm",
+        )
+        # Sanity: recv handle should now point to our dst addr
+        assert recv.addr == dst_buf_addr + rank * nbytes
+        assert recv.space == "hbm"
+
+    topo = resolve_topology("topology.yaml")
+
+    def run(torch):
+        plan = torch.install_ipcq(
+            algorithm="ring_allreduce_tcm", world_size_override=8,
+        )
+        a = torch.zeros(
+            (1, 8 * 8),
+            dtype="f16",
+            dp=DPPolicy(
+                sip="replicate", cube="replicate", pe="column_wise",
+                num_sips=1, num_cubes=1,
+            ),
+            name="copy_in",
+        )
+        store = torch.engine.memory_store
+        base = a._handle.va_base or a._handle.shards[0].pa
+        nbytes = 8 * 2
+        for r in range(8):
+            store.write("hbm", base + r * nbytes,
+                        np.full((8,), float(r + 1), dtype=np.float16))
+
+        # Use a separate dst region (synthetic addresses)
+        dst_buf = 0xC0FFEE_0000
+        torch.launch("ring_allreduce_tcm", kernel, a, 8, dst_buf)
+
+        # After the kernel, dst_buf + r*16 should contain rank (r-1)%8's data
+        for r in range(8):
+            arr = store.read("hbm", dst_buf + r * nbytes, shape=(8,), dtype="f16")
+            expected = float(((r - 1) % 8) + 1)
+            assert np.allclose(arr, expected), f"rank {r}: got {arr}, expected {expected}"
+
+    result = run_bench(
+        topology=topo, bench_fn=run,
+        device=resolve_device("all"),
+        engine_factory=lambda t, d: GraphEngine(
+            getattr(t, "topology_obj", t), enable_data=True
+        ),
+    )
+    assert result.completion.ok
diff --git a/tests/test_runtime_api_tensor.py b/tests/test_runtime_api_tensor.py
new file mode 100644
index 0000000..b06eac0
--- /dev/null
+++ b/tests/test_runtime_api_tensor.py
@@ -0,0 +1,136 @@
+"""Tests for the pytorch-compat Tensor API extensions.
+
+Covers the new ``torch.from_numpy`` factory and ``Tensor.numpy``,
+``Tensor.copy_`` methods used by the unified ``ccl_allreduce`` bench.
+"""
+from __future__ import annotations
+
+import numpy as np
+import pytest
+
+from kernbench.policy.placement.dp import DPPolicy
+from kernbench.runtime_api.bench_runner import run_bench
+from kernbench.runtime_api.types import resolve_device
+from kernbench.sim_engine.engine import GraphEngine
+from kernbench.topology.builder import resolve_topology
+
+
+def _engine_factory(topology, device):
+    return GraphEngine(getattr(topology, "topology_obj", topology), enable_data=True)
+
+
+def _run_with(bench_body):
+    topo = resolve_topology("topology.yaml")
+    return run_bench(
+        topology=topo,
+        bench_fn=bench_body,
+        device=resolve_device("all"),
+        engine_factory=_engine_factory,
+    )
+
+
+# ── from_numpy ──────────────────────────────────────────────────────
+
+
+def test_from_numpy_creates_host_tensor():
+    """torch.from_numpy returns a kernbench Tensor with the array stored
+    in its host buffer (not deployed to any PE)."""
+
+    def body(torch):
+        arr = np.arange(8, dtype=np.float16).reshape(1, 8)
+        h = torch.from_numpy(arr)
+        # Host tensor has shape/dtype matching the array.
+        assert h.shape == (1, 8)
+        assert h.dtype == "f16"
+        # numpy() round-trips the host buffer.
+        assert np.array_equal(h.numpy(), arr)
+        # No deploy → no real shards.
+        assert h._handle is None
+        # Submit a no-op so run_bench has at least one handle.
+        torch.zeros((1, 8), dtype="f16",
+                    dp=DPPolicy(sip="replicate", cube="replicate", pe="replicate",
+                                num_sips=1, num_cubes=1, num_pes=1),
+                    name="dummy")
+
+    _run_with(body)
+
+
+# ── single-PE replicated tensor ─────────────────────────────────────
+
+
+def test_copy_and_numpy_single_pe():
+    """copy_ from a numpy array, then numpy() round-trips correctly on
+    a single-PE (no real sharding) tensor."""
+
+    def body(torch):
+        dp = DPPolicy(sip="replicate", cube="replicate", pe="replicate",
+                      num_sips=1, num_cubes=1, num_pes=1)
+        t = torch.zeros((1, 16), dtype="f16", dp=dp, name="t")
+        src = np.arange(16, dtype=np.float16).reshape(1, 16)
+        t.copy_(torch.from_numpy(src))
+        gathered = t.numpy()
+        assert gathered.shape == (1, 16)
+        assert np.array_equal(gathered, src)
+
+    _run_with(body)
+
+
+# ── multi-PE column-wise sharding (1 cube) ──────────────────────────
+
+
+def test_copy_and_numpy_multi_pe_column_wise():
+    """copy_ splits across 8 PEs in one cube, numpy() reassembles."""
+
+    def body(torch):
+        n_pe = 8
+        dp = DPPolicy(sip="replicate", cube="replicate", pe="column_wise",
+                      num_sips=1, num_cubes=1, num_pes=n_pe)
+        t = torch.zeros((1, n_pe * 4), dtype="f16", dp=dp, name="t")
+        src = np.arange(n_pe * 4, dtype=np.float16).reshape(1, n_pe * 4)
+        t.copy_(torch.from_numpy(src))
+        gathered = t.numpy()
+        assert gathered.shape == (1, n_pe * 4)
+        assert np.array_equal(gathered, src)
+        # Sanity: there really were 8 shards.
+        assert len(t._handle.shards) == n_pe
+
+    _run_with(body)
+
+
+# ── multi-cube sharding ─────────────────────────────────────────────
+
+
+def test_copy_and_numpy_multi_cube():
+    """copy_ across 2 cubes (16 PEs total), numpy() reassembles."""
+
+    def body(torch):
+        n_pe_per_cube = 8
+        n_cubes = 2
+        total = n_cubes * n_pe_per_cube  # 16
+        dp = DPPolicy(sip="replicate", cube="column_wise", pe="column_wise",
+                      num_sips=1, num_cubes=n_cubes)
+        t = torch.zeros((1, total * 4), dtype="f16", dp=dp, name="t")
+        src = np.arange(total * 4, dtype=np.float16).reshape(1, total * 4)
+        t.copy_(torch.from_numpy(src))
+        gathered = t.numpy()
+        assert np.array_equal(gathered, src)
+        assert len(t._handle.shards) == total
+
+    _run_with(body)
+
+
+# ── shape mismatch raises ───────────────────────────────────────────
+
+
+def test_copy_shape_mismatch_raises():
+    """copy_ with mismatched shapes raises ValueError."""
+
+    def body(torch):
+        dp = DPPolicy(sip="replicate", cube="replicate", pe="replicate",
+                      num_sips=1, num_cubes=1, num_pes=1)
+        t = torch.zeros((1, 8), dtype="f16", dp=dp, name="t")
+        src = np.zeros((1, 16), dtype=np.float16)
+        with pytest.raises(ValueError, match="copy_ shape mismatch"):
+            t.copy_(torch.from_numpy(src))
+
+    _run_with(body)
diff --git a/tests/test_tl_ipcq_api.py b/tests/test_tl_ipcq_api.py
new file mode 100644
index 0000000..f1e708f
--- /dev/null
+++ b/tests/test_tl_ipcq_api.py
@@ -0,0 +1,95 @@
+"""Tests for tl.send / tl.recv API (ADR-0023 D4 + D9.5)."""
+from __future__ import annotations
+
+from typing import Any
+
+import simpy
+from greenlet import greenlet
+
+from kernbench.common.ipcq_types import (
+    IpcqRecvCmd,
+    IpcqRequest,
+    IpcqSendCmd,
+)
+from kernbench.triton_emu.tl_context import TLContext
+
+
+# ── Command-list mode (no runner) ────────────────────────────────────
+
+
+def test_tl_send_command_list_mode():
+    tl = TLContext(pe_id=0, num_programs=4, dispatch_cycles=0)
+    tl.send(dir="E", src_addr=0x500, nbytes=64, shape=(8,), dtype="f16")
+    cmds = tl.commands
+    sends = [c for c in cmds if isinstance(c, IpcqSendCmd)]
+    assert len(sends) == 1
+    assert sends[0].direction == "E"
+    assert sends[0].src_addr == 0x500
+    assert sends[0].nbytes == 64
+
+
+def test_tl_recv_command_list_mode():
+    tl = TLContext(pe_id=0, num_programs=4, dispatch_cycles=0)
+    handle = tl.recv(dir="W", shape=(8,), dtype="f16")
+    cmds = tl.commands
+    recvs = [c for c in cmds if isinstance(c, IpcqRecvCmd)]
+    assert len(recvs) == 1
+    assert recvs[0].direction == "W"
+    # In command-list mode (no runner), tl.recv returns a placeholder
+    # TensorHandle (no actual data movement happens until SimPy)
+    assert handle.shape == (8,)
+    assert handle.dtype == "f16"
+
+
+def test_tl_recv_round_robin_no_dir():
+    tl = TLContext(pe_id=0, num_programs=4, dispatch_cycles=0)
+    tl.recv(shape=(8,), dtype="f16")
+    cmds = tl.commands
+    recvs = [c for c in cmds if isinstance(c, IpcqRecvCmd)]
+    assert recvs[0].direction is None
+
+
+# ── Runner mode (greenlet) ──────────────────────────────────────────
+
+
+class _StubRunner:
+    """Minimal runner that auto-responds to IpcqSendCmd / IpcqRecvCmd."""
+
+    def __init__(self) -> None:
+        self.received: list[Any] = []
+
+    def switch_to_simpy(self, cmd: Any) -> Any:
+        self.received.append(cmd)
+        if isinstance(cmd, IpcqSendCmd):
+            return None
+        if isinstance(cmd, IpcqRecvCmd):
+            # Return a fake slot dict
+            return {
+                "data": None,
+                "src_space": "tcm",
+                "src_addr": 0xABCD,
+                "direction": cmd.direction or "E",
+                "dtype": cmd.dtype,
+                "shape": cmd.shape,
+                "nbytes": 16,
+            }
+        return None
+
+
+def test_tl_send_runner_mode():
+    runner = _StubRunner()
+    tl = TLContext(pe_id=0, num_programs=4, dispatch_cycles=0, runner=runner)
+    tl.send(dir="E", src_addr=0x500, nbytes=64, shape=(8,), dtype="f16")
+    assert len(runner.received) == 1
+    assert isinstance(runner.received[0], IpcqSendCmd)
+
+
+def test_tl_recv_runner_mode_returns_handle_with_slot_addr():
+    runner = _StubRunner()
+    tl = TLContext(pe_id=0, num_programs=4, dispatch_cycles=0, runner=runner)
+    h = tl.recv(dir="W", shape=(8,), dtype="f16")
+    assert isinstance(runner.received[0], IpcqRecvCmd)
+    # The returned TensorHandle's addr should reflect the slot
+    assert h.addr == 0xABCD
+    assert h.shape == (8,)
+    assert h.dtype == "f16"
diff --git a/tests/test_tl_recv_async.py b/tests/test_tl_recv_async.py
new file mode 100644
index 0000000..37aae56
--- /dev/null
+++ b/tests/test_tl_recv_async.py
@@ -0,0 +1,106 @@
+"""Tests for tl.recv_async + tl.wait (ADR-0023 D4)."""
+from __future__ import annotations
+
+import numpy as np
+
+from kernbench.ccl.testing import run_kernel_in_mock
+
+
+def kernel_async_recv(t_ptr, n_elem, tl):
+    """Each PE issues recv_async first, then send, then wait — this exercises
+    the non-blocking path. Uses TensorHandle math (PE_MATH) for accumulation
+    so Phase 2 produces correct final HBM contents."""
+    rank = tl.program_id(axis=0)
+    world_size = tl.num_programs(axis=0)
+    nbytes = n_elem * 2
+
+    pe_addr = t_ptr + rank * nbytes
+    acc = tl.load(pe_addr, shape=(n_elem,), dtype="f16")
+    current = acc
+
+    for _step in range(world_size - 1):
+        future = tl.recv_async(dir="W", shape=(n_elem,), dtype="f16")
+        tl.send(dir="E", src=current)
+        recv = tl.wait(future)
+        acc = acc + recv
+        current = recv  # forward W's tile to E next round
+
+    tl.store(pe_addr, acc)
+
+
+def test_recv_async_mock_runtime():
+    n_elem = 8
+    inputs = [
+        np.full((n_elem,), float(r + 1), dtype=np.float16)
+        for r in range(4)
+    ]
+    expected = sum(inputs)
+
+    outputs = run_kernel_in_mock(
+        kernel_fn=kernel_async_recv,
+        world_size=4,
+        topology="ring_1d",
+        inputs=inputs,
+        kernel_args=(n_elem,),
+    )
+    for r in range(4):
+        assert np.allclose(outputs[r], expected)
+
+
+def test_recv_async_simpy_runner():
+    """Run the async kernel through the real SimPy stack via the
+    install_ipcq + launch path.
+    """
+    import importlib
+
+    from kernbench.runtime_api.bench_runner import run_bench
+    from kernbench.runtime_api.types import resolve_device
+    from kernbench.sim_engine.engine import GraphEngine
+    from kernbench.topology.builder import resolve_topology
+
+    # Re-use the standard 8-PE bench skeleton but swap in the async kernel.
+    topo = resolve_topology("topology.yaml")
+
+    # Build a tiny inline bench module
+    import types
+    mod = types.ModuleType("inline_bench_async")
+
+    from kernbench.policy.placement.dp import DPPolicy
+
+    def run(torch):
+        plan = torch.install_ipcq(
+            algorithm="ring_allreduce_tcm", world_size_override=8,
+        )
+        a = torch.zeros(
+            (1, 8 * 8),
+            dtype="f16",
+            dp=DPPolicy(
+                sip="replicate", cube="replicate", pe="column_wise",
+                num_sips=1, num_cubes=1,
+            ),
+            name="async_in",
+        )
+        store = torch.engine.memory_store
+        base = a._handle.va_base or a._handle.shards[0].pa
+        nbytes = 8 * 2
+        for r in range(8):
+            store.write("hbm", base + r * nbytes,
+                        np.full((8,), float(r + 1), dtype=np.float16))
+
+        torch.launch("ring_allreduce_tcm", kernel_async_recv, a, 8)
+
+        for r in range(8):
+            result = store.read("hbm", base + r * nbytes, shape=(8,), dtype="f16")
+            expected = float(sum(range(1, 9)))  # 36
+            assert np.allclose(result, expected, rtol=1e-2, atol=1e-2), \
+                f"rank {r}: got {result}, expected {expected}"
+
+    mod.run = run
+    result = run_bench(
+        topology=topo, bench_fn=mod.run,
+        device=resolve_device("all"),
+        engine_factory=lambda t, d: GraphEngine(
+            getattr(t, "topology_obj", t), enable_data=True
+        ),
+    )
+    assert result.completion.ok
diff --git a/tests/test_topology_compile.py b/tests/test_topology_compile.py
index ae849aa..0ef2348 100644
--- a/tests/test_topology_compile.py
+++ b/tests/test_topology_compile.py
@@ -19,16 +19,19 @@ def test_full_graph_node_count():
     # + 2 SIPs x (1 IO x 23 io_nodes
     #            + 16 cubes x (32 routers + 1 hbm_ctrl + 1 m_cpu + 1 sram
     #                          + 20 ucie (4 ports x (1 port + 4 conn))
-    #                          + 8 PEs x 8 pe_comps))  (ADR-0021: +pe_fetch_store)
+    #                          + 8 PEs x 9 pe_comps))  (ADR-0023: +pe_ipcq)
     #   IO: pcie_ep + io_cpu + noc + 4 io_ucie_ports + 4*4 io_ucie_conn = 23
-    #   cube: 32 + 3 + 20 + 64 = 119
-    # = 1 + 2*(23 + 16*119) = 1 + 2*(23+1904) = 1 + 3854 = 3855
-    assert len(g.nodes) == 3855
+    #   cube: 32 + 3 + 20 + 72 = 127
+    # = 1 + 2*(23 + 16*127) = 1 + 2*(23+2032) = 1 + 4110 = 4111
+    assert len(g.nodes) == 4111
 
 
 def test_full_graph_edge_count():
     g = _graph()
-    assert len(g.edges) == 12922  # ADR-0021: +pe_fetch_store + chaining edges
+    # ADR-0023: +3 IPCQ edges per PE (cpu→ipcq, ipcq→dma, dma→ipcq)
+    # 2 SIPs × 16 cubes × 8 PEs × 3 = 768 new edges
+    # Cross-SIP routing: +1 reverse pcie_ep→switch edge per SIP = +2
+    assert len(g.edges) == 13692
 
 
 # -- Full graph: specific nodes exist -----------------------------------------
@@ -287,7 +290,7 @@ def test_pe_view_has_all_components():
     v = _graph().pe_view
     assert set(v.nodes.keys()) == {
         "pe_cpu", "pe_scheduler", "pe_dma", "pe_fetch_store",
-        "pe_gemm", "pe_math", "pe_mmu", "pe_tcm",
+        "pe_gemm", "pe_math", "pe_mmu", "pe_tcm", "pe_ipcq",
     }
 
 
diff --git a/tests/test_topology_load.py b/tests/test_topology_load.py
index 82f2859..93c2c6d 100644
--- a/tests/test_topology_load.py
+++ b/tests/test_topology_load.py
@@ -24,7 +24,7 @@ def test_pe_template_components():
     comps = spec["cube"]["pe_template"]["components"]
     assert set(comps.keys()) == {
         "pe_cpu", "pe_scheduler", "pe_dma", "pe_fetch_store",
-        "pe_gemm", "pe_math", "pe_mmu", "pe_tcm",
+        "pe_gemm", "pe_math", "pe_mmu", "pe_tcm", "pe_ipcq",
     }
 
 
diff --git a/tests/test_triton_emu.py b/tests/test_triton_emu.py
index 77b4568..160ec4b 100644
--- a/tests/test_triton_emu.py
+++ b/tests/test_triton_emu.py
@@ -87,6 +87,37 @@ def test_tl_math_unary_ops():
     assert ops == ["exp", "log", "sqrt", "abs", "sigmoid", "cos", "sin"]
 
 
+def test_tl_math_extra_ops():
+    """tl.maximum/minimum/fma/clamp/softmax + tl.cdiv (real-Triton parity)."""
+    tl = _ctx()
+    a = tl.load(0x1000, shape=(8, 8), dtype="f16")
+    b = tl.load(0x2000, shape=(8, 8), dtype="f16")
+    c = tl.load(0x3000, shape=(8, 8), dtype="f16")
+
+    tl.maximum(a, b)
+    tl.minimum(a, b)
+    tl.fma(a, b, c)
+    tl.clamp(a, b, c)
+    tl.softmax(a, axis=1)
+
+    math_cmds = [cm for cm in tl.commands if isinstance(cm, MathCmd)]
+    ops = [cm.op for cm in math_cmds]
+    assert ops == ["maximum", "minimum", "fma", "clamp", "softmax"]
+    # ternary fma/clamp must record three inputs
+    fma_cmd = math_cmds[2]
+    assert len(fma_cmd.inputs) == 3
+    clamp_cmd = math_cmds[3]
+    assert len(clamp_cmd.inputs) == 3
+    # softmax records the axis
+    assert math_cmds[4].axis == 1
+
+    # cdiv is a scalar helper, not a tensor op
+    from kernbench.triton_emu.tl_context import TLContext
+    assert TLContext.cdiv(10, 3) == 4
+    assert TLContext.cdiv(9, 3) == 3
+    assert TLContext.cdiv(0, 4) == 0
+
+
 # ── 5. a + b, a * b → MathCmd ────────────────────────────────────
 
 
diff --git a/topology.yaml b/topology.yaml
index ad936cc..c56b385 100644
--- a/topology.yaml
+++ b/topology.yaml
@@ -67,7 +67,8 @@ cube:
       pe_math:        { kind: pe_math,        impl: builtin.pe_math,        attrs: { overhead_ns: 0.0, shared_resource: accel_slot } }
       pe_fetch_store: { kind: pe_fetch_store, impl: builtin.pe_fetch_store, attrs: { overhead_ns: 0.0 } }
       pe_mmu:         { kind: pe_mmu,         impl: builtin.pe_mmu,         attrs: { tlb_overhead_ns: 0.5, page_size: 4096 } }
-      pe_tcm:         { kind: pe_tcm,         impl: builtin.pe_tcm,         attrs: { size_mb: 16, read_bw_gbs: 512.0, write_bw_gbs: 512.0 } }
+      pe_tcm:         { kind: pe_tcm,         impl: builtin.pe_tcm,         attrs: { size_mb: 16, read_bw_gbs: 512.0, write_bw_gbs: 512.0, kernel_scratch_mb: 1 } }
+      pe_ipcq:        { kind: pe_ipcq,        impl: builtin.pe_ipcq,        attrs: { overhead_ns: 0.0 } }
     links:
       pe_cpu_to_scheduler_mm:  0.5
       scheduler_to_dma_mm:     0.5
@@ -88,6 +89,9 @@ cube:
       gemm_to_tcm_mm:          0.5
       math_to_tcm_bw_gbs:      512.0
       math_to_tcm_mm:          0.5
+      cpu_to_ipcq_mm:          0.5     # PE_CPU → PE_IPCQ (ADR-0023)
+      ipcq_to_dma_mm:          0.0     # PE_IPCQ → PE_DMA token forwarding (ADR-0023)
+      dma_to_ipcq_mm:          0.0     # PE_DMA → PE_IPCQ metadata arrival (ADR-0023)
 
   memory_map:
     hbm_total_gb_per_cube: 48