Intercube allreduce: pe0 cube-mesh reduce + multi-SIP ring/torus/mesh

New intercube allreduce kernel replacing the old flat ring algorithms. Reduces across the 4x4 cube mesh within each SIP (pe0-only, same-lane), then inter-SIP exchange on root cube, then broadcast back. Supports ring_1d, torus_2d, and mesh_2d_no_wrap SIP topologies driven by topology.yaml. Integrated with dist.init_process_group / dist.all_reduce. New files: - src/kernbench/ccl/algorithms/intercube_allreduce.py (kernel) - src/kernbench/ccl/sfr_config.py (configure_sfr_intercube_multisip) - tests/test_allreduce_multidevice.py (config-driven, 3 topologies) - tests/test_distributed_intercube_allreduce.py (full distributed path) - tests/test_intercube_sfr_config.py (SFR wiring verification) Modified: - distributed.py: AhbmCCLBackend uses configure_sfr_intercube_multisip - topologies.py: added torus_2d, mesh_2d_no_wrap - install.py: global_E/W/N/S in _OPPOSITE_DIR - topology.yaml: added system.sips.topology - ccl.yaml: single intercube_allreduce algorithm - benches/ccl_allreduce.py: row_wise cube-mesh tensor layout Removed old flat-ring algorithms and their tests. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Refactor ccl_allreduce bench: rank=SIP only, remove rank=PE legacy path
2026-04-16 17:33:42 -07:00 · 2026-04-14 16:45:27 -07:00 · 2026-04-14 16:31:13 -07:00 · 2026-04-14 14:13:26 -07:00 · 2026-04-14 13:02:19 -07:00 · 2026-04-14 12:46:33 -07:00
175 changed files with 24289 additions and 2706 deletions
@@ -29,3 +29,4 @@ build/

 # Logs
 *.log
+.claude/
@@ -104,7 +104,7 @@ The simulator MUST accept multiple topologies (YAML / JSON / dict), varying:
 - SIP count,
 - CUBE count per SIP,
 - PE count per CUBE,
- on-chip fabric structure (e.g., mesh / NoC / XBAR),
+- on-chip fabric structure (e.g., mesh / NoC router grid),
 - IO chiplets and interconnects,
 - link bandwidth, latency, and capacity parameters.

@@ -119,8 +119,7 @@ Given a topology:

 All components MUST be replaceable behind stable interfaces, including:

- routers and fabrics (NoC, bridges, switches),
- XBAR-like selectors,
+- routers and fabrics (NoC router mesh, switches),
 - DMA engines and queues,
 - memory controllers and services (HBM, TCM, queues),
 - management and control processors (modeled components).
@@ -226,7 +225,7 @@ No implicit translation or hidden latency is allowed.

 ### 2.1 Graph Execution Model

- Nodes represent modeled components (PE blocks, XBAR, NoC, bridges,
+- Nodes represent modeled components (PE blocks, NoC routers,
  HBM controllers, IO components, etc.).
 - Directed edges represent interconnect links with latency and bandwidth attributes.
 - Execution model:
@@ -0,0 +1,103 @@
+"""CCL all-reduce bench (ADR-0024 + ADR-0027).
+
+Pure TP launcher model: rank = SIP. Each rank owns a ``(N_CUBES, n_elem)``
+tensor sharded row-wise across the cube mesh (pe0 per cube). After
+``dist.all_reduce(op="sum")`` every cube on every rank must hold
+``N_CUBES * sum(1..world_size)``. Rank 0 prints the pass/fail line.
+
+Driven by ``ccl.yaml`` (``defaults.algorithm``, ``n_elem``) + ``topology.yaml``
+(SIP count → world_size, cube_mesh → N_CUBES).
+"""
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+import numpy as np
+
+from kernbench.ccl.install import load_ccl_config, resolve_algorithm_config
+from kernbench.policy.placement.dp import DPPolicy
+
+DEFAULT_N_ELEM = 8
+
+
+@dataclass(frozen=True)
+class _BenchCfg:
+    algorithm: str
+    n_elem: int
+    n_cubes: int
+    world_size: int
+
+
+def _resolve_cfg(torch) -> _BenchCfg:
+    """Read ccl.yaml + topology once at host side."""
+    merged = resolve_algorithm_config(load_ccl_config())
+    ws = torch.distributed.get_world_size()
+    spec = torch.spec or {}
+    n_sips = int(spec.get("system", {}).get("sips", {}).get("count", 1))
+    if ws != n_sips:
+        raise RuntimeError(
+            f"ccl_allreduce bench requires world_size == topology SIP count "
+            f"(world_size={ws}, n_sips={n_sips})."
+        )
+    cm = spec.get("sip", {}).get("cube_mesh", {})
+    n_cubes = int(cm.get("w", 4)) * int(cm.get("h", 4))
+    return _BenchCfg(
+        algorithm=merged["algorithm"],
+        n_elem=int(merged.get("n_elem", DEFAULT_N_ELEM)),
+        n_cubes=n_cubes,
+        world_size=ws,
+    )
+
+
+def _rank_dp(n_cubes: int) -> DPPolicy:
+    return DPPolicy(cube="row_wise", pe="replicate", num_cubes=n_cubes, num_pes=1)
+
+
+def _allocate_rank_tensor(torch, rank: int, cfg: _BenchCfg):
+    """Allocate this rank's ``(n_cubes, n_elem)`` tensor on its SIP."""
+    return torch.zeros(
+        (cfg.n_cubes, cfg.n_elem), dtype="f16",
+        dp=_rank_dp(cfg.n_cubes), name=f"ccl_in_r{rank}",
+    )
+
+
+def _init_with_rank_value(torch, tensor, rank: int, cfg: _BenchCfg) -> None:
+    """Fill all cubes with the scalar ``rank + 1``."""
+    arr = np.full((cfg.n_cubes, cfg.n_elem), float(rank + 1), dtype=np.float16)
+    tensor.copy_(torch.from_numpy(arr))
+
+
+def _report(result: np.ndarray, cfg: _BenchCfg) -> None:
+    """Single-line pass/fail printer (rank 0 only)."""
+    expected = float(cfg.n_cubes * sum(range(1, cfg.world_size + 1)))
+    ok = True
+    for cube_id in range(cfg.n_cubes):
+        if not np.allclose(result[cube_id], expected, rtol=1e-1, atol=1e-1):
+            ok = False
+            break
+    if ok:
+        total = cfg.world_size * cfg.n_cubes
+        print(f"  {cfg.algorithm} (ws={cfg.world_size}): {total} OK")
+        return
+    got = float(result.reshape(-1).mean())
+    print(
+        f"  [FAIL] {cfg.algorithm} (ws={cfg.world_size}): "
+        f"got mean={got:.3f}, expected={expected:.3f}"
+    )
+
+
+def _worker(rank: int, cfg: _BenchCfg, torch) -> None:
+    torch.ahbm.set_device(rank)
+    tensor = _allocate_rank_tensor(torch, rank, cfg)
+    _init_with_rank_value(torch, tensor, rank, cfg)
+    torch.distributed.all_reduce(tensor, op="sum")
+    if rank == 0:
+        _report(tensor.numpy(), cfg)
+
+
+def run(torch) -> None:
+    torch.distributed.init_process_group(backend="ahbm")
+    cfg = _resolve_cfg(torch)
+    torch.multiprocessing.spawn(
+        _worker, args=(cfg, torch), nprocs=cfg.world_size,
+    )
@@ -3,7 +3,7 @@
 Full host-to-PE pipeline:
  Host → PCIE_EP → IO_CPU → M_CPU → PE_CPU → SchedulerV2 → PE_DMA → HBM

-Single PE: num_sips=1, num_cubes=1, num_pes=1 via DPPolicy override.
+Single PE: num_cubes=1, num_pes=1 via DPPolicy override.
 Both operands use tl.ref (HBM-resident); scheduler_v2 tiles and streams
 per-tile DMA internally.

@@ -30,7 +30,7 @@ def _gemm_kernel(a_ptr, b_ptr, out_ptr, M, K, N, tl, DTYPE="f16"):
 def run(torch):
    """Run the single-PE GEMM benchmark."""
    dp = DPPolicy(cube="replicate", pe="replicate",
-                  num_sips=1, num_cubes=1, num_pes=1)
+                  num_cubes=1, num_pes=1)

    a   = torch.empty((M, K), dtype=DTYPE, dp=dp, name="a")
    b   = torch.empty((K, N), dtype=DTYPE, dp=dp, name="b")
@@ -72,12 +72,16 @@ def run(torch):
    K = GPT3_D_MODEL
    N = COLS_PER_PE

-    # X: replicated across all PEs
+    # ADR-0026: DPPolicy is intra-device only. For multi-SIP execution the
+    # ADR-0024 launcher calls this bench once per SIP (each worker via
+    # torch.ahbm.set_device(rank)); here the policy describes only the
+    # cube × PE layout within a single SIP.
+    # X: replicated across all PEs within the SIP
    dp_replicate = DPPolicy(cube="replicate", pe="replicate",
-                            num_sips=N_SIPS, num_cubes=N_CUBES, num_pes=N_PE_PER_CUBE)
-    # W_Q/K/V, out_Q/K/V: column-wise sharded across all PEs
+                            num_cubes=N_CUBES, num_pes=N_PE_PER_CUBE)
+    # W_Q/K/V, out_Q/K/V: column-wise sharded across all PEs within the SIP
    dp_sharded = DPPolicy(cube="column_wise", pe="column_wise",
-                          num_sips=N_SIPS, num_cubes=N_CUBES, num_pes=N_PE_PER_CUBE)
+                          num_cubes=N_CUBES, num_pes=N_PE_PER_CUBE)

    x     = torch.empty((M, K), dtype=DTYPE, dp=dp_replicate, name="x")
    wq    = torch.empty((K, GPT3_D_MODEL), dtype=DTYPE, dp=dp_sharded, name="wq")
@@ -9,29 +9,32 @@ from kernbench.runtime_api.context import RuntimeContext
 BenchFn = Callable[[RuntimeContext], Any]


+def _load_module(bench_id: str):
+    bench_id = bench_id.strip()
+    if not bench_id:
+        raise ValueError("Bench id is empty.")
+    module_path = f"benches.{bench_id}"
+    try:
+        return importlib.import_module(module_path)
+    except ModuleNotFoundError as e:
+        raise ValueError(
+            f"Unknown bench '{bench_id}'. Expected module {module_path}.py"
+        ) from e
+
+
 def resolve_bench(bench_id: str) -> BenchFn:
-    """
-    Resolve a bench id into a callable bench function.
+    """Resolve a bench id into its ``run(torch)`` callable.

    Expected layout (repo root):
        benches/<bench_id>.py
            def run(torch: RuntimeContext) -> Any
    """
-    bench_id = bench_id.strip()
-    if not bench_id:
-        raise ValueError("Bench id is empty.")
-
-    module_path = f"benches.{bench_id}"
-
-    try:
-        mod = importlib.import_module(module_path)
-    except ModuleNotFoundError as e:
-        raise ValueError(f"Unknown bench '{bench_id}'. Expected module {module_path}.py") from e
-
+    mod = _load_module(bench_id)
    run_fn = getattr(mod, "run", None)
    if run_fn is None:
-        raise ValueError(f"Bench module {module_path} must define a 'run(torch)' function.")
+        raise ValueError(
+            f"Bench module benches.{bench_id} must define 'run(torch)'."
+        )
    if not callable(run_fn):
-        raise ValueError(f"'run' in {module_path} is not callable.")
-
+        raise ValueError(f"'run' in benches.{bench_id} is not callable.")
    return run_fn
@@ -10,7 +10,9 @@ Kernel: tl.load(a) + tl.ref(b) + tl.composite(gemm) + tl.wait()
 from kernbench.policy.placement.dp import DPPolicy

 # GEMM dimensions: (M, K) x (K, N) → (M, N)
-M, K, N = 128, 256, 128
+# Small dims (1 tile) for fast regression. The test verifies the full
+# host→PE pipeline, not large-matrix throughput.
+M, K, N = 32, 64, 32
 DTYPE = "f16"


@@ -10,7 +10,9 @@ Kernel: tl.load(a) + tl.ref(b) + tl.composite(gemm) + tl.wait()
 from kernbench.policy.placement.dp import DPPolicy

 # GEMM dimensions: (M, K) x (K, N) -> (M, N)
-M, K, N = 128, 256, 128
+# Small dims (1 tile) for fast regression. The test verifies the multi-PE
+# fan-out pipeline, not large-matrix throughput.
+M, K, N = 32, 64, 32
 DTYPE = "f16"


@@ -1,7 +1,7 @@
 """VA offset verification benchmark.

 Verifies that Triton-style base_ptr + pid * stride addressing works correctly
-with full TP sharding (sip/cube/pe all column_wise). Each PE loads its own
+with intra-SIP TP sharding (cube/pe column_wise). Each PE loads its own
 block from a sharded tensor and stores it back.

 The kernel uses standard Triton patterns:
@@ -28,7 +28,7 @@ def _copy_kernel(src_ptr, dst_ptr, M, K, tl, DTYPE="f16"):

 def run(torch):
    """Run the VA offset verification benchmark with full TP sharding."""
-    dp = DPPolicy(sip="column_wise", cube="column_wise", pe="column_wise")
+    dp = DPPolicy(cube="column_wise", pe="column_wise")
    src = torch.zeros((M, K), dtype=DTYPE, dp=dp, name="src")
    dst = torch.empty((M, K), dtype=DTYPE, dp=dp, name="dst")

@@ -0,0 +1,45 @@
+# ccl.yaml — CCL backend (ahbm) configuration (ADR-0023 D11)
+#
+# Loaded by AhbmCCLBackend at init_process_group time.
+# defaults.algorithm chooses which kernel + topology is installed
+# into PE_IPCQ neighbor tables. Host code is unaware of these settings.
+
+defaults:
+  # Algorithm to run for this benchmark execution.
+  algorithm: intercube_allreduce
+
+  # IPCQ ring buffer location.
+  #   tcm  — PE-local TCM (fast, small, conflicts with compute TCM access)
+  #   hbm  — PE-local HBM (large, slower DMA latency)
+  #   sram — Cube-shared SRAM (medium, cube-internal contention)
+  buffer_kind: tcm
+
+  # Backpressure mode.
+  #   poll  — spin-loop polling of cached peer pointers
+  #   sleep — yield SimPy event, wake on credit return
+  backpressure: sleep
+
+  # Ring depth: number of slots per (direction, tx|rx) buffer.
+  n_slots: 4
+
+  # Slot size in bytes (must hold one tile worth of data).
+  slot_size: 4096
+
+  # PE_DMA virtual channel chunk size (D8).
+  vc_chunk_size: 256
+
+  # Credit return fast path message size (D9).
+  ipcq_credit_size_bytes: 16
+
+algorithms:
+  # ── intercube all-reduce (pe0-only, cube mesh + inter-SIP) ──
+  # Reduces across the 4×4 cube mesh within each SIP, then inter-SIP
+  # exchange on root cube, then broadcast back. SIP topology is read
+  # from topology.yaml → system.sips.topology. Kernel auto-selects
+  # ring / torus / mesh inter-SIP exchange pattern.
+  intercube_allreduce:
+    module: kernbench.ccl.algorithms.intercube_allreduce
+    topology: none
+    buffer_kind: tcm
+    n_elem: 8
+    root_cube: 15
@@ -2,6 +2,10 @@
 # Maps impl names (used in topology.yaml) to Python class paths.
 # Format: impl_name: module.path:ClassName
 #
+# Naming convention:
+#   builtin.<name>  — built-in implementations
+#   custom.<name>   — user-defined implementations
+#
 # ── Adding custom components ──────────────────────────────────────────
 #
 # 1. Create your implementation in:
@@ -10,44 +14,43 @@
 #    Your class must inherit from ComponentBase (or PeEngineBase for PE engines).
 #
 # 2. Register it below under "Custom" with a unique impl name:
-#      my_pe_cpu_v2: kernbench.components.custom.my_pe_cpu:MyPeCpuComponent
+#      custom.my_pe_cpu: kernbench.components.custom.my_pe_cpu:MyPeCpuComponent
 #
 # 3. Reference it in topology.yaml:
-#      pe_cpu: { kind: pe_cpu, impl: my_pe_cpu_v2, attrs: { ... } }
+#      pe_cpu: { kind: pe_cpu, impl: custom.my_pe_cpu, attrs: { ... } }
 #
 # 4. Add unit tests in:
 #      tests/custom/test_<your_component>.py
 #
 # External packages also work — use the full module path:
-#      fast_gemm_v1: my_team.accel.fast_gemm:FastGemmComponent
+#      custom.fast_gemm: my_team.accel.fast_gemm:FastGemmComponent
 # ──────────────────────────────────────────────────────────────────────

 components:
  # Infrastructure
-  forwarding_v1:  kernbench.components.builtin.forwarding:TransitComponent
-  switch_v1:      kernbench.components.builtin.forwarding:TransitComponent
-  noc_v1:         kernbench.components.builtin.forwarding:TransitComponent
-  ucie_v1:        kernbench.components.builtin.forwarding:TransitComponent
-  noc_2d_mesh_v1: kernbench.components.builtin.noc:TwoDMeshNocComponent
-  xbar_v1:        kernbench.components.builtin.xbar:PositionAwareXbarComponent
+  builtin.forwarding:  kernbench.components.builtin.forwarding:TransitComponent
+  builtin.switch:      kernbench.components.builtin.forwarding:TransitComponent
+  builtin.noc:         kernbench.components.builtin.forwarding:TransitComponent
+  builtin.ucie:        kernbench.components.builtin.forwarding:TransitComponent

  # IO / Host interface
-  pcie_ep_v1:     kernbench.components.builtin.pcie_ep:PcieEpComponent
-  io_cpu_v1:      kernbench.components.builtin.io_cpu:IoCpuComponent
+  builtin.pcie_ep:     kernbench.components.builtin.pcie_ep:PcieEpComponent
+  builtin.io_cpu:      kernbench.components.builtin.io_cpu:IoCpuComponent

  # Cube-level
-  m_cpu_v1:       kernbench.components.builtin.m_cpu:MCpuComponent
-  hbm_ctrl_v1:    kernbench.components.builtin.hbm_ctrl:HbmCtrlComponent
-  sram_v1:        kernbench.components.builtin.sram:SramComponent
+  builtin.m_cpu:       kernbench.components.builtin.m_cpu:MCpuComponent
+  builtin.hbm_ctrl:    kernbench.components.builtin.hbm_ctrl:HbmCtrlComponent
+  builtin.sram:        kernbench.components.builtin.sram:SramComponent

  # PE-level
-  pe_cpu_v1:       kernbench.components.builtin.pe_cpu:PeCpuComponent
-  pe_scheduler_v1: kernbench.components.builtin.pe_scheduler:PeSchedulerComponent
-  pe_dma_v1:       kernbench.components.builtin.pe_dma:PeDmaComponent
-  pe_gemm_v1:      kernbench.components.builtin.pe_gemm:PeGemmComponent
-  pe_math_v1:      kernbench.components.builtin.pe_math:PeMathComponent
-  pe_mmu_v1:       kernbench.components.builtin.pe_mmu:PeMmuComponent
-  pe_tcm_v1:       kernbench.components.builtin.pe_tcm:PeTcmComponent
+  builtin.pe_cpu:         kernbench.components.builtin.pe_cpu:PeCpuComponent
+  builtin.pe_scheduler:   kernbench.components.builtin.pe_scheduler:PeSchedulerComponent
+  builtin.pe_dma:         kernbench.components.builtin.pe_dma:PeDmaComponent
+  builtin.pe_gemm:        kernbench.components.builtin.pe_gemm:PeGemmComponent
+  builtin.pe_math:        kernbench.components.builtin.pe_math:PeMathComponent
+  builtin.pe_fetch_store: kernbench.components.builtin.pe_fetch_store:PeFetchStoreComponent
+  builtin.pe_mmu:         kernbench.components.builtin.pe_mmu:PeMmuComponent
+  builtin.pe_tcm:         kernbench.components.builtin.pe_tcm:PeTcmComponent
+  builtin.pe_ipcq:        kernbench.components.builtin.pe_ipcq:PeIpcqComponent

  # Custom — add your implementations here
-  pe_scheduler_v2: kernbench.components.custom.pe_accel.scheduler:SchedulerV2Component
@@ -34,12 +34,11 @@ shortcuts that obscure control paths.
  (topology + policy + request).

 ### D3. Bypass is explicit and graph-represented
- Any bypass (e.g., local cube HBM access via XBAR instead of NOC) must be:
-  - explicitly represented as a graph path, and
-  - subject to latency accumulation like any other path.
- Example: PE_DMA has dual egress — one to XBAR (HBM path) and one to NOC (non-HBM path).
-  Both are explicit graph edges; neither is a “bypass” — they are distinct data paths
-  serving different memory domains.
+- All paths must be explicitly represented in the graph and subject to latency accumulation.
+- Example: PE_DMA connects to the NOC router mesh (ADR-0019). All destinations
+  (HBM, shared SRAM, inter-cube UCIe) are reached via explicit mesh hops.
+  Local HBM access has minimal hops (switching overhead only); remote access
+  traverses additional routers.
 - Implicit or “magic” bypass paths are disallowed.

 ### D4. No zero-latency end-to-end paths
@@ -35,12 +35,11 @@ We model the system hierarchy explicitly:

 - A CUBE contains:
  - HBM + memory controller (HBM_CTRL)
-  - XBAR (top/bottom): HBM pseudo-channel crossbar, PE's dedicated path to HBM
-  - Bridge (left/right): connects XBAR.top ↔ XBAR.bottom for cross-half HBM access
-  - NOC: 2D mesh router grid spanning the entire cube with XY routing and
-    per-segment contention modeling; carries all intra-cube traffic including
-    PE DMA to xbar (HBM), inter-cube (UCIe), command (M_CPU↔PE_CPU), and
-    shared SRAM access. See ADR-0017 for full NOC architecture.
+  - NOC router mesh: 2D grid of explicit routers (from cube_mesh.yaml) with XY routing;
+    carries all intra-cube traffic including HBM data, inter-cube (UCIe),
+    command (M_CPU↔PE_CPU), and shared SRAM access.
+    HBM_CTRL is attached to PE routers (local HBM = 0 hop).
+    See ADR-0017 and ADR-0019 for full architecture.
  - Shared SRAM: cube-level shared memory accessible by all PEs via NOC
  - management/control CPU (M_CPU) coordinating PE command distribution and completion aggregation
  - multiple PEs
@@ -14,9 +14,9 @@ Each PE has a notion of “local HBM” that must guarantee full HBM bandwidth,
 ### D1. Local HBM definition

 - Each PE is assigned a logically defined “local HBM” region.
- Local HBM corresponds to the pseudo-channel subset directly attached to that PE’s DMA path
-  via the XBAR (top or bottom, depending on PE corner placement).
- The path is: PE_DMA → XBAR.top/bottom → HBM_CTRL.
+- Local HBM corresponds to the pseudo-channel subset directly attached to that PE’s
+  router in the NOC mesh (ADR-0019).
+- The path is: PE_DMA → local router → HBM_CTRL (switching overhead only, 0 mesh hops).
 - The mapping (HBM pseudo-channels → PE local regions) is derived from topology configuration.

 ### D2. Local HBM bandwidth guarantee contract
@@ -27,19 +27,18 @@ Each PE has a notion of “local HBM” that must guarantee full HBM bandwidth,
  The efficiency factor (configured via `hbm_ctrl.attrs.efficiency`, default 0.8)
  models real-world DRAM inefficiencies (refresh cycles, bank conflicts, page
  misses). For example: 256 GB/s spec x 0.8 = 204.8 GB/s effective.
- The topology builder applies the efficiency factor to xbar-to-hbm edge
+- The topology builder applies the efficiency factor to router-to-hbm edge
  bandwidth at graph construction time, so all downstream routing and latency
  computation uses the effective value.
 - This guarantee is modeled by:
  - a dedicated logical path and/or service model that enforces HBM BW at the PE-local-HBM interaction point,
  - while still incurring non-zero latency along explicitly modeled components.

-### D3. Cross-half HBM semantics
+### D3. Remote PE HBM semantics (intra-cube)

- A PE connected to XBAR.bottom that accesses HBM pseudo-channels on the XBAR.top half
-  (or vice versa) traverses a bridge:
-  - PE_DMA → XBAR.bottom → bridge → XBAR.top → HBM_CTRL
- Bridge bandwidth may limit cross-half HBM access relative to local-half access.
+- A PE that accesses another PE's local HBM traverses the router mesh:
+  - PE_DMA → local router → (mesh hops) → target PE's router → HBM_CTRL
+- Router mesh bandwidth and hop count may limit remote HBM access relative to local access.

 ### D4. Non-local HBM semantics (inter-cube / inter-SIP)

@@ -61,7 +60,7 @@ Each PE has a notion of “local HBM” that must guarantee full HBM bandwidth,
 Tests should cover:

 - local-HBM case: BW matches HBM BW regardless of fabric BW parameter
- cross-half HBM case: latency includes bridge traversal
+- remote PE HBM case: latency includes mesh hop traversal
 - non-local cases (inter-cube/inter-SIP): BW/latency respond to fabric/link parameters
 - shared SRAM case: access via NOC with correct BW

@@ -82,9 +82,8 @@ Explain cube-internal structure and data/control flow.

 **Visible elements**

- XBAR (top/bottom): HBM pseudo-channel crossbar
- Bridge (left/right): cross-half HBM connectors between XBAR.top and XBAR.bottom
- NOC: distributed on-die fabric for non-HBM traffic
+- Router mesh: 2D grid of NOC routers (from cube_mesh.yaml), all traffic routes through mesh
+- HBM_CTRL attached to PE routers (local HBM = 0 hop)
 - HBM subsystem (HBM_CTRL)
 - Shared SRAM: cube-level shared memory
 - Management CPU (M_CPU)
@@ -97,14 +96,13 @@ Explain cube-internal structure and data/control flow.

 **Visible links**

- PE → XBAR (HBM data path, top or bottom by corner placement)
- PE → NOC (non-HBM data path)
- XBAR ↔ bridge ↔ XBAR (cross-half HBM access)
- XBAR → HBM_CTRL
- NOC ↔ UCIe endpoints
- NOC ↔ shared SRAM
- M_CPU ↔ NOC (command path)
- NOC → PE_CPU (command delivery, collapsed into PE block)
+- PE → router (HBM + non-HBM data path via mesh)
+- Router ↔ HBM_CTRL (local HBM access)
+- Router ↔ Router (mesh hops for remote access)
+- Router ↔ UCIe endpoints
+- Router ↔ shared SRAM
+- M_CPU ↔ router (command path)
+- Router → PE_CPU (command delivery, collapsed into PE block)

 ---

@@ -61,9 +61,9 @@ For each view (SIP / CUBE / PE):
  - preserve connectivity semantics relevant to that view,
  - compute distance buckets and assign layout layers deterministically.
 - CUBE-level projection MUST include:
-  - XBAR (top/bottom), bridge (left/right), NOC, HBM_CTRL, shared SRAM, M_CPU, UCIe ports,
+  - Router mesh (from cube_mesh.yaml), HBM_CTRL, shared SRAM, M_CPU, UCIe ports,
    and PEs as opaque blocks.
-  - Distinct edge kinds for HBM path (PE→XBAR) vs non-HBM path (PE→NOC).
+  - All paths (HBM, non-HBM, command) route through the same router mesh (ADR-0019).
 - Default anchors are implicit (ADR-0005) and MUST NOT require instance indices.

 ### D6. Output formats and determinism
@@ -44,14 +44,15 @@ Each PE contains the following logical components.
 **PE_DMA**

 - Handles memory transfers between PE_TCM and external memory domains.
- PE_DMA has **dual egress** at the CUBE level:
-  - **→ XBAR**: dedicated path to HBM (local and cross-half via bridge)
-  - **→ NOC**: path to non-HBM destinations (shared SRAM, inter-cube UCIe, etc.)
+- PE_DMA connects to the NOC router mesh at the CUBE level (ADR-0019):
+  - All destinations (HBM, shared SRAM, inter-cube UCIe) are reached via the router mesh
+  - Local HBM access: PE_DMA → local router → hbm_ctrl (switching overhead only)
+  - Remote/shared: PE_DMA → local router → (mesh hops) → destination
 - Supported directions include:
-  - HBM → PE_TCM (via XBAR)
-  - PE_TCM → HBM (via XBAR)
-  - PE_TCM → shared SRAM (via NOC)
-  - PE_TCM → other memory domains (via NOC, if supported by topology)
+  - HBM → PE_TCM (via router mesh)
+  - PE_TCM → HBM (via router mesh)
+  - PE_TCM → shared SRAM (via router mesh)
+  - PE_TCM → other memory domains (via router mesh, if supported by topology)

 **PE_GEMM**

@@ -251,7 +252,7 @@ Compute operations use a TCM-centric dataflow model.
 **Input path (HBM)**

 ```text
-HBM → XBAR → PE_DMA (DMA_READ) → PE_TCM
+HBM → router mesh → PE_DMA (DMA_READ) → PE_TCM
 ```

 **Input path (shared SRAM)**
@@ -268,14 +269,14 @@ Compute engines read input tensors from PE_TCM.
 PE_TCM → GEMM / MATH
 ```

-Weights for GEMM may optionally stream directly from HBM (via XBAR).
+Weights for GEMM may optionally stream directly from HBM (via router mesh).

 **Output path (HBM)**

 Compute results are written to PE_TCM, then DMA writes to HBM.

 ```text
-PE_TCM → PE_DMA (DMA_WRITE) → XBAR → HBM
+PE_TCM → PE_DMA (DMA_WRITE) → router mesh → HBM
 ```

 **Output path (shared SRAM)**
@@ -347,9 +348,9 @@ PE instances are derived from `cube.pe_layout`.

 External connectivity such as:

- PE_DMA → XBAR (HBM data path)
- PE_DMA → NOC (non-HBM data path: shared SRAM, inter-cube UCIe)
- NOC → PE_CPU (command path from M_CPU)
+- PE_DMA → router mesh → HBM (data path, ADR-0019)
+- PE_DMA → router mesh → shared SRAM, inter-cube UCIe (non-HBM data path)
+- router mesh → PE_CPU (command path from M_CPU)

 is modeled at the CUBE level (see ADR-0003 D3).

@@ -104,13 +104,13 @@ Kernel Launch routes through M_CPU for PE fan-out.
 ```text
 pcie_ep → io_noc → io_ucie
  → [transit cubes: ucie_in → noc → ucie_out]  (zero or more)
-  → target cube: ucie_in → noc → xbar → hbm_ctrl
+  → target cube: ucie_in → router mesh → hbm_ctrl
 ```

 **Memory R/W completion path:**

 ```text
-hbm_ctrl → xbar → noc → [transit cubes: ucie → noc → ucie]
+hbm_ctrl → router mesh → [transit cubes: ucie → router mesh → ucie]
  → io_ucie → io_noc → pcie_ep
 ```

@@ -49,7 +49,7 @@ Memory operations (MemoryWrite, MemoryRead) are routed directly from pcie_ep
 through io_noc to the target cube, bypassing io_cpu entirely:

 ```text
-pcie_ep → io_noc → conn → io_ucie → [cube UCIe] → noc → xbar → hbm_ctrl
+pcie_ep → io_noc → conn → io_ucie → [cube UCIe] → router mesh → hbm_ctrl
 ```

 This avoids the 10ns io_cpu overhead for pure data transfers. The simulation
@@ -16,9 +16,10 @@ architecture.

 ### D1. NOC node and router grid

-Each cube contains a single NOC topology node (`sip{S}.cube{C}.noc`)
-implemented as `noc_2d_mesh_v1`. Internally, the NOC models a 2D router
-grid generated by `mesh_gen.py`.
+Each cube contains a 2D router mesh generated by `mesh_gen.py`.
+Each router is a separate topology node (`sip{S}.cube{C}.r{row}c{col}`)
+implemented as `forwarding_v1`. (Supersedes the original single-node
+`noc_2d_mesh_v1` design — see ADR-0019.)

 Grid properties:

@@ -82,8 +83,8 @@ PE4.cpu <--+         |       |         +--< PE6.cpu
                         |
                    UCIe-S (conn x4)

-xbar_top attached to: r0c0, r0c1, r1c4, r1c5 (top-half PE routers)
-xbar_bot attached to: r4c0, r4c1, r5c4, r5c5 (bottom-half PE routers)
+HBM attach: PE가 있는 라우터에 hbm_ctrl도 연결 (ADR-0019 D1)
+(xbar_top/xbar_bot은 ADR-0019에 의해 제거됨)
 ```

 ### D5. NOC edge bandwidths and distances
@@ -92,8 +93,7 @@ xbar_bot attached to: r4c0, r4c1, r5c4, r5c5 (bottom-half PE routers)
 | --- | --- | --- | --- |
 | PE_DMA -> NOC | 256.0 | Physical (PE pos) | Matches HBM slice BW |
 | NOC -> PE_CPU | - | 0.0 mm | Command path only |
-| NOC <-> xbar_top | 256.0 | 0.0 mm | Per xbar half |
-| NOC <-> xbar_bot | 256.0 | 0.0 mm | Per xbar half |
+| Router <-> HBM_CTRL | 256.0 | 0.0 mm | Per PE router (ADR-0019) |
 | NOC <-> M_CPU | - | 0.0 mm | Command path |
 | NOC <-> SRAM | 128.0 x4 | 0.0 mm | 512 GB/s aggregate |
 | NOC <-> UCIe conn | 128.0 | 0.0 mm | Per connection, 4 per port |
@@ -117,7 +117,7 @@ Inter-cube traffic path:
 ```text
 Source: PE_DMA -> NOC -> conn{i} -> ucie-{PORT}
                    [UCIe link: 512 GB/s, 1.0mm seam distance]
-Target: ucie-{PORT} -> conn{i} -> NOC -> xbar -> HBM
+Target: ucie-{PORT} -> conn{i} -> r{x}c{y} -> (mesh hops) -> hbm_ctrl
 ```

 UCIe overhead (8.0 ns) is applied at each ucie-{PORT} node, so a
@@ -128,31 +128,31 @@ full crossing incurs 16 ns (TX port + RX port).
 **PE DMA to local HBM (same half):**

 ```text
-PE_DMA -> NOC -> xbar_top -> HBM_CTRL.slice{0-3}
+PE_DMA -> r{x}c{y} -> hbm_ctrl  (local: 0 mesh hops, switching overhead only)
 ```

-**PE DMA to cross-half HBM:**
+**PE DMA to remote PE's HBM:**

 ```text
-PE_DMA -> NOC -> xbar_top -> bridge -> xbar_bot -> HBM_CTRL.slice{4-7}
+PE_DMA -> r{x}c{y} -> (mesh hops) -> r{x'}c{y'} -> hbm_ctrl
 ```

 **PE DMA to remote cube HBM:**

 ```text
-PE_DMA -> NOC -> conn -> ucie-E -> [seam] -> ucie-W -> conn -> NOC -> xbar -> HBM
+PE_DMA -> r{x}c{y} -> conn -> ucie-E -> [seam] -> ucie-W -> conn -> r{x'}c{y'} -> hbm_ctrl
 ```

 **Kernel Launch command to PE:**

 ```text
-[from io_noc] -> ucie -> conn -> NOC -> M_CPU -> NOC -> PE_CPU
+[from io_noc] -> ucie -> conn -> r{x}c{y} -> (mesh hops) -> M_CPU -> (mesh hops) -> PE_CPU
 ```

 **Shared SRAM access:**

 ```text
-PE_DMA -> NOC -> SRAM
+PE_DMA -> r{x}c{y} -> (mesh hops) -> SRAM
 ```

 ### D8. Mesh generation
@@ -169,7 +169,7 @@ The generator produces a `mesh_data` dictionary containing:
 - PE-to-router attachments (pe_dma, pe_cpu per PE)
 - UCIe-to-router attachments (N/S/E/W, distributed across edge routers)
 - M_CPU and SRAM router attachments
- xbar_top/bot router assignments (top-half vs bottom-half PE routers)
+- HBM attachment per PE router (ADR-0019)

 ## Consequences

@@ -182,8 +182,8 @@ The generator produces a `mesh_data` dictionary containing:
 ## Links

 - ADR-0003 D3 (cube-level NOC definition — extended by this ADR)
- ADR-0004 D1 (PE DMA to local HBM path via xbar)
- ADR-0004 D3 (cross-half HBM via bridge)
- ADR-0014 D1 (PE_DMA dual egress: xbar for HBM, NOC for non-HBM)
+- ADR-0004 D1 (PE DMA to local HBM path via router mesh)
+- ADR-0014 D1 (PE_DMA egress via router mesh)
+- ADR-0019 (NOC-Local HBM — xbar/bridge 제거, 명시적 라우터 mesh)
 - ADR-0015 D4 (fabric paths for Memory R/W and Kernel Launch)
 - ADR-0016 D1 (IOChiplet io_noc — analogous pattern at IO chiplet level)
@@ -0,0 +1,441 @@
+# ADR-0018: LA-Based Memory Address Abstraction and HBM Channel Mapping Mode Introduction
+
+## Status
+
+Proposed
+
+## Context
+
+Kernbench simulates memory access between PE_DMA and Local-HBM within a CUBE.
+Currently, a VA-based access path is used; however, the following two channel mapping models
+are difficult to represent consistently.
+
+### Background: Local-HBM Pseudo Channel Structure
+
+The HBM in a CUBE consists of 32 or 64 pseudo channels.
+In the PE-Local-HBM model, each PE is responsible for an equal number of pseudo channels.
+
+Example: 64 pseudo channels, 8 PEs per cube -> each PE accesses 8 pseudo channels as local HBM
+
+Both the number of pseudo channels and the number of PEs are topology parameters.
+`N = hbm_pseudo_channels / pes_per_cube` (= channels_per_pe) determines
+the number of local channels per PE.
+
+The routing path BW between DMA and each pseudo channel matches the BW of each pseudo channel
+(e.g., 32 GB/s), so if a PE sends simultaneous requests to N channels, it can utilize the
+maximum memory BW.
+
+### Limitations of the Current VA Model
+
+When channels are divided into 8, requests must also be generated per channel and sent to DMA.
+However, in the current architecture, the kernel generates requests with VA (`tl.load`)
+and passes them directly to DMA, making it difficult for PE_CPU to generate per-channel DMA requests.
+
+Therefore, instead of VA, we propose using **Logical Address (LA)**,
+where the **BAAW (Logical-to-Physical Mapping Unit)** inside PE_DMA
+converts LA to PA or a list of PAs based on segment-based mapping.
+
+### Two Channel Mapping Modes
+
+- **1:1 mode**: Creates and executes per-channel requests. Precise per-channel modeling.
+- **n:1 mode (default)**: Assumes interleaving across local HBM channels. Aggregated BW modeling.
+
+By supporting both modes, the overhead of the n:1 mode can be measured and evaluated.
+
+### Core Requirements
+
+- The effective bandwidth semantics of PE_DMA -> HBM_CTRL must be identical in both modes
+- The difference must only be in the request representation and resource modeling approach
+- The kernel programming model must not be changed
+- Physical channel information must not be exposed to the kernel
+
+### Existing Physical Address
+
+The current system's 51-bit Physical Address is defined in `policy/address/phyaddr.py`:
+
+```
+[50:47] rack_id (4 bit)
+[46:43] sip_id  (4 bit)
+[42:38] cube_id (5 bit, sip_seg)
+[37]    hbm_selector (1=HBM window)
+[36:0]  hbm_offset   (37 bit, 128GB per cube)
+```
+
+PA is used to represent the final routable canonical physical destination,
+and this role is preserved.
+However, the timing and policy of logical access -> physical request conversion are not clearly separated.
+
+---
+
+## Decision
+
+### D1. Introduction of LA (Logical Address) — Replacing VA
+
+The existing VA (Virtual Address) infrastructure is replaced with LA (Logical Address).
+
+#### Characteristics of LA
+
+- Like VA, tensors can be mapped to a contiguous memory space
+- Represents logical buffer + offset
+- Does not directly contain physical channel information
+- An intermediate abstraction maintained until physical resolution
+- The sole address scheme used by kernel code (`tl.load`, `tl.store`, `tl.composite`)
+
+#### LA Space Definition
+
+| Item | Value |
+|------|-------|
+| LA start address | `0x1_0000_0000` (4 GB, preserving the existing VA start point) |
+| LA space size | 64 GB per PE |
+| Alignment unit | Segment-based (see D3 below) |
+
+LA is a PE-local address space.
+Even if different PEs use the same LA value, they resolve to different PAs
+because each PE has a different BAAW segment table.
+
+#### VA Infrastructure Removal Scope
+
+With the introduction of LA, the following existing code will be replaced/removed:
+
+| Removal Target | Replacement |
+|----------------|-------------|
+| `policy/address/va_allocator.py` (VirtualAllocator) | LA allocator (same free-list approach, name/role changed) |
+| `policy/address/pe_mmu.py` (PeMMU) | BAAW segment table (inside PE_DMA) |
+| `components/builtin/pe_mmu.py` (PeMmuComponent) | Removed — BAAW is internal PE_DMA logic, not a separate component |
+| `runtime_api/kernel.py`: MmuMapMsg, MmuUnmapMsg | Replaced with BaawSegmentInstallMsg |
+| `runtime_api/context.py`: VA alloc + MMU mapping install | LA alloc + BAAW segment install |
+| `runtime_api/tensor.py`: `va_base` field | `la_base` field |
+| `topology.yaml`: pe_mmu component entry | Removed |
+
+---
+
+### D2. Mapping Mode Configuration
+
+The mapping mode is configured at the cube level in topology.yaml:
+
+```yaml
+cube:
+  memory_map:
+    hbm_mapping_mode: n_to_one    # one_to_one | n_to_one
+    hbm_pseudo_channels: 64       # total pseudo channel count
+    hbm_channels_per_pe: 8        # local channel count per PE
+    hbm_channel_bw_gbs: 32.0      # per-channel bandwidth
+```
+
+This configuration is referenced during graph compilation (topology builder) and BAAW initialization.
+
+---
+
+### D3. Segments and BAAW
+
+#### Segment Definition
+
+A segment is a logical allocation unit that partitions the LA space so that each segment
+maps to a specific HBM channel or channel group.
+
+Segments are created by the runtime allocator during tensor deployment,
+and BAAW uses them to convert LA into physical requests.
+
+#### BAAW Segment Table Entry
+
+```python
+@dataclass
+class BaawSegment:
+    la_base: int          # segment start LA
+    la_size: int          # segment size (bytes)
+    mode: str             # "one_to_one" | "n_to_one"
+    # 1:1 mode fields
+    channel_count: int    # number of channels assigned to this segment (e.g., 8)
+    pa_bases: list[int]   # per-channel PA start address list (len = channel_count)
+    channel_ids: list[int]  # per-channel logical IDs (e.g., [0,1,2,...,7])
+    channel_size: int     # per-channel size (la_size // channel_count)
+    # n:1 mode fields
+    agg_pa_base: int      # aggregated PA start address
+    agg_node_id: str      # aggregated router node_id (for routing)
+```
+
+#### Segment Lifecycle
+
+1. **Allocation time** (tensor deploy):
+   - RuntimeContext allocates LA space from the LA allocator
+   - PEMemAllocator allocates per-channel PA (1:1) or aggregated PA (n:1)
+   - Sends `BaawSegmentInstallMsg` to PE_DMA to register in the segment table
+
+2. **Usage time** (kernel execution):
+   - Kernel issues `tl.load(la_ptr)` -> DmaReadCmd(src_addr=LA)
+   - PE_DMA looks up the segment corresponding to the LA in BAAW
+   - Converts to PA(s) according to the mode
+
+3. **Deallocation time** (tensor free):
+   - Removed from the segment table
+   - LA space returned, PA deallocated
+
+---
+
+### D4. BAAW (Logical-to-Physical Mapping Unit)
+
+#### Location
+
+BAAW is placed as a front-end stage inside PE_DMA.
+It is not a separate SimPy component; it is synchronous address resolution logic
+executed at the beginning of PE_DMA's `handle_command()`.
+
+#### Input
+
+- LA (Logical Address) — DmaReadCmd.src_addr or DmaWriteCmd.dst_addr
+- access size (bytes)
+
+#### Output
+
+- 1:1 mode: `list[PhysicalRequest]` — each request is (PA, nbytes, channel_node_id)
+- n:1 mode: 1 `PhysicalRequest` — (agg_PA, nbytes, agg_node_id)
+
+```python
+@dataclass
+class PhysicalRequest:
+    pa: int           # 51-bit Physical Address
+    nbytes: int       # transfer size for this request
+    dst_node: str     # target node_id (channel router or aggregated router)
+```
+
+#### BAAW Resolve Logic
+
+```python
+def resolve(self, la: int, nbytes: int) -> list[PhysicalRequest]:
+    seg = self._find_segment(la)  # la_base <= la < la_base + la_size
+    offset = la - seg.la_base
+
+    if seg.mode == "n_to_one":
+        pa = seg.agg_pa_base + offset
+        return [PhysicalRequest(pa=pa, nbytes=nbytes, dst_node=seg.agg_node_id)]
+
+    elif seg.mode == "one_to_one":
+        requests = []
+        per_ch_size = seg.channel_size
+        for i, (pa_base, ch_id) in enumerate(zip(seg.pa_bases, seg.channel_ids)):
+            ch_offset = offset % per_ch_size  # interleaved or striped
+            ch_nbytes = nbytes // seg.channel_count
+            pa = pa_base + ch_offset
+            dst_node = f"{self._pe_prefix}.ch_r{ch_id}"
+            requests.append(PhysicalRequest(pa=pa, nbytes=ch_nbytes, dst_node=dst_node))
+        return requests
+```
+
+#### Scope of Responsibility
+
+BAAW is responsible for:
+- Converting logical accesses into physical request units
+- Performing fan-out (1:1) or pass-through (n:1) according to the mapping mode
+- Generating Physical Addresses and determining target nodes
+
+BAAW is NOT responsible for:
+- Performing actual data movement
+- Executing NOC routing
+- Simulating bandwidth consumption (this is the role of downstream components)
+
+#### Output Contract
+
+The output of BAAW must be request units that can be directly used by the simulator's
+routing and resource model without any additional address decoding.
+
+---
+
+### D5. PE_DMA handle_command() Changes
+
+#### Current Flow (VA-based)
+
+```
+DmaReadCmd.src_addr (VA)
+  -> MMU.translate(VA) -> PA
+  -> PhysAddr.decode(PA) -> PhysAddr object
+  -> resolver.resolve(PhysAddr) -> dst_node_id (e.g., "sip0.cube0.hbm_ctrl")
+  -> router.find_path(pe_prefix, dst_node_id) -> path
+  -> 1 sub-Transaction created -> fabric inject
+```
+
+#### New Flow (LA-based)
+
+```
+DmaReadCmd.src_addr (LA)
+  -> BAAW.resolve(LA, nbytes) -> list[PhysicalRequest]
+  -> For each PhysicalRequest:
+      -> router.find_path(pe_prefix, req.dst_node) -> path
+      -> compute_drain_ns(path, req.nbytes) -> drain
+      -> sub-Transaction created -> fabric inject
+  -> Wait for all sub-Transactions to complete
+  -> pe_txn.done.succeed()
+```
+
+Key changes:
+- MMU reference removed -> replaced with BAAW resolve
+- PhysAddr.decode() + resolver.resolve() -> BAAW directly returns dst_node
+- 1 request -> N requests injected in parallel (1:1 mode)
+
+---
+
+### D6. 1:1 Mode Details
+
+- One logical access -> N (= `channels_per_pe`) physical requests
+- N is a parameter determined by `hbm_pseudo_channels / pes_per_cube`
+- Each request:
+  - Fully resolved 51-bit PA
+  - Targets a specific channel router (`{pe_prefix}.ch_r{channel_id}`)
+- BW contention modeling via per-channel links
+- PE_DMA injects N sub-transactions simultaneously
+
+#### 1:1 Mode Example
+
+Configuration: `hbm_pseudo_channels=64`, `pes_per_cube=8`
+-> `channels_per_pe=8`, PE0 owns ch0-7
+
+```text
+Tensor A (4 KB) -> LA 0x1_0000_0000, size=4096 bytes
+BAAW segment: {
+    la_base: 0x1_0000_0000, la_size: 4096,
+    mode: "one_to_one", channel_count: 8,  # = channels_per_pe
+    pa_bases: [PA_ch0, PA_ch1, ..., PA_ch7],
+    channel_ids: [0, 1, 2, 3, 4, 5, 6, 7],
+    channel_size: 512,  # = la_size / channel_count
+}
+
+BAAW resolve result (N=8 requests):
+  -> PhysicalRequest(pa=PA_ch0, nbytes=512, dst_node="sip0.cube0.pe0.ch_r0")
+  -> PhysicalRequest(pa=PA_ch1, nbytes=512, dst_node="sip0.cube0.pe0.ch_r1")
+  -> ...
+  -> PhysicalRequest(pa=PA_ch7, nbytes=512, dst_node="sip0.cube0.pe0.ch_r7")
+
+PE_DMA: N sub-transactions injected in parallel
+  Each accesses HBM via channel router -> hbm_ctrl link (channel_bw_gbs)
+  Total effective BW = N x channel_bw_gbs
+```
+
+Examples with different N values:
+- `hbm_pseudo_channels=32`, `pes_per_cube=8` -> `channels_per_pe=4`, 4 requests
+- `hbm_pseudo_channels=64`, `pes_per_cube=4` -> `channels_per_pe=16`, 16 requests
+
+---
+
+### D7. n:1 Mode Details
+
+- One logical access -> one aggregated request
+- Target: aggregated router -> hbm_ctrl (see ADR-0019)
+- Aggregated link BW = `channels_per_pe` x `channel_bw_gbs` (e.g., 8 x 32 = 256 GB/s)
+- Modeled as a single queue / resource
+- No per-channel PA decomposition
+
+#### n:1 Mode Example
+
+```
+Tensor A (4 KB) -> LA 0x1_0000_0000, size=4096 bytes
+BAAW segment: {
+    la_base: 0x1_0000_0000, la_size: 4096,
+    mode: "n_to_one",
+    agg_pa_base: PA_agg,
+    agg_node_id: "sip0.cube0.pe0.agg_router",
+}
+
+BAAW resolve result:
+  -> PhysicalRequest(pa=PA_agg, nbytes=4096, dst_node="sip0.cube0.pe0.agg_router")
+
+PE_DMA: 1 sub-transaction injected
+  Accesses HBM via aggregated router -> hbm_ctrl link (256 GB/s)
+```
+
+---
+
+### D8. Kernel Model Preservation
+
+- The kernel still issues only single memory ops (`tl.load`, `tl.store`, `tl.composite`)
+- LA is the address scheme passed to the kernel
+- Channel decomposition/aggregation is performed by BAAW inside PE_DMA
+- Physical channel information is not exposed to kernel code
+
+---
+
+## Consequences
+
+### Positive
+
+- 1:1 vs n:1 semantics are clearly separated at a single point: BAAW
+- Kernel abstraction is preserved — no kernel code changes required
+- Topology-based policy control is possible (mode switching via yaml)
+- Improved simulation model consistency and debuggability
+- Segment-based mapping is simpler and has lower overhead compared to page tables
+
+### Negative
+
+- Full refactoring of VA/MMU-based code is required
+- Increased complexity in the request generation path (managing N requests in 1:1 mode)
+- Reduced per-channel visibility in n:1 mode
+- Existing VA-related tests must be rewritten
+
+---
+
+## Alternatives
+
+### A1. Keep VA + Fan-out at MMU
+
+- Extend MMU to return per-channel PAs
+- Problem: MMU's role expands beyond address translation to include request decomposition
+- Problem: Aggregation representation is difficult in n:1 mode
+
+### A2. Kernel Generates Channel-Aware Requests
+
+- Kernel directly calls per-channel load/store
+- Problem: Abstraction leakage, reduced portability
+- Problem: All benchmark code must be modified
+
+### A3. Always Use PA (Without LA)
+
+- Runtime directly passes per-channel PA to the kernel
+- Problem: Conflicts with the aggregation model
+- Problem: Conversion timing is unclear, channel information exposed to kernel
+
+---
+
+## Implementation Notes
+
+### Implementation Order
+
+1. Introduce LA type (`policy/address/la_allocator.py`)
+2. Implement BAAW segment table (`policy/address/baaw.py`)
+3. Add `BaawSegmentInstallMsg` message type (`runtime_api/kernel.py`)
+4. Integrate BAAW into PE_DMA (`components/builtin/pe_dma.py` handle_command changes)
+5. Modify RuntimeContext: LA alloc + segment install (`runtime_api/context.py`)
+6. Change Tensor.va_base -> la_base (`runtime_api/tensor.py`)
+7. Remove VA/MMU code
+8. Remove pe_mmu from topology.yaml, add mapping mode configuration
+9. Test migration
+
+### Affected Existing Tests
+
+| Test File | Impact |
+|-----------|--------|
+| `tests/test_mmu_component.py` | Remove -> replace with BAAW segment install test |
+| `tests/test_mmu_fabric.py` | Remove -> replace with BAAW + fabric integration test |
+| `tests/test_pe_mmu.py` | Remove |
+| `tests/test_va_allocator.py` | Replace with LA allocator test |
+| `tests/test_va_integration.py` | Replace with LA + BAAW integration test |
+| `tests/test_va_offset.py` | Replace with LA offset test |
+
+---
+
+## Test Requirements
+
+- For the same logical access:
+  - 1:1 -> verify N requests are generated
+  - n:1 -> verify 1 aggregated request is generated
+- Verify effective bandwidth consistency across both modes
+- 1:1 -> verify per-channel contention modeling
+- n:1 -> verify aggregated bandwidth is reflected
+- Verify operation without kernel code changes
+- Verify correct BAAW segment install/uninstall operation
+- Verify no conflicts when multiple tensors are assigned to different segments
+
+---
+
+## Links
+
+- ADR-0011 (Memory Addressing Simplification — PA-first, VA/MMU introduction) -> superseded by this ADR
+- ADR-0019 (NOC Per-Channel HBM Connection Model) -> topology-side integration
+- ADR-0014 (PE Internal Execution Model) -> PE_DMA change impact
@@ -247,7 +247,7 @@ simulator의 routing 및 resource 모델에서 직접 사용 가능한 request
 DmaReadCmd.src_addr (VA)
  → MMU.translate(VA) → PA
  → PhysAddr.decode(PA) → PhysAddr object
-  → resolver.resolve(PhysAddr) → dst_node_id (e.g., "sip0.cube0.hbm_ctrl.slice3")
+  → resolver.resolve(PhysAddr) → dst_node_id (e.g., "sip0.cube0.hbm_ctrl")
  → router.find_path(pe_prefix, dst_node_id) → path
  → 1개 sub-Transaction 생성 → fabric inject
 ```
@@ -0,0 +1,431 @@
+# ADR-0019: Per-Channel and Aggregated HBM Connection Models within CUBE NOC
+
+## Status
+
+Proposed
+
+## Context
+
+ADR-0018 introduced LA-based address abstraction and BAAW,
+defining how a logical memory access is translated into the following two forms of requests:
+
+- 1:1 mode: one logical access → N per-channel requests
+- n:1 mode: one logical access → one aggregated request
+
+Here N = `hbm_pseudo_channels / pes_per_cube` (= `channels_per_pe`),
+determined by topology parameters.
+
+### Problems with the Existing Structure
+
+In the current implementation (`topology/builder.py`):
+
+- PE_DMA → NOC → xbar_top/xbar_bot → HBM_CTRL.slice{0-7} path is used
+- HBM is modeled as 8 slice (= per-PE) nodes
+- Local/remote access use different paths:
+  - local: NOC → xbar → HBM slice
+  - cross-half: NOC → xbar_top → bridge → xbar_bot → HBM slice
+  - remote cube: NOC → UCIe → remote NOC → remote xbar → remote HBM slice
+
+Limitations of this structure:
+
+- Cannot model at the pseudo-channel granularity (slice = per-PE granularity, not per-channel)
+- xbar/bridge bifurcate local/remote paths
+- Cannot express 1:1 / n:1 modes consistently
+
+---
+
+## Decision
+
+### D1. HBM Attaches to PE Routers
+
+Consolidate the current `hbm_ctrl.slice{0-7}` (8 nodes) into a **single `hbm_ctrl` node**,
+and attach the HBM access point to the same router where the PE is attached.
+
+- n:1 mode: PE's local HBM access goes directly from its own router (switching overhead only, 0 hops)
+- Remote PE's HBM access: reaches the target PE's router via mesh hops
+- The read/write resource model within the HBM controller is preserved
+
+Node naming changes:
+
+| Current | After Change |
+| ---- | ------- |
+| `sip0.cube0.hbm_ctrl.slice0` ~ `slice7` | `sip0.cube0.hbm_ctrl` (single) |
+
+In `mesh_gen.py`, add `pe{idx}.hbm` to the PE attachment so that
+the builder generates an edge between that router and hbm_ctrl.
+
+---
+
+### D2. Complete Removal of xbar, bridge, and Single NOC Node
+
+Remove all of the following nodes and related edges:
+
+- `{cube}.xbar_top`, `{cube}.xbar_bot`
+- `{cube}.bridge.left`, `{cube}.bridge.right`
+- `{cube}.noc` (single TwoDMeshNocComponent node)
+- Edges of type `noc_to_xbar`, `xbar_to_noc`, `xbar_to_hbm`, `hbm_to_xbar`
+- Edges of type `xbar_to_bridge`, `bridge_to_xbar`
+- Edges of type `pe_to_noc`, `noc_to_pe`, `noc_to_pe_cpu`, etc. referencing the single noc node
+
+Their role is replaced by an **explicit router mesh based on cube_mesh.yaml**.
+Each router (r0c0, r0c1, ...) from the 6x6 router grid generated by `mesh_gen.py`
+is created as a separate SimPy node in the topology graph,
+and adjacent routers are connected via XY mesh edges.
+
+---
+
+### D3. Explicit Router Mesh (Common Basis for n:1 / 1:1)
+
+#### Router Nodes Based on cube_mesh.yaml
+
+Each non-null router from cube_mesh.yaml generated by `mesh_gen.py`
+is created as a **separate SimPy node** in the topology graph.
+
+- Node ID: `{cube}.r{row}c{col}` (e.g., `sip0.cube0.r0c0`)
+- kind: `noc_router`, impl: `forwarding_v1`
+- pos_mm: taken from cube_mesh.yaml
+
+Based on the attach information in cube_mesh.yaml, components are connected to each router:
+- `pe{p}.dma` → PE_DMA ↔ router edge
+- `pe{p}.cpu` → PE_CPU ↔ router edge
+- `pe{p}.hbm` → HBM_CTRL ↔ router edge (added in n:1)
+- `m_cpu` → M_CPU ↔ router edge
+- `sram` → SRAM ↔ router edge
+- `ucie_{dir}.c{i}` → UCIe conn ↔ router edge
+
+Router-to-router XY mesh edges: bidirectional edges between adjacent routers.
+Null routers (HBM exclusion zones) are skipped.
+
+#### 1:1 Mode Extension (To Be Implemented Later)
+
+In 1:1 mode, each router differentiates into N channel mini-routers.
+Per-channel routing and ChannelSplitter (LA → per-channel PA) introduction are required.
+N GEMM engines per PE are also added at this point.
+
+---
+
+### D4. Cross-PE HBM Access (n:1 Mode)
+
+In n:1 mode, when a PE accesses another PE's local HBM,
+it hops through the XY mesh in cube_mesh.yaml to reach the target PE's router.
+
+Example: PE0 (r0c0) accessing PE2's (r1c4) HBM:
+
+```text
+PE0.pe_dma → r0c0 → r0c1 → r0c2 → r0c3 → r0c4 → r1c4 → hbm_ctrl
+```
+
+The Dijkstra router finds the shortest path in the mesh.
+
+Cross-PE channel access in 1:1 mode will be defined during the 1:1 extension in D3.
+
+---
+
+### D5. n:1 Mode: Uses cube_mesh.yaml Router Mesh
+
+In n:1 mode, no separate "aggregated router" is created.
+The existing router grid from cube_mesh.yaml serves that role.
+
+#### Connection Structure
+
+PE_DMA, PE_CPU, and HBM are all connected to the router where each PE is attached:
+
+```text
+sip0.cube0.pe0.pe_dma ←→ sip0.cube0.r0c0  (bw: N × channel_bw_gbs)
+sip0.cube0.hbm_ctrl   ←→ sip0.cube0.r0c0  (bw: N × channel_bw_gbs)
+```
+
+Routers are connected via XY mesh edges. PE's local HBM access goes
+directly from its own router (switching overhead only).
+
+#### n:1 Mode Full Data Paths
+
+**Local HBM (0 hops):**
+```text
+PE0.pe_dma → r0c0 → hbm_ctrl  (switching overhead only)
+```
+
+**Remote HBM (mesh hops):**
+```text
+PE0.pe_dma → r0c0 → r0c1 → ... → r1c4 → hbm_ctrl
+```
+
+**M_CPU DMA:**
+```text
+M_CPU → r2c0 → (mesh hops) → r{x}c{y} → hbm_ctrl
+```
+
+---
+
+### D6. All Traffic Is Unified onto the Same Router Mesh
+
+- All memory accesses (DMA data) and commands (PE_CPU) use the same router mesh
+- Local access does not use a separate fast path (xbar)
+- Cross-cube (remote) access path:
+
+```text
+PE_DMA → r{x}c{y} → (mesh hops) → ucie_conn → ucie-{PORT}
+  → [UCIe link] → remote ucie → remote conn → remote r{x}c{y} → hbm_ctrl
+```
+
+UCIe connections maintain the existing structure,
+but both endpoints become mesh routers instead of xbars.
+
+The number of UCIe lines is determined by BW ratio: `ucie_lines_per_side = ceil(ucie_bw / noc_line_bw)`.
+
+---
+
+### D7. AddressResolver Changes
+
+Current `AddressResolver.resolve()`:
+
+```python
+# Current: HBM offset → pe_slice → "sip{s}.cube{c}.hbm_ctrl.slice{pe_slice}"
+pe_slice = PhysAddr.hbm_pe_id(addr.hbm_offset, self._slice_size_bytes)
+return f"sip{s}.cube{c}.hbm_ctrl.slice{pe_slice}"
+```
+
+After change:
+
+```python
+# Changed: HBM → single endpoint
+return f"sip{s}.cube{c}.hbm_ctrl"
+```
+
+The pe_slice calculation is removed.
+In n:1 mode, PE_DMA directly accesses the hbm_ctrl attached to its own router.
+
+resolver.resolve() is retained for external access (M_CPU DMA, etc.) and backward compatibility.
+
+---
+
+### D8. topology.yaml Configuration Changes
+
+#### Added Settings
+
+```yaml
+cube:
+  memory_map:
+    hbm_mapping_mode: n_to_one          # one_to_one | n_to_one
+    hbm_pseudo_channels: 64             # total pseudo channel count
+    hbm_channels_per_pe: 8              # local channels per PE (= pseudo_channels / pes_per_cube)
+    hbm_channel_bw_gbs: 32.0            # per-channel bandwidth (GB/s)
+    hbm_total_gb_per_cube: 48           # retained
+```
+
+#### Removed Settings
+
+```yaml
+# To be removed
+links:
+  xbar_to_hbm_bw_gbs: 256.0            # → replaced by channel_bw_gbs × channels_per_pe
+  xbar_to_hbm_mm: 2.5                  # → replaced by ch_router_to_hbm_mm
+  xbar_to_bridge_bw_gbs: 128.0         # → removed (no bridge)
+  xbar_to_bridge_mm: 3.0               # → removed
+  noc_to_xbar_bw_gbs: ...              # → removed
+  noc_to_xbar_mm: ...                  # → removed
+```
+
+#### Added Link Settings
+
+```yaml
+links:
+  router_link_bw_gbs: 256.0            # XY mesh link BW between routers
+  router_overhead_ns: 2.0              # router switching overhead
+  pe_to_router_bw_gbs: 256.0           # PE_DMA ↔ router
+  hbm_to_router_bw_gbs: 256.0          # HBM ↔ router (= N × channel_bw)
+```
+
+---
+
+### D9. Bandwidth Numerical Consistency
+
+| Configuration | Value |
+| ---- | --- |
+| pseudo channels per cube | 64 (parameter) |
+| PEs per cube | 8 (parameter) |
+| channels per PE (N) | `pseudo_channels / pes_per_cube` = 8 |
+| per-channel BW | 32 GB/s (parameter) |
+| per-PE local BW | N × 32 = 256 GB/s |
+| cube total HBM BW | 64 × 32 = 2048 GB/s |
+
+The effective BW per PE is identical in both modes:
+
+- 1:1 mode: N channel links × channel_bw_gbs = N × 32 = 256 GB/s
+- n:1 mode: 1 aggregated link = N × channel_bw_gbs = 256 GB/s
+
+---
+
+## Consequences
+
+### Positive
+
+- The router mesh based on cube_mesh.yaml accurately reflects physical placement
+- In n:1 mode, the existing VA scheme is preserved, keeping transition costs low
+- Local / remote / command traffic is unified onto the same mesh, resulting in simplicity
+- Aligns well with graph compiler-based topology generation
+- Channel count and PE count are both parameterized, enabling testing of various configurations
+- 1:1 mode extension naturally follows through router differentiation
+
+### Negative
+
+- The number of SimPy nodes increases due to explicit router nodes (6x6 = up to 32 routers/cube)
+- Requires complete rewrite of existing xbar/bridge/single NOC-based tests
+- The internal contention model of TwoDMeshNocComponent needs to be replaced with a per-router model
+
+---
+
+## Alternatives
+
+### A1. Retain Existing xbar + HBM Slices
+
+- Local/remote paths remain bifurcated
+- Cannot model at pseudo-channel granularity
+- Cannot switch between 1:1/n:1 modes
+
+### A2. Always Generate Per-Channel Links and Aggregate Only in n:1
+
+- Topology structure always has 1:1 size
+- Expressing n:1 semantics via link aggregation is complex
+- No reduction in router node count
+
+### A3. Gradual Transition (Retain xbar + Add NOC Path)
+
+- Higher compatibility, but dual-path coexistence increases complexity
+- Since xbar removal is ultimately necessary, the intermediate step provides little value
+
+---
+
+## Implementation Notes
+
+### topology/builder.py Change Details
+
+#### Code to Remove (within current `_instantiate_cube()`)
+
+- xbar_top, xbar_bot node creation (~line 495-508)
+- bridge.left, bridge.right node creation
+- noc ↔ xbar edge creation (~line 540-555)
+- xbar ↔ hbm_ctrl.slice edge creation (~line 510-538)
+- xbar ↔ bridge edge creation (~line 557-572)
+
+#### Code to Add
+
+1:1 mode:
+
+```python
+N = hbm_channels_per_pe  # from topology config
+total_ch = hbm_pseudo_channels
+
+# Create channel router nodes
+for ch_id in range(total_ch):
+    pe_id = ch_id // N
+    nodes[f"{cp}.ch_r{ch_id}"] = Node(
+        id=f"{cp}.ch_r{ch_id}", kind="noc_router", impl="noc_v1",
+        attrs={}, pos_mm=(...),  # horizontal row = ch_id % N
+    )
+
+# PE_DMA ↔ local channel router edges
+for pe_id in range(pes_per_cube):
+    for local_ch in range(N):
+        ch_id = pe_id * N + local_ch
+        edges.append(Edge(
+            src=f"{cp}.pe{pe_id}.pe_dma", dst=f"{cp}.ch_r{ch_id}",
+            bw_gbs=channel_bw, kind="pe_to_ch_router", ...))
+        edges.append(Edge(
+            src=f"{cp}.ch_r{ch_id}", dst=f"{cp}.pe{pe_id}.pe_dma",
+            bw_gbs=channel_bw, kind="ch_router_to_pe", ...))
+
+# Channel router ↔ hbm_ctrl edges
+for ch_id in range(total_ch):
+    edges.append(Edge(
+        src=f"{cp}.ch_r{ch_id}", dst=f"{cp}.hbm_ctrl",
+        bw_gbs=channel_bw, kind="ch_router_to_hbm", ...))
+    edges.append(Edge(
+        src=f"{cp}.hbm_ctrl", dst=f"{cp}.ch_r{ch_id}",
+        bw_gbs=channel_bw, kind="hbm_to_ch_router", ...))
+
+# Horizontal line edges (same logical index)
+for row in range(N):
+    for p in range(pes_per_cube - 1):
+        ch_a = p * N + row
+        ch_b = (p + 1) * N + row
+        edges.append(Edge(
+            src=f"{cp}.ch_r{ch_a}", dst=f"{cp}.ch_r{ch_b}",
+            bw_gbs=ch_horizontal_bw, kind="ch_horizontal", ...))
+        edges.append(Edge(
+            src=f"{cp}.ch_r{ch_b}", dst=f"{cp}.ch_r{ch_a}",
+            bw_gbs=ch_horizontal_bw, kind="ch_horizontal", ...))
+```
+
+n:1 mode:
+
+```python
+# Create aggregated router nodes
+for pe_id in range(pes_per_cube):
+    nodes[f"{cp}.pe{pe_id}.agg_router"] = Node(
+        id=f"{cp}.pe{pe_id}.agg_router", kind="noc_router", impl="noc_v1",
+        attrs={}, pos_mm=(...),
+    )
+
+agg_bw = N * channel_bw  # aggregated BW
+
+# PE_DMA ↔ aggregated router
+for pe_id in range(pes_per_cube):
+    edges.append(Edge(
+        src=f"{cp}.pe{pe_id}.pe_dma", dst=f"{cp}.pe{pe_id}.agg_router",
+        bw_gbs=agg_bw, kind="pe_to_agg_router", ...))
+    edges.append(Edge(
+        src=f"{cp}.pe{pe_id}.agg_router", dst=f"{cp}.pe{pe_id}.pe_dma",
+        bw_gbs=agg_bw, kind="agg_router_to_pe", ...))
+
+# Aggregated router ↔ hbm_ctrl
+for pe_id in range(pes_per_cube):
+    edges.append(Edge(
+        src=f"{cp}.pe{pe_id}.agg_router", dst=f"{cp}.hbm_ctrl",
+        bw_gbs=agg_bw, kind="agg_to_hbm", ...))
+    edges.append(Edge(
+        src=f"{cp}.hbm_ctrl", dst=f"{cp}.pe{pe_id}.agg_router",
+        bw_gbs=agg_bw, kind="hbm_to_agg", ...))
+
+# Horizontal links between aggregated routers
+for p in range(pes_per_cube - 1):
+    edges.append(Edge(
+        src=f"{cp}.pe{p}.agg_router", dst=f"{cp}.pe{p+1}.agg_router",
+        bw_gbs=agg_horizontal_bw, kind="agg_horizontal", ...))
+    edges.append(Edge(
+        src=f"{cp}.pe{p+1}.agg_router", dst=f"{cp}.pe{p}.agg_router",
+        bw_gbs=agg_horizontal_bw, kind="agg_horizontal", ...))
+```
+
+### Affected Existing Tests
+
+| Test File | Impact |
+| ---------- | ---- |
+| `tests/test_topology_compile.py` | Remove xbar/bridge node references, add channel router verification |
+| `tests/test_topology_load.py` | Reflect topology.yaml configuration changes |
+| `tests/test_pe_components.py` | PE_DMA routing path changes |
+| `tests/test_sip_parallel.py` | Cross-PE access path changes |
+| Cases that directly test xbar/bridge | Remove |
+
+---
+
+## Test Requirements
+
+- Verify that requests are delivered via per-channel links in 1:1 mode
+- Verify that requests are delivered via the aggregated link in n:1 mode
+- Verify that topology is correctly generated in both modes:
+  - 1:1: `total_ch` channel routers + per-PE links + horizontal links
+  - n:1: `pes_per_cube` aggregated routers + per-PE links
+- Verify that effective BW is consistent across both modes for the same workload
+- Verify that horizontal line routing works for cross-PE access
+- Verify that routing through UCIe works for cross-cube access
+- Verify that topology generation is correct under parameter variations (channels_per_pe = 4, 8, 16, etc.)
+
+---
+
+## Links
+
+- ADR-0018 (LA + BAAW) → addressing-side integration
+- ADR-0017 (Cube NOC 2D Mesh) → this ADR replaces the xbar/bridge portion
+- ADR-0004 (Memory Semantics) → BW model redefinition
+- ADR-0014 (PE Internal Execution Model) → impact from PE_DMA path changes
@@ -36,16 +36,14 @@ topology 파라미터로 결정된다.

 ## Decision

-### D1. HBM controller는 CUBE당 단일 endpoint로 정의한다
+### D1. HBM은 PE 라우터에 attach된다

-현재의 `hbm_ctrl.slice{0-7}` (8개 노드)를 **`hbm_ctrl` 단일 노드**로 통합한다.
+현재의 `hbm_ctrl.slice{0-7}` (8개 노드)를 **`hbm_ctrl` 단일 노드**로 통합하고,
+PE가 attach된 라우터에 HBM access point도 함께 attach한다.

- pseudo channel은 HBM controller 노드 자체가 아니라,
-  controller에 연결되는 **link의 단위**로 표현한다
- HBM controller 내부의 read/write resource 모델은 유지하되,
-  mode에 따라 contention 단위가 달라진다:
-  - 1:1 mode: per-channel link가 BW contention point (controller는 terminal)
-  - n:1 mode: aggregated link가 BW contention point (controller는 terminal)
+- n:1 mode: PE의 local HBM 접근은 자기 라우터에서 바로 (switching overhead만, 0 hop)
+- remote PE의 HBM 접근: mesh hop을 거쳐 대상 PE의 라우터에 도달
+- HBM controller 내부의 read/write resource 모델은 유지

 노드 네이밍 변경:

@@ -53,198 +51,127 @@ topology 파라미터로 결정된다.
 | ---- | ------- |
 | `sip0.cube0.hbm_ctrl.slice0` ~ `slice7` | `sip0.cube0.hbm_ctrl` (단일) |

+`mesh_gen.py`에서 PE attachment에 `pe{idx}.hbm`을 추가하여,
+builder가 해당 라우터와 hbm_ctrl 간 edge를 생성한다.
+
 ---

-### D2. xbar, bridge 완전 제거
+### D2. xbar, bridge, 단일 NOC 노드 완전 제거

 기존 다음 노드 및 관련 edge를 모두 제거한다:

 - `{cube}.xbar_top`, `{cube}.xbar_bot`
 - `{cube}.bridge.left`, `{cube}.bridge.right`
+- `{cube}.noc` (단일 TwoDMeshNocComponent 노드)
 - `noc_to_xbar`, `xbar_to_noc`, `xbar_to_hbm`, `hbm_to_xbar` 종류의 edge
 - `xbar_to_bridge`, `bridge_to_xbar` 종류의 edge
+- `pe_to_noc`, `noc_to_pe`, `noc_to_pe_cpu` 등 단일 noc 노드 참조 edge

-이들의 역할(PE→HBM 라우팅, cross-half 연결)은
-channel router 및 horizontal line 연결이 대체한다 (D3, D4 참조).
+이들의 역할은 **cube_mesh.yaml 기반의 명시적 라우터 mesh**가 대체한다.
+기존 `mesh_gen.py`가 생성하는 6×6 라우터 grid의 각 라우터(r0c0, r0c1, ...)를
+별도의 SimPy 노드로 topology graph에 생성하고,
+인접 라우터 간 XY mesh edge로 연결한다.

 ---

-### D3. 1:1 mode: per-channel router 기반 연결
+### D3. 명시적 라우터 mesh (n:1 / 1:1 공통 기반)

-#### channel router 정의
+#### cube_mesh.yaml 기반 라우터 노드

-1:1 mode에서 graph compiler는 pseudo-channel 수만큼의 **channel router** 노드를
-생성한다. channel router는 NOC의 일부이다.
+`mesh_gen.py`가 생성한 cube_mesh.yaml의 각 non-null 라우터를
+topology graph의 **별도 SimPy 노드**로 생성한다.

-```text
-파라미터 예: hbm_pseudo_channels=64, pes_per_cube=8
-→ channels_per_pe = 8, 총 64개 channel router 생성
-```
+- 노드 ID: `{cube}.r{row}c{col}` (e.g., `sip0.cube0.r0c0`)
+- kind: `noc_router`, impl: `forwarding_v1`
+- pos_mm: cube_mesh.yaml에서 가져옴

-노드 네이밍: `{cube}.ch_r{global_channel_id}`
+기존 cube_mesh.yaml의 attach 정보에 따라 각 라우터에 component를 연결:
+- `pe{p}.dma` → PE_DMA ↔ 라우터 edge
+- `pe{p}.cpu` → PE_CPU ↔ 라우터 edge
+- `pe{p}.hbm` → HBM_CTRL ↔ 라우터 edge (n:1에서 추가)
+- `m_cpu` → M_CPU ↔ 라우터 edge
+- `sram` → SRAM ↔ 라우터 edge
+- `ucie_{dir}.c{i}` → UCIe conn ↔ 라우터 edge

-| PE | 소유 channel routers |
-| -- | -------------------- |
-| PE0 | ch_r0, ch_r1, ..., ch_r7 |
-| PE1 | ch_r8, ch_r9, ..., ch_r15 |
-| ... | ... |
-| PE7 | ch_r56, ch_r57, ..., ch_r63 |
+라우터 간 XY mesh edge: 인접 라우터 간 bidirectional edge.
+null 라우터(HBM exclusion zone)는 skip.

-일반화: PE `p`는 channel `p * channels_per_pe` ~ `(p+1) * channels_per_pe - 1`을 소유.
+#### 1:1 mode 확장 (나중에 구현)

-#### PE_DMA ↔ channel router 연결
-
-각 PE_DMA는 자신의 local channel router N개와 양방향 link로 연결된다:
-
-```text
-sip0.cube0.pe0.pe_dma ←→ sip0.cube0.ch_r0  (bw: channel_bw_gbs)
-sip0.cube0.pe0.pe_dma ←→ sip0.cube0.ch_r1  (bw: channel_bw_gbs)
-...
-sip0.cube0.pe0.pe_dma ←→ sip0.cube0.ch_r7  (bw: channel_bw_gbs)
-```
-
- edge kind: `pe_to_ch_router` / `ch_router_to_pe`
- BW: `hbm_channel_bw_gbs` (e.g., 32 GB/s)
- distance: PE에서 channel router까지의 물리적 거리 (layout 기반)
-
-#### channel router ↔ HBM controller 연결
-
-각 channel router는 cube의 hbm_ctrl과 양방향 link로 연결된다:
-
-```text
-sip0.cube0.ch_r0 ←→ sip0.cube0.hbm_ctrl  (bw: channel_bw_gbs)
-sip0.cube0.ch_r1 ←→ sip0.cube0.hbm_ctrl  (bw: channel_bw_gbs)
-...
-sip0.cube0.ch_r63 ←→ sip0.cube0.hbm_ctrl (bw: channel_bw_gbs)
-```
-
- edge kind: `ch_router_to_hbm` / `hbm_to_ch_router`
- BW: `hbm_channel_bw_gbs` (e.g., 32 GB/s)
-
-#### 1:1 mode 전체 데이터 경로
-
-```text
-PE0.pe_dma
-  ├→ ch_r0 → hbm_ctrl  (32 GB/s)
-  ├→ ch_r1 → hbm_ctrl  (32 GB/s)
-  ├→ ...
-  └→ ch_r7 → hbm_ctrl  (32 GB/s)
-                         총 PE0 local BW = N × channel_bw_gbs
-```
+1:1 mode에서는 각 라우터가 N개 channel mini-router로 분화된다.
+per-channel routing과 ChannelSplitter (LA → per-channel PA) 도입이 필요.
+PE당 N개 GEMM engine도 이 시점에 추가.

 ---

-### D4. 1:1 mode: horizontal line 연결 (cross-PE channel 접근)
+### D4. cross-PE HBM 접근 (n:1 mode)

-#### 배치 규칙
+n:1 mode에서 PE가 다른 PE의 local HBM에 접근하는 경우,
+cube_mesh.yaml의 XY mesh를 통해 대상 PE의 라우터까지 hop한다.

-같은 **logical index**를 가지는 channel router들을 동일한 horizontal row에 배치한다.
-
-logical index 정의: `logical_idx = global_channel_id % channels_per_pe`
+예: PE0(r0c0)이 PE2(r1c4)의 HBM에 접근:

 ```text
-파라미터 예: channels_per_pe=8, pes_per_cube=8
-
-Row 0: ch_r0  (PE0) ↔ ch_r8  (PE1) ↔ ch_r16 (PE2) ↔ ... ↔ ch_r56 (PE7)
-Row 1: ch_r1  (PE0) ↔ ch_r9  (PE1) ↔ ch_r17 (PE2) ↔ ... ↔ ch_r57 (PE7)
-Row 2: ch_r2  (PE0) ↔ ch_r10 (PE1) ↔ ch_r18 (PE2) ↔ ... ↔ ch_r58 (PE7)
-...
-Row 7: ch_r7  (PE0) ↔ ch_r15 (PE1) ↔ ch_r23 (PE2) ↔ ... ↔ ch_r63 (PE7)
+PE0.pe_dma → r0c0 → r0c1 → r0c2 → r0c3 → r0c4 → r1c4 → hbm_ctrl
 ```

-일반화: Row `r`에는 `{ch_r(p * N + r) | p ∈ 0..pes_per_cube-1}`이 위치.
-여기서 `N = channels_per_pe`.
+Dijkstra router가 mesh에서 최단 경로를 탐색한다.

-#### horizontal line edge
-
-같은 row에서 인접한 channel router끼리 양방향 edge로 연결:
-
-```text
-ch_r0 ↔ ch_r8 ↔ ch_r16 ↔ ... ↔ ch_r56
-```
-
- edge kind: `ch_horizontal`
- BW: `hbm_channel_bw_gbs` (or configurable inter-PE channel BW)
- distance: PE 간 물리적 거리
-
-#### cross-PE HBM 접근 경로 (1:1 mode)
-
-PE0이 PE1의 local channel (ch_r8)에 접근하는 경우:
-
-```text
-PE0.pe_dma → ch_r0 → ch_r8 (horizontal hop) → hbm_ctrl
-```
-
-Dijkstra router가 horizontal line을 통해 최단 경로를 탐색한다.
-
-#### 설계 의도
-
-이 배치 규칙은:
-
- routing 규칙 단순화: horizontal = cross-PE, vertical = PE-local
- 거리 계산 단순화: row 내 hop 수 = |src_pe - dst_pe|
- 구조적 반복성 확보: 모든 row가 동일한 구조
+1:1 mode에서의 cross-PE channel 접근은 D3의 1:1 확장 시 정의한다.

 ---

-### D5. n:1 mode: aggregated router 기반 연결
+### D5. n:1 mode: cube_mesh.yaml 라우터 mesh 사용

-#### aggregated router 정의
-
-n:1 mode에서 graph compiler는 PE당 1개의 **aggregated router** 노드를 생성한다.
-aggregated router는 NOC의 일부이다.
-
-노드 네이밍: `{cube}.pe{p}.agg_router`
+n:1 mode에서는 별도의 "aggregated router"를 생성하지 않는다.
+기존 cube_mesh.yaml의 라우터 grid가 그 역할을 한다.

 #### 연결 구조

-```text
-sip0.cube0.pe0.pe_dma ←→ sip0.cube0.pe0.agg_router  (bw: N × channel_bw_gbs)
-sip0.cube0.pe0.agg_router ←→ sip0.cube0.hbm_ctrl    (bw: N × channel_bw_gbs)
-```
-
- edge kind: `pe_to_agg_router` / `agg_router_to_pe`, `agg_to_hbm` / `hbm_to_agg`
- BW: `channels_per_pe × hbm_channel_bw_gbs` (e.g., 8 × 32 = 256 GB/s)
-
-#### cross-PE 접근 (n:1 mode)
-
-PE0이 PE1의 local HBM에 접근하는 경우:
+각 PE가 attach된 라우터에 PE_DMA, PE_CPU, HBM이 함께 연결된다:

 ```text
-PE0.pe_dma → PE0.agg_router → PE1.agg_router → hbm_ctrl
+sip0.cube0.pe0.pe_dma ←→ sip0.cube0.r0c0  (bw: N × channel_bw_gbs)
+sip0.cube0.hbm_ctrl   ←→ sip0.cube0.r0c0  (bw: N × channel_bw_gbs)
 ```

-aggregated router 간 연결:
-
-```text
-pe0.agg_router ↔ pe1.agg_router ↔ pe2.agg_router ↔ ... ↔ pe7.agg_router
-```
-
- edge kind: `agg_horizontal`
- BW: configurable (inter-PE aggregated BW)
+라우터 간 XY mesh edge로 연결. PE의 local HBM 접근은
+자기 라우터에서 바로 (switching overhead만).

 #### n:1 mode 전체 데이터 경로

+**local HBM (0 hop):**
 ```text
-PE0.pe_dma → PE0.agg_router → hbm_ctrl
-             (BW = N × channel_bw_gbs = 256 GB/s)
+PE0.pe_dma → r0c0 → hbm_ctrl  (switching overhead only)
+```
+
+**remote HBM (mesh hops):**
+```text
+PE0.pe_dma → r0c0 → r0c1 → ... → r1c4 → hbm_ctrl
+```
+
+**M_CPU DMA:**
+```text
+M_CPU → r2c0 → (mesh hops) → r{x}c{y} → hbm_ctrl
 ```

 ---

-### D6. local / remote access를 NOC로 통일한다
+### D6. 모든 트래픽을 동일 router mesh로 통일한다

- 모든 memory access는 NOC(channel router 또는 aggregated router)를 통해 전달된다
+- 모든 memory access (DMA data)와 command (PE_CPU)가 동일 router mesh를 사용한다
 - local access도 별도의 fast path(xbar)를 사용하지 않는다
 - cross-cube (remote) access 경로:

 ```text
-1:1 mode: PE_DMA → ch_r{local} → ch_r{...} → UCIe → remote_ch_r → remote_hbm_ctrl
-n:1 mode: PE_DMA → agg_router → UCIe → remote_agg_router → remote_hbm_ctrl
+PE_DMA → r{x}c{y} → (mesh hops) → ucie_conn → ucie-{PORT}
+  → [UCIe link] → remote ucie → remote conn → remote r{x}c{y} → hbm_ctrl
 ```

 UCIe 연결은 기존 구조를 유지하되,
-양쪽 endpoint가 xbar 대신 channel router 또는 aggregated router가 된다.
+양쪽 endpoint가 xbar 대신 mesh 라우터가 된다.
+
+UCIe line 수는 BW 비율로 결정: `ucie_lines_per_side = ceil(ucie_bw / noc_line_bw)`.

 ---

@@ -266,9 +193,7 @@ return f"sip{s}.cube{c}.hbm_ctrl"
 ```

 pe_slice 계산이 제거된다.
-BAAW가 이미 dst_node를 결정하므로, PE_DMA의 1:1 mode에서는
-resolver를 거치지 않고 BAAW가 직접 channel router node_id를 반환한다.
-n:1 mode에서도 BAAW가 aggregated router node_id를 반환한다.
+n:1 mode에서 PE_DMA는 자기 라우터에 attach된 hbm_ctrl에 직접 접근한다.

 resolver.resolve()는 외부 접근(M_CPU DMA 등) 및 backward compatibility용으로 유지한다.

@@ -305,16 +230,10 @@ links:

 ```yaml
 links:
-  pe_to_ch_router_bw_gbs: 32.0         # PE_DMA ↔ channel router
-  pe_to_ch_router_mm: 1.0              # 물리적 거리
-  ch_router_to_hbm_bw_gbs: 32.0        # channel router ↔ hbm_ctrl
-  ch_router_to_hbm_mm: 2.0             # 물리적 거리
-  ch_horizontal_bw_gbs: 32.0           # channel router 간 horizontal link
-  ch_horizontal_mm: 1.5                # PE 간 horizontal 거리
-  # n:1 mode용
-  pe_to_agg_router_bw_gbs: 256.0       # PE_DMA ↔ aggregated router
-  agg_to_hbm_bw_gbs: 256.0             # aggregated router ↔ hbm_ctrl
-  agg_horizontal_bw_gbs: 256.0         # aggregated router 간 link
+  router_link_bw_gbs: 256.0            # 라우터 간 XY mesh link BW
+  router_overhead_ns: 2.0              # 라우터 switching overhead
+  pe_to_router_bw_gbs: 256.0           # PE_DMA ↔ 라우터
+  hbm_to_router_bw_gbs: 256.0          # HBM ↔ 라우터 (= N × channel_bw)
 ```

 ---
@@ -341,19 +260,18 @@ links:

 ### Positive

- 1:1 mode에서 pseudo-channel 단위 BW contention 모델링이 자연스럽다
- n:1 mode에서 aggregated bandwidth 모델이 단순하다
- local / remote access 경로가 NOC로 통일된다
+- cube_mesh.yaml 기반 라우터 mesh로 물리적 배치를 정확히 반영한다
+- n:1 mode에서 기존 VA 체계를 유지하여 전환 비용이 낮다
+- local / remote / command 트래픽이 동일 mesh로 통일되어 단순하다
 - graph compiler 기반 topology 생성과 잘 맞는다
 - channel 수, PE 수가 모두 파라미터이므로 다양한 구성을 테스트할 수 있다
+- 1:1 mode 확장이 라우터 분화로 자연스럽게 가능하다

 ### Negative

- 1:1 mode에서 router 및 link 수가 크게 증가한다
-  (64 channel routers + 64 edges to HBM + 56 horizontal edges per cube)
- local access도 NOC 경로를 사용하므로 모델이 더 일반화된다
- 기존 xbar 기반 테스트 전면 재작성 필요
- SimPy 노드 수 증가에 따른 시뮬레이션 성능 영향 가능
+- 명시적 라우터 노드로 인해 SimPy 노드 수가 증가한다 (6×6 = 최대 32개 라우터/cube)
+- 기존 xbar/bridge/단일 NOC 기반 테스트 전면 재작성 필요
+- TwoDMeshNocComponent의 내부 contention 모델을 라우터별 모델로 교체 필요

 ---

@@ -0,0 +1,553 @@
+# ADR-0020: 2-Pass Data Execution Model (Timing / Data Separation)
+
+## Status
+
+Proposed
+
+## Context
+
+The current simulation models **timing only**.
+`tl.load()`, `tl.composite(op="gemm")`, etc. generate SimPy latencies,
+but do not actually read tensor data or perform computations.
+
+### Required Capabilities
+
+1. Must be able to store and read actual data in HBM/TCM/SRAM
+2. PE_GEMM, PE_MATH must be able to perform actual matrix operations and verify results
+3. Must minimize simulation performance degradation
+
+### Limitations of the Existing Kernel Execution Structure
+
+The current kernel execution is separated into 3 stages:
+
+```
+Phase 0: Kernel function execution in TLContext → PeCommand list generation (outside SimPy, no data)
+Phase 1: PE_CPU replays PeCommand list via SimPy (timing only)
+```
+
+Phase 0 requires the kernel to **complete execution entirely** before SimPy begins.
+`tl.load()` returns a TensorHandle (placeholder), so actual data cannot be accessed.
+Therefore, branching based on data values (dynamic control flow) is impossible.
+
+This ADR resolves this limitation **for memory operations only** (see D1, D3).
+
+### Constraints
+
+- SimPy is a single-thread event loop — running numpy matmul inside it blocks everything
+- Components must be replaceable (ADR-0015) — framework requirements must not leak into implementations
+- Benchmark kernels are imperative code (tl.load → tl.composite → tl.wait) — the same code must be reused
+- Kernel functions must remain plain Python functions (no generator/async transformation)
+
+### Design Exploration Results
+
+| Option | Approach | Verdict |
+|--------|----------|---------|
+| Direct execution in SimPy | Call numpy GEMM inside SimPy | Rejected: single-thread block |
+| SimPy + ThreadPool | future.submit → timeout → result() | Rejected: blocks on result() for back-to-back requests |
+| Symbolic + lazy | Track metadata only, execute later | Rejected: difficult to handle control-flow dependent reads |
+| **2-pass (adopted)** | Phase 1: timing, Phase 2: data | Full separation, no performance impact |
+
+---
+
+## Decision
+
+### D1. 2-Pass Execution Model — Phase 0 Elimination
+
+The existing 3 stages (Phase 0 → Phase 1 → Phase 2) are **consolidated into 2 stages**.
+
+Before:
+```
+Phase 0: Kernel → PeCommand list (no data, no branching)
+Phase 1: Replay PeCommand list via SimPy (timing only)
+```
+
+After:
+```
+Phase 1 (timing): Kernel + SimPy integrated execution — greenlet-based
+  - Memory read/write: SimPy timing + MemoryStore actual data
+  - Compute (GEMM/Math): SimPy timing + op_log recording (actual computation in Phase 2)
+  - Dynamic control flow possible (tl.load returns actual data)
+
+Phase 2 (data): Actual computation execution based on op_log — outside SimPy, parallelizable
+```
+
+This ADR **extends Phase 1 to be data-aware for memory operations only**.
+Phase 1 handles latency/BW bottleneck analysis + memory data tracking,
+Phase 2 handles GEMM/Math computation correctness verification.
+Phase 2 is optional — if only timing is needed, run Phase 1 alone.
+
+### D2. Op Log Recording — ComponentBase Hook
+
+Op log recording is performed as a **hook in the component base class**.
+Individual component implementations are not modified.
+
+```python
+class ComponentBase:
+    def _on_process_start(self, env, msg):
+        if self._op_logger and getattr(msg, 'data_op', False):
+            self._op_logger.record_start(env.now, self.node.id, msg)
+
+    def _on_process_end(self, env, msg):
+        if self._op_logger and getattr(msg, 'data_op', False):
+            self._op_logger.record_end(env.now, self.node.id, msg)
+```
+
+Hooks are called before and after `run()` within `_forward_txn()`.
+`_op_logger` is optional — zero overhead when absent.
+
+**Hook timing definitions**:
+
+| Timing | Meaning |
+|--------|---------|
+| `t_start` | The point at which the component **begins servicing** the msg (immediately before `run()` entry) |
+| `t_end` | The point at which the component's **internal service completes** (immediately after `run()` returns) |
+
+Link traversal latency is not included in t_start/t_end.
+Link latency is observed as the difference between the sending component's t_end and the receiving component's t_start.
+
+### D3. Greenlet-Based Kernel Execution — Phase 0 Elimination
+
+The existing Phase 0 (kernel → PeCommand list) is eliminated,
+and **greenlet** is used to cooperatively interleave kernel and SimPy execution.
+
+#### Operating Principle
+
+greenlet is a C extension that provides cooperative context switching.
+When the kernel (child greenlet) calls `tl.load()` etc., it switches to the SimPy loop (parent greenlet)
+to perform timing simulation, and after completion, returns to the kernel with actual data.
+
+```
+SimPy loop (parent greenlet)           Kernel (child greenlet)
+─────────────────────────              ──────────────────────
+g.switch() ─────────────────────────→ Kernel starts
+                                       a = tl.load(ptr, ...)
+                                         internal: parent.switch(DmaReadCmd)
+cmd = DmaReadCmd ←──────────────────  (kernel paused)
+  yield DmaReadMsg(...)
+  yield env.timeout(dma_latency)
+  data = memory_store.read(...)
+g.switch(data) ─────────────────────→ (kernel resumed)
+                                       a = data  ← actual numpy array
+                                       if a[0][0] > 0.5:  ← branching possible
+                                         ...
+```
+
+The kernel is maintained as a **plain Python function**.
+greenlet switches exist **only within the internal implementation** of `tl.load()`, `tl.store()`, etc.
+
+#### KernelRunner — Framework Layer
+
+The greenlet loop resides not in the PE_CPU component but in the framework layer,
+**KernelRunner**.
+
+```python
+# KernelRunner (framework — greenlet ↔ SimPy bridge)
+class KernelRunner:
+    def run(self, env, kernel_fn, args, store):
+        g = greenlet(self._run_kernel)
+        cmd = g.switch(kernel_fn, args)
+
+        while cmd is not None:
+            if isinstance(cmd, DmaReadCmd):
+                yield from self._dispatch_dma(env, cmd)
+                data = store.read(cmd.src_addr, cmd.shape, cmd.dtype)
+                cmd = g.switch(data)            # resume with actual data
+            elif isinstance(cmd, GemmCmd):
+                yield from self._dispatch_gemm(env, cmd)
+                cmd = g.switch()                # resume (no data)
+            elif isinstance(cmd, DmaWriteCmd):
+                store.write(cmd.dst_addr, cmd.data)  # visibility = issue time
+                yield from self._dispatch_dma(env, cmd)  # timing only
+                cmd = g.switch()
+
+# PE_CPU (component — kept simple, unaware of greenlet)
+def _execute_kernel(self, env):
+    runner = KernelRunner(self.ctx)
+    yield from runner.run(env, kernel_fn, args, store)
+```
+
+**Op logging single source of truth**: KernelRunner does not record directly to op_log.
+All op logging is handled **solely by the ComponentBase hook (_on_process_start/end)**.
+When KernelRunner delivers messages to components via `_dispatch_gemm()` etc.,
+the component base class hooks automatically record them.
+
+**Layer separation**:
+- **Kernel code**: plain function, unaware of greenlet
+- **TLContext**: calls `parent.switch(cmd)` inside `tl.load()`
+- **KernelRunner**: greenlet ↔ SimPy bridge, handles MemoryStore read/write. **Does not log**.
+- **ComponentBase hook**: the sole path for op_log recording
+- **PE_CPU**: only calls KernelRunner, replaceable as a component
+
+#### Handling Differences Between Memory Read/Write and Compute
+
+| Operation | In Phase 1 | In Phase 2 |
+|-----------|-----------|-----------|
+| `tl.load()` | SimPy timing + MemoryStore read → **actual data returned** | — |
+| `tl.store()` | SimPy timing + MemoryStore write → **actual write** | — |
+| `tl.composite(gemm)` | SimPy timing + **op_log recording only** | numpy actual computation |
+| `tl.dot()` / math ops | SimPy timing + **op_log recording only** | numpy actual computation |
+
+Memory read/write is processed immediately in Phase 1 (numpy slice, fast).
+GEMM/Math operations are batch-executed in Phase 2 (performance separation).
+
+#### Store Visibility Rule
+
+`tl.store()` is **immediately reflected in MemoryStore at issue time** (visibility = issue).
+SimPy DMA timing is simulated separately afterward.
+
+This is an intentional separation of timing and visibility:
+- **visibility**: the point at which it is reflected in MemoryStore = when `store.write()` is called
+- **timing**: the point at which DMA latency completes in SimPy
+
+This separation allows a load immediately after a store to see the latest data in dynamic control flow.
+
+#### Result Handle Semantics
+
+`tl.composite()` (sync/async) returns a **handle** referencing the result tensor.
+
+The key contract in Phase 1:
+
+1. **All compute handles are always considered pending in Phase 1.**
+2. `tl.wait(handle)` **expresses timing synchronization only**
+   and does not make the handle ready.
+3. Accessing the handle's actual result data (`handle.data`, element access,
+   numpy conversion, etc.) is **only possible in Phase 2**.
+4. Therefore, **compute-result-based control flow is not supported in Phase 1.**
+5. In contrast, `tl.load()` returns actual data in Phase 1, so
+   **memory-read-based control flow is supported**.
+
+| Handle state | Phase | Allowed operations |
+|------------|-------|----------|
+| pending | Phase 1 | `tl.wait(handle)` — timing synchronization only |
+| pending | Phase 1 | Pass handle as target of `tl.store()` (logical destination binding only, payload in Phase 2) |
+| pending | Phase 1 | **Data access not allowed** — value-based branching not possible |
+| ready | Phase 2 | Actual numpy data access, verification |
+
+This restriction is intentional. If computations were executed in Phase 1,
+the SimPy single-thread would block, defeating the purpose of 2-pass separation.
+
+#### Phase 1 Materialization — Future Extension
+
+If Phase 1 eager execution becomes necessary for small operations
+(scalar, small reduction) in the future, selective materialization can be supported
+by adding a `materialized_in_phase1: bool` flag to the op record.
+This is not implemented in the current scope.
+
+### D4. data_op Flag — Message Self-Declaration
+
+The logging target is determined by the `data_op` attribute on the message instance,
+not by message type. The framework does not hardcode message types.
+
+```python
+class MsgBase:
+    data_op: bool = False       # default: no logging
+
+class DmaReadCmd(MsgBase):
+    data_op = True              # memory transfer → logging
+
+class GemmCmd(MsgBase):
+    data_op = True              # compute → logging
+
+class MathCmd(MsgBase):
+    data_op = True              # compute → logging
+```
+
+When adding a new message type (e.g., IpcqMsg), simply setting `data_op = True`
+enables automatic logging without modifying framework code.
+
+### D5. Op Log Structure
+
+#### Op Classification Scheme
+
+A two-level classification is used:
+
+| Level | Field | Role |
+|-------|-------|------|
+| `op_kind` | `memory` \| `gemm` \| `math` | executor dispatch criterion |
+| `op_name` | `dma_read` \| `dma_write` \| `gemm_f16` \| `exp` \| `add` \| `sum` etc. | specific operation identification |
+
+#### OpRecord Definition
+
+```python
+@dataclass
+class OpRecord:
+    t_start: float              # SimPy time (ns) — service start
+    t_end: float                # SimPy time (ns) — service completion
+    component_id: str           # e.g. "sip0.cube0.pe0.pe_gemm"
+    op_kind: str                # "memory" | "gemm" | "math"
+    op_name: str                # specific operation name
+    params: dict                # per-operation parameters (see below)
+    dependency_ids: list[int]   # currently based on in-memory record index, may be replaced with stable op_id in the future
+```
+
+#### dependency_ids Generation Rules
+
+`dependency_ids` is **optional**, and by default the executor performs
+address-based dependency inference (see D6).
+
+Explicit setting is only needed when precise execution ordering is required:
+- **Default (address-based inference)**: the executor analyzes read/write sets to
+  automatically infer RAW/WAW/WAR dependencies. This is sufficient for most cases.
+- **Explicit setting**: set when logical dependencies cannot be expressed via addresses
+  at the TLContext or command generation stage.
+  Example: completion handle-based synchronization — handle dependencies depend on
+  logical completion order rather than memory addresses, so they cannot be captured
+  by address inference.
+
+#### op_log Ordering
+
+The op_log maintains **stable ordering** based on `t_start`.
+Records with the same `t_start` preserve insertion order.
+
+#### params Details
+
+**memory (dma_read / dma_write)**:
+```python
+{
+    "src_addr": int,            # source address (byte)
+    "dst_addr": int,            # destination address (byte)
+    "nbytes": int,              # transfer size
+    "src_space": str,           # "hbm" | "tcm" | "sram"
+    "dst_space": str,           # "hbm" | "tcm" | "sram"
+}
+```
+
+**gemm**:
+```python
+{
+    "src_a_addr": int,          # operand A address
+    "src_b_addr": int,          # operand B address
+    "dst_addr": int,            # output address
+    "shape_a": tuple,           # e.g. (128, 256)
+    "shape_b": tuple,           # e.g. (256, 128)
+    "shape_out": tuple,         # e.g. (128, 128)
+    "dtype_in": str,            # e.g. "f16"
+    "dtype_acc": str,           # accumulation dtype, e.g. "f32"
+    "dtype_out": str,           # output dtype, e.g. "f16"
+    "transpose_a": bool,
+    "transpose_b": bool,
+    "layout_a": str,            # "row_major" | "col_major"
+    "layout_b": str,
+    "layout_out": str,
+    "addr_space": str,          # "tcm" (GEMM operands are always in TCM)
+}
+```
+
+**math**:
+```python
+{
+    "op": str,                  # "exp" | "add" | "sum" | "where" | ...
+    "input_addrs": list[int],   # list of operand addresses
+    "input_shapes": list[tuple],
+    "dst_addr": int,
+    "shape_out": tuple,
+    "dtype": str,
+    "axis": int | None,         # reduction axis
+    "addr_space": str,          # "tcm"
+}
+```
+
+### D6. Phase 2 Executor
+
+Phase 2 executes the op_log outside of SimPy.
+
+```python
+class DataExecutor:
+    def __init__(self, op_log: list[OpRecord], initial_store: MemoryStore):
+        self.store = initial_store  # Takes the Phase 1 MemoryStore snapshot as input
+
+    def run(self):
+        for t, ops in groupby(op_log, key=lambda o: o.t_start):
+            batch = list(ops)
+            independent, sequential = self._classify(batch)
+            self._execute_parallel(independent)
+            self._execute_sequential(sequential)
+```
+
+**Parallel execution determination**:
+
+Ops with the same `t_start` are considered **parallel candidates**.
+The executor determines actual parallel execution based on the following criteria:
+- Whether read/write address ranges overlap (WAW, RAW, WAR conflict checks)
+- Whether predecessor ops specified in `dependency_ids` have completed
+
+Only ops with no overlapping address ranges and no explicit dependencies are executed in parallel.
+
+**Batch optimization**: Only independent ops with the same op_name **and identical
+shape, dtype, layout, and transpose flags** are eligible for batching.
+Example: identical shape GEMMs from multiple PEs → bundled into a single `np.matmul(a_batch, b_batch)` call.
+Improves BLAS efficiency on CPU, reduces launch overhead on GPU.
+
+**Phase 2 execution order guarantee**:
+
+Phase 2 does not consider data arrival timing,
+and guarantees execution order solely through
+dependencies (address-based inference + explicit dependency_ids).
+
+### D7. Memory Store
+
+`MemoryStore` logically follows byte-addressable semantics,
+and the current implementation uses **tensor-granular storage** (addr → numpy ndarray mapping).
+
+```python
+class MemoryStore:
+    def write(self, space: str, addr: int, data: np.ndarray) -> None: ...
+    def read(self, space: str, addr: int, shape: tuple, dtype: str) -> np.ndarray: ...
+```
+
+**Internal storage format: numpy ndarray**
+
+MemoryStore stores tensors as **numpy ndarrays**.
+
+| Candidate | store/load speed | Phase 2 compute | Verdict |
+|-----------|-----------------|-----------------|---------|
+| **numpy ndarray** | Immediate (reference passing, no copy) | `np.matmul` directly usable | **Adopted** |
+| bytearray | Requires memcpy | Requires `np.frombuffer` conversion | Rejected |
+| torch tensor | Immediate | torch operations available | Use only for GPU optimization |
+
+- write: **stores numpy array by reference** (no copy) → Phase 1 overhead = 1 dict lookup
+- read: **returns numpy array by reference** (no copy)
+- Re-writing to the same addr **overwrites at tensor granularity** (partial overwrite not supported)
+- dtype uses numpy native (`np.float16`, `np.float32`, `np.bfloat16`, etc.)
+- For byte-level access, convert via `.view(np.uint8)`
+- For GPU batch optimization in Phase 2, numpy → torch tensor conversion is the executor's responsibility
+
+**read/write contract**:
+
+- read/write operates on a **contiguous tensor** basis.
+  If non-contiguous stride views are needed, express them as separate copy ops.
+- In the normal benchmark path, producer/consumer dtype match is expected.
+  Reinterpret cast is a permissive behavior for low-level memory validation
+  or special test cases.
+- addr is byte-aligned, with minimum alignment = dtype size.
+- dtype mismatch (reading with a different dtype than written) is handled as a reinterpret cast.
+  Shape mismatch is verified based on nbytes, and raises an error on mismatch.
+- Correctness criteria follow address-range-based read/write semantics.
+- A tensor object cache may be used as an implementation optimization,
+  but the canonical state is byte-addressable storage.
+- At deploy time, the host injects initial tensor data.
+
+### D8. Benchmark Kernel Code
+
+The benchmark's **user code API is not changed**.
+The call interfaces for `tl.load()`, `tl.composite()`, `tl.store()`, etc. are maintained.
+
+However, internal command/message schemas may be extended to include metadata
+required for Phase 2 execution (e.g., additional fields such as dtype_acc, transpose).
+
+### D9. No Component Changes
+
+Individual component implementations (PE_GEMM, PE_DMA, HBM_CTRL, etc.) are not modified.
+Op log recording is the responsibility of the ComponentBase hook.
+When custom components are replaced, only the timing model changes,
+and Phase 2 data execution is unaffected.
+
+### D10. Phase 2 is Optional
+
+```python
+engine = GraphEngine(graph)
+engine.run(benchmark)                       # Phase 1: timing only
+result = engine.get_timing_result()
+
+if verify_data:
+    executor = DataExecutor(engine.op_log)  # Phase 2: data
+    executor.run()
+    executor.verify(expected_output)
+```
+
+If only timing analysis is needed, Phase 2 is skipped.
+If the op_logger is deactivated, Phase 1 performance is identical to the original.
+
+### D11. Verification Contract
+
+Basic verification **compares the final output tensor** against a reference backend (numpy).
+
+Per-dtype tolerance policy:
+
+| dtype | Comparison method | Tolerance |
+|-------|----------|-----------|
+| f32 | `np.allclose` | rtol=1e-5, atol=1e-5 |
+| f16 | `np.allclose` | rtol=1e-3, atol=1e-3 |
+| bf16 | `np.allclose` | rtol=1e-2, atol=1e-2 |
+| int types | `np.array_equal` | exact |
+
+- Default mode: compare final output only (end-to-end correctness)
+- Debug mode: can compare intermediate tensors on a per-op basis
+  (MemoryStore snapshot at each op boundary)
+
+---
+
+## Non-goals
+
+- **Compute-result-based control flow**: not supported.
+  All compute handles are in pending state during Phase 1,
+  `wait()` expresses timing synchronization only and does not imply data readiness.
+  Accessing `handle.data`, element access, or truth-value evaluation in Phase 1
+  is **treated as an error**.
+  Memory-data-based branching (results of `tl.load()`) is supported via greenlet.
+  Phase 1 materialization is a future extension (see D3).
+- **Cycle-accurate overlap reconstruction**: Phase 2 does not precisely reproduce
+  the execution time overlap from Phase 1. Phase 2 only verifies data correctness.
+- **GPU kernel compilation**: GEMM/Math in Phase 2 are numpy/torch calls
+  and do not reproduce the actual hardware PE microarchitecture.
+
+## Open Questions
+
+- **Aliasing / slice view**: How to represent slice/views referencing the same
+  backing storage in MemoryStore (stride-based view vs copy semantics)
+- **IPCQ/descriptor read generalization**: Whether to fully generalize PE-to-PE
+  communication as memory ops or introduce a separate op_kind
+- **Op log streaming**: Managing op_log memory usage in large-scale simulations
+  (in-memory list vs disk-backed streaming)
+- **Fused operation**: Whether to record tl.composite's tiled pipeline
+  (READ→COMPUTE→WRITE) as a single fused op record or separate individual ops
+- **Math op schema generalization**: The current math params have a simple structure,
+  but generalization may be needed for broadcasting rules, per-input dtype, keepdims,
+  scalar/immediate operands, where/mask expressions, etc.
+- **Op record identifier**: Currently dependency_ids are based on in-memory list indices;
+  replacement with stable op_id is needed when introducing streaming/disk-backed mode
+- **Phase 1 materialization policy**: See Future Extension in D3.
+  If allowed, the Phase 2 handling approach (skip / verify / recompute) for those ops
+  needs to be defined
+
+---
+
+## Consequences
+
+### Positive
+
+- Minimal impact on SimPy simulation performance (only op_log append added)
+- Free to use multi-threading/GPU in Phase 2
+- Component replaceability preserved (ADR-0015 design philosophy maintained)
+- No changes needed to benchmark user code API
+- When adding new message types, only set the data_op flag
+- Phase 0 eliminated via greenlet — memory-data-based dynamic control flow supported
+- `tl.load()` returns actual data, making kernel debugging easier
+
+### Negative
+
+- op_log memory usage (for large-scale simulations)
+- Phase 2 execution time is proportional to tensor size (large GEMM)
+- Dynamic branching based on pending handles (incomplete computations) not possible
+  (computations execute in Phase 2, result values are undetermined in Phase 1).
+  Memory-data-based branching is supported via greenlet.
+- greenlet C extension dependency added (pip install greenlet)
+
+---
+
+## Affected Files
+
+| File | Change |
+|------|--------|
+| `src/kernbench/components/base.py` | Add `_on_process_start/end` hooks |
+| `src/kernbench/common/pe_commands.py` | Add `data_op = True`, extend metadata fields |
+| `src/kernbench/sim_engine/op_log.py` | New: OpRecord, OpLogger |
+| `src/kernbench/sim_engine/data_executor.py` | New: DataExecutor, MemoryStore |
+| `src/kernbench/sim_engine/engine.py` | op_logger injection (optional) |
+| `src/kernbench/triton_emu/tl_context.py` | greenlet switch calls inside `tl.load()` etc. |
+| `src/kernbench/triton_emu/kernel_runner.py` | New: KernelRunner (greenlet ↔ SimPy bridge) |
+| `src/kernbench/components/builtin/pe_cpu.py` | Remove Phase 0, change to KernelRunner invocation |
+| `pyproject.toml` | Add greenlet dependency |
+
+Component implementation files (pe_gemm.py, pe_dma.py, hbm_ctrl.py, etc.): **no changes**
+Benchmark kernels (benches/*.py): **no user API changes**
@@ -0,0 +1,550 @@
+# ADR-0020: 2-Pass 데이터 실행 모델 (타이밍 / 데이터 분리)
+
+## Status
+
+Proposed
+
+## Context
+
+현재 시뮬레이션은 **타이밍만** 모델링한다.
+`tl.load()`, `tl.composite(op="gemm")` 등은 SimPy latency를 생성하지만,
+실제 텐서 데이터를 읽거나 연산하지 않는다.
+
+### 필요한 기능
+
+1. HBM/TCM/SRAM에 실제 데이터를 저장하고 읽을 수 있어야 한다
+2. PE_GEMM, PE_MATH가 실제 행렬 연산을 수행하고 결과를 검증할 수 있어야 한다
+3. 시뮬레이션 성능 저하를 최소화해야 한다
+
+### 기존 커널 실행 구조의 한계
+
+현재 커널 실행은 3단계로 분리되어 있다:
+
+```
+Phase 0: TLContext에서 커널 함수 실행 → PeCommand 리스트 생성 (SimPy 밖, 데이터 없음)
+Phase 1: PE_CPU가 PeCommand 리스트를 SimPy로 replay (타이밍만)
+```
+
+Phase 0에서 커널이 **전부 실행 완료**된 후에야 SimPy가 시작된다.
+`tl.load()`는 TensorHandle(placeholder)을 반환하므로 실제 데이터에 접근할 수 없다.
+따라서 데이터 값에 따른 분기(dynamic control flow)가 불가능하다.
+
+본 ADR은 이 한계를 **메모리 연산에 한해** 해소한다 (D1, D3 참조).
+
+### 제약 조건
+
+- SimPy는 single-thread 이벤트 루프 — numpy matmul을 안에서 하면 전체가 block
+- 컴포넌트는 교체 가능해야 한다 (ADR-0015) — 프레임워크 요구사항이 구현에 침투하면 안 됨
+- 벤치마크 커널은 명령형 코드(tl.load → tl.composite → tl.wait) — 같은 코드를 재사용해야 함
+- 커널 함수는 plain Python function으로 유지해야 한다 (generator/async 변환 불가)
+
+### 설계 탐색 결과
+
+| Option | 방식 | 판정 |
+|--------|------|------|
+| SimPy 내 직접 실행 | GEMM을 SimPy 안에서 numpy 호출 | 탈락: single-thread block |
+| SimPy + ThreadPool | future.submit → timeout → result() | 탈락: back-to-back 요청 시 result()에서 block |
+| Symbolic + lazy | 메타데이터만 추적, 나중에 실행 | 탈락: control-flow dependent 읽기 처리 곤란 |
+| **2-pass (채택)** | Phase 1: 타이밍, Phase 2: 데이터 | 완전 분리, 성능 영향 없음 |
+
+---
+
+## Decision
+
+### D1. 2-Pass 실행 모델 — Phase 0 제거
+
+기존의 3단계(Phase 0 → Phase 1 → Phase 2)를 **2단계로 통합**한다.
+
+기존:
+```
+Phase 0: 커널 → PeCommand 리스트 (데이터 없음, 분기 불가)
+Phase 1: PeCommand 리스트를 SimPy replay (타이밍만)
+```
+
+변경:
+```
+Phase 1 (타이밍): 커널 + SimPy 통합 실행 — greenlet 기반
+  - 메모리 읽기/쓰기: SimPy 타이밍 + MemoryStore 실제 데이터
+  - 연산 (GEMM/Math): SimPy 타이밍 + op_log 기록 (실제 연산은 Phase 2)
+  - dynamic control flow 가능 (tl.load가 실제 데이터 반환)
+
+Phase 2 (데이터): op_log 기반 실제 연산 실행 — SimPy 외부, 병렬 가능
+```
+
+본 ADR은 **메모리 연산에 한해 Phase 1을 data-aware로 확장**한다.
+Phase 1은 latency/BW 병목 분석 + 메모리 데이터 추적,
+Phase 2는 GEMM/Math 연산 정합성 검증.
+Phase 2는 optional — 타이밍만 필요하면 Phase 1만 실행.
+
+### D2. Op Log 기록 — ComponentBase hook
+
+op_log 기록은 **컴포넌트 베이스 클래스의 hook**으로 수행한다.
+개별 컴포넌트 구현을 수정하지 않는다.
+
+```python
+class ComponentBase:
+    def _on_process_start(self, env, msg):
+        if self._op_logger and getattr(msg, 'data_op', False):
+            self._op_logger.record_start(env.now, self.node.id, msg)
+
+    def _on_process_end(self, env, msg):
+        if self._op_logger and getattr(msg, 'data_op', False):
+            self._op_logger.record_end(env.now, self.node.id, msg)
+```
+
+`_forward_txn()` 에서 `run()` 전후로 hook을 호출한다.
+`_op_logger`는 optional — 없으면 오버헤드 제로.
+
+**hook 시점 정의**:
+
+| 시점 | 의미 |
+|------|------|
+| `t_start` | 컴포넌트가 해당 msg의 **service를 시작**한 시점 (`run()` 진입 직전) |
+| `t_end` | 컴포넌트의 **내부 service가 완료**된 시점 (`run()` 반환 직후) |
+
+link traversal latency는 t_start/t_end에 포함되지 않는다.
+link latency는 발신 컴포넌트의 t_end와 수신 컴포넌트의 t_start 차이로 관측된다.
+
+### D3. Greenlet 기반 커널 실행 — Phase 0 제거
+
+기존 Phase 0 (커널 → PeCommand 리스트)를 제거하고,
+**greenlet**을 사용하여 커널과 SimPy를 협력적으로 interleave 실행한다.
+
+#### 동작 원리
+
+greenlet은 협력적 context switch를 제공하는 C 확장이다.
+커널(child greenlet)이 `tl.load()` 등을 호출하면 SimPy 루프(parent greenlet)로
+switch하여 타이밍 시뮬레이션을 수행하고, 완료 후 실제 데이터와 함께 커널로 돌아온다.
+
+```
+SimPy 루프 (parent greenlet)          커널 (child greenlet)
+─────────────────────────              ──────────────────────
+g.switch() ─────────────────────────→ 커널 시작
+                                       a = tl.load(ptr, ...)
+                                         내부: parent.switch(DmaReadCmd)
+cmd = DmaReadCmd ←──────────────────  (커널 일시정지)
+  yield DmaReadMsg(...)
+  yield env.timeout(dma_latency)
+  data = memory_store.read(...)
+g.switch(data) ─────────────────────→ (커널 재개)
+                                       a = data  ← 실제 numpy array
+                                       if a[0][0] > 0.5:  ← 분기 가능
+                                         ...
+```
+
+커널은 **plain Python function**으로 유지된다.
+greenlet switch는 `tl.load()`, `tl.store()` 등의 **내부 구현에만** 존재한다.
+
+#### KernelRunner — 프레임워크 레이어
+
+greenlet 루프는 PE_CPU 컴포넌트가 아니라 프레임워크 레이어인
+**KernelRunner**에 위치한다.
+
+```python
+# KernelRunner (프레임워크 — greenlet ↔ SimPy 연결)
+class KernelRunner:
+    def run(self, env, kernel_fn, args, store):
+        g = greenlet(self._run_kernel)
+        cmd = g.switch(kernel_fn, args)
+
+        while cmd is not None:
+            if isinstance(cmd, DmaReadCmd):
+                yield from self._dispatch_dma(env, cmd)
+                data = store.read(cmd.src_addr, cmd.shape, cmd.dtype)
+                cmd = g.switch(data)            # 실제 데이터와 함께 재개
+            elif isinstance(cmd, GemmCmd):
+                yield from self._dispatch_gemm(env, cmd)
+                cmd = g.switch()                # 재개 (데이터 없음)
+            elif isinstance(cmd, DmaWriteCmd):
+                store.write(cmd.dst_addr, cmd.data)  # visibility = issue 시점
+                yield from self._dispatch_dma(env, cmd)  # timing만 반영
+                cmd = g.switch()
+
+# PE_CPU (컴포넌트 — 간단하게 유지, greenlet을 모름)
+def _execute_kernel(self, env):
+    runner = KernelRunner(self.ctx)
+    yield from runner.run(env, kernel_fn, args, store)
+```
+
+**Op logging single source of truth**: KernelRunner는 op_log에 직접 기록하지 않는다.
+모든 op logging은 **ComponentBase hook (_on_process_start/end)만** 담당한다.
+KernelRunner가 `_dispatch_gemm()` 등으로 컴포넌트에 메시지를 전달하면,
+컴포넌트 베이스 클래스의 hook이 자동으로 기록한다.
+
+**레이어 분리**:
+- **커널 코드**: plain function, greenlet 존재를 모름
+- **TLContext**: `tl.load()` 내부에서 `parent.switch(cmd)` 호출
+- **KernelRunner**: greenlet ↔ SimPy 연결, MemoryStore 읽기/쓰기 처리. **logging 안 함**.
+- **ComponentBase hook**: op_log 기록의 유일한 경로
+- **PE_CPU**: KernelRunner를 호출만 함, 컴포넌트로서 교체 가능
+
+#### 메모리 읽기/쓰기 vs 연산의 처리 차이
+
+| 연산 | Phase 1에서 | Phase 2에서 |
+|------|------------|------------|
+| `tl.load()` | SimPy 타이밍 + MemoryStore read → **실제 데이터 반환** | — |
+| `tl.store()` | SimPy 타이밍 + MemoryStore write → **실제 기록** | — |
+| `tl.composite(gemm)` | SimPy 타이밍 + **op_log 기록만** | numpy 실제 연산 |
+| `tl.dot()` / math ops | SimPy 타이밍 + **op_log 기록만** | numpy 실제 연산 |
+
+메모리 읽기/쓰기는 Phase 1에서 즉시 처리 (numpy slice, 빠름).
+GEMM/Math 연산은 Phase 2에서 batch 실행 (성능 분리).
+
+#### Store Visibility Rule
+
+`tl.store()`는 **issue 시점에 MemoryStore에 즉시 반영**된다 (visibility = issue).
+SimPy DMA 타이밍은 이후 별도로 시뮬레이션된다.
+
+이는 timing과 visibility를 의도적으로 분리한 것이다:
+- **visibility**: MemoryStore에 반영되는 시점 = `store.write()` 호출 시
+- **timing**: SimPy에서 DMA latency가 완료되는 시점
+
+이 분리로 dynamic control flow에서 store 직후 load가 최신 데이터를 볼 수 있다.
+
+#### Result Handle Semantics
+
+`tl.composite()`(sync/async)는 결과 tensor를 참조하는 **handle**을 반환한다.
+
+Phase 1에서의 핵심 계약:
+
+1. **모든 compute handle은 Phase 1에서 항상 pending 상태로 간주한다.**
+2. `tl.wait(handle)`은 **timing synchronization만 표현**하며,
+   handle을 ready로 만들지 않는다.
+3. handle의 실제 결과 데이터 접근(`handle.data`, element access,
+   numpy conversion 등)은 **Phase 2에서만 가능**하다.
+4. 따라서 Phase 1에서 **compute-result 기반 control flow는 지원하지 않는다.**
+5. 반면 `tl.load()`는 Phase 1에서 실제 데이터를 반환하므로,
+   **memory-read 기반 control flow는 지원 가능**하다.
+
+| handle 상태 | Phase | 허용 동작 |
+|------------|-------|----------|
+| pending | Phase 1 | `tl.wait(handle)` — timing 동기화만 |
+| pending | Phase 1 | handle을 `tl.store()`의 대상으로 전달 (logical destination 연결만, payload는 Phase 2) |
+| pending | Phase 1 | **데이터 접근 불가** — 값 기반 분기 불가 |
+| ready | Phase 2 | 실제 numpy 데이터 접근, 검증 |
+
+이 제약은 의도적이다. Phase 1에서 연산을 실행하면 SimPy single-thread가
+block되어 2-pass 분리의 존재 이유가 사라진다.
+
+#### Phase 1 Materialization — Future Extension
+
+향후 소형 연산(scalar, 작은 reduction)에 대해 Phase 1 eager execution이
+필요한 경우, `materialized_in_phase1: bool` 플래그를 op record에 추가하여
+선택적 materialization을 지원할 수 있다. 현재 범위에서는 구현하지 않는다.
+
+### D4. data_op 플래그 — 메시지 자기 선언
+
+로깅 대상은 메시지 타입이 아니라 메시지 인스턴스의 `data_op` 속성으로 결정한다.
+프레임워크가 메시지 타입을 하드코딩하지 않는다.
+
+```python
+class MsgBase:
+    data_op: bool = False       # 기본: 로깅 안 함
+
+class DmaReadCmd(MsgBase):
+    data_op = True              # 메모리 이동 → 로깅
+
+class GemmCmd(MsgBase):
+    data_op = True              # 연산 → 로깅
+
+class MathCmd(MsgBase):
+    data_op = True              # 연산 → 로깅
+```
+
+새 메시지 타입(예: IpcqMsg) 추가 시 `data_op = True`만 설정하면
+프레임워크 코드 수정 없이 자동 로깅된다.
+
+### D5. Op Log 구조
+
+#### op 분류 체계
+
+2단계로 분류한다:
+
+| 레벨 | 필드 | 역할 |
+|------|------|------|
+| `op_kind` | `memory` \| `gemm` \| `math` | executor dispatch 기준 |
+| `op_name` | `dma_read` \| `dma_write` \| `gemm_f16` \| `exp` \| `add` \| `sum` 등 | 구체 연산 식별 |
+
+#### OpRecord 정의
+
+```python
+@dataclass
+class OpRecord:
+    t_start: float              # SimPy 시각 (ns) — service 시작
+    t_end: float                # SimPy 시각 (ns) — service 완료
+    component_id: str           # e.g. "sip0.cube0.pe0.pe_gemm"
+    op_kind: str                # "memory" | "gemm" | "math"
+    op_name: str                # 구체 연산명
+    params: dict                # 연산별 파라미터 (아래 참조)
+    dependency_ids: list[int]   # 현재는 in-memory record index 기반, 향후 stable op_id로 대체 가능
+```
+
+#### dependency_ids 생성 규칙
+
+`dependency_ids`는 **optional**이며, 기본적으로 executor는
+주소 기반 dependency 추론을 수행한다 (D6 참조).
+
+정확한 실행 순서가 필요한 경우에만 명시적으로 설정한다:
+- **기본 (address-based inference)**: executor가 read/write set을 분석하여
+  RAW/WAW/WAR 의존성을 자동 추론. 대부분의 경우 이것으로 충분.
+- **명시적 설정**: TLContext 또는 command 생성 단계에서 logical dependency가
+  주소로 표현되지 않는 경우에 설정.
+  예: completion handle 기반 동기화 — handle dependency는 메모리 주소가 아니라
+  논리적 완료 순서에 의존하므로 address inference로 잡히지 않는다.
+
+#### op_log ordering
+
+op_log는 `t_start` 기준으로 **stable ordering**을 유지한다.
+동일 `t_start`의 record들은 insertion order를 보존한다.
+
+#### params 상세
+
+**memory (dma_read / dma_write)**:
+```python
+{
+    "src_addr": int,            # source 주소 (byte)
+    "dst_addr": int,            # destination 주소 (byte)
+    "nbytes": int,              # 전송 크기
+    "src_space": str,           # "hbm" | "tcm" | "sram"
+    "dst_space": str,           # "hbm" | "tcm" | "sram"
+}
+```
+
+**gemm**:
+```python
+{
+    "src_a_addr": int,          # operand A 주소
+    "src_b_addr": int,          # operand B 주소
+    "dst_addr": int,            # output 주소
+    "shape_a": tuple,           # e.g. (128, 256)
+    "shape_b": tuple,           # e.g. (256, 128)
+    "shape_out": tuple,         # e.g. (128, 128)
+    "dtype_in": str,            # e.g. "f16"
+    "dtype_acc": str,           # accumulation dtype, e.g. "f32"
+    "dtype_out": str,           # output dtype, e.g. "f16"
+    "transpose_a": bool,
+    "transpose_b": bool,
+    "layout_a": str,            # "row_major" | "col_major"
+    "layout_b": str,
+    "layout_out": str,
+    "addr_space": str,          # "tcm" (GEMM operand는 항상 TCM)
+}
+```
+
+**math**:
+```python
+{
+    "op": str,                  # "exp" | "add" | "sum" | "where" | ...
+    "input_addrs": list[int],   # operand 주소 목록
+    "input_shapes": list[tuple],
+    "dst_addr": int,
+    "shape_out": tuple,
+    "dtype": str,
+    "axis": int | None,         # reduction axis
+    "addr_space": str,          # "tcm"
+}
+```
+
+### D6. Phase 2 Executor
+
+Phase 2는 SimPy 밖에서 op_log를 실행한다.
+
+```python
+class DataExecutor:
+    def __init__(self, op_log: list[OpRecord], initial_store: MemoryStore):
+        self.store = initial_store  # Phase 1의 MemoryStore snapshot을 입력으로 받는다
+
+    def run(self):
+        for t, ops in groupby(op_log, key=lambda o: o.t_start):
+            batch = list(ops)
+            independent, sequential = self._classify(batch)
+            self._execute_parallel(independent)
+            self._execute_sequential(sequential)
+```
+
+**병렬 실행 판정**:
+
+같은 `t_start`의 op들은 **병렬 후보**로 간주한다.
+실제 병렬 실행 여부는 executor가 다음 기준으로 판정한다:
+- read/write 주소 범위 겹침 여부 (WAW, RAW, WAR 충돌 검사)
+- `dependency_ids`에 명시된 선행 op 완료 여부
+
+주소 범위가 겹치지 않고 명시적 의존성이 없는 op들만 병렬 실행한다.
+
+**배치 최적화**: 동일 op_name이며 **shape, dtype, layout, transpose flag가
+모두 동일한** 독립 op들만 batching 대상이 된다.
+예: 여러 PE의 동일 shape GEMM → `np.matmul(a_batch, b_batch)` 한 번으로 묶음.
+CPU에서도 BLAS 효율 향상, GPU에서는 launch overhead 절감.
+
+**Phase 2 실행 순서 보장**:
+
+Phase 2는 데이터 도착 시점을 고려하지 않으며,
+dependency (주소 기반 추론 + 명시적 dependency_ids)를 통해서만
+실행 순서를 보장한다.
+
+### D7. Memory Store
+
+`MemoryStore`는 논리적으로 byte-addressable semantics를 따르며,
+현재 구현은 **tensor-granular storage** (addr → numpy ndarray 매핑)를 사용한다.
+
+```python
+class MemoryStore:
+    def write(self, space: str, addr: int, data: np.ndarray) -> None: ...
+    def read(self, space: str, addr: int, shape: tuple, dtype: str) -> np.ndarray: ...
+```
+
+**내부 저장 포맷: numpy ndarray**
+
+MemoryStore는 텐서를 **numpy ndarray**로 저장한다.
+
+| 후보 | store/load 속도 | Phase 2 연산 | 판정 |
+|------|----------------|-------------|------|
+| **numpy ndarray** | 즉시 (참조 전달, 복사 없음) | `np.matmul` 바로 사용 | **채택** |
+| bytearray | memcpy 필요 | `np.frombuffer` 변환 필요 | 탈락 |
+| torch tensor | 즉시 | torch 연산 가능 | GPU 최적화 시만 사용 |
+
+- write: numpy array를 **참조 저장** (복사 없음) → Phase 1 오버헤드 = dict lookup 1회
+- read: numpy array를 **참조 반환** (복사 없음)
+- 동일 addr에 재 write 시 기존 array를 **tensor 단위로 덮어쓴다** (partial overwrite 미지원)
+- dtype은 numpy native 사용 (`np.float16`, `np.float32`, `np.bfloat16` 등)
+- byte-level access가 필요한 경우 `.view(np.uint8)` 로 변환
+- Phase 2에서 GPU batch 최적화 시 numpy → torch tensor 변환은 executor가 담당
+
+**read/write contract**:
+
+- read/write는 **contiguous tensor** 기준이다.
+  non-contiguous stride view가 필요한 경우 별도 copy op으로 표현한다.
+- 일반 benchmark path에서는 producer/consumer dtype 일치를 기대한다.
+  reinterpret cast는 low-level memory validation 또는 특수 테스트 케이스를 위한
+  permissive behavior이다.
+- addr은 byte-aligned이며, 최소 alignment = dtype 크기.
+- dtype mismatch (write와 다른 dtype으로 read)는 reinterpret cast로 처리한다.
+  shape 불일치 시 nbytes 기준으로 검증하고, 불일치하면 error.
+- 정합성 기준은 주소 범위 기반 read/write semantics를 따른다.
+- 구현 최적화로 tensor object cache를 둘 수 있지만,
+  canonical state는 byte-addressable storage이다.
+- deploy 시점에 호스트가 초기 텐서 데이터를 주입한다.
+
+### D8. 벤치마크 커널 코드
+
+벤치마크의 **사용자 코드 API는 변경하지 않는다**.
+`tl.load()`, `tl.composite()`, `tl.store()` 등의 호출 인터페이스는 유지.
+
+단, 내부 command/message schema는 Phase 2 실행에 필요한 metadata를
+포함하도록 확장될 수 있다 (예: dtype_acc, transpose 등 추가 필드).
+
+### D9. 컴포넌트 변경 없음
+
+개별 컴포넌트 구현(PE_GEMM, PE_DMA, HBM_CTRL 등)은 수정하지 않는다.
+op_log 기록은 ComponentBase hook의 책임이다.
+커스텀 컴포넌트 교체 시 타이밍 모델만 교체되며,
+Phase 2 데이터 실행은 영향받지 않는다.
+
+### D10. Phase 2는 Optional
+
+```python
+engine = GraphEngine(graph)
+engine.run(benchmark)                       # Phase 1: 타이밍만
+result = engine.get_timing_result()
+
+if verify_data:
+    executor = DataExecutor(engine.op_log)  # Phase 2: 데이터
+    executor.run()
+    executor.verify(expected_output)
+```
+
+타이밍 분석만 필요하면 Phase 2를 건너뛴다.
+op_logger를 비활성화하면 Phase 1 성능도 기존과 동일.
+
+### D11. Verification Contract
+
+기본 검증은 **최종 output tensor**를 reference backend(numpy)와 비교한다.
+
+dtype별 tolerance 정책:
+
+| dtype | 비교 방식 | tolerance |
+|-------|----------|-----------|
+| f32 | `np.allclose` | rtol=1e-5, atol=1e-5 |
+| f16 | `np.allclose` | rtol=1e-3, atol=1e-3 |
+| bf16 | `np.allclose` | rtol=1e-2, atol=1e-2 |
+| int 계열 | `np.array_equal` | exact |
+
+- 기본 모드: 최종 output만 비교 (end-to-end correctness)
+- 디버그 모드: intermediate tensor도 op 단위로 비교 가능
+  (MemoryStore snapshot at each op boundary)
+
+---
+
+## Non-goals
+
+- **Compute-result-based control flow**: 지원하지 않는다.
+  모든 compute handle은 Phase 1에서 pending 상태이며,
+  `wait()`는 timing synchronization만 표현하고 data readiness를 의미하지 않는다.
+  Phase 1에서 `handle.data` 접근, element access, truth-value evaluation은
+  **error로 처리**한다.
+  메모리 데이터 기반 분기(`tl.load()` 결과)는 greenlet으로 지원된다.
+  Phase 1 materialization은 future extension (D3 참조).
+- **Cycle-accurate overlap reconstruction**: Phase 2에서 Phase 1의 실행 시간
+  overlap을 정확히 재현하지 않는다. Phase 2는 데이터 정합성만 검증한다.
+- **GPU kernel compilation**: Phase 2의 GEMM/Math는 numpy/torch 호출이며,
+  실제 하드웨어 PE의 마이크로아키텍처를 재현하지 않는다.
+
+## Open Questions
+
+- **Aliasing / slice view**: 동일 backing storage를 참조하는 slice/view를
+  MemoryStore에서 어떻게 표현할지 (stride-based view vs copy semantics)
+- **IPCQ/descriptor read 일반화**: PE-to-PE 통신을 memory op으로 완전히
+  일반화할지, 별도 op_kind를 둘지
+- **Op log streaming**: 대규모 시뮬레이션에서 op_log 메모리 사용량 관리
+  (in-memory list vs disk-backed streaming)
+- **Fused operation**: tl.composite의 tiled pipeline (READ→COMPUTE→WRITE)을
+  하나의 fused op record로 기록할지, 개별 op으로 분리할지
+- **Math op schema 일반화**: 현재 math params는 단순 구조이나,
+  broadcasting rule, input별 dtype, keepdims, scalar/immediate operand,
+  where/mask 표현 등 일반화가 필요할 수 있음
+- **Op record 식별자**: 현재 dependency_ids는 in-memory list index 기반이며,
+  streaming/disk-backed mode 도입 시 stable op_id로 대체 필요
+- **Phase 1 materialization policy**: D3의 Future Extension 참조.
+  허용 시 해당 op의 Phase 2 처리 방식 (skip / verify / recompute) 정의 필요
+
+---
+
+## Consequences
+
+### 긍정적
+
+- SimPy 시뮬레이션 성능 영향 최소 (op_log append만 추가)
+- Phase 2에서 멀티스레드/GPU 자유롭게 사용 가능
+- 컴포넌트 교체 자유도 유지 (ADR-0015 설계 철학 보존)
+- 벤치마크 사용자 코드 API 변경 불필요
+- 새 메시지 타입 추가 시 data_op 플래그만 설정
+- greenlet으로 Phase 0 제거 — 메모리 데이터 기반 dynamic control flow 지원
+- `tl.load()`가 실제 데이터를 반환하므로 커널 디버깅 용이
+
+### 부정적
+
+- op_log 메모리 사용량 (대규모 시뮬레이션 시)
+- Phase 2 실행 시간은 텐서 크기에 비례 (대형 GEMM)
+- pending handle (연산 미완료) 기반 동적 분기 불가
+  (연산은 Phase 2에서 실행, Phase 1에서 결과 값 미확정).
+  메모리 데이터 기반 분기는 greenlet으로 지원된다.
+- greenlet C 확장 의존성 추가 (pip install greenlet)
+
+---
+
+## 영향받는 파일
+
+| 파일 | 변경 |
+|------|------|
+| `src/kernbench/components/base.py` | `_on_process_start/end` hook 추가 |
+| `src/kernbench/common/pe_commands.py` | `data_op = True` 추가, metadata 필드 확장 |
+| `src/kernbench/sim_engine/op_log.py` | 신규: OpRecord, OpLogger |
+| `src/kernbench/sim_engine/data_executor.py` | 신규: DataExecutor, MemoryStore |
+| `src/kernbench/sim_engine/engine.py` | op_logger 주입 (optional) |
+| `src/kernbench/triton_emu/tl_context.py` | `tl.load()` 등 내부에서 greenlet switch 호출 |
+| `src/kernbench/triton_emu/kernel_runner.py` | 신규: KernelRunner (greenlet ↔ SimPy 연결) |
+| `src/kernbench/components/builtin/pe_cpu.py` | Phase 0 제거, KernelRunner 호출로 변경 |
+| `pyproject.toml` | greenlet 의존성 추가 |
+
+컴포넌트 구현 파일 (pe_gemm.py, pe_dma.py, hbm_ctrl.py 등): **변경 없음**
+벤치마크 커널 (benches/*.py): **사용자 API 변경 없음**
@@ -0,0 +1,537 @@
+# ADR-0021: PE Pipeline Refactoring — Component Separation + Scheduler-Based Routing
+
+## Status
+
+Proposed
+
+## Context
+
+### Problems with the Current Structure
+
+pe_accel (SchedulerV2Component) hides 5 hardware blocks (DmaIn, DmaWb, Gemm, Math, Tcm)
+**inside a single component**.
+
+```
+SchedulerV2Component (single topology node)
+├── DmaInBlock     ← directly connected via internal SimPy Store
+├── DmaWbBlock     ← not visible in topology
+├── GemmBlock      ← not replaceable
+├── MathBlock      ← not replaceable
+└── TcmBlock       ← not replaceable
+```
+
+Problems:
+- Blocks directly reference the next block via `desc.next_block` — hardcoded routing
+- Individual blocks cannot be replaced (violates ADR-0015 component replacement principle)
+- PE internal structure is not visible in the topology
+- GemmBlock and MathBlock each duplicate TCM load/store logic
+
+### Actual Hardware Structure
+
+```
+HBM ←(DMA)→ TCM ←(Fetch/Store Unit)→ Register File ←→ GEMM/MATH Engine
+```
+
+- DMA: HBM ↔ TCM transfer (via fabric, tens to hundreds of ns)
+- Fetch/Store Unit: TCM ↔ Register File transfer (BW-based, a few ns)
+- GEMM/MATH Engine: computation between Register Files (cycle-accurate)
+- Completion signal: PE-internal 1-cycle wire signal (done pin assert)
+
+---
+
+## Decision
+
+### D1. Separate Each Block into an Independent Component
+
+The internal blocks of pe_accel are separated into **independent PeEngineBase components**.
+Existing 5 blocks + 1 Fetch/Store Unit = 6 components.
+
+| Component | Role | HW Correspondence |
+|-----------|------|-------------------|
+| PE_SCHEDULER | Plan generation, tile state management, stage routing | Scheduler/Sequencer |
+| PE_DMA | HBM ↔ TCM (via fabric) | DMA Engine |
+| PE_FETCH_STORE | TCM ↔ Register File | Load/Store Unit |
+| PE_GEMM | MAC compute (register only) | MAC Array |
+| PE_MATH | Element-wise/reduction (register only) | SIMD/Vector Unit |
+| PE_TCM | BW-serialized scratchpad | SRAM Bank |
+
+Each component exists as a topology node and is connected via ports/wires.
+Replacing the `impl` allows changing the timing model of an individual block.
+
+### D2. Token Self-Routing — Scheduler Handles Only Dispatch + Completion
+
+**Components do not pass through the scheduler at every stage.**
+The token carries a plan so that components chain directly to the next stage.
+
+```
+Scheduler → DMA → Fetch → GEMM → Math → Store → DMA_WB → (done) → Scheduler
+              ↑ chaining: does not go through scheduler          completion only
+```
+
+This matches the actual HW structure where each block's done signal is directly
+connected to the next block via wire. The scheduler is responsible **only for
+initial dispatch + completion aggregation**.
+
+#### Stage Definition
+
+```python
+class StageType(Enum):
+    DMA_READ = 0
+    FETCH = 1
+    GEMM = 2
+    MATH = 3
+    STORE = 4
+    DMA_WRITE = 5
+```
+
+#### Plan Structure
+
+When the scheduler receives a CompositeCmd, it generates a **per-tile execution plan**.
+The plan defines the **stage sequence** for each tile:
+
+```python
+@dataclass
+class Stage:
+    stage_type: StageType
+    component: str       # topology node ID (e.g. "sip0.cube0.pe0.pe_dma")
+    params: dict         # per-stage parameters (dynamic)
+
+@dataclass(frozen=True)
+class TilePlan:
+    tile_id: int
+    stages: tuple[Stage, ...]  # list of stages to execute in order (immutable)
+```
+
+The stage sequence varies depending on the plan:
+
+```python
+# Normal GEMM: HBM → TCM → Register → Compute → Register → TCM → HBM
+stages = (DMA_READ, FETCH, GEMM, STORE, DMA_WRITE)
+
+# GEMM directly from TCM data (skip DMA read):
+stages = (FETCH, GEMM, STORE, DMA_WRITE)
+
+# MATH element-wise:
+stages = (DMA_READ, FETCH, MATH, STORE, DMA_WRITE)
+
+# GEMM + accumulation (intermediate K-tile, skip writeback):
+stages = (DMA_READ, FETCH, GEMM, STORE)  # store to TCM only
+```
+
+**Components do not hardcode the next component.**
+They read the next stage from the token's plan and forward it directly via out_port.
+This is the same pattern as a network packet carrying a routing header.
+
+#### Pipeline Context
+
+```python
+@dataclass
+class PipelineContext:
+    id: str
+    total_tiles: int
+    completed_tiles: int = 0
+    done_event: simpy.Event = None  # succeeds when all tiles are complete
+
+    def complete_tile(self) -> None:
+        self.completed_tiles += 1
+        if self.completed_tiles == self.total_tiles:
+            self.done_event.succeed()
+```
+
+**Completion follows an exactly-once contract**: the last stage of each tile must call
+`complete_tile()` exactly once. Duplicate calls are a bug, and `done_event` must
+succeed only once (SimPy Event constraint).
+
+#### Scheduler Role (Reduced)
+
+When the scheduler receives a CompositeCmd, it creates a plan and PipelineContext,
+enqueues them into the scheduler's internal `_pending_feeds` FIFO, and returns immediately.
+
+Actual tile injection is handled by a **single feeder process** (`_feed_loop`).
+This feeder consumes `_pending_feeds` in FIFO order and
+**does not allow tile feed interleaving across composite commands.**
+That is, the feed for the next command begins only after all tiles of the current
+command have been injected into the first stage queue.
+
+There is **exactly one `_feed_loop`** per scheduler, and
+tile feed for composite commands is performed exclusively through this single process.
+Command issue order refers to **the order in which PE_SCHEDULER receives PeInternalTxn**.
+
+This structure maintains command issue order while ensuring that when the first stage
+queue is full, only the feeder process blocks — the scheduler worker's inbox processing
+itself does not stall.
+
+```python
+class PeSchedulerV2(PeEngineBase):
+    _pipelines: dict[str, PipelineContext]
+    _pending_feeds: simpy.Store   # FIFO of (plan, ctx)
+
+    def start(self, env):
+        super().start(env)
+        self._pending_feeds = simpy.Store(env)
+        env.process(self._feed_loop(env))
+
+    def _dispatch_composite(self, env, pe_txn, cmd):
+        plan = generate_plan(cmd)
+        ctx = PipelineContext(
+            id=next_id(),
+            total_tiles=len(plan.tiles),
+            done_event=pe_txn.done,
+        )
+        self._pipelines[ctx.id] = ctx
+
+        # only enqueue to feeder queue and return immediately
+        yield self._pending_feeds.put((plan, ctx))
+
+    def _feed_loop(self, env):
+        """Single feeder process: feeds composite commands in FIFO order.
+
+        Tile feed interleaving across composite commands is not allowed.
+        The feed for the next command begins only after all tiles of the
+        current command have been injected into the first stage queue.
+
+        When the first stage queue is full, only this feeder blocks;
+        the scheduler worker's inbox processing does not stall.
+        """
+        while True:
+            plan, ctx = yield self._pending_feeds.get()
+            for tile in plan.tiles:
+                token = TileToken(
+                    tile_id=tile.tile_id,
+                    pipeline_ctx=ctx,
+                    plan=tile,
+                    stage_idx=0,
+                    params=tile.stages[0].params,
+                )
+                yield self.out_ports[tile.stages[0].component].put(token)
+                # queue capacity = HW queue depth → feeder blocks only when full
+```
+
+In this ADR, the scheduler can accept multiple composite commands,
+but tile submission order follows per-command FIFO.
+Within a command, tile-level pipeline overlap is allowed,
+but tile feed interleaving across commands is not.
+
+### D3. Data Transfer vs. Completion Signal — HW Modeling Criteria
+
+| Communication Type | Method | HW Correspondence |
+|-------------------|--------|-------------------|
+| Tile token (work directive) | message via out_port | enqueue to command queue |
+| Stage completion → next stage | component directly calls out_port.put | done-triggered local enqueue |
+| Pipeline completion → scheduler | PipelineContext.complete_tile() | completion interrupt |
+
+**Tile token**: uses out_port.put(). SimPy Store capacity = HW queue depth.
+
+**Intra-PE chaining latency**: within the scope of this ADR, no explicit latency model
+is applied to intra-PE stage triggers. Chaining between components corresponds to
+PE-internal wires, and since there is no scheduler round-trip, no artificial hop cost
+is incurred.
+
+**Pipeline completion**: the component at the last stage calls `pipeline_ctx.complete_tile()`.
+When all tiles are complete, PipelineContext calls done_event.succeed().
+
+### D4. Asynchronous Pipeline — Natural Overlap
+
+The scheduler processes CompositeCmds **asynchronously**.
+However, tile feed does not spawn an independent process per command; instead,
+the scheduler's internal **single feeder process** performs the feed in FIFO order.
+Therefore, the scheduler can continue to receive the next command,
+but the first-stage tile injection order is guaranteed per command.
+
+Since **SimPy Store capacity = HW queue depth**:
+- When the queue is full, put() naturally blocks (backpressure)
+- While DMA is processing tile 0, GEMM can start fetching an already-completed tile
+- When a second CompositeCmd arrives, it is immediately queued to the DMA queue
+
+```
+First-stage feed order (feeder → DMA queue):
+  [cmd1:t0][cmd1:t1][cmd1:t2]...[cmd1:tN] | [cmd2:t0][cmd2:t1]...
+                                            ↑ cmd2 starts after cmd1 feed completes
+
+Runtime pipeline (downstream overlap):
+  PE_DMA:    [cmd1:t0][cmd1:t1][cmd1:t2]...[cmd1:tN][cmd2:t0][cmd2:t1]...
+  PE_FETCH:          [cmd1:t0][cmd1:t1]...
+  PE_GEMM:                   [cmd1:t0][cmd1:t1]...
+                              ↑ pipeline overlap within the same command
+```
+
+Here, the overlap does not come from tile feed interleaving across different commands,
+but occurs naturally as tiles from earlier commands progress to downstream stages
+while the feeder continues injecting subsequent tiles.
+
+For example, tile feed for cmd2 does not start until all tiles of cmd1 have been
+injected into the first stage queue. However, while cmd1.tile0 has already progressed
+to GEMM, cmd1.tile1 and cmd1.tile2 may still remain in DMA/FETCH, so
+**pipeline overlap within the same command occurs naturally**.
+
+#### Component Chaining Pattern
+
+All components follow the same pattern:
+
+```python
+def _pipeline_worker(self, env):
+    while True:
+        token = yield self._inbox.get()
+
+        # process own stage
+        yield from self._process(env, token)
+
+        # chain to next stage (read from plan)
+        next_idx = token.stage_idx + 1
+        if next_idx < len(token.plan.stages):
+            next_stage = token.plan.stages[next_idx]
+            token.stage_idx = next_idx
+            token.params = next_stage.params
+            yield self.out_ports[next_stage.component].put(token)
+        else:
+            # last stage — pipeline completion
+            token.pipeline_ctx.complete_tile()
+```
+
+### D5. PE_FETCH_STORE — Dedicated TCM ↔ Register File Transfer
+
+Previously, GemmBlock and MathBlock each implemented their own TCM read/write.
+This is separated into a **PE_FETCH_STORE component**.
+
+```python
+# PE_FETCH_STORE._process()
+def _process(self, env, token):
+    yield self.out_ports[tcm_id].put(TcmRequest(token.params["direction"], ...))
+    yield tcm_done
+    # chaining is handled by the base class (D4 pattern)
+```
+
+Advantages:
+- GEMM/MATH perform **pure compute only** — no TCM access logic
+- Fetch/store BW contention is naturally modeled (serialization via PE_TCM resource)
+- Prefetch strategies can be experimented with by replacing the fetch unit alone
+
+### D6. Simplification of Each Compute Component
+
+GEMM/MATH perform compute only with register data already prepared.
+**Chaining follows the common pattern (D4), so only _process() needs to be implemented:**
+
+```python
+# PE_GEMM._process()
+def _process(self, env, token):
+    yield env.timeout(self._mac_latency(token.params))
+
+# PE_MATH._process()
+def _process(self, env, token):
+    yield env.timeout(self._simd_latency(token.params))
+
+# PE_FETCH_STORE._process()
+def _process(self, env, token):
+    yield self.out_ports[tcm_id].put(TcmRequest(token.params["direction"], ...))
+    yield tcm_done
+
+# PE_DMA._process()
+def _process(self, env, token):
+    yield from self._do_fabric_dma(token.params)
+```
+
+By replacing only the timing model, one can freely switch between cycle-accurate
+and analytical models. Since the chaining logic resides in the base class,
+each component only implements its pure stage logic.
+
+### D7. Topology Changes
+
+Add PE_FETCH_STORE to the PE template:
+
+```yaml
+pe_template:
+  components:
+    pe_cpu:         { kind: pe_cpu,         impl: pe_cpu_v1, ... }
+    pe_scheduler:   { kind: pe_scheduler,   impl: pe_scheduler_v2, ... }
+    pe_dma:         { kind: pe_dma,         impl: pe_dma_v1, ... }
+    pe_fetch_store: { kind: pe_fetch_store, impl: pe_fetch_store_v1, ... }
+    pe_gemm:        { kind: pe_gemm,        impl: pe_gemm_v1, ... }
+    pe_math:        { kind: pe_math,        impl: pe_math_v1, ... }
+    pe_mmu:         { kind: pe_mmu,         impl: pe_mmu_v1, ... }
+    pe_tcm:         { kind: pe_tcm,         impl: pe_tcm_v1, ... }
+  links:
+    # existing links...
+    fetch_store_to_tcm_bw_gbs: 512.0
+    fetch_store_to_tcm_mm: 0.0
+```
+
+PE internal edge connections:
+```
+PE_SCHEDULER → PE_DMA (initial dispatch)
+PE_SCHEDULER → PE_FETCH_STORE (initial dispatch)
+PE_SCHEDULER → PE_GEMM (initial dispatch)
+PE_SCHEDULER → PE_MATH (initial dispatch)
+PE_DMA → PE_FETCH_STORE (chaining)
+PE_FETCH_STORE → PE_GEMM (chaining)
+PE_FETCH_STORE → PE_MATH (chaining)
+PE_GEMM → PE_FETCH_STORE (store chaining)
+PE_MATH → PE_FETCH_STORE (store chaining)
+PE_FETCH_STORE → PE_DMA (writeback chaining)
+PE_FETCH_STORE → PE_TCM (BW request)
+```
+
+Topology edges encompass both **control/dispatch visibility + runtime chaining**.
+Scheduler → sub-component edges are initial dispatch paths, while
+inter-component edges are runtime chaining paths driven by token self-routing.
+
+### D8. Existing Code Migration — Builtin Integration
+
+The existing builtin v1 components and pe_accel are **replaced with new builtin components**.
+
+#### Migration Strategy
+
+1. Back up existing `components/builtin/` → `components/builtin_legacy/` (preserved without modification)
+2. Back up existing `components/custom/pe_accel/` → likewise
+3. Re-implement new `components/builtin/` with the ADR-0021 architecture
+4. Maintain **only one** topology.yaml (including pe_fetch_store)
+5. components.yaml points to the new builtin
+
+```yaml
+# components.yaml — new builtin
+pe_scheduler_v1: kernbench.components.builtin.pe_scheduler:PeSchedulerComponent
+pe_gemm_v1:      kernbench.components.builtin.pe_gemm:PeGemmComponent
+pe_math_v1:      kernbench.components.builtin.pe_math:PeMathComponent
+pe_dma_v1:       kernbench.components.builtin.pe_dma:PeDmaComponent
+pe_fetch_store_v1: kernbench.components.builtin.pe_fetch_store:PeFetchStoreComponent
+pe_tcm_v1:       kernbench.components.builtin.pe_tcm:PeTcmComponent
+```
+
+The impl names (pe_gemm_v1, etc.) are preserved, but **the implementations are replaced
+with the ADR-0021 architecture**. Existing benchmarks and tests referencing topology.yaml
+continue to work without changes.
+
+#### Latency Model Inheritance
+
+The latency modeling of the new builtin components (MAC cycle calculation, SIMD latency,
+TCM BW serialization, DMA fabric latency, etc.) is **based on the current pe_accel
+implementation**. The tile schedule generation logic from tiling.py is also carried over.
+Only the architecture (component separation, self-routing) changes; timing accuracy
+is preserved.
+
+#### Test Strategy
+
+#### Test Plan
+
+**1. Existing test pass** (regression):
+After migration is complete, all existing tests (366) must pass.
+
+**2. Latency regression**:
+Verify that the new builtin produces identical latency for the same inputs as pe_accel.
+
+**3. Phase 1 → Phase 2 end-to-end**:
+Integration test from SimPy simulation (Phase 1) op_log generation → DataExecutor
+(Phase 2) actual numpy computation → result correctness verification.
+- GEMM: tl.composite(gemm) → op_log → Phase 2 matmul → allclose verification
+- MATH: tl.exp / tl.add, etc. → op_log → Phase 2 numpy op → allclose verification
+- Chaining: GEMM output → MATH input → final result end-to-end verification
+
+**4. TileToken self-routing**:
+- Verify that tiles chain according to the plan's stage sequence
+- Verify PipelineContext.complete_tile() exactly-once at the last stage
+- Queue backpressure: verify that only the feeder blocks when DMA queue capacity is exceeded
+
+**5. Asynchronous pipeline overlap**:
+- Verify that inter-tile stage overlap occurs within the same command (tile0 in GEMM while tile1 in DMA)
+- Multiple commands: verify that cmd2 feed starts after cmd1 feed completes (FIFO order)
+
+### D9. TileToken Message Definition
+
+A message used for passing tile work between components.
+The token carries the plan and stage index, enabling self-routing.
+
+```python
+@dataclass
+class TileToken:
+    tile_id: int
+    pipeline_ctx: PipelineContext    # completion tracking
+    plan: TilePlan                   # full stage sequence for this tile (immutable)
+    stage_idx: int                   # current stage index in plan.stages
+    params: dict                     # current stage parameter cache (canonical: plan.stages[stage_idx].params)
+    data_op: bool = True             # op_log recording target (ADR-0020)
+```
+
+A TileToken is **owned by exactly one component at a time** and
+is never referenced by multiple components simultaneously (single-owner).
+
+Token lifecycle:
+1. Scheduler creates it with stage_idx=0 and puts it to the first stage component
+2. The component executes _process(), increments stage_idx, and puts it to the next component
+3. The last stage component calls pipeline_ctx.complete_tile()
+4. When all tiles are complete, PipelineContext calls done_event.succeed()
+
+Relationship with existing PeInternalTxn:
+- PeInternalTxn: command transfer between PE_CPU → PE_SCHEDULER (existing, unchanged)
+- TileToken: per-tile work transfer from PE_SCHEDULER → sub-components (new, self-routing)
+
+---
+
+## Non-goals
+
+- **PE_CPU changes**: the PE_CPU → PE_SCHEDULER interface is not modified
+  (PeInternalTxn-based, ADR-0014 maintained)
+- **Resource contention model across multiple pipelines**: the current scope focuses on
+  accurate modeling of a single pipeline. TCM bank conflicts across multiple pipelines
+  are future work.
+- **builtin_legacy maintenance**: kept for backup purposes only; not a target for
+  bug fixes or feature additions.
+
+## Open Questions
+
+- **Register File capacity model**: whether to model capacity limits when the fetch unit
+  loads into registers. Capacity is expressed in bytes (register_file_bytes), and
+  the number of tiles that can be held simultaneously is determined by tile size.
+  When capacity is exceeded, fetch stalls, creating natural backpressure.
+- **Prefetch strategy**: this ADR does not allow tile feed interleaving across composite
+  commands. Therefore, overlap arises not from pre-injection across commands, but
+  naturally from pipeline progression of tiles within the same command.
+  If additional prefetch is needed, it should be considered at the level of tile ordering
+  within the same command or fetch/store unit policy, not cross-command injection.
+- **PE_DMA coalescing**: per-tile DMA may cause fragmentation.
+  Direction is to merge/coalesce within DMA without scheduler involvement.
+- **Synchronous execution mode**: this ADR adopts asynchronous pipeline as the
+  default/sole execution model. If a sync mode is needed for debug or validation
+  purposes, it will be considered in a future ADR.
+- **TCM bank conflict across multiple pipelines**: currently based on a single pipeline.
+  Bank conflict modeling when multiple pipelines simultaneously access TCM is future work.
+
+---
+
+## Consequences
+
+### Positive
+
+- Each block is an independent component — individually replaceable (ADR-0015 compliant)
+- PE internal structure is visible in the topology
+- Components do not know the next component — plan-based routing provides flexibility
+- Natural pipeline overlap between DMA and compute (SimPy Store backpressure)
+- Improved HW modeling accuracy (done signal = Event, data transfer = message)
+- Fetch/store separation enables accurate TCM BW contention modeling
+
+### Negative
+
+- Increased number of PE internal components (5 → 6) — more topology nodes/edges
+- Component separation makes intra-PE token forwarding more explicit than before
+- Breaking change from existing builtin/pe_accel — migration required
+
+---
+
+## Affected Files
+
+| File | Change |
+|------|--------|
+| `topology.yaml` | Add pe_fetch_store component, add chaining edges |
+| `components.yaml` | Register new builtin components |
+| `src/kernbench/topology/builder.py` | Add fetch_store + chaining edges to PE internal edges |
+| `src/kernbench/common/pe_commands.py` | Add TileToken definition |
+| `src/kernbench/components/builtin/pe_scheduler.py` | Re-implement (feeder + plan-based dispatch) |
+| `src/kernbench/components/builtin/pe_gemm.py` | Re-implement (TileToken, _process pattern) |
+| `src/kernbench/components/builtin/pe_math.py` | Re-implement (TileToken, _process pattern) |
+| `src/kernbench/components/builtin/pe_dma.py` | Re-implement (TileToken, _process pattern) |
+| `src/kernbench/components/builtin/pe_fetch_store.py` | New |
+| `src/kernbench/components/builtin/pe_tcm.py` | Re-implement (TcmRequest service) |
+| `src/kernbench/components/builtin/types.py` | New: TilePlan, Stage, StageType, PipelineContext, TileToken |
+| `src/kernbench/components/builtin/tiling.py` | Ported from pe_accel: plan generation logic |
+
+Backup:
+| `src/kernbench/components/builtin_legacy/` | Full backup of existing builtin (preserved without modification) |
+| `src/kernbench/components/custom/pe_accel/` | Backup of existing pe_accel (preserved without modification) |
@@ -0,0 +1,528 @@
+# ADR-0021: PE 파이프라인 리팩토링 — 컴포넌트 분리 + Scheduler 기반 라우팅
+
+## Status
+
+Proposed
+
+## Context
+
+### 현재 구조의 문제
+
+pe_accel (SchedulerV2Component)은 5개 하드웨어 블록(DmaIn, DmaWb, Gemm, Math, Tcm)을
+**단일 컴포넌트 내부**에 숨기고 있다.
+
+```
+SchedulerV2Component (단일 topology 노드)
+├── DmaInBlock     ← 내부 SimPy Store로 직접 연결
+├── DmaWbBlock     ← topology에 안 보임
+├── GemmBlock      ← 교체 불가
+├── MathBlock      ← 교체 불가
+└── TcmBlock       ← 교체 불가
+```
+
+문제점:
+- 블록이 다음 블록을 `desc.next_block`으로 직접 참조 — 하드코딩된 라우팅
+- 개별 블록 교체 불가 (ADR-0015 컴포넌트 교체 원칙 위배)
+- topology에서 PE 내부 구조가 보이지 않음
+- GemmBlock과 MathBlock이 TCM load/store 로직을 각각 중복 구현
+
+### 실제 하드웨어 구조
+
+```
+HBM ←(DMA)→ TCM ←(Fetch/Store Unit)→ Register File ←→ GEMM/MATH Engine
+```
+
+- DMA: HBM ↔ TCM 전송 (fabric 경유, 수십~수백 ns)
+- Fetch/Store Unit: TCM ↔ Register File 전송 (BW 기반, 수 ns)
+- GEMM/MATH Engine: Register File 간 연산 (cycle-accurate)
+- 완료 신호: PE 내부 1-cycle wire signal (done pin assert)
+
+---
+
+## Decision
+
+### D1. 각 블록을 독립 컴포넌트로 분리
+
+pe_accel의 내부 블록을 **독립 PeEngineBase 컴포넌트**로 분리한다.
+기존 5개 + Fetch/Store Unit 1개 = 6개 컴포넌트.
+
+| 컴포넌트 | 역할 | HW 대응 |
+|----------|------|---------|
+| PE_SCHEDULER | plan 생성, tile 상태 관리, stage 라우팅 | Scheduler/Sequencer |
+| PE_DMA | HBM ↔ TCM (fabric 경유) | DMA Engine |
+| PE_FETCH_STORE | TCM ↔ Register File | Load/Store Unit |
+| PE_GEMM | MAC compute (register only) | MAC Array |
+| PE_MATH | element-wise/reduction (register only) | SIMD/Vector Unit |
+| PE_TCM | BW-serialized scratchpad | SRAM Bank |
+
+각 컴포넌트는 topology 노드로 존재하며, port/wire로 연결된다.
+`impl`을 교체하면 개별 블록의 타이밍 모델을 변경할 수 있다.
+
+### D2. Token Self-Routing — Scheduler는 dispatch + completion만
+
+**컴포넌트가 매 stage마다 scheduler를 경유하지 않는다.**
+Token이 plan을 가지고 있어 컴포넌트가 직접 다음 stage로 체이닝한다.
+
+```
+Scheduler → DMA → Fetch → GEMM → Math → Store → DMA_WB → (done) → Scheduler
+              ↑ 체이닝: scheduler 안 거침                    completion만
+```
+
+이는 실제 HW에서 각 블록의 done signal이 다음 블록에 직접 wire로 연결되어
+있는 구조와 일치한다. Scheduler는 **초기 dispatch + completion aggregation만** 담당.
+
+#### Stage 정의
+
+```python
+class StageType(Enum):
+    DMA_READ = 0
+    FETCH = 1
+    GEMM = 2
+    MATH = 3
+    STORE = 4
+    DMA_WRITE = 5
+```
+
+#### Plan 구조
+
+Scheduler가 CompositeCmd를 받으면 **tile 단위 실행 plan**을 생성한다.
+Plan은 각 tile의 **stage sequence**를 정의한다:
+
+```python
+@dataclass
+class Stage:
+    stage_type: StageType
+    component: str       # topology 노드 ID (e.g. "sip0.cube0.pe0.pe_dma")
+    params: dict         # stage별 파라미터 (dynamic)
+
+@dataclass(frozen=True)
+class TilePlan:
+    tile_id: int
+    stages: tuple[Stage, ...]  # 순서대로 실행할 stage 목록 (immutable)
+```
+
+Plan에 따라 stage sequence가 달라진다:
+
+```python
+# 일반 GEMM: HBM → TCM → Register → Compute → Register → TCM → HBM
+stages = (DMA_READ, FETCH, GEMM, STORE, DMA_WRITE)
+
+# TCM 데이터로 바로 GEMM (DMA read 생략):
+stages = (FETCH, GEMM, STORE, DMA_WRITE)
+
+# MATH element-wise:
+stages = (DMA_READ, FETCH, MATH, STORE, DMA_WRITE)
+
+# GEMM + accumulation (중간 K-tile, writeback 생략):
+stages = (DMA_READ, FETCH, GEMM, STORE)  # store to TCM only
+```
+
+**컴포넌트는 다음 컴포넌트를 하드코딩하지 않는다.**
+Token의 plan에서 다음 stage를 읽고, out_port로 직접 전달한다.
+네트워크 패킷이 라우팅 헤더를 가지고 있는 것과 같은 패턴이다.
+
+#### Pipeline Context
+
+```python
+@dataclass
+class PipelineContext:
+    id: str
+    total_tiles: int
+    completed_tiles: int = 0
+    done_event: simpy.Event = None  # 모든 tile 완료 시 succeed
+
+    def complete_tile(self) -> None:
+        self.completed_tiles += 1
+        if self.completed_tiles == self.total_tiles:
+            self.done_event.succeed()
+```
+
+**Completion은 exactly-once contract**: 각 tile의 마지막 stage는 정확히 한 번만
+`complete_tile()`을 호출해야 한다. 중복 호출은 버그이며, `done_event`는
+단 한 번만 succeed되어야 한다 (SimPy Event 제약).
+
+#### Scheduler 역할 (축소됨)
+
+Scheduler는 CompositeCmd를 받으면 plan과 PipelineContext를 생성한 뒤,
+이를 scheduler 내부의 `_pending_feeds` FIFO에 enqueue하고 즉시 리턴한다.
+
+실제 tile 투입은 **단일 feeder process** (`_feed_loop`)가 담당한다.
+이 feeder는 `_pending_feeds`를 FIFO 순서로 소비하며,
+**composite command 간 tile feed interleaving은 허용하지 않는다.**
+즉, 한 command의 모든 tile이 첫 stage queue에 투입된 후에만
+다음 command의 feed가 시작된다.
+
+Scheduler당 `_feed_loop`는 **정확히 하나만** 존재하며,
+composite command의 tile feed는 이 단일 process를 통해서만 수행된다.
+Command issue order는 **PE_SCHEDULER가 PeInternalTxn을 수신한 순서**를 의미한다.
+
+이 구조는 command issue order를 유지하면서도, 첫 stage queue full 시
+feeder process만 block되고 scheduler worker의 inbox 처리 자체는 멈추지 않도록 한다.
+
+```python
+class PeSchedulerV2(PeEngineBase):
+    _pipelines: dict[str, PipelineContext]
+    _pending_feeds: simpy.Store   # FIFO of (plan, ctx)
+
+    def start(self, env):
+        super().start(env)
+        self._pending_feeds = simpy.Store(env)
+        env.process(self._feed_loop(env))
+
+    def _dispatch_composite(self, env, pe_txn, cmd):
+        plan = generate_plan(cmd)
+        ctx = PipelineContext(
+            id=next_id(),
+            total_tiles=len(plan.tiles),
+            done_event=pe_txn.done,
+        )
+        self._pipelines[ctx.id] = ctx
+
+        # feeder queue에 등록만 하고 즉시 리턴
+        yield self._pending_feeds.put((plan, ctx))
+
+    def _feed_loop(self, env):
+        """단일 feeder process: composite command를 FIFO 순서로 feed.
+
+        Composite command 간 tile feed interleaving은 허용하지 않는다.
+        한 command의 모든 tile이 첫 stage queue에 투입된 후에만
+        다음 command의 feed가 시작된다.
+
+        첫 stage queue full 시 이 feeder만 block되며,
+        scheduler worker의 inbox 처리는 멈추지 않는다.
+        """
+        while True:
+            plan, ctx = yield self._pending_feeds.get()
+            for tile in plan.tiles:
+                token = TileToken(
+                    tile_id=tile.tile_id,
+                    pipeline_ctx=ctx,
+                    plan=tile,
+                    stage_idx=0,
+                    params=tile.stages[0].params,
+                )
+                yield self.out_ports[tile.stages[0].component].put(token)
+                # queue capacity = HW queue depth → full이면 feeder만 block
+```
+
+본 ADR에서 scheduler는 여러 composite command를 수용할 수 있으나,
+tile submission order는 command 단위 FIFO를 따른다.
+Command 내부에서는 tile-level pipeline overlap을 허용하지만,
+command 간 tile feed interleaving은 허용하지 않는다.
+
+### D3. 데이터 전달 vs 완료 신호 — HW 모델링 기준
+
+| 통신 유형 | 방식 | HW 대응 |
+|----------|------|---------|
+| tile token (작업 지시) | message via out_port | command queue에 enqueue |
+| stage 완료 → 다음 stage | 컴포넌트가 직접 out_port.put | done-triggered local enqueue |
+| pipeline 완료 → scheduler | PipelineContext.complete_tile() | completion interrupt |
+
+**Tile token**: out_port.put() 사용. SimPy Store capacity = HW queue depth.
+
+**Intra-PE chaining latency**: 본 ADR 범위에서는 intra-PE stage trigger에
+explicit latency model을 두지 않는다. 컴포넌트 간 체이닝은 PE 내부 wire에 해당하며,
+scheduler 왕복이 없으므로 artificial hop cost가 발생하지 않는다.
+
+**Pipeline 완료**: 마지막 stage의 컴포넌트가 `pipeline_ctx.complete_tile()` 호출.
+모든 tile 완료 시 PipelineContext가 done_event.succeed().
+
+### D4. 비동기 파이프라인 — 자연스러운 overlap
+
+Scheduler는 CompositeCmd를 **비동기로** 처리한다.
+다만 tile feed는 command마다 독립 process를 만들지 않고,
+scheduler 내부의 **단일 feeder process**가 FIFO 순서로 수행한다.
+따라서 scheduler는 다음 command를 계속 받을 수 있지만,
+첫-stage tile 투입 순서는 command 단위로 보장된다.
+
+**SimPy Store capacity = HW queue depth**이므로:
+- queue가 차면 put()이 자연스럽게 block (backpressure)
+- DMA가 tile 0을 처리하는 동안 GEMM은 이미 완료된 tile의 fetch를 시작
+- 두 번째 CompositeCmd가 들어오면 DMA queue에 바로 이어서 투입
+
+```
+First-stage feed order (feeder → DMA queue):
+  [cmd1:t0][cmd1:t1][cmd1:t2]...[cmd1:tN] | [cmd2:t0][cmd2:t1]...
+                                            ↑ cmd1 feed 완료 후 cmd2 시작
+
+Runtime pipeline (downstream overlap):
+  PE_DMA:    [cmd1:t0][cmd1:t1][cmd1:t2]...[cmd1:tN][cmd2:t0][cmd2:t1]...
+  PE_FETCH:          [cmd1:t0][cmd1:t1]...
+  PE_GEMM:                   [cmd1:t0][cmd1:t1]...
+                              ↑ 같은 cmd 내부에서 pipeline overlap
+```
+
+이때 overlap은 서로 다른 command의 tile feed interleaving에서 오는 것이 아니라,
+먼저 투입된 command의 tile들이 downstream stage로 진행되는 동안 feeder가
+다음 tile들을 계속 투입하면서 자연스럽게 발생한다.
+
+예를 들어 cmd1의 모든 tile이 첫 stage queue에 투입되기 전에는
+cmd2의 tile feed는 시작되지 않는다. 그러나 cmd1.tile0이 이미 GEMM으로
+진행한 상태에서 cmd1.tile1, cmd1.tile2가 DMA/FETCH에 남아 있을 수 있으므로,
+**같은 command 내부에서는 pipeline overlap이 자연스럽게 발생**한다.
+
+#### 컴포넌트 체이닝 패턴
+
+모든 컴포넌트가 동일한 패턴을 따른다:
+
+```python
+def _pipeline_worker(self, env):
+    while True:
+        token = yield self._inbox.get()
+
+        # 자기 stage 처리
+        yield from self._process(env, token)
+
+        # 다음 stage로 체이닝 (plan에서 읽음)
+        next_idx = token.stage_idx + 1
+        if next_idx < len(token.plan.stages):
+            next_stage = token.plan.stages[next_idx]
+            token.stage_idx = next_idx
+            token.params = next_stage.params
+            yield self.out_ports[next_stage.component].put(token)
+        else:
+            # 마지막 stage — pipeline completion
+            token.pipeline_ctx.complete_tile()
+```
+
+### D5. PE_FETCH_STORE — TCM ↔ Register File 전담
+
+기존에 GemmBlock과 MathBlock이 각각 TCM read/write를 구현했으나,
+이를 **PE_FETCH_STORE 컴포넌트**로 분리한다.
+
+```python
+# PE_FETCH_STORE._process()
+def _process(self, env, token):
+    yield self.out_ports[tcm_id].put(TcmRequest(token.params["direction"], ...))
+    yield tcm_done
+    # 체이닝은 base class가 처리 (D4 패턴)
+```
+
+장점:
+- GEMM/MATH는 **순수 compute만** — TCM 접근 로직 없음
+- fetch/store BW 경합이 자연스럽게 모델링됨 (PE_TCM의 resource로 serialization)
+- prefetch 전략 등 fetch unit 단독 교체로 실험 가능
+
+### D6. 각 Compute 컴포넌트의 단순화
+
+GEMM/MATH는 register 데이터가 이미 준비된 상태에서 compute만 수행.
+**체이닝은 공통 패턴(D4)을 따르므로, _process()만 구현하면 된다:**
+
+```python
+# PE_GEMM._process()
+def _process(self, env, token):
+    yield env.timeout(self._mac_latency(token.params))
+
+# PE_MATH._process()
+def _process(self, env, token):
+    yield env.timeout(self._simd_latency(token.params))
+
+# PE_FETCH_STORE._process()
+def _process(self, env, token):
+    yield self.out_ports[tcm_id].put(TcmRequest(token.params["direction"], ...))
+    yield tcm_done
+
+# PE_DMA._process()
+def _process(self, env, token):
+    yield from self._do_fabric_dma(token.params)
+```
+
+타이밍 모델만 교체하면 cycle-accurate든 analytical든 자유롭게 변경 가능.
+체이닝 로직은 base class에 있으므로 각 컴포넌트는 순수 stage 로직만 구현.
+
+### D7. Topology 변경
+
+PE template에 PE_FETCH_STORE 추가:
+
+```yaml
+pe_template:
+  components:
+    pe_cpu:         { kind: pe_cpu,         impl: pe_cpu_v1, ... }
+    pe_scheduler:   { kind: pe_scheduler,   impl: pe_scheduler_v2, ... }
+    pe_dma:         { kind: pe_dma,         impl: pe_dma_v1, ... }
+    pe_fetch_store: { kind: pe_fetch_store, impl: pe_fetch_store_v1, ... }
+    pe_gemm:        { kind: pe_gemm,        impl: pe_gemm_v1, ... }
+    pe_math:        { kind: pe_math,        impl: pe_math_v1, ... }
+    pe_mmu:         { kind: pe_mmu,         impl: pe_mmu_v1, ... }
+    pe_tcm:         { kind: pe_tcm,         impl: pe_tcm_v1, ... }
+  links:
+    # 기존 links...
+    fetch_store_to_tcm_bw_gbs: 512.0
+    fetch_store_to_tcm_mm: 0.0
+```
+
+PE 내부 edge 연결:
+```
+PE_SCHEDULER → PE_DMA (초기 dispatch)
+PE_SCHEDULER → PE_FETCH_STORE (초기 dispatch)
+PE_SCHEDULER → PE_GEMM (초기 dispatch)
+PE_SCHEDULER → PE_MATH (초기 dispatch)
+PE_DMA → PE_FETCH_STORE (체이닝)
+PE_FETCH_STORE → PE_GEMM (체이닝)
+PE_FETCH_STORE → PE_MATH (체이닝)
+PE_GEMM → PE_FETCH_STORE (store 체이닝)
+PE_MATH → PE_FETCH_STORE (store 체이닝)
+PE_FETCH_STORE → PE_DMA (writeback 체이닝)
+PE_FETCH_STORE → PE_TCM (BW 요청)
+```
+
+Topology edge는 **control/dispatch visibility + runtime chaining** 양쪽을 포함한다.
+Scheduler → 하위 컴포넌트 edge는 초기 dispatch 경로이며,
+컴포넌트 간 edge는 token self-routing에 의한 runtime chaining 경로이다.
+
+### D8. 기존 코드 마이그레이션 — builtin 통합
+
+기존 builtin v1 컴포넌트와 pe_accel을 **새 builtin으로 교체**한다.
+
+#### 마이그레이션 전략
+
+1. 기존 `components/builtin/` → `components/builtin_legacy/`로 백업 (수정 없이 보관)
+2. 기존 `components/custom/pe_accel/` → 동일하게 백업
+3. 새 `components/builtin/`에 ADR-0021 아키텍처로 재구현
+4. topology.yaml은 **하나만 유지** (pe_fetch_store 포함)
+5. components.yaml은 새 builtin을 가리킴
+
+```yaml
+# components.yaml — 새 builtin
+pe_scheduler_v1: kernbench.components.builtin.pe_scheduler:PeSchedulerComponent
+pe_gemm_v1:      kernbench.components.builtin.pe_gemm:PeGemmComponent
+pe_math_v1:      kernbench.components.builtin.pe_math:PeMathComponent
+pe_dma_v1:       kernbench.components.builtin.pe_dma:PeDmaComponent
+pe_fetch_store_v1: kernbench.components.builtin.pe_fetch_store:PeFetchStoreComponent
+pe_tcm_v1:       kernbench.components.builtin.pe_tcm:PeTcmComponent
+```
+
+impl 이름(pe_gemm_v1 등)은 유지하되, **구현이 ADR-0021 아키텍처로 교체**된다.
+기존 벤치마크와 테스트의 topology.yaml 참조는 변경 없이 동작한다.
+
+#### 레이턴시 모델 계승
+
+새 builtin 컴포넌트의 레이턴시 모델링(MAC cycle 계산, SIMD latency,
+TCM BW serialization, DMA fabric latency 등)은 **pe_accel 현재 버전의 구현을 바탕으로** 한다.
+tiling.py의 tile schedule 생성 로직도 그대로 가져온다.
+아키텍처(컴포넌트 분리, self-routing)만 변경하고, 타이밍 정확도는 유지한다.
+
+#### 테스트 전략
+
+#### 테스트 계획
+
+**1. 기존 테스트 통과** (regression):
+마이그레이션 완료 후 기존 테스트(366개)가 전부 통과해야 한다.
+
+**2. 레이턴시 regression**:
+pe_accel과 동일한 입력에 대해 새 builtin이 동일 레이턴시를 산출하는지 검증.
+
+**3. Phase 1 → Phase 2 end-to-end**:
+SimPy 시뮬레이션(Phase 1)에서 op_log 생성 → DataExecutor(Phase 2)로
+실제 numpy 연산 → 결과 정합성 검증까지 통합 테스트.
+- GEMM: tl.composite(gemm) → op_log → Phase 2 matmul → allclose 검증
+- MATH: tl.exp / tl.add 등 → op_log → Phase 2 numpy op → allclose 검증
+- 체이닝: GEMM 출력 → MATH 입력 → 최종 결과 end-to-end 검증
+
+**4. TileToken self-routing**:
+- tile이 plan의 stage sequence를 따라 체이닝되는지 검증
+- 마지막 stage에서 PipelineContext.complete_tile() exactly-once 검증
+- queue backpressure: DMA queue capacity 초과 시 feeder만 block 검증
+
+**5. 비동기 pipeline overlap**:
+- 동일 command 내 tile 간 stage overlap 발생 검증 (tile0 GEMM 중 tile1 DMA)
+- 다중 command: cmd1 feed 완료 후 cmd2 feed 시작 (FIFO 순서) 검증
+
+### D9. TileToken 메시지 정의
+
+컴포넌트 간 tile 작업 전달에 사용하는 메시지.
+Token이 plan과 stage index를 가지고 있어 self-routing이 가능하다.
+
+```python
+@dataclass
+class TileToken:
+    tile_id: int
+    pipeline_ctx: PipelineContext    # completion 추적
+    plan: TilePlan                   # 이 tile의 전체 stage sequence (immutable)
+    stage_idx: int                   # 현재 stage index in plan.stages
+    params: dict                     # current stage 파라미터 캐시 (canonical: plan.stages[stage_idx].params)
+    data_op: bool = True             # op_log 기록 대상 (ADR-0020)
+```
+
+TileToken은 한 시점에 **하나의 컴포넌트에 의해서만 소유**되며,
+동시에 여러 컴포넌트에 의해 참조되지 않는다 (single-owner).
+
+Token lifecycle:
+1. Scheduler가 stage_idx=0으로 생성, 첫 stage 컴포넌트에 put
+2. 컴포넌트가 _process() 실행 후 stage_idx 증가, 다음 컴포넌트에 put
+3. 마지막 stage 컴포넌트가 pipeline_ctx.complete_tile() 호출
+4. 모든 tile 완료 시 PipelineContext가 done_event.succeed()
+
+기존 PeInternalTxn과의 관계:
+- PeInternalTxn: PE_CPU → PE_SCHEDULER 간 command 전달 (기존 유지)
+- TileToken: PE_SCHEDULER → 하위 컴포넌트 간 tile 단위 작업 전달 (신규, self-routing)
+
+---
+
+## Non-goals
+
+- **PE_CPU 변경**: PE_CPU → PE_SCHEDULER 인터페이스는 변경하지 않음
+  (PeInternalTxn 기반, ADR-0014 유지)
+- **다중 pipeline 간 자원 경합 모델**: 현재 범위에서는 단일 pipeline의
+  정확한 모델링에 집중. 다중 pipeline 간 TCM bank conflict 등은 future work.
+- **builtin_legacy 유지보수**: 백업 목적이며, 버그 수정이나 기능 추가 대상이 아님.
+
+## Open Questions
+
+- **Register File 용량 모델**: fetch unit이 register에 로드할 때 용량 제한을
+  모델링할지. 용량은 바이트 단위(register_file_bytes)로 표현하며,
+  동시에 보유 가능한 tile 수는 tile 크기에 따라 결정된다.
+  용량 초과 시 fetch가 stall되어 자연스러운 backpressure가 발생한다.
+- **Prefetch 전략**: 본 ADR에서는 composite command 간 tile feed interleaving을
+  허용하지 않는다. 따라서 overlap은 command 간 선행 투입이 아니라,
+  같은 command 내부 tile들의 pipeline progression에서 자연스럽게 발생한다.
+  추가적인 prefetch가 필요하면 command 간 투입이 아니라, 같은 command 내부에서의
+  tile ordering 또는 fetch/store unit policy 차원에서 검토한다.
+- **PE_DMA coalescing**: tile 단위 DMA는 fragmentation 발생 가능.
+  DMA 내부에서 merge/coalesce하되 scheduler는 관여하지 않는 방향.
+- **동기 실행 모드**: 본 ADR에서는 비동기 pipeline을 기본/유일 execution model로
+  채택한다. 디버그 또는 validation 목적의 sync mode가 필요하면 future ADR에서 검토.
+- **다중 pipeline 간 TCM bank conflict**: 현재 단일 pipeline 기준.
+  다중 pipeline이 동시에 TCM에 접근할 때의 bank conflict 모델은 future work.
+
+---
+
+## Consequences
+
+### 긍정적
+
+- 각 블록이 독립 컴포넌트 — 개별 교체 가능 (ADR-0015 준수)
+- topology에서 PE 내부 구조 가시화
+- 컴포넌트가 다음 컴포넌트를 모름 — plan 기반 라우팅으로 유연성 확보
+- DMA와 compute의 자연스러운 파이프라인 overlap (SimPy Store backpressure)
+- HW 모델링 정확도 향상 (done signal = Event, data transfer = message)
+- fetch/store 분리로 TCM BW 경합 정확히 모델링
+
+### 부정적
+
+- PE 내부 컴포넌트 수 증가 (5 → 6) — topology 노드/edge 증가
+- 컴포넌트 분리로 인해 intra-PE token forwarding이 이전 대비 더 명시적으로 드러남
+- 기존 builtin/pe_accel과의 breaking change — 마이그레이션 필요
+
+---
+
+## 영향받는 파일
+
+| 파일 | 변경 |
+|------|------|
+| `topology.yaml` | pe_fetch_store 컴포넌트 추가, 체이닝 edge 추가 |
+| `components.yaml` | 새 builtin 컴포넌트 등록 |
+| `src/kernbench/topology/builder.py` | PE 내부 edge에 fetch_store + 체이닝 edge 추가 |
+| `src/kernbench/common/pe_commands.py` | TileToken 정의 추가 |
+| `src/kernbench/components/builtin/pe_scheduler.py` | 재구현 (feeder + plan 기반 dispatch) |
+| `src/kernbench/components/builtin/pe_gemm.py` | 재구현 (TileToken, _process 패턴) |
+| `src/kernbench/components/builtin/pe_math.py` | 재구현 (TileToken, _process 패턴) |
+| `src/kernbench/components/builtin/pe_dma.py` | 재구현 (TileToken, _process 패턴) |
+| `src/kernbench/components/builtin/pe_fetch_store.py` | 신규 |
+| `src/kernbench/components/builtin/pe_tcm.py` | 재구현 (TcmRequest 서비스) |
+| `src/kernbench/components/builtin/types.py` | 신규: TilePlan, Stage, StageType, PipelineContext, TileToken |
+| `src/kernbench/components/builtin/tiling.py` | pe_accel에서 이식: plan 생성 로직 |
+
+백업:
+| `src/kernbench/components/builtin_legacy/` | 기존 builtin 전체 백업 (수정 없이 보관) |
+| `src/kernbench/components/custom/pe_accel/` | 기존 pe_accel 백업 (수정 없이 보관) |
@@ -0,0 +1,90 @@
+# ADR-0022: 2D Grid program_id Semantics
+
+- **Status**: Accepted
+- **Date**: 2026-04-09
+- **Context**: Triton-style kernel addressing for multi-cube PE topology
+
+## Problem
+
+Triton kernels use `tl.program_id(axis)` to identify their position in a launch grid.
+Our hardware has a 2-level hierarchy: **cubes** contain **PEs**.
+The previous implementation ignored the `axis` parameter and always returned a flat PE index,
+making it impossible for kernels to distinguish their cube-local position from their cube identity.
+
+## Decision
+
+Map `tl.program_id` and `tl.num_programs` to the 2D hardware grid:
+
+| Call | Returns | Description |
+|------|---------|-------------|
+| `tl.program_id(axis=0)` | `local_pe_id` | PE index within cube |
+| `tl.program_id(axis=1)` | `cube_id` | Cube index |
+| `tl.num_programs(axis=0)` | `num_pes_per_cube` | PEs per cube |
+| `tl.num_programs(axis=1)` | `num_cubes` | Total cubes |
+
+Global PID is derived as:
+
+```python
+global_pid = tl.program_id(axis=1) * tl.num_programs(axis=0) + tl.program_id(axis=0)
+```
+
+### Axis mapping rationale
+
+- **axis=0 = PE (innermost)**: PEs within a cube share HBM and communicate via local NOC mesh. This is the fast, tightly-coupled dimension — analogous to threads within a block.
+- **axis=1 = Cube (outer)**: Cross-cube communication goes through UCIe with higher latency. This is the coarser scheduling dimension — analogous to blocks in a grid.
+
+## Implementation
+
+### TLContext (`triton_emu/tl_context.py`)
+
+Added `cube_id` and `num_cubes` constructor parameters. `program_id()` and `num_programs()` dispatch on `axis`:
+
+```python
+def program_id(self, axis: int = 0) -> int:
+    if axis == 1:
+        return self._cube_id
+    return self._pe_id
+
+def num_programs(self, axis: int = 0) -> int:
+    if axis == 1:
+        return self._num_cubes
+    return self._num_programs
+```
+
+### PE_CPU (`components/builtin/pe_cpu.py`)
+
+- Extracts `num_cubes` from `ctx.spec["system"]["sips"]["cubes_per_sip"]`
+- Passes `cube_id` (already available as `self._cube_idx`) and `num_cubes` to TLContext
+
+### KernelRunner (`triton_emu/kernel_runner.py`)
+
+- Receives `num_cubes` from PE_CPU
+- Passes `cube_id` and `num_cubes` to TLContext in greenlet mode
+
+## Backward Compatibility
+
+- Existing code using `tl.program_id(0)` or `tl.program_id()` is unchanged — returns the same PE index as before.
+- `cube_id` and `num_cubes` default to `0` and `1`, so callers that don't provide them (e.g. unit tests) continue to work.
+
+## Usage Example
+
+```python
+def sharded_gemm_kernel(a_ptr, b_ptr, out_ptr, M, K, N, tl):
+    local_pid = tl.program_id(axis=0)      # PE within cube
+    cube_id   = tl.program_id(axis=1)      # which cube
+    global_pid = cube_id * tl.num_programs(axis=0) + local_pid
+
+    # Column-wise sharding across global PID
+    n_per_pid = N // (tl.num_programs(axis=1) * tl.num_programs(axis=0))
+    col_start = global_pid * n_per_pid
+
+    a = tl.load(a_ptr, shape=(M, K), dtype="f16")
+    b = tl.ref(b_ptr + col_start * K * 2, shape=(K, n_per_pid), dtype="f16")
+    h = tl.composite(op="gemm", a=a, b=b, out_ptr=out_ptr + col_start * M * 2)
+    tl.wait(h)
+```
+
+## Consequences
+
+- Benchmarks can now express cube-aware sharding and addressing without hardcoding topology dimensions.
+- Future axis=2 (SIP-level) can be added following the same pattern if needed.
@@ -0,0 +1,866 @@
+# ADR-0023: PE-level IPCQ — Inter-PE Collective Communication
+
+## Status
+
+Proposed
+
+## Context
+
+### Goal
+
+Add the infrastructure that lets CCL (Collective Communication Library)
+kernels run **inside** a PE. The host just launches a kernel on each
+SIP; the actual synchronization and data movement happen **inside the
+PE kernel via an IPCQ (Inter-Process Communication Queue)**.
+
+This mirrors how NCCL performs NVLink communication inside a GPU
+kernel, or how Cerebras / Tenstorrent expose core-local communication
+queues. Host-level collectives (`dist.all_reduce`) are deferred to
+**future work**; this ADR focuses solely on the kernel-side collective
+infrastructure.
+
+### Current state
+
+- ADR-0021 PE pipeline refactor: each PE is decomposed into components
+  (PE_CPU, PE_SCHEDULER, PE_DMA, PE_FETCH_STORE, PE_GEMM, PE_MATH,
+  PE_TCM, PE_MMU).
+- No direct PE-to-PE channel exists today. All data movement goes
+  through PE_DMA → cube_noc / UCIe / PCIE → HBM.
+- A pre-ADR host CCL skeleton exists (`dist.init_process_group(backend="ahbm")`,
+  `_run_ccl_bench` running per-rank greenlets concurrently). The
+  collective itself is a stub.
+
+### Problems to solve
+
+1. PE-to-PE direct data movement (writing into a peer's memory).
+2. Synchronization — the sender must check that the receiver has space
+   in its buffer (backpressure).
+3. Resource contention between compute traffic and communication
+   traffic (Head-of-Line blocking).
+4. The host must be able to construct logical neighbor topologies
+   (ring / mesh / tree) per algorithm.
+
+---
+
+## Decision
+
+### D1. Add a new `PE_IPCQ` component
+
+A new component `PE_IPCQ` is added inside each PE. It follows the same
+pattern as PE_GEMM / PE_MATH — modeling a sub-block of the PE as a
+distinct component.
+
+```
+PE
+├── PE_CPU
+├── PE_SCHEDULER
+├── PE_DMA
+├── PE_IPCQ          ← new
+├── PE_FETCH_STORE
+├── PE_GEMM
+├── PE_MATH
+├── PE_TCM
+├── PE_MMU
+```
+
+**Role separation** (control plane vs. data plane):
+
+- **PE_IPCQ (control plane)**: ring-buffer address arithmetic, head /
+  tail pointer management, peer pointer caches, backpressure, 4-direction
+  neighbor mapping.
+- **PE_DMA (data plane)**: actually moves data through cube_noc / UCIe
+  / PCIE into the peer's memory.
+
+PE_IPCQ does **not** move data itself — it delegates to PE_DMA.
+
+### D2. Ring buffer model
+
+Each PE owns 4 directions (N/S/E/W) × {tx, rx} = 8 ring buffers.
+
+```python
+@dataclass
+class IpcqQueuePair:
+    direction: Direction          # N/S/E/W
+    peer: IpcqEndpoint            # set by host at init time (D2.5)
+    tx_buffer_base: int           # outgoing data base addr (in our memory)
+    rx_buffer_base: int           # incoming data base addr (in our memory)
+    slot_size: int                # 1 tile per slot
+    n_slots: int                  # ring depth
+    my_head: int                  # next slot we will write/send into
+    my_tail: int                  # next slot we will read/recv from
+    peer_head_cache: int          # peer's last-seen head (updated via D9 piggyback)
+    peer_tail_cache: int          # peer's last-seen tail (updated via D9 fast-path credit)
+```
+
+**Canonical field names**: throughout this ADR the four names above
+(`my_head`, `my_tail`, `peer_head_cache`, `peer_tail_cache`) are used
+consistently. Synonyms (`peer_head_local`, `peer_head`, `peer_tail`,
+etc.) are not used.
+
+| Field | Owner | Updated when |
+|-------|-------|--------------|
+| `my_head` | local PE_IPCQ | immediately after `tl.send` (send tracking) |
+| `my_tail` | local PE_IPCQ | immediately after `tl.recv` (recv tracking) |
+| `peer_head_cache` | local PE_IPCQ | on `IpcqMetaArrival` (D9 piggyback) |
+| `peer_tail_cache` | local PE_IPCQ | on `IpcqCreditMetadata` (D9 fast path) |
+
+**Slot unit**: fixed-size, one slot holds one full tile (no descriptor
+indirection). Full data embedded in the slot. See D5.
+
+### D2.5. `IpcqEndpoint` schema
+
+`IpcqQueuePair.peer` carries everything the sender needs to compute the
+peer's rx slot address:
+
+```python
+@dataclass(frozen=True)
+class IpcqEndpoint:
+    sip: int
+    cube: int
+    pe: int
+    buffer_kind: str             # "tcm" | "hbm" | "sram"
+    rx_base_pa: int              # peer rx_buffer base PA (PhysAddr.encode())
+    rx_base_va: int              # peer rx_buffer base VA (optional, MMU mode)
+    n_slots: int                 # peer ring depth (for wrap-around)
+    slot_size: int               # peer slot size (for offset)
+```
+
+Address computation:
+
+```python
+slot_idx = self.my_head % peer.n_slots
+dst_pa = peer.rx_base_pa + slot_idx * peer.slot_size
+```
+
+PE_IPCQ passes `dst_pa` to PE_DMA inside an `IpcqDmaToken`. PE_DMA
+(vc_comm) routes the data to `dst_pa` through the fabric.
+
+**Endpoint construction order**: at backend init (D10), the IPCQ
+buffers for **every PE** are allocated first (so each rank knows the
+others' PA), then the per-rank neighbor tables are built and pushed to
+PE_IPCQ via `IpcqInitMsg`.
+
+### D3. Four-direction mapping ≡ logical ProcessGroup
+
+The PE views four directions (N/S/E/W) as logical ports. Real peer
+addresses are configured by the host CCL init, per the chosen
+algorithm. The PE kernel never knows the topology, only directions.
+
+```python
+# 1D ring
+for rank in range(world_size):
+    ipcq_set_neighbor(rank, "E", peer=ranks[(rank + 1) % world_size])
+    ipcq_set_neighbor(rank, "W", peer=ranks[(rank - 1) % world_size])
+
+# 2D mesh
+for r in range(R):
+    for c in range(C):
+        ipcq_set_neighbor((r, c), "N", peer=((r - 1) % R, c))
+        ipcq_set_neighbor((r, c), "S", peer=((r + 1) % R, c))
+        ipcq_set_neighbor((r, c), "E", peer=(r, (c + 1) % C))
+        ipcq_set_neighbor((r, c), "W", peer=(r, (c - 1) % C))
+```
+
+The PE code does not need to know where `tl.send(dir="E", ...)` actually
+ends up.
+
+### D4. PE kernel API
+
+```python
+# Send (blocking; may stall on backpressure)
+tl.send(dir: str, src=TensorHandle)
+tl.send(dir: str, src_addr=..., nbytes=..., shape=..., dtype=..., space=...)
+
+# Recv (blocking)
+recv = tl.recv(dir: str, shape=..., dtype=...)
+recv = tl.recv(shape=..., dtype=...)        # round-robin across 4 directions
+
+# Recv (non-blocking)
+fut  = tl.recv_async(dir: str, shape=..., dtype=...)
+recv = tl.wait(fut)
+```
+
+`tl.recv()` (no direction) keeps a `last_polled_dir` cursor and on each
+call rotates through directions, returning the first available slot.
+Empty in all 4 directions → wait.
+
+**Fairness is weak**: the rotating start mitigates simple bias, but if
+one direction always wins the race the others can starve. Algorithms
+that need strict fairness must call `tl.recv(dir=...)` explicitly.
+
+### D5. Single-hop DMA write + full-data slot model
+
+Data moves from sender memory into the receiver's ring slot in **one
+DMA transfer**. Key properties:
+
+- **Single-hop**: the sender already knows the peer rx slot address and
+  fires one fabric DMA into it.
+- **No CPU memcpy**: the CPU never copies data.
+- **No intermediate staging**: neither side keeps a separate staging
+  buffer (sender uses the source addr directly; receiver gets the data
+  in its ring slot directly).
+
+(Strictly speaking the fabric DMA write does happen, so this is not
+literally "no data movement" — it's the same property NCCL labels
+"zero-copy", meaning no CPU memcpy and no staging copy.)
+
+```
+PE A: tl.send(E, src_addr, nbytes)
+  1. IPCQ computes the peer rx slot address:
+       dst_addr = peer.rx_base_pa + (my_head % peer.n_slots) * peer.slot_size
+  2. Backpressure: my_head - peer_tail_cache < peer.n_slots ?
+                   (full → sleep / poll)
+  3. Submit DMA on PE_DMA(vc_comm): src_addr → peer dst_addr, nbytes
+  4. my_head += 1
+
+PE B: data = tl.recv(W)
+  1. Look at rx_buffer[my_tail % n_slots]
+  2. Wait for the data to arrive (D7 backpressure mode)
+  3. Return the slot address to the kernel (or fetch into register file)
+  4. my_tail += 1
+  5. Issue a credit-return fast path (D9): after the bottleneck-BW
+     latency the peer A's peer_tail_cache is updated.
+```
+
+The slot holds the full tile. The receiver only reads its own
+rx_buffer; it never reads back into A's memory. The sender knows the
+peer rx slot address and DMAs directly into it (single-hop).
+
+The PE's own PE_TCM read/write does not go through DMA (PE_TCM is local
+to the PE).
+
+### D6. Buffer placement — three-way benchmark
+
+The host CCL init picks the IPCQ ring-buffer location:
+
+```python
+ipcq_init(
+    backend="ahbm",
+    buffer_kind="tcm" | "hbm" | "sram",
+    n_slots=8,
+    slot_size=4096,
+)
+```
+
+| Location | Trait | Trade-off |
+|----------|-------|-----------|
+| **PE_TCM** | Attached to the PE; fast | Small; competes with PE-internal resources |
+| **PE-local HBM** | Large; via DMA | Higher latency |
+| **Cube SRAM** | Mid-size; cube-shared | Cube-internal contention |
+
+All three locations run the same kernel code; only the init differs.
+
+### D7. Backpressure — two-mode benchmark
+
+How the sender or receiver waits when peer slots are full / data not
+yet arrived:
+
+| Mode | Behavior | Model |
+|------|----------|-------|
+| **poll** | Periodically re-check the cached peer pointer | Spin loop |
+| **sleep** | Yield a SimPy event; wake on a peer-trigger | Interrupt-like |
+
+```python
+ipcq_init(backpressure="poll" | "sleep", ...)
+```
+
+Both modes are implemented so latency / throughput trade-offs can be
+benchmarked.
+
+### D8. PE_DMA virtual channels
+
+Extend PE_DMA from a single queue into a **two-channel virtual-channel**
+model.
+
+```
+PE_DMA
+├── vc_compute: tile load / store / writeback for GEMM and Math
+└── vc_comm:    IPCQ send data
+```
+
+Each VC has an independent state machine:
+
+- One channel stalling does not block the other.
+- The same physical link (cube_noc, UCIe, …) is shared, but link BW is
+  split between channels.
+
+**Chunk-level interleave**:
+
+- Large GEMM tile DMAs do not lock the link end-to-end.
+- Progress happens in chunks (e.g. 256 B); each chunk shares link BW
+  with the other VC's pending chunks.
+- Chunk size is an init parameter (smaller = fairer, larger = more
+  efficient).
+
+Net effect:
+
+- HoL blocking is eliminated (an IPCQ send can interleave with a long
+  compute DMA).
+- Compute / comm overlap is natural (NVIDIA copy-engine + compute-SM
+  pattern).
+- Matches the NoC-virtual-channel pattern used in real HW.
+
+**First-implementation accuracy limit (intentional)**: this ADR's
+first cut uses **deterministic chunk-level interleave + weighted
+round-robin arbitration** (default 50 / 50, exposed in `ccl.yaml`).
+This is a first-order approximation and is simpler than real HW
+dynamic-contention / credit-based arbiters. Functional correctness is
+unaffected, but heavy-contention scenarios may report slightly
+optimistic latency vs. real HW. A separate ADR can add a NoC arbiter
+component later if more precision is needed.
+
+#### Token routing
+
+- Compute tokens (`TileToken`) — go through the existing
+  PE_FETCH_STORE → PE_DMA chain.
+- Communication tokens (`IpcqDmaToken`, new) — PE_IPCQ → PE_DMA
+  self-routing.
+- PE_DMA picks the channel by token type.
+
+```python
+class PeDmaComponent:
+    def _process(self, env, token):
+        if isinstance(token, IpcqDmaToken):
+            yield from self._vc_comm_process(env, token)
+        else:
+            yield from self._vc_compute_process(env, token)
+```
+
+### D9. Pointer synchronization — DMA payload piggyback
+
+Real HW (NVLink, UCIe, etc.) piggybacks metadata onto DMA payloads so
+pointers update along with the data. This simulation adopts the same
+model: **no separate control channel** — metadata travels with the
+data.
+
+The big benefits:
+
+- **Automatic ordering**: data and metadata move on the same token, so
+  data is visible **before** the head_cache update. No race.
+- **HW fidelity**: matches NVLink / UCIe piggybacked headers.
+- **Component simplification**: no separate `IpcqPtrUpdate` event type.
+
+#### Send flow (head update via piggyback)
+
+```
+PE A: tl.send(E, src_addr, nbytes)
+  1. PE_IPCQ checks backpressure (using peer_tail_cache)
+  2. PE_IPCQ creates an IpcqDmaToken:
+       - data body (src_addr → peer dst_addr)
+       - piggyback metadata: (sender_seq, src_sip/cube/pe, src_direction)
+  3. Hand the token to PE_DMA(vc_comm)
+  4. PE A increments my_head (send tracking)
+
+[fabric DMA: latency elapses]
+
+PE B's PE_DMA receives the token
+  5. Writes data into dst_addr (B's rx slot) via MemoryStore.write
+  6. Forwards token metadata to PE B's PE_IPCQ (PE-internal wire, ~1 cycle)
+
+PE B's PE_IPCQ receives the metadata
+  7. Updates peer_head_cache (= A's head)
+  8. Wakes any pending recv on that direction
+```
+
+**Steps 5 and 6 must execute in the same SimPy step** — DMA completion
+makes data and metadata atomically visible.
+
+#### Recv flow (credit return — fast path with bottleneck-BW latency)
+
+When the receiver frees a slot, the sender must learn about it
+(backpressure release). Unlike data, the credit return does **not**
+travel through general vc_comm fabric — it uses a **separate fast
+path**, an abstraction of the NVLink / UCIe credit-return wire.
+
+**Latency** is computed from the **bottleneck BW on the path**, not a
+magic constant:
+
+```
+credit_size_bytes = 16  (ccl.yaml: ipcq_credit_size_bytes)
+path = router.find_path(self_pe, peer_pe)
+latency = compute_drain_ns(path, credit_size_bytes)
+        = credit_size_bytes / bottleneck_bw_on_path
+```
+
+That gives us:
+
+- **Topology-proportional approximation**: an in-cube credit return is
+  automatically faster than a cross-SIP credit return.
+- **No magic constants**: no arbitrary `ipcq_ctrl_latency_ns`.
+- **No deadlock risk**: unlike piggyback, B can issue credit even when
+  it has no data to send back.
+- **Reuses existing utility**: `ComponentContext.compute_drain_ns`.
+
+#### Component coupling — SimPy Store channel
+
+PE B's PE_IPCQ does not call PE A's PE_IPCQ directly. Instead, at init
+time, **a SimPy Store is wired between the two** (a per-direction
+fast-path channel) and credit metadata is `put` into that store.
+
+```python
+class PeIpcqComponent:
+    def _delayed_credit_send(self, env, peer_credit_store, my_tail, latency_ns):
+        yield env.timeout(latency_ns)
+        yield peer_credit_store.put(IpcqCreditMetadata(seq=my_tail, ...))
+```
+
+Backend init wires both directions of the fast-path channel as part of
+fan-out (see `IpcqInitMsg` in D12).
+
+#### Credit-return fast path limitations
+
+- `credit_size_bytes` is an estimate (typically 16–64 bytes).
+- The fast path is **excluded from vc_comm BW contention** (separate
+  wire). Real HW credit-return wires are very lightweight, so this is a
+  reasonable first approximation.
+- A follow-up ADR can: model the credit fast path as a separate link
+  (BW limit + contention), or switch to piggyback (`credit_return_mode:
+  piggyback`).
+
+#### PE_DMA's added responsibility
+
+When `vc_comm` receives a token, PE_DMA processes it as the following
+**atomic** sequence. **No SimPy yield is allowed between the two steps**
+(invariant I6):
+
+```python
+def _on_vc_comm_recv(self, env, token):
+    # ── ATOMIC: no yield between these two operations ──
+    data = self._memory_store.read(token.src_space, token.src_addr,
+                                   shape=..., dtype=...)
+    self._memory_store.write(token.dst_endpoint.buffer_kind,
+                             token.dst_addr, data)
+    # 2. Forward metadata to the local PE_IPCQ
+    yield self.out_ports[self._ipcq_id].put(IpcqMetaArrival(token=token))
+    # ───────────────────────────────────────────────────
+```
+
+The final `put` is yieldable but uses an unbounded internal store, so
+it completes in a single step. That `put` is the closing call of the
+atomic block; nothing may be inserted before it.
+
+### D9.5. ADR-0020 (2-pass) integration
+
+`tl.send` / `tl.recv` integrates with ADR-0020's two-pass model. Phase
+1 simulates timing **and** moves data via MemoryStore; Phase 2 enables
+op-log-based correctness verification.
+
+#### Phase 1 (timing + data)
+
+D9 models head and tail updates with two different mechanisms:
+
+- **Send-side (head update)** — DMA payload piggyback. Data write and
+  metadata forward happen in the same SimPy step → automatic atomic
+  visibility.
+- **Recv-side (tail credit return)** — fast-path SimPy Store channel
+  with bottleneck-BW latency, then `peer_tail_cache` update.
+
+Together they preserve ring-buffer pointer consistency.
+
+The op-log records `op_kind="ipcq"` entries for sends (with
+`src/dst/space/addr/nbytes/dir/dtype/shape/sender_seq`) and recvs (with
+`recv_mode/src/dst/space/addr/nbytes/dir/dtype/shape/consumer_seq`).
+Two recv modes:
+
+- **`return_slot`** (default): the slot address is returned to the
+  kernel. Zero-copy.
+- **`copy_to_dst`**: when the kernel passes `dst_addr` + `dst_space`,
+  PE_IPCQ copies the slot data into the user dst.
+
+#### Phase 2 (op_log replay)
+
+When `DataExecutor` encounters an `op_kind="ipcq"` record:
+
+- **send**: idempotent `src → dst` ndarray write.
+- **recv (`return_slot`)**: no-op (the slot already holds the data).
+- **recv (`copy_to_dst`)**: idempotent `slot → dst_addr` copy.
+
+IPCQ ops are pure data movement — Phase 2 has nothing extra to compute.
+The downstream GEMM / Math ops in `DataExecutor` will consume the data
+and naturally validate correctness.
+
+### D10. Host CCL init keeps the PyTorch shape
+
+The host code looks just like real PyTorch DDP. `init_process_group`
+creates the backend object; it does **not** receive IPCQ knobs
+(neighbor topology, buffer_kind, backpressure …).
+
+```python
+# benches/ccl_allreduce.py — same shape as real PyTorch
+def worker(rank, world_size, torch):
+    dist = torch.distributed
+    dist.init_process_group(backend="ahbm")  # reads ccl.yaml + topology
+    tensor = torch.zeros((1, world_size * N_ELEM), dtype="f16", dp=...)
+    tensor.copy_(torch.from_numpy(init))
+    dist.all_reduce(tensor, op="sum")
+```
+
+The IPCQ configuration is decided by the backend at
+`init_process_group` time: it loads `ccl.yaml`, picks the algorithm,
+and pushes IPCQ neighbor tables to every participating PE_IPCQ. The
+host code never has to know about IPCQ.
+
+A bench runs one algorithm, chosen via `ccl.yaml`'s `defaults.algorithm`.
+Switching algorithms is purely a `ccl.yaml` change — no host edits
+required.
+
+#### Init flow (eager)
+
+1. `init_process_group(backend="ahbm")` is called.
+2. Backend loads `ccl.yaml` → resolves `defaults.algorithm`.
+3. Pulls topology + buffer_kind + backpressure + slot config from
+   `algorithms[<algo>]`.
+4. **Immediately** installs neighbor tables on every PE_IPCQ
+   (sideband or fabric `IpcqInitMsg`).
+5. Subsequent `torch.launch(kernel_name, ...)` calls behave normally —
+   PE_IPCQ is already prepared whether the kernel is a CCL kernel or
+   not.
+
+### D11. CCL config file (`ccl.yaml`)
+
+IPCQ config and algorithm metadata live in a separate YAML file,
+following the same pattern as `components.yaml` and `topology.yaml`.
+
+A single benchmark execution runs one algorithm
+(`defaults.algorithm`). Switching algorithms means editing
+`defaults.algorithm` only.
+
+```yaml
+defaults:
+  algorithm: ring_allreduce_tcm
+  buffer_kind: tcm                # tcm | hbm | sram
+  backpressure: sleep             # poll | sleep
+  n_slots: 8
+  slot_size: 4096
+  vc_chunk_size: 256
+  ipcq_credit_size_bytes: 16
+
+algorithms:
+  ring_allreduce_tcm:
+    module: kernbench.ccl.algorithms.ring_allreduce
+    topology: ring_1d             # builtin name or "custom"
+    buffer_kind: tcm
+    n_elem: 8                     # optional, per-algorithm tile width
+
+  tree_allreduce_7:
+    module: kernbench.ccl.algorithms.tree_allreduce
+    topology: tree_binary
+    buffer_kind: tcm
+    world_size: 7                 # algorithm-level override
+    n_elem: 16
+
+  custom_mesh:
+    module: kernbench.ccl.algorithms.custom_mesh
+    topology: custom              # the module supplies its own neighbors()
+```
+
+`world_size` is **not set in `defaults`**. The backend resolves it via:
+`algorithm-level override > defaults override > topology spec`. The
+last fallback (`sips × cubes_per_sip × pes_per_cube`) mirrors real DDP
+where `WORLD_SIZE` comes from env vars rather than config files.
+
+#### Algorithm module structure
+
+Each algorithm module exports two hooks — `kernel` (required) and
+`neighbors` (optional) — plus a `kernel_args` helper that the
+backend uses to populate positional kernel arguments at `all_reduce`
+time:
+
+```python
+# src/kernbench/ccl/algorithms/ring_allreduce.py
+
+def kernel_args(world_size: int, n_elem: int) -> tuple:
+    return (n_elem, world_size)
+
+
+def kernel(t_ptr, n_elem, world_size, tl):
+    """Required — the PE kernel.
+
+    IPCQ is already installed by the backend before this is called.
+    The kernel only uses the four-direction send / recv API.
+    """
+    ...
+
+
+def neighbors(rank, world_size, neighbor_map):
+    """Optional — override the builtin topology's neighbor map.
+
+    Returns a new dict, the modified-in-place dict, or None to keep the
+    builtin map.
+    """
+    return None
+```
+
+#### `neighbors` override patterns
+
+- **Pattern A — tweak a builtin**: drop a direction for some ranks, etc.
+- **Pattern B — replace entirely**: ignore `neighbor_map` and return a
+  brand-new dict.
+- **Pattern C — keep builtin**: omit `neighbors` or return None.
+
+#### Builtin topologies
+
+| topology | direction set |
+|----------|---------------|
+| `ring_1d` | E, W |
+| `ring_1d_unidir` | E only |
+| `mesh_2d` | N, S, E, W |
+| `tree_binary` | parent, child_left, child_right |
+| `none` | (empty) — algorithm must supply `neighbors()` |
+
+#### Adding a new algorithm
+
+1. Write `kernel` and `kernel_args` in
+   `src/kernbench/ccl/algorithms/<algo>.py`.
+2. Add an entry in `ccl.yaml`'s `algorithms` section.
+3. (Optional) provide `neighbors()` for custom topology.
+4. Set `defaults.algorithm` to the new algorithm.
+
+The host bench (`benches/ccl_allreduce.py`) does not change.
+
+### D12. Message / token schema
+
+The new message types added by this ADR. They live in
+`src/kernbench/common/pe_commands.py` and
+`src/kernbench/runtime_api/kernel.py`.
+
+#### `IpcqInitMsg` (sideband, fan-out at init)
+
+The backend pushes neighbor tables to every PE_IPCQ. Structure mirrors
+`MmuMapMsg` (`target_sips`, `target_cubes`, `target_pe`, `entries`).
+Each `IpcqInitEntry` has `direction`, `peer: IpcqEndpoint`,
+`my_rx_base_pa/va`, `n_slots`, `slot_size`, plus a `peer_credit_store`
+field — a `simpy.Store` instance pre-wired so the sender PE_IPCQ can
+push `IpcqCreditMetadata` directly into the receiver's input queue.
+
+#### `IpcqSendCmd` (PE_CPU → PE_IPCQ)
+
+Carries `direction`, source addr/space, nbytes, shape, dtype, and a
+handle id. `data_op=True` so it lands in the op_log.
+
+#### `IpcqRecvCmd` (PE_CPU → PE_IPCQ)
+
+Carries `direction` (or None for round-robin), `recv_mode`
+(`return_slot` / `copy_to_dst`), optional `dst_addr/dst_space`, shape,
+dtype, blocking flag.
+
+#### `IpcqDmaToken` (PE_IPCQ → PE_DMA, vc_comm channel)
+
+Per D9 piggyback: the token carries the data (`src/dst/space/nbytes`)
+plus the head metadata (`sender_seq`, `src_sip/cube/pe`,
+`src_direction`). PE_DMA picks the channel by token type
+(`IpcqDmaToken → vc_comm`, `TileToken → vc_compute`).
+
+The receiver's PE_DMA, on token arrival, performs the I6 atomic
+sequence: write data into MemoryStore, then forward `IpcqMetaArrival`
+to the local PE_IPCQ.
+
+#### `IpcqCreditMetadata` (PE_IPCQ → peer PE_IPCQ, fast path)
+
+Carries `consumer_seq` (= my_tail), source PE coords, and source
+direction. Travels through the dedicated SimPy Store channel rather
+than `vc_comm`. Latency = `credit_size_bytes / bottleneck_bw_on_path`.
+
+There is **no `IpcqPtrUpdate` event** — head updates flow via D9
+piggyback, tail updates via the D9 fast-path channel.
+
+### D13. Test strategy
+
+Following the ADR-0021 D8 pattern.
+
+#### T1. Unit tests (component-level)
+
+- **PE_IPCQ** (`tests/test_pe_ipcq.py`): send without backpressure
+  immediately forwards a token; full peer slot triggers backpressure
+  (poll / sleep modes); recv waits, wakes on `IpcqMetaArrival`;
+  round-robin recv weak fairness; bad direction → `IpcqInvalidDirection`.
+- **PE_DMA virtual channels** (`tests/test_pe_dma_vc.py`): `vc_compute`
+  / `vc_comm` independent progress, chunk interleave, BW split.
+- **Builtin topology** (`tests/test_ccl_topologies.py`): ring_1d /
+  mesh_2d / tree_binary correctness, mesh_2d non-square →
+  `ValueError`, custom resolver returns the module's `neighbors`.
+
+#### T2. Integration tests (E2E send/recv)
+
+- **`tests/test_ipcq_e2e.py`**: 2-rank ring, 4-rank ring (bidirectional
+  no-deadlock), 4×4 mesh.
+- **CCL kernel + 2-pass** (`tests/test_ipcq_2pass.py`): greenlet mode
+  records `ipcq` ops in op_log; DataExecutor produces correct
+  `out.data`.
+
+#### T3. Backend init (`tests/test_ccl_backend_ipcq.py`)
+
+`ccl.yaml` load, builtin topology → `IpcqInitMsg` fan-out, endpoint PA
+consistency, per-`buffer_kind` allocation.
+
+#### T4. Regression
+
+All existing tests pass; ADR-0020 op_log / DataExecutor unaffected for
+non-CCL benches.
+
+#### T5. Performance / overhead
+
+Single send/recv pair latency = (DMA latency) + (IPCQ overhead).
+Should be close to a regular PE_DMA write of the same nbytes (IPCQ
+overhead < 100 ns).
+
+### D14. Invariants and failure modes
+
+#### Invariants
+
+I1. **Slot lifecycle exactly-once**: one send → exactly one recv.
+I2. **Pointer monotonicity**: `my_head` / `my_tail` strictly
+   non-decreasing; `sender_seq` strictly increasing.
+I3. **Endpoint consistency**: if rank A's `direction=E` peer is rank
+   B, then rank B's reverse-direction peer must be rank A. Verified at
+   init.
+I4. **`buffer_kind` consistency**: all PEs in a process group share
+   the same `buffer_kind` (no mixed mode in the first cut).
+I5. **op_log ordering**: send → DMA complete → recv possible. The
+   t_start order in op_log respects this causality.
+I6. **Atomic data + metadata visibility (MUST)**: at the receiver
+   side, data write (`MemoryStore.write`) and metadata forward
+   (`peer_head_cache` update) **must execute in the same SimPy step**.
+   No yield is allowed between the two operations in PE_DMA's vc_comm
+   handler. Code review must reject any inserted `yield` (or `yield
+   from`) — it would create a race where head_cache becomes visible
+   before or after the data.
+I7. **MemoryStore slot existence ↔ pointer**: as a consequence of I6,
+   the step in which `peer_head_cache > my_tail` becomes truthy is the
+   same step in which the slot data is observable.
+
+#### Failure modes (runtime errors)
+
+F1. **Bad direction**: `tl.send(dir="X")` for an uninstalled direction
+   → `IpcqInvalidDirection`, simulation aborts.
+F2. **Type mismatch**: dtype/shape/nbytes disagreement between matched
+   send and recv. Not validated by default; opt-in strict mode catches
+   it (`strict_validation: true` on a PE_IPCQ node attrs).
+F3. **Deadlock detection (timeout-based)**: the simulator empties its
+   schedule while a send/recv is still pending → engine raises
+   `IpcqDeadlock` and embeds a pointer dump.
+F4. **Backend init failure**: missing `defaults.algorithm`, missing
+   `algorithms[name]`, module import failure, topology validation
+   failure (I3, I4) — all raised at `init_process_group` time.
+F5. **Slot full + infinite backpressure**: the peer never recvs.
+   Surfaces as F3 timeout.
+
+#### Diagnostics
+
+- **CCL trace**: `KERNBENCH_CCL_TRACE=1` logs each send/recv as
+  `(rank, t, dir, nbytes)`.
+- **Pointer dump**: `kernbench.ccl.diagnostics.pointer_dump(engine)`
+  prints every PE_IPCQ ring buffer's `my_head`, `my_tail`,
+  `peer_head_cache`, `peer_tail_cache`.
+- **Deadlock dump**: on hang the engine includes the pointer dump in
+  the `IpcqDeadlock` exception message.
+
+### D15. Algorithm-author cheat sheet
+
+Full step-by-step lives in
+[`docs/ccl-author-guide.en.md`](../ccl-author-guide.en.md). The
+shortest version:
+
+| Things you touch | Things you don't |
+|------------------|-------------------|
+| `src/kernbench/ccl/algorithms/<your_algo>.py` (`kernel`, `kernel_args`, optional `neighbors`) | `benches/ccl_allreduce.py` host code |
+| One entry in `ccl.yaml` + optionally `defaults.algorithm` | `src/kernbench/ccl/` framework |
+| (Optional) `tests/test_<your_algo>.py` mock test | PE_IPCQ component, AhbmCCLBackend |
+
+5-step flow: write the kernel → register in `ccl.yaml` → optional
+`neighbors` override → optional mock unit test → SimPy validation via
+`kernbench run --bench ccl_allreduce --verify-data`.
+
+Common mistakes: using a direction that wasn't installed, sends
+without matching recvs (deadlock), dtype/shape disagreement, assuming
+fairness from `tl.recv()` round-robin, confusing
+`tl.num_programs(axis)` with the CCL group size.
+
+---
+
+## Non-goals
+
+- **Host collective**: a model where `dist.all_reduce` itself moves
+  data on the host side is out of scope. This ADR only covers
+  communication that happens inside the PE kernel.
+- **All-reduce algorithms**: ring / tree / etc. live in algorithm
+  modules and can be added without amending this ADR.
+- **Reliability / error handling**: link faults, send/recv failure
+  recovery, etc. are out of scope.
+- **NoC arbiter precision**: dynamic VC contention is left for a future
+  ADR (see D8).
+
+---
+
+## Open questions
+
+- **VC arbitration accuracy** — the first cut uses deterministic
+  chunk interleave + weighted round-robin; heavy contention may report
+  optimistic latency. A NoC arbiter component can be added later.
+- **Credit return BW model** — the fast path is currently outside the
+  fabric BW contention model. Can be modeled as a separate link or
+  switched to piggyback (`credit_return_mode: piggyback`).
+- **Ring buffer slot allocation metadata** — whether the host pushes
+  IPCQ buffer metadata via sideband or via a fabric message similar to
+  `MmuMapMsg` is open.
+- **VC BW split default** — 50/50 vs. weighted (e.g. 80/20). Exposed in
+  `ccl.yaml`; default value TBD.
+- **Direction count** — 4 (N/S/E/W) is fixed in the first cut; 6
+  (with Up/Down for 3D) or N (variable) is future work.
+- **Multi-tile aggregation primitives** — whether
+  `tl.recv_all` or similar is needed for fan-in.
+- **Round-robin recv fairness** — current weak fairness can starve;
+  strict fairness counter is future work.
+- **Deadlock detection precision** — currently timeout-based; a
+  realtime wait-for graph would enable deterministic detection.
+
+---
+
+## Consequences
+
+### Positive
+
+- PE-to-PE direct communication enables CCL kernels to be written.
+- Host stays minimal (just `launch`), synchronization happens inside
+  the PE → strong compute / comm overlap.
+- VCs eliminate HoL blocking → collective latency is not blocked by
+  compute traffic.
+- Buffer placement and backpressure mode are init-time parameters →
+  easy to benchmark.
+- Four-direction logical neighbors → host is free to map
+  ring/mesh/tree algorithms.
+
+### Negative
+
+- One new component (PE_IPCQ) and a redesigned PE_DMA (VCs).
+- IPCQ memory cost = 8 rings × `slot_size` × `n_slots` per PE.
+- VC arbitration is a first-order approximation; heavy contention
+  scenarios may report slightly optimistic latency vs real HW (D8).
+- Chunk-level interleave makes PE_DMA implementation more complex.
+
+---
+
+## Affected files
+
+| File | Change |
+|------|--------|
+| `topology.yaml` | Add `pe_ipcq` to `pe_template`, plus the IPCQ ↔ DMA / CPU / TCM edges. |
+| `components.yaml` | Register `pe_ipcq_v1`. |
+| `src/kernbench/topology/builder.py` | Wire the IPCQ chain into PE-internal edges. |
+| `src/kernbench/components/builtin/pe_ipcq.py` | New. |
+| `src/kernbench/components/builtin/pe_dma.py` | Add VCs, handle `IpcqDmaToken`. |
+| `src/kernbench/common/pe_commands.py` | `IpcqSendCmd`, `IpcqRecvCmd`, `IpcqDmaToken`. |
+| `src/kernbench/triton_emu/tl_context.py` | `tl.send` / `tl.recv` API. |
+| `src/kernbench/runtime_api/distributed.py` | Eager IPCQ install in `AhbmCCLBackend.__init__`. |
+| `src/kernbench/runtime_api/kernel.py` | `IpcqInitMsg` definition. |
+| `src/kernbench/ccl/__init__.py` | New CCL package. |
+| `src/kernbench/ccl/topologies.py` | Builtin topology generators + `resolve_topology()`. |
+| `src/kernbench/ccl/helpers.py` | Algorithm-author helpers (`chunked`, `ring_step`, `tree_step`). |
+| `src/kernbench/ccl/testing.py` | Mock CCL runtime (`run_kernel_in_mock`). |
+| `src/kernbench/ccl/algorithms/*.py` | Algorithm modules (kernel + `kernel_args` + optional `neighbors`). |
+| `ccl.yaml` | Algorithm metadata + IPCQ defaults. |
+| `tests/test_pe_ipcq.py` | PE_IPCQ unit tests. |
+| `tests/test_pe_dma_vc.py` | PE_DMA VC tests. |
+| `tests/test_ipcq_e2e.py` | end-to-end send/recv tests. |
+| `tests/test_ccl_topologies.py` | Builtin topology generator tests. |
+| `tests/test_ccl_allreduce_matrix.py` | Unified bench × algorithm matrix. |
@@ -0,0 +1,990 @@
+# ADR-0024: SIP-level TP Launcher — rank = SIP (host-driven dispatch)
+
+## Status
+
+Proposed (Revision 8 — Hierarchical content split out to ADR-0029)
+
+## Context
+
+### 목표
+
+`torch.distributed` collective 호출의 참여 단위(rank)를 **SIP**(device)
+경계에 맞춘다. 실제 PyTorch DDP/TP 스크립트와 **호스트 레벨에서 구분 없이**
+읽히는 bench 코드를 목표로 한다.
+
+real PyTorch와 비교:
+
+| 차원 | real PyTorch | KernBench (이 ADR 이후) |
+|---|---|---|
+| 프로세스 모델 | N개 프로세스, 각 1 GPU | 1 프로세스, N greenlet, 각 1 SIP |
+| `get_rank()` | `RANK` env var | greenlet-local 레지스트리 |
+| `get_world_size()` | `WORLD_SIZE` env var | topology의 SIP 수 |
+| `torch.cuda.set_device(r)` (real) / `torch.ahbm.set_device(r)` (KernBench) | rank → GPU | rank → SIP |
+| `mp.spawn` | OS 프로세스 fork | greenlet fan-out |
+
+### 설계 원칙 — 공개 API의 추상화, 내부는 기존 path 활용
+
+**공개 API (bench worker) 수준의 추상화**:
+```
+rank = SIP
+DPPolicy = intra-device (cube × PE) 분산만
+dist.all_reduce, torch.ahbm.set_device, mp.spawn 등 PyTorch-style 표면
+```
+
+**Framework 내부 구현**:
+```
+build_install_plans (host): topology + mapper + algorithm → SipInstallPlan
+  ↓
+backend (host): plan의 per-PE spec을 engine.submit으로 IpcqInitMsg 디스패치
+  ↓
+engine: 기존 PE-scoped routing (MmuMapMsg 등과 동일 경로)
+  ↓
+PE_IPCQ: 자체 message loop에서 IpcqInitMsg 처리 (기존 capability)
+```
+
+**핵심**: 새 message 타입이나 IO_CPU 확장 없음. 기존 engine routing과 기존
+`IpcqInitMsg` 타입을 그대로 사용. 기존의 "sideband direct call" 우회만
+제거하여 convention 일원화.
+
+### 현재 상태
+
+- `DistributedContext` facade 존재
+- `init_process_group("ahbm")` → `AhbmCCLBackend`가 `ctx.install_ipcq` 호출
+  → `ccl/install.py`가 **sideband direct call** (`pe_ipcq._install_neighbors`)로
+  PE_IPCQ에 neighbor table 설치
+- `get_rank()` 항상 `0` (single-driver)
+- `get_world_size()` fallback: 총 PE 수 (rank = PE)
+- `benches/ccl_allreduce.py`: `worker(rank=0, world_size=total_PEs)` 1회 호출
+
+### 풀어야 할 문제
+
+1. **공개 API에서 rank = SIP** — bench worker가 PE 개념을 알지 않도록.
+2. **Multi-worker 실행** — N개 rank가 독립 worker 코드 실행. 1 프로세스 제약
+   하에서 greenlet + barrier 동기화.
+3. **Cross-rank collective submit 동기화** — 첫 rank가 혼자 wait하면 peer 부재로
+   SimPy deadlock. 모든 rank submit 후 drain 보장.
+4. **기존 sideband install 제거** — IpcqInitMsg를 engine.submit으로 일원화.
+   MmuMapMsg 등 다른 control-plane 메시지와 동일 패턴.
+5. **Algorithm / mapper / validator 분리** — 알고리즘 모듈은 kernel 코드만
+   담고, topology / mapping / validation은 registry + 선언.
+
+### Non-problem (이 ADR 밖)
+
+- IPCQ direction addressing fix → **ADR-0025**
+- `DPPolicy.sip`/`num_sips` 제거 → **ADR-0026**
+- Megatron-style TP → **ADR-0027**
+- DTensor → **ADR-0028 (future)**
+- **IO_CPU를 SIP-level control-plane 단일 endpoint로 승격**: 이 ADR에서는
+  invariant으로 채택하지 않음. 현재 KernBench에 해당 원칙이 없고, 단독으로
+  도입하기엔 정당화가 약함. 미래에 control-plane latency 모델링 정밀도 요구가
+  생기면 별도 ADR.
+
+### TODO (이 ADR 구현 이후)
+
+- Tensor Parallelism (ADR-0027)
+- Hierarchical all-reduce 알고리즘 설계 (ADR-0029) — 본 ADR의 mapper /
+  validator registry 인프라를 활용하는 첫 사례
+
+---
+
+## Decision
+
+### D1. rank = SIP (world_size 해석)
+
+```python
+def _resolve_world_size(self) -> int:
+    if "world_size" in self._merged:
+        return int(self._merged["world_size"])
+    defaults = self._cfg_all.get("defaults", {})
+    if "world_size" in defaults:
+        return int(defaults["world_size"])
+    spec = self.ctx.spec or {}
+    return int(spec.get("system", {}).get("sips", {}).get("count", 1))
+```
+
+우선순위: 알고리즘 override > defaults override > SIP count. `ccl.yaml`
+override는 legacy "rank = PE" 테스트 경로로 유지.
+
+### D2. Install 경로 — engine.submit 일원화
+
+`ccl/install.py`의 sideband direct call을 제거하고, `IpcqInitMsg`를
+`engine.submit`으로 보낸다. MmuMapMsg / MemoryWriteMsg 등이 이미 동일 패턴.
+
+```python
+# Backend (AhbmCCLBackend.__init__ 또는 init_process_group 시점)
+from kernbench.ccl.install_plan import build_install_plans
+
+plans = build_install_plans(
+    world_size=self._world_size,
+    algorithm=self._merged["algorithm"],
+    algorithm_config=self._merged,
+    spec=self.ctx.spec,
+)
+self._plans = plans
+
+# Each PE_IPCQ가 자기 neighbor table을 받도록 engine 경유 submit
+handles = []
+for plan in plans:
+    for pe_install in plan.pe_installs:
+        h = self.ctx.submit(IpcqInitMsg(
+            correlation_id=self.ctx.correlation_id,
+            request_id=f"ipcq_init_s{plan.sip}c{pe_install.cube}p{pe_install.pe}",
+            target_sips=(plan.sip,),
+            target_cubes=(pe_install.cube,),
+            target_pe=pe_install.pe,
+            entries=pe_install.neighbors,
+            buffer_kind=plan.buffer_kind,
+            n_slots=plan.n_slots,
+            slot_size=plan.slot_size,
+            # ... (기존 IpcqInitMsg 필드)
+        ))
+        handles.append(h)
+
+# Eager install — init_process_group이 반환하기 전에 완료 보장
+for h in handles:
+    self.ctx.wait(h)
+```
+
+**PE_IPCQ 컴포넌트**는 이미 `IpcqInitMsg`를 main loop에서 처리 (`pe_ipcq.py`
+라인 145-147). 변경 불필요. 유일한 차이는 "message가 sideband Python call이
+아니라 engine queue를 거쳐 도착한다"는 점.
+
+**Correctness invariant (equivalence)**: `init_process_group()`은 모든
+install handle을 `wait()`한 후 반환하므로 launch-before-install 문제는
+구조적으로 없다. 남는 correctness 질문은 단 하나:
+
+> Engine-routed `IpcqInitMsg` 처리가 기존 sideband
+> `pe_ipcq._install_neighbors(msg)` 호출과 **동일한 최종 PE_IPCQ 상태**를
+> 생성하는가.
+
+검증 포인트 (T3 참고):
+
+1. **State equivalence**: `_install_neighbors()` 내부 상태 전이가 engine
+   dispatch path에서도 동일하게 일어나 최종 PE_IPCQ state
+   (`_queue_pairs`, `_installed`, `_credit_inbox` 등)가 일치.
+
+2. **Sideband-only side effect 부재**: Sideband path에서만 있던 부수 효과가
+   없음 (예: engine.submit이 설정하는 request_id / correlation tracking 등이
+   install semantics를 왜곡하지 않음).
+
+3. **Ordering independence**: 서로 다른 PE들의 install message가 engine
+   큐에서 임의 순서로 처리되어도 최종 상태가 동일. 즉 install은 **PE별
+   독립 연산**이어야 하고, cross-PE 순서 의존성이 있으면 안 됨.
+
+4. **Idempotency**: 동일 PE에 대해 `IpcqInitMsg`가 두 번 도착하면? 현재
+   설계 전제는 "per-PE 단 한 번 install". 중복 install 시 동작은 정의되지
+   않음. 보수적 정책:
+   - 최초 install 시 `_installed = True`로 전이
+   - 이후 중복 install msg는 **에러** (raise) 또는 **silent idempotent**
+     (no-op) 둘 중 하나로 명시
+   - Recommend: **raise** (명시적 에러 → 버그 조기 검출). T3에 duplicate
+     install 케이스 추가.
+
+5. **Partial install visibility**: 일부 PE만 install 완료된 중간 상태가
+   외부에 observable한가? 현재 구조에서는 `init_process_group()`의 eager
+   wait-all이 barrier 역할을 하므로 partial state는 bench 코드에 노출되지
+   않음. 단, debugging / introspection API는 중간 상태를 볼 수 있음 (문제
+   아님, 문서화만).
+
+**Timing 영향**: Engine-routed install은 `init_process_group()`이 SimPy 시간을
+소비하게 만든다. 기존 sideband install은 사실상 zero-cost. ADR 계약:
+
+> Benchmarks must not rely on zero-cost initialization.
+> `init_process_group()` consumes simulated time proportional to the number
+> of participating PEs × per-PE install latency. First collective call
+> starts at a well-defined but non-zero sim time.
+
+### D3. Launch 경로 — non-CCL 커널과 동일 primitive
+
+**CCL 커널은 non-CCL 커널과 동일한 `KernelLaunchMsg` submission path를 쓴다.**
+Engine 내부의 IO_CPU/M_CPU transit 같은 것은 **기존 구현 세부이지 CCL-specific
+장치가 아님**. Backend는 plan의 `participating_pes` 목록을 돌면서 `KernelLaunchMsg`를
+submit할 뿐이다. 새 메시지 타입 없음, 새 라우팅 경로 없음.
+
+```python
+# AhbmCCLBackend.all_reduce
+def all_reduce(self, tensor, op="sum"):
+    if op != "sum":
+        raise NotImplementedError(...)
+    if tensor._handle is None or not tensor._handle.shards:
+        raise RuntimeError(...)
+
+    # Validator — global handle 기준 (D8)
+    validator_name = self._merged.get("validator")
+    if validator_name:
+        resolve_validator(validator_name)(tensor._handle, self._world_size, self.ctx.spec)
+
+    rank = self.ctx.distributed.get_rank()
+    plan = self._plans[rank]
+    tensor_view = _tensor_slice_for_sip(tensor._handle, plan.sip)
+
+    # Plan에서 kernel args 계산 (host-side)
+    import importlib
+    mod = importlib.import_module(plan.kernel_module)
+    n_elem = tensor_view.shards[0].nbytes // tensor.itemsize
+    kargs = mod.kernel_args(n_elem=n_elem, world_size=plan.world_size,
+                             **plan.kernel_config)
+
+    def _submit():
+        out = []
+        for (cube, pe) in plan.participating_pes:
+            h = self.ctx.submit(KernelLaunchMsg(
+                correlation_id=self.ctx.correlation_id,
+                request_id=f"allreduce_r{rank}_c{cube}p{pe}",
+                kernel_ref=KernelRef(name=plan.algorithm_name, kind="builtin"),
+                args=(_tensor_arg_for_pe(tensor_view, cube, pe), *kargs),
+                target_sips=(plan.sip,),
+                target_cubes=(cube,),
+                target_pe=pe,
+            ))
+            out.append(h)
+        return out
+
+    self._barrier.submit_and_drain(self.ctx, rank, _submit)
+```
+
+### D4. Algorithm ABI — 얇게 + 명시적 arg 계약
+
+각 알고리즘 모듈은 **kernel + kernel_args만 필수**.
+
+```python
+# src/kernbench/ccl/algorithms/ring_allreduce.py
+def kernel(t_ptr, n_elem, world_size, tl):
+    """PE-side kernel code.
+
+    Signature convention: first positional arg is the tensor pointer
+    (per-PE slice), subsequent positional args are whatever
+    kernel_args() returns. `tl` is injected by the TLContext runtime.
+    """
+
+def kernel_args(*, n_elem: int, world_size: int, **kw) -> tuple:
+    """Return the tuple of non-tensor positional args.
+
+    Signature contract:
+    - Called keyword-only with n_elem and world_size plus kernel_config.
+    - Returns a tuple (possibly empty) of scalar / metadata args.
+    - The backend constructs the final KernelLaunchMsg.args as:
+          (per_pe_tensor_arg, *kernel_args(...))
+      where per_pe_tensor_arg is a TensorArg containing only the shards
+      local to the receiving PE (derived from tensor_view).
+    """
+    return (n_elem, world_size)
+```
+
+**Arg assembly in backend (reference)**:
+
+```python
+# AhbmCCLBackend.all_reduce (D3에서 발췌)
+kargs = mod.kernel_args(n_elem=n_elem, world_size=plan.world_size,
+                         **plan.kernel_config)
+for (cube, pe) in plan.participating_pes:
+    pe_tensor_arg = _tensor_arg_for_pe(tensor_view, cube, pe)
+    self.ctx.submit(KernelLaunchMsg(
+        args=(pe_tensor_arg, *kargs),       # tensor first, then kernel_args return
+        target_sips=(plan.sip,),
+        target_cubes=(cube,),
+        target_pe=pe,
+        ...
+    ))
+```
+
+**ccl.yaml**에서 선언적 metadata:
+
+```yaml
+algorithms:
+  ring_allreduce_tcm:
+    module: kernbench.ccl.algorithms.ring_allreduce
+    topology: ring_1d             # kernbench/ccl/topologies.py
+    mapper: leader_only           # kernbench/ccl/mappers.py (신규)
+    validator: single_shard_per_rank   # kernbench/ccl/validators.py (신규)
+    buffer_kind: tcm
+    n_elem: 8
+```
+
+- `topology` (필수)
+- `mapper` (선택, default `"leader_only"`)
+- `validator` (선택)
+
+알고리즘 모듈 자체에는 mapper/validator/participating_pes/neighbor
+생성기가 **들어가지 않음**.
+
+### D5. Mapper + validator — registry key **또는** import path
+
+Host-side framework가 built-in registry 제공. 커스텀 확장은 dot-import path.
+
+```python
+# src/kernbench/ccl/mappers.py (new)
+Mapper = Callable[[dict, int], list[tuple[int, int]]]
+
+def leader_only(spec, rank):
+    """Single leader PE per SIP. Ring/tree/mesh용."""
+    return [(0, 0)]
+
+def all_pes(spec, rank):
+    """Every PE in the SIP. 알고리즘이 intra-SIP 전체 PE를 참여시킬 때 사용
+    (e.g. intra-SIP reduction, intra-SIP broadcast, hierarchical collective
+    의 낮은 레벨 등)."""
+    cm = spec["sip"]["cube_mesh"]
+    pl = spec["cube"]["pe_layout"]
+    n_cubes = cm["w"] * cm["h"]
+    n_pes = pl["pe_per_corner"] * len(pl["corners"])
+    return [(c, p) for c in range(n_cubes) for p in range(n_pes)]
+
+MAPPER_REGISTRY = {"leader_only": leader_only, "all_pes": all_pes}
+
+def resolve_mapper(key_or_path: str) -> Mapper:
+    if key_or_path in MAPPER_REGISTRY:
+        return MAPPER_REGISTRY[key_or_path]
+    if "." in key_or_path:
+        import importlib
+        mod_path, fn_name = key_or_path.rsplit(".", 1)
+        return getattr(importlib.import_module(mod_path), fn_name)
+    raise ValueError(f"unknown mapper: {key_or_path!r}")
+```
+
+Validator도 동일 패턴 (`src/kernbench/ccl/validators.py`). 입력은 **global
+TensorHandle** (D8 참고).
+
+### D6. Host-side install plan builder
+
+```python
+# src/kernbench/ccl/install_plan.py (new; 기존 install.py의 재구성)
+from dataclasses import dataclass
+from typing import Any, Mapping
+
+@dataclass(frozen=True)
+class NeighborTableEntry:
+    direction: str
+    peer_direction: str       # ADR-0025
+    peer_sip: int
+    peer_cube: int
+    peer_pe: int
+    rx_base_pa: int
+    # ... 기타 IPCQ 설정 ...
+
+@dataclass(frozen=True)
+class PeInstallSpec:
+    cube: int
+    pe: int
+    neighbors: tuple[NeighborTableEntry, ...]
+
+@dataclass(frozen=True)
+class SipInstallPlan:
+    algorithm_name: str                  # human-readable ("ring_allreduce_tcm")
+    sip: int
+    rank: int
+    world_size: int
+    pe_installs: tuple[PeInstallSpec, ...]     # per-PE neighbor tables
+    buffer_kind: str
+    n_slots: int
+    slot_size: int
+    kernel_module: str
+    participating_pes: tuple[tuple[int, int], ...]
+    kernel_config: Mapping[str, Any]
+
+
+def build_install_plans(
+    world_size: int,
+    algorithm: str,
+    algorithm_config: dict,
+    spec: dict,
+) -> list[SipInstallPlan]:
+    """Compose topology + mapper + algorithm into per-SIP plan list."""
+    topo_fn = _resolve_topology(algorithm_config["topology"])
+    mapper = resolve_mapper(algorithm_config.get("mapper", "leader_only"))
+
+    # kernel_config: launch 시 kernel_args에 전달할 algorithm-specific params
+    kernel_config = {
+        k: v for k, v in algorithm_config.items()
+        if k in {"n_elem", "reduce_op", "chunk_size"} or k.startswith("kernel_")
+    }
+
+    plans = []
+    for rank in range(world_size):
+        sip = rank  # identity mapping (non-identity는 open question)
+        pes = mapper(spec, rank)
+        pe_installs = _build_pe_installs(
+            rank=rank, world_size=world_size, sip=sip,
+            pes=pes, topo_fn=topo_fn, algorithm_config=algorithm_config, spec=spec,
+        )
+        plans.append(SipInstallPlan(
+            algorithm_name=algorithm,
+            sip=sip, rank=rank, world_size=world_size,
+            pe_installs=pe_installs,
+            buffer_kind=algorithm_config["buffer_kind"],
+            n_slots=algorithm_config["n_slots"],
+            slot_size=algorithm_config["slot_size"],
+            kernel_module=algorithm_config["module"],
+            participating_pes=tuple(pes),
+            kernel_config=kernel_config,
+        ))
+    return plans
+```
+
+`_build_pe_installs`는 기존 `ccl/install.py`의 neighbor 계산 로직을 재활용
+(ADR-0025의 `reverse_direction` 개선 반영).
+
+**Multi-PE 매퍼와 neighbor 생성 책임**: mapper가 SIP 내 여러 PE를 반환하는
+경우 (`all_pes` 등), PE-level neighbor 그래프는 `_build_pe_installs` 내부에
+형성된다. 즉 topology 모듈은 rank-level 관계만 제공하고, PE-level 연결은
+builder에서 풀어낸다. 복잡한 multi-level 패턴을 쓰는 알고리즘은 이 책임
+분산이 관리 부담이 될 수 있음 — 관련 논의는 ADR-0029 참고.
+
+### D7. Epoch-based collective barrier
+
+Cross-rank submit 동기화. 각 collective 호출은 독립 epoch. 같은 rank의
+중복 join은 즉시 에러.
+
+```python
+# src/kernbench/runtime_api/distributed.py
+@dataclass
+class _EpochState:
+    participants: set[int] = field(default_factory=set)
+    pending: list = field(default_factory=list)
+    drained: bool = False
+    returned: int = 0
+
+
+class _CollectiveBarrier:
+    """Epoch-based barrier.
+
+    Contract:
+    - Each call joins the earliest non-drained epoch.
+    - Each rank may join a given epoch at most once. Duplicate join raises.
+    - Last arriver (participants == world_size) performs drain and advances
+      _next_epoch. Earlier arrivers yield and re-check drained on resume.
+    - Epoch state is GC'd when returned == world_size (success path).
+    - On failure paths, residual state is acceptable; reset() clears it.
+    """
+
+    def __init__(self, world_size: int):
+        self._world_size = world_size
+        self._next_epoch = 0
+        self._state: dict[int, _EpochState] = {}
+
+    def submit_and_drain(self, ctx, rank: int, submit_fn) -> None:
+        epoch = self._next_epoch
+        state = self._state.setdefault(epoch, _EpochState())
+
+        if rank in state.participants:
+            raise RuntimeError(
+                f"rank {rank} attempted duplicate join to epoch {epoch}"
+            )
+        state.participants.add(rank)
+
+        handles = submit_fn()
+        state.pending.extend(handles)
+
+        is_last = len(state.participants) >= self._world_size
+
+        if is_last:
+            for h in state.pending:
+                ctx.wait(h)
+            state.drained = True
+            self._next_epoch = epoch + 1
+        else:
+            from greenlet import getcurrent
+            g = getcurrent()
+            if g.parent is None:
+                raise RuntimeError("barrier requires a bound worker greenlet")
+            while not state.drained:
+                g.parent.switch()
+
+        state.returned += 1
+        if state.returned >= self._world_size:
+            self._state.pop(epoch, None)
+
+    def reset(self) -> None:
+        """Explicit cleanup on spawn exception unwinding."""
+        self._state.clear()
+        self._next_epoch = 0
+```
+
+### D8. Per-rank tensor view + validator contract
+
+**Validator** (host-side, pre-slice, global handle 기준):
+
+```python
+# src/kernbench/ccl/validators.py
+Validator = Callable[[TensorHandle, int, dict], None]
+
+def single_shard_per_rank(handle, world_size, spec):
+    """Ring 계열: 정확히 world_size개 shard, SIP당 1개."""
+    if len(handle.shards) != world_size:
+        raise ValueError(...)
+    per_sip = {}
+    for s in handle.shards:
+        per_sip[s.sip] = per_sip.get(s.sip, 0) + 1
+    if any(c != 1 for c in per_sip.values()):
+        raise ValueError(...)
+
+def multi_pe_sip_local(handle, world_size, spec):
+    """Multi-PE per SIP layout: 각 SIP에 intra-SIP PE 수만큼 shard 존재.
+    Intra-SIP 전체 PE를 참여시키는 알고리즘이 사용."""
+    cm = spec["sip"]["cube_mesh"]
+    pl = spec["cube"]["pe_layout"]
+    per_sip = cm["w"] * cm["h"] * pl["pe_per_corner"] * len(pl["corners"])
+    if len(handle.shards) != world_size * per_sip:
+        raise ValueError(...)
+
+VALIDATOR_REGISTRY = {...}
+def resolve_validator(key_or_path): ...
+```
+
+Validator는 world 전체의 shard layout 불변량을 본다. Per-rank view는
+backend가 validator 호출 **후** `_tensor_slice_for_sip`로 생성.
+
+**Per-rank tensor view** — SIP-local slice:
+
+```python
+def _tensor_slice_for_sip(handle, sip) -> TensorArg:
+    sip_shards = [s for s in handle.shards if s.sip == sip]
+    if not sip_shards:
+        raise RuntimeError(f"tensor has no shards on SIP {sip}")
+    # Deterministic ordering contract: (cube, pe, offset_bytes) ascending.
+    # Multi-PE mappers (hierarchical 등) rely on this ordering to align
+    # per-PE tensor arg construction with participating_pes enumeration.
+    sip_shards.sort(key=lambda s: (s.cube, s.pe, s.offset_bytes))
+    min_offset = min(s.offset_bytes for s in sip_shards)
+    local_va_base = handle.va_base + min_offset if handle.va_base else 0
+    return TensorArg(
+        shards=tuple(TensorArgShard(...) for s in sip_shards),
+        va_base=local_va_base,
+    )
+```
+
+**Ordering invariant**: slice의 shard는 `(cube, pe, offset_bytes)` 오름차순.
+Backend가 `participating_pes`를 iterate하며 `_tensor_arg_for_pe(view, cube, pe)`를
+구성할 때, 결정론적 ordering을 전제할 수 있다. 특히 `all_pes` mapper +
+hierarchical 알고리즘이 per-PE slice 조합을 순서 의존적으로 해석하는 경우에
+중요.
+
+### D9. Greenlet-local rank registry (+ debug warning)
+
+```python
+class DistributedContext:
+    def __init__(self):
+        self._backend = None
+        self._rank_by_greenlet: dict = {}
+
+    def _bind_rank(self, g, rank: int) -> None:
+        self._rank_by_greenlet[g] = int(rank)
+
+    def get_rank(self) -> int:
+        self._ensure_initialized()
+        from greenlet import getcurrent
+        g = getcurrent()
+        if g not in self._rank_by_greenlet:
+            if os.environ.get("KERNBENCH_DEBUG"):
+                warnings.warn(
+                    "get_rank() called outside a bound greenlet — returning 0. "
+                    "Likely a bug unless running single-driver."
+                )
+            return 0
+        return int(self._rank_by_greenlet[g])
+```
+
+### D10. `torch.ahbm.set_device(rank)` — SIP 바인딩
+
+KernBench 백엔드 이름은 `ahbm` (ADR-0023 D10). Real PyTorch는
+`torch.cuda.set_device(r)`이지만 우리는 CUDA가 아니므로 honestly-named
+namespace를 사용한다.
+
+```python
+class _AhbmNamespace:
+    """torch.ahbm — per-greenlet SIP device binding.
+
+    Real-PyTorch parity idiom: ``torch.cuda.set_device(rank)``. Since
+    KernBench's backend is 'ahbm' (not CUDA), we expose the equivalent
+    API under ``torch.ahbm`` to avoid pretending to be a CUDA runtime.
+    """
+
+    def __init__(self):
+        self._device_by_greenlet: dict = {}
+
+    def set_device(self, device: int) -> None:
+        from greenlet import getcurrent
+        self._device_by_greenlet[getcurrent()] = int(device)
+
+    def current_device(self) -> int | None:
+        from greenlet import getcurrent
+        return self._device_by_greenlet.get(getcurrent())
+
+# Attached to RuntimeContext as `self.ahbm = _AhbmNamespace()`.
+# Bench code: `torch.ahbm.set_device(rank)` mirrors `torch.cuda.set_device`.
+```
+
+**PyTorch 2.x style 병행 지원**: 최신 PyTorch는 device-agnostic한
+`torch.accelerator` 네임스페이스를 지향 (`torch.accelerator.set_device_index(r)`,
+`torch.accelerator.current_device_index()`). Device vendor에 종속되지 않는
+코드를 쓰려는 사용자를 위해 KernBench도 이 표면을 병행 지원한다.
+
+```python
+class _AcceleratorNamespace:
+    """torch.accelerator — device-agnostic API (PyTorch 2.x style).
+
+    Aliases torch.ahbm for bench code that prefers device-neutral idiom:
+        torch.accelerator.set_device_index(rank)
+        torch.accelerator.current_device_index()
+    """
+
+    def __init__(self, ahbm: _AhbmNamespace):
+        self._ahbm = ahbm
+
+    def set_device_index(self, device: int) -> None:
+        self._ahbm.set_device(device)
+
+    def current_device_index(self) -> int | None:
+        return self._ahbm.current_device()
+
+# RuntimeContext
+self.ahbm = _AhbmNamespace()
+self.accelerator = _AcceleratorNamespace(self.ahbm)   # alias
+```
+
+Bench 작성자는 다음 중 하나를 선택 — 둘 다 내부적으로 같은 레지스트리를 보유:
+
+```python
+torch.ahbm.set_device(rank)                   # KernBench-native, explicit backend
+torch.accelerator.set_device_index(rank)      # PyTorch 2.x device-agnostic
+```
+
+### D11. Tensor placement = structural (sip, cube, pe) 좌표
+
+`resolve_dp_policy`가 `target_sip`을 직접 받아 구조적 좌표로 placement 생성.
+세부는 ADR-0026.
+
+```python
+# RuntimeContext._create_tensor
+current_sip = self.ahbm.current_device()          # (D10 naming)
+if current_sip is None:
+    current_sip = 0  # single-driver fallback (D9와 일관)
+placement = resolve_dp_policy(
+    dp, shape=shape_2d, itemsize=itemsize,
+    num_pe=eff_num_pe, num_cubes=eff_num_cubes,
+    target_sip=current_sip,
+)
+```
+
+Post-hoc `pe_index` shifting 제거 — ShardSpec이 `(sip, cube, pe)` 구조적
+좌표 보유.
+
+### D12. `torch.multiprocessing.spawn`-compat surface
+
+Bench 작성자 표면은 real PyTorch `mp.spawn`과 동일:
+
+```python
+# src/kernbench/runtime_api/multiprocessing.py (new)
+def spawn(fn, args=(), nprocs=1, join=True, daemon=False, start_method="spawn"):
+    """Drop-in for torch.multiprocessing.spawn.
+    Internal: greenlet fan-out + epoch-barrier sync + exception propagation.
+    """
+    ...
+
+# torch namespace에 부착
+torch.multiprocessing = SimpleNamespace(spawn=spawn)
+```
+
+Bench:
+
+```python
+import torch.multiprocessing as mp
+mp.spawn(worker, nprocs=world_size, args=(world_size, torch))
+```
+
+### D13. Scheduler + exception handling
+
+```python
+def spawn(fn, args, nprocs, ...):
+    dist = torch.distributed
+    gs: list[greenlet] = []
+    errors: dict[int, Exception] = {}
+
+    for rank in range(nprocs):
+        def _entry(r=rank):
+            try:
+                fn(r, *args)
+            except Exception as e:
+                errors[r] = e
+                raise
+        g = greenlet(_entry)
+        dist._bind_rank(g, rank)
+        gs.append(g)
+
+    try:
+        while True:
+            alive = [g for g in gs if not g.dead]
+            if not alive:
+                break
+            for g in alive:
+                if not g.dead:
+                    g.switch()
+    except Exception as outer:
+        for other in gs:
+            if not other.dead:
+                try:
+                    other.throw(SystemExit)
+                except Exception:
+                    pass
+        # Epoch barrier state 명시적 cleanup
+        backend = getattr(dist, "_backend", None)
+        if backend is not None and hasattr(backend, "_barrier"):
+            backend._barrier.reset()
+        raise SpawnException(errors) from outer
+```
+
+**Scheduler contract**:
+- Deterministic round-robin over insertion order (rank 0, 1, ..., N-1).
+- 동기화 지점은 epoch barrier (D7)만. Scheduler 순서에 의존하는 correctness 없음.
+- 예외 발생 시 다른 greenlet 강제 종료 + `SpawnException` 전파.
+
+**Starvation guideline**:
+- 일반적으로 collective barrier가 workers를 동기화. 큰 편차 없음.
+- 극단적 non-collective 루프 대비 cooperative yield 제공:
+  `torch.distributed.cooperative_yield()`.
+
+### D14. Backward compatibility
+
+1. **Single-driver 호출**: `get_rank()` 0 반환 (D9).
+2. **`ccl.yaml` world_size override**: D1 fallback 우회 — legacy "rank = PE"
+   테스트 경로로 사용 가능.
+3. **`DPPolicy.sip="column_wise"` 명시**: ADR-0026 scope.
+4. **`install_ipcq()` compatibility wrapper**:
+
+기존 `ccl/install.py`의 `install_ipcq()` API는 곧바로 제거하지 않는다.
+Thin compatibility wrapper로 남겨 기존 직접 호출자가 점진적으로 migration할
+수 있게 한다.
+
+```python
+# src/kernbench/ccl/install.py (after this ADR)
+def install_ipcq(engine, spec, merged, *, algo_module=None, rank_to_pe=None):
+    """DEPRECATED: legacy host-side PE installer.
+
+    Internally delegates to build_install_plans + engine-routed IpcqInitMsg.
+    Use dist.init_process_group() instead.
+    """
+    from kernbench.ccl.install_plan import build_install_plans
+    import warnings
+    warnings.warn(
+        "install_ipcq() is deprecated; use dist.init_process_group()",
+        DeprecationWarning, stacklevel=2,
+    )
+    plans = build_install_plans(
+        world_size=merged.get("world_size", 1),
+        algorithm=merged["algorithm"],
+        algorithm_config=merged,
+        spec=spec,
+    )
+    handles = []
+    for plan in plans:
+        for pe_install in plan.pe_installs:
+            h = engine.submit(IpcqInitMsg(
+                target_sips=(plan.sip,),
+                target_cubes=(pe_install.cube,),
+                target_pe=pe_install.pe,
+                entries=pe_install.neighbors,
+                buffer_kind=plan.buffer_kind,
+                n_slots=plan.n_slots,
+                slot_size=plan.slot_size,
+            ))
+            handles.append(h)
+    for h in handles:
+        engine.wait(h)
+    return {"world_size": merged.get("world_size", 1), "plans": plans}
+```
+
+Migration 스케줄:
+- Phase 1: wrapper로 유지 + DeprecationWarning
+- Phase 2: 직접 호출자 grep-audit → 각각 `dist.init_process_group()` 또는
+  `build_install_plans()` 직접 사용으로 이관
+- Phase 3: wrapper 제거 (별도 cleanup ADR 또는 PR)
+
+---
+
+## Dependencies
+
+- **ADR-0023** (IPCQ): `IpcqInitMsg` 메시지 타입과 PE_IPCQ 핸들링을 그대로
+  활용. Engine-routed submit으로 전환하는 것이 유일한 변경.
+- **ADR-0025** (IPCQ direction fix): `_build_pe_installs`의 neighbor 계산이
+  2-rank ring 등에서 정확히 동작하려면 필요.
+- **ADR-0003 / 0016** (IO_CPU): IO_CPU는 기존 transit 역할 그대로. 본 ADR에서
+  IO_CPU 역할 변경 없음.
+
+---
+
+## Non-goals
+
+- **IPCQ protocol 수정**: ADR-0023 유지.
+- **DPPolicy 필드 정리**: ADR-0026.
+- **Megatron-style TP**: ADR-0027.
+- **Multi-node (프로세스 간)**: 단일 프로세스.
+- **IO_CPU SIP control-plane 단일 endpoint 원칙 채택**: 본 ADR 범위 밖. 현재
+  KernBench에 이 원칙이 없고, 도입은 별도 ADR.
+- **Hierarchical all-reduce 알고리즘 설계**: ADR-0029. 본 ADR은 그 알고리즘이
+  쓸 framework 인프라 (`all_pes` mapper, `multi_pe_sip_local` validator,
+  registry 확장점)만 제공.
+
+---
+
+## Open questions
+
+### 🔴 Critical — 구현 blocker 가능성 (integration 전 반드시 검증)
+
+- **`IpcqInitMsg`의 engine routing — primary implementation risk**: 현재
+  sideband만 쓰여서 engine routing path가 실사용 검증되지 않은 상태. **본
+  ADR 전체가 "engine routing이 동작한다"는 가정 위에 서 있다**. 이것이
+  실제로 안 되면 D2, D14, T3 등이 전부 영향 받음. 반드시 **ADR 구현 착수
+  전 스파이크 검증**:
+  - `engine.submit(IpcqInitMsg(target_sips=..., target_cubes=..., target_pe=...))`
+    가 PE_IPCQ로 정확히 배달되는지 (기존 `MmuMapMsg` / `MemoryWriteMsg` 라우팅
+    패턴과 비교)
+  - 미지원 시 minor hook: engine의 message-type → component-kind 매핑 테이블에
+    `IpcqInitMsg → "pe_ipcq"` 등록 (localized change, topology builder /
+    message schema 영향 없음)
+  - 결과에 따라 D2 채택 여부가 달라질 수 있음 — 만약 routing 불가 시 sideband
+    path 유지로 fallback 후 본 ADR 범위 재조정
+
+- **Engine-routed install vs sideband equivalence** (D2 검증점 1-5): T3의
+  equivalence test가 실제 동작하는지 스파이크. 특히 ordering independence와
+  idempotency는 기존 테스트에 없는 속성이라 신규 검증 필요.
+
+- **`install_ipcq()` 직접 호출자 audit** (구현 전 필수): deprecated wrapper
+  전략은 적절하지만 실제 migration 리스크는 호출자 목록에 따라 다름. 착수 전
+  grep audit:
+  - Pattern: `install_ipcq(` (cwd 전체)
+  - Scope: `src/`, `tests/`, `benches/`, `scripts/`, `src/kernbench/cli/`
+  - 각 호출자의 예상 migration path (→ `dist.init_process_group` vs
+    `build_install_plans` 직접)를 정리한 후 wrapper 도입
+
+### 🟡 Nice-to-have — scope 경계 관련
+
+- **Install timing 허용치**: SimPy 시간 상 install이 몇 ns~us 소모. 기존
+  sideband는 0ns. 기존 테스트가 t=0 시작을 전제로 하는지 확인 (audit 결과에
+  따라 테스트 교정 필요).
+
+- **`IpcqInitMsg` 배치 가능성**: MmuMapMsg처럼 `target_pe="all"` 브로드캐스트
+  는 IPCQ에서는 부적합 (PE마다 neighbor가 다름). 현재는 per-PE 개별 submit.
+  Per-PE payload를 담는 batched IpcqInitMsg 타입은 future optimization.
+
+- **`_rank_to_sip` 매핑**: 현재 identity. Non-trivial mapping 요구 시 별도.
+
+- **Cooperative yield API 위치**: `torch.distributed.cooperative_yield()`로
+  노출 예정. 실제 필요성은 Phase 2 이후 벤치 추가 시 판단.
+
+(PE-level topology 일원화 관련 중장기 방향은 **ADR-0029** 참고 — 복잡한
+multi-level 알고리즘이 driving force가 되는 framework 진화 방향.)
+
+---
+
+## Test strategy
+
+### T1. Launcher infrastructure
+
+`tests/test_ccl_ddp_launcher.py`:
+- `test_world_size_equals_sip_count` — D1
+- `test_ahbm_set_device_binds_tensor_to_single_sip` — D10/D11
+- `test_get_rank_is_greenlet_local` — D9
+- `test_run_spawns_one_worker_per_rank` — D12/D13
+- `test_get_rank_debug_warning` — D9 warning path
+
+### T2. Install plan builder
+
+`tests/test_ccl_install_plan.py` (new):
+- `build_install_plans` — ring_1d × leader_only 조합 (단일 PE per rank)
+- `build_install_plans` — ring_1d × all_pes 조합 (multi-PE per rank; mapper
+  framework 동작 확인, 알고리즘-무관)
+- Mapper / validator registry resolution (built-in key vs import path vs
+  unknown)
+- Import path fallback (`"pkg.mod.fn"` 형식) 동작 검증
+
+### T3. Engine-routed IpcqInitMsg (equivalence — 핵심 검증)
+
+`tests/test_ipcq_init_routing.py` (new):
+- **Routing**: `engine.submit(IpcqInitMsg)` → 지정 PE_IPCQ가 실제 설치 수행
+- **Equivalence**: 동일한 IpcqInitMsg를 (a) sideband `_install_neighbors`
+  직접 호출, (b) engine.submit 두 경로로 보낸 뒤 PE_IPCQ 최종 state
+  (`_queue_pairs`, `_installed` 등) 동일성 비교
+- **Ordering independence**: 서로 다른 PE의 install msg를 engine 큐에 임의
+  순서로 넣어도 최종 state가 동일
+- **Idempotency (duplicate install)**: 동일 PE에 두 번 install msg → 두
+  번째는 에러 raise (policy: explicit error; D2 검증점 4 참고)
+- **Multi-PE 병렬 install**: per-PE submit이 interference 없이 완료
+- **Install 후 send 성공**: 설치 직후 `IpcqSendCmd` 실행해서 neighbor table
+  state가 실제로 유효한지 확인
+
+### T4. Barrier correctness
+
+`tests/test_collective_barrier.py` (new):
+- Single collective 정상
+- 다중 collective 연속 호출 (epoch 격리)
+- 동일 rank의 duplicate join → RuntimeError
+- Rank 1이 all_reduce 전 종료 → SpawnException + barrier.reset()
+- Conditional branch 시 모든 rank 도달하면 정상
+
+### T5. E2E
+
+`tests/test_ccl_allreduce_matrix.py`:
+- `ring_tcm` / `ring_hbm` / `ring_sram` @ ws=SIP_count
+
+### T6. 회귀
+
+기존 `test_ccl_framework`, `test_ccl_install`, `test_ccl_topologies`,
+`test_ccl_mock_runtime`, `test_pe_ipcq`, `test_ipcq_e2e`, 기타 non-CCL
+모두 통과.
+
+---
+
+## Consequences
+
+### Positive
+
+- **새 message 타입 0개**: 기존 `IpcqInitMsg` + `KernelLaunchMsg`만으로 구현.
+- **IO_CPU / engine 변경 없음**: 기존 routing 그대로.
+- **Sideband install convention 제거**: MmuMapMsg 등과 동일 패턴으로 일원화.
+- **Plan state stale 문제 소멸**: Plan은 host 단일 소유.
+- **Bench = real PyTorch DDP** (공개 API 관점).
+- **Algorithm ABI 경량**: `kernel` + `kernel_args`만 필수.
+- **Epoch-based barrier**: interleaved collective 안전.
+- **Control/data plane 분리**: data plane(PE_IPCQ)은 ADR-0023 유지, control
+  plane은 host-driven.
+- 장기 확장성: Megatron TP, DTensor 기반.
+
+### Negative
+
+- 신규 모듈: `install_plan.py`, `mappers.py`, `validators.py`,
+  `multiprocessing.py`.
+- Engine이 `IpcqInitMsg`를 엔진-path로 라우팅할 수 있는지 구현 시 확인 필요
+  (minor hook 가능성).
+- Install이 SimPy 시간을 소모 (positive로도 볼 수 있으나, 기존 sideband 시점
+  0ns 전제인 테스트가 있으면 교정 필요).
+
+### Neutral
+
+- IPCQ PE-level protocol (ADR-0023) 불변.
+- `DPPolicy` 필드 변경은 ADR-0026.
+- IO_CPU 역할 불변 (기존 transit 그대로).
+
+---
+
+## Affected files
+
+| File | Change |
+|------|--------|
+| `src/kernbench/runtime_api/distributed.py` | D1/D2/D7/D9: world_size fallback, rank_to_sip, plan 소유, engine-routed install/launch, epoch barrier |
+| `src/kernbench/runtime_api/context.py` | D10/D11: `_AhbmNamespace`, `ctx.ahbm`, `_create_tensor`가 `target_sip` 전달 |
+| `src/kernbench/runtime_api/multiprocessing.py` (new) | D12/D13: `spawn` + scheduler + exception |
+| `src/kernbench/ccl/install_plan.py` (new) | D6: `build_install_plans`, `SipInstallPlan`, `PeInstallSpec`, `NeighborTableEntry` |
+| `src/kernbench/ccl/mappers.py` (new) | D5: `leader_only`, `all_pes`, registry + resolver |
+| `src/kernbench/ccl/validators.py` (new) | D5: validator registry + resolver |
+| `src/kernbench/ccl/install.py` | Thin deprecated compat wrapper (D14) |
+| `src/kernbench/ccl/algorithms/ring_allreduce.py` | D4: `kernel` + `kernel_args` 유지 (큰 변화 없음) |
+| `src/kernbench/ccl/algorithms/mesh_allreduce.py` | D4 동일 |
+| `src/kernbench/ccl/algorithms/tree_allreduce.py` | D4 동일 |
+| `ccl.yaml` | 각 알고리즘에 `mapper` / `validator` 선언 추가 |
+| `src/kernbench/sim_engine/engine.py` | (If needed) `IpcqInitMsg` → PE_IPCQ 라우팅 확인 hook |
+| `benches/ccl_allreduce.py` | 새 launcher 기반 rewrite |
+| `tests/test_ccl_ddp_launcher.py` (new) | T1 |
+| `tests/test_ccl_install_plan.py` (new) | T2 |
+| `tests/test_ipcq_init_routing.py` (new) | T3 |
+| `tests/test_collective_barrier.py` (new) | T4 |
+| `tests/test_ccl_allreduce_matrix.py` | T5: ws=SIP_count 단순화 |
@@ -0,0 +1,365 @@
+# ADR-0025: IPCQ Direction Addressing — address-based matching
+
+## Status
+
+Proposed (Revision 2 — Address-based matching; peer_direction field dropped)
+
+## Context
+
+### 목표
+
+ADR-0023의 IPCQ protocol에서 **"어느 direction pair를 통한 전송인가"의 식별**을
+topology / dict-order에 의존하지 않고 **주소 기반**으로 일관되게 한다.
+2-rank bidirectional ring (또는 여러 direction이 동일 peer를 가리키는
+topology 일반)에서 정확히 동작하도록 한다.
+
+### 현재 상태 (ADR-0023 D9 구현)
+
+`src/kernbench/components/builtin/pe_ipcq.py` — `_handle_meta_arrival`:
+
+```python
+def _handle_meta_arrival(self, msg: IpcqMetaArrival) -> None:
+    token = msg.token
+    sender_key = (token.src_sip, token.src_cube, token.src_pe)
+    for d, qp in self._queue_pairs.items():
+        p = qp["peer"]
+        if (p.sip, p.cube, p.pe) == sender_key:
+            qp["peer_head_cache"] = max(qp["peer_head_cache"], token.sender_seq + 1)
+            # ... wake recv waiters ...
+            return
+```
+
+`_credit_worker`도 동일한 "sender-coord-first-match" 패턴.
+
+`src/kernbench/ccl/install.py` — `reverse_direction`:
+
+```python
+def reverse_direction(my_rank: int, peer_rank: int) -> str | None:
+    for d, target in neighbor_table[peer_rank].items():
+        if target == my_rank:
+            return d
+    return None
+```
+
+### 드러난 버그 — 2-rank bidirectional ring
+
+`ring_1d(rank, world_size=2)` → `{"E": 1, "W": 1}` (rank 0). 양쪽 방향이 같은 peer.
+
+**버그 1 (install)**:
+- `reverse_direction(0, 1)` → dict order로 "E" 반환 (틀림, "W"가 맞음 — opposite
+  direction convention)
+- rank 0의 E entry가 `peer.rx_base_pa = rx_base(sip1, cube0, pe0, d="E")`로 설정
+- tl.send(E) → data가 sip1의 E-rx buffer로 landing (should be W-rx)
+
+**버그 2 (runtime)**:
+- 설령 install이 올바른 주소로 설정해도, receiver의 `_handle_meta_arrival`이
+  sender 좌표만으로 direction 매칭 → 첫 direction (E) 승
+- peer_head_cache[E] 증가, peer_head_cache[W]는 불변
+- Kernel의 tl.recv(W)는 peer_head_cache[W] 대기 → 영원히 블록 → IpcqDeadlock
+
+### 근본 원인
+
+두 축에서 동일 문제:
+1. **Install-time pairing**: "내 direction과 peer의 어느 direction이 짝인가"
+   결정이 dict-iteration-order에 의존 → 여러 direction이 같은 peer를 가리킬 때
+   fragile
+2. **Runtime identification**: "어느 qp를 업데이트해야 하는가" 결정이 sender
+   좌표만으로 이루어짐 → direction 중복 시 ambiguous
+
+### 해결 방향 — address-based matching
+
+각 PE의 rx buffer는 **direction별로 고유한 주소 range**에 위치 (rx_base_pa +
+direction_idx × bytes_per_direction). 따라서:
+
+- **Runtime**: sender coord 대신 **dst_addr 범위**로 매칭 → unambiguous
+- **Install**: opposite-direction 우선 선택 heuristic (ring / mesh의 자연스러운
+  대칭성)
+- `peer_direction` 같은 이중 메타데이터 불필요 — **주소가 single source of
+  truth**
+
+이 설계는 **PhysAddr 전환 (ADR-0030)과 독립적**으로 작동. 현재 synthetic
+주소든 PhysAddr든 direction별 range 유일성만 지켜지면 동일하게 적용 가능.
+
+---
+
+## Decision
+
+### D1. Install — `reverse_direction` opposite-preference
+
+`src/kernbench/ccl/install.py`:
+
+```python
+_OPPOSITE_DIR = {"E": "W", "W": "E", "N": "S", "S": "N"}
+
+def reverse_direction(my_rank: int, peer_rank: int, my_dir: str) -> str | None:
+    """Find peer's direction that reciprocates my_dir→peer_rank.
+
+    Prefer the OPPOSITE direction (E↔W, N↔S) when the peer has it
+    pointing back to us. This matters in 2-rank bidirectional rings
+    where both E and W on one side point to the same peer — without
+    the preference, the first-match-wins iteration would route data
+    into the wrong rx slot. Falls back to any direction pointing back
+    for topologies without an opposite convention (tree_binary's
+    parent/child).
+    """
+    nt = neighbor_table[peer_rank]
+    opp = _OPPOSITE_DIR.get(my_dir)
+    if opp is not None and nt.get(opp) == my_rank:
+        return opp
+    for d, target in nt.items():
+        if target == my_rank:
+            return d
+    return None
+```
+
+호출부:
+
+```python
+for d, peer_rank in nbrs.items():
+    peer_dir = reverse_direction(r, peer_rank, d)  # my_dir 전달
+    if peer_dir is None:
+        continue
+    ...
+```
+
+### D2. Runtime — `_handle_meta_arrival` dst_addr 매칭
+
+`src/kernbench/components/builtin/pe_ipcq.py`:
+
+```python
+def _handle_meta_arrival(self, msg: IpcqMetaArrival) -> None:
+    """Match incoming token to the receiver-side direction by dst_addr range.
+
+    Each direction has a unique rx buffer address range
+    (my_rx_base_pa + n_slots * slot_size). The token's dst_addr (set by
+    the sender's IPCQ when computing peer's slot address) falls within
+    exactly one such range. This address-based matching is unambiguous
+    even when multiple directions have the same peer (2-rank ring).
+    """
+    token = msg.token
+    dst_addr = token.dst_addr
+    for d, qp in self._queue_pairs.items():
+        base = qp["my_rx_base_pa"]
+        size = qp["n_slots"] * qp["slot_size"]
+        if base <= dst_addr < base + size:
+            qp["peer_head_cache"] = max(qp["peer_head_cache"],
+                                         token.sender_seq + 1)
+            self._arrived_tokens.setdefault(d, []).append(token)
+            waiters = self._recv_waiters.get(d, [])
+            self._recv_waiters[d] = []
+            for ev in waiters:
+                if not ev.triggered:
+                    ev.succeed()
+            any_waiters = self._any_recv_waiters
+            self._any_recv_waiters = []
+            for ev in any_waiters:
+                if not ev.triggered:
+                    ev.succeed()
+            return
+    # Unknown dst_addr — diagnostic log (should not happen under correct install)
+```
+
+Sender 좌표 검사는 **제거**. `dst_addr`가 이미 direction을 결정.
+
+### D3. Credit — `dst_rx_base_pa` 필드 추가
+
+`src/kernbench/common/ipcq_types.py`:
+
+```python
+@dataclass(frozen=True)
+class IpcqCreditMetadata:
+    consumer_seq: int
+    dst_rx_base_pa: int       # NEW: 원 sender의 peer.rx_base_pa와 매칭용
+    # 기존 필드 (diagnostic / log 용도로 유지)
+    src_sip: int
+    src_cube: int
+    src_pe: int
+    src_direction: str
+```
+
+Credit 생성 시 (`_delayed_credit_send`): 자기 direction의 `my_rx_base_pa`를
+`dst_rx_base_pa`로 실어 보냄 (이게 상대방이 sender 당시 썼던 `peer.rx_base_pa`).
+
+수신 측 (`_credit_worker`):
+
+```python
+def _credit_worker(self, env):
+    while True:
+        credit = yield self._credit_inbox.get()
+        for d, qp in self._queue_pairs.items():
+            # peer의 rx_base_pa와 credit의 dst_rx_base_pa가 일치하는 qp 찾기
+            if qp["peer"].rx_base_pa == credit.dst_rx_base_pa:
+                qp["peer_tail_cache"] = max(qp["peer_tail_cache"],
+                                              credit.consumer_seq)
+                waiters = self._send_waiters.get(d, [])
+                self._send_waiters[d] = []
+                for ev in waiters:
+                    if not ev.triggered:
+                        ev.succeed()
+                break
+```
+
+Sender 좌표 검사 제거. `dst_rx_base_pa` 매칭으로 unambiguous.
+
+### D4. `IpcqInitEntry`에 `peer_direction` 필드를 **추가하지 않음**
+
+ADR-0025 rev 1에서 제안했던 `IpcqInitEntry.peer_direction`은 **불필요**.
+이유:
+- Meta arrival은 dst_addr로 매칭 (D2)
+- Credit은 dst_rx_base_pa로 매칭 (D3)
+- qp에 peer_direction 저장 필요 없음
+- Install은 rx_base_pa 계산 시 내부적으로만 peer_dir 사용 (`reverse_direction`)
+
+IpcqInitEntry schema 변경 없음. Rev 1 대비 **단순화**.
+
+### D5. `IpcqDmaToken.src_direction` 유지 (diagnostic only)
+
+기존 `src_direction` 필드는 제거하지 않는다. 다음 용도로 유지:
+- Logging / trace: `KERNBENCH_CCL_TRACE=1` 출력의 `(rank, t, dir, nbytes)`
+- Diagnostics: pointer_dump 등에서 direction 표시
+- 미래 확장 여지
+
+Runtime matching은 `dst_addr`만 사용.
+
+### D6. Invariants (ADR-0023 I3 강화)
+
+**I3 (엄격)**: 각 방향 pair `(my_direction, peer_direction)`에 대해 my
+rx_base와 peer rx_base는 **별개의 direction slot**을 가리켜야 함. Install은
+이를 보장해야 한다 (reverse_direction opposite-preference).
+
+**I3.1 (신규)**: 모든 qp에 대해 `qp["my_rx_base_pa"]`와 `qp["peer"].rx_base_pa`는
+서로 disjoint한 주소 range를 점유한다 (다른 direction의 buffer는 절대 겹치지
+않음). 이것이 D2/D3의 주소-기반 매칭의 전제.
+
+Install time에 검증 가능:
+```python
+# ccl/install_plan.py: build_install_plans 끝에 assertion
+all_rx_ranges = set()
+for plan in plans:
+    for pe_install in plan.pe_installs:
+        for entry in pe_install.neighbors:
+            r = (entry.my_rx_base_pa,
+                 entry.my_rx_base_pa + plan.n_slots * plan.slot_size)
+            overlap = any(_ranges_overlap(r, e) for e in all_rx_ranges)
+            assert not overlap
+            all_rx_ranges.add(r)
+```
+
+---
+
+## Dependencies
+
+- **ADR-0023** (IPCQ protocol): 본 ADR은 ADR-0023의 runtime 매칭 로직 수정
+  (D2, D3) + install heuristic 개선 (D1). IPCQ 프로토콜의 semantic layer
+  변경은 없음.
+- **ADR-0024** (launcher): 2-rank bidirectional ring이 실제 쓰이는 경우가
+  ADR-0024의 ws=SIP_count 모델. 본 ADR이 그 케이스를 작동시킴.
+- **ADR-0030** (PhysAddr transition, stub): **독립적** — ADR-0025의
+  주소-기반 매칭은 현재 synthetic 주소든 PhysAddr이든 동일하게 작동.
+
+---
+
+## Non-goals
+
+- **IPCQ 주소 체계를 PhysAddr로 전환**: ADR-0030 scope. 본 ADR은 주소가 어떻게
+  인코딩되는가와 무관.
+- **Multi-hop routing**: ADR-0023 D5의 single-hop DMA write 전제 유지.
+- **Unidir ring 특수화**: `ring_1d_unidir`는 direction 하나만 있으므로 본 버그
+  무관.
+
+---
+
+## Open questions
+
+- **주소 매칭 성능**: `_handle_meta_arrival`과 `_credit_worker`가 qp를 선형
+  순회 (max 4 direction). 성능 영향 무시 가능 수준. 문제 시 dict lookup으로
+  전환 가능 (`_qp_by_rx_base`).
+- **`IpcqDmaToken.src_direction` 필요성 재평가**: diagnostic 용도로만 남긴
+  필드를 계속 유지할지, 또는 logging 외부로 분리할지. 현재는 유지.
+- **Install-time invariant 검증 cost**: D6의 I3.1 검증은 O(N_PE × N_direction)^2.
+  대형 topology에서 느려질 수 있음 → interval tree 등 자료구조로 개선 가능.
+  단순 구현 먼저.
+
+---
+
+## Test strategy
+
+### T1. Unit — `reverse_direction` opposite-preference
+
+`tests/test_ccl_install.py` (확장):
+- Ring ws=2: `reverse_direction(0, 1, "E")` → "W", `reverse_direction(0, 1, "W")` → "E"
+- Ring ws=4: `reverse_direction(0, 1, "E")` → "W" (자연스러운 opposite)
+- Mesh 2×2: `reverse_direction(r, peer, "N")` → "S", "E" ↔ "W"
+- Tree binary: opposite 없는 direction (parent) → fallback 경로
+- Non-symmetric topology: opposite가 peer에 없고 다른 direction만 있는 경우
+
+### T2. Runtime — `_handle_meta_arrival` dst_addr 매칭
+
+`tests/test_pe_ipcq.py` (확장):
+- 2-rank pair install 후, E direction dst_addr로 meta arrival → E의 `peer_head_cache`
+  증가 (W는 불변)
+- W direction dst_addr로 meta arrival → W의 `peer_head_cache` 증가
+- 잘못된 dst_addr (어느 rx range에도 속하지 않음) → 에러 또는 silent drop
+  (결정 후 명시)
+
+### T3. Credit — `dst_rx_base_pa` 매칭
+
+`tests/test_pe_ipcq.py` (확장):
+- E direction send 후 peer가 consume → credit에 자기 W의 `my_rx_base_pa`
+  담아 송신 → sender의 E direction `peer_tail_cache` 증가
+- W direction도 동일
+
+### T4. E2E — 2-rank bidirectional ring
+
+`tests/test_ipcq_e2e.py`:
+- 2-rank ring_1d로 tl.send(E) + tl.recv(W) pattern이 양방향으로 작동
+- ADR-0024의 `test_ccl_allreduce_matrix.py`에서 ring at ws=2가 통과
+
+### T5. Install invariant — rx_base range disjointness
+
+`tests/test_ccl_install_plan.py` (확장):
+- I3.1 검증: `build_install_plans` 결과에서 모든 qp의 rx_base range가 disjoint
+
+### T6. 회귀
+
+- 기존 ws≥3 ring / mesh / tree 테스트 그대로 통과
+- `test_pe_ipcq`, `test_ipcq_e2e` 기존 케이스 회귀
+
+---
+
+## Consequences
+
+### Positive
+
+- **단순함**: `peer_direction` 이중 메타데이터 제거. 주소가 single source of truth.
+- **Unambiguous matching**: 모든 topology (direction 중복 포함)에서 동작.
+- **Schema 변경 최소**: `IpcqInitEntry` 불변, `IpcqCreditMetadata`에 1 필드 추가.
+- **PhysAddr 전환 (ADR-0030) 독립**: 주소-기반 매칭은 주소 인코딩 방식과 무관.
+- **Diagnostic 유지**: `IpcqDmaToken.src_direction`은 로깅 용도로 존치.
+
+### Negative
+
+- Runtime 매칭이 주소 비교로 바뀌어서 디버깅 시 "왜 peer_head_cache[E]가 아닌
+  W가 업데이트됐나" 같은 질문에 address range를 추적해야 함 (기존엔 direction
+  이름으로 충분). 해결: pointer_dump에 "direction ↔ rx_base_pa" 매핑 포함.
+
+### Neutral
+
+- IPCQ protocol의 semantic layer (sender가 dst_addr 계산, receiver가 수신)는
+  불변.
+
+---
+
+## Affected files
+
+| File | Change |
+|------|--------|
+| `src/kernbench/ccl/install.py` | D1: `reverse_direction`에 `my_dir` 인자 추가, opposite-preference |
+| `src/kernbench/components/builtin/pe_ipcq.py` | D2: `_handle_meta_arrival` dst_addr 매칭 / D3: `_credit_worker` dst_rx_base_pa 매칭 / `_delayed_credit_send`가 `dst_rx_base_pa` 필드 채움 |
+| `src/kernbench/common/ipcq_types.py` | D3: `IpcqCreditMetadata`에 `dst_rx_base_pa` 필드 추가 |
+| `src/kernbench/ccl/install_plan.py` (ADR-0024 신규) | D6: I3.1 invariant 검증 (optional) |
+| `docs/adr/ADR-0023-ipcq-pe-collective.md` | Reference note: runtime 매칭 방식이 ADR-0025에서 바뀜 |
+| `tests/test_ccl_install.py` | T1 |
+| `tests/test_pe_ipcq.py` | T2, T3 |
+| `tests/test_ipcq_e2e.py` | T4 |
+| `tests/test_ccl_install_plan.py` | T5 |
@@ -0,0 +1,476 @@
+# ADR-0026: DPPolicy = Intra-Device Only — sip/num_sips 필드 제거
+
+## Status
+
+Accepted (Revision 5 — Phase 2 landed 2026-04-14, 523 passed + 1 strict xfail)
+
+## Context
+
+### 목표
+
+`DPPolicy`를 **한 device(SIP) 내부의 cube × PE 분산**만 표현하는 순수한
+intra-device 추상화로 명확화한다. SIP 간 분산(TP)은 별도 레이어로 분리
+(ADR-0024의 `torch.ahbm.set_device(rank)` 또는 ADR-0027의 Megatron parallel
+layers가 담당).
+
+### 현재 상태
+
+`src/kernbench/policy/placement/dp.py`:
+
+```python
+@dataclass(frozen=True)
+class DPPolicy:
+    sip: Literal["replicate", "column_wise", "row_wise"] = "replicate"
+    cube: Literal["replicate", "column_wise", "row_wise"] = "replicate"
+    pe: Literal["replicate", "column_wise", "row_wise"] = "replicate"
+    num_pes: int | None = None
+    num_cubes: int | None = None
+    num_sips: int | None = None    # ← 제거 대상
+```
+
+`sip` / `num_sips` 필드는 텐서를 SIP 경계 **너머**로 분산하는 경로를 제공함.
+이는:
+
+- **ADR-0024의 launcher 모델과 충돌**: ADR-0024는 "rank = SIP = 1 worker per SIP"
+  모델. 각 worker가 자기 SIP에 텐서를 생성. 텐서가 여러 SIP에 걸치는 경우는
+  Megatron-style TP가 개별 primitive로 처리해야 함.
+- **사용자 의도와 불일치**: "DPPolicy는 한 디바이스 내에서 PE들로 분산하는 방법"
+  (사용자 진술).
+- **개념 혼동**: `DPPolicy.sip="column_wise"`는 실제로 **TP**. 이름이 DP인데
+  하는 일은 TP → 신규 사용자에게 혼란.
+
+### 영향받는 call site (rollback 시점 grep 결과)
+
+**생성 사이트** (`DPPolicy(sip=...` 또는 `num_sips=...`):
+- `tests/test_runtime_api_tensor.py`
+- `benches/ccl_allreduce.py` (ADR-0024 scope 내에서 이미 개편됨)
+- `tests/test_va_offset.py`
+- `benches/va_offset_verify.py`
+- `tests/test_sip_parallel.py`
+
+**참조 사이트** (`dp.sip`, `policy.sip`, `num_sips` 등):
+- `src/kernbench/runtime_api/context.py` (`_create_tensor`, `launch`)
+- `src/kernbench/components/builtin/pe_cpu.py`
+- `src/kernbench/components/legacy/builtin/pe_cpu.py`
+- `src/kernbench/policy/placement/dp.py` (구현 자체)
+- `tests/test_tensor.py`, `test_ipcq_types.py`
+
+**핵심 테스트**: `test_sip_parallel.py`는 이름 그대로 "SIP 병렬성을 DPPolicy로
+표현하는" 테스트. 이 ADR 이후 **새 launcher 모델로 재작성** 필요.
+
+---
+
+## Decision
+
+### D1. `DPPolicy`에서 `sip` + `num_sips` 필드 제거
+
+```python
+@dataclass(frozen=True)
+class DPPolicy:
+    """Intra-device (cube × PE) data-parallel policy.
+
+    SIP-level placement is controlled by ``torch.ahbm.set_device(rank)``
+    (ADR-0024 D10) and, for model-level TP, by Megatron-style parallel
+    layers (ADR-0027). DPPolicy does not cross SIP boundaries.
+    """
+    cube: Literal["replicate", "column_wise", "row_wise"] = "replicate"
+    pe: Literal["replicate", "column_wise", "row_wise"] = "replicate"
+    num_pes: int | None = None
+    num_cubes: int | None = None
+```
+
+제거되는 필드: `sip`, `num_sips`.
+
+### D2. `ShardSpec` — structural (sip, cube, pe) 좌표, `pe_index` 완전 제거
+
+현재 `ShardSpec.pe_index`는 **global flat index** (`sip × cubes × pes + cube ×
+pes + pe`). 이는 ADR-0024 D11이 "abstraction leakage"로 지적한 형태.
+
+본 ADR에서 ShardSpec을 **structural 좌표로 재정의**하고, `pe_index`는
+property로도 **남기지 않는다**:
+
+```python
+# src/kernbench/policy/placement/dp.py (after)
+@dataclass(frozen=True)
+class ShardSpec:
+    """Structural shard placement — intra-SIP (cube × PE) coord.
+
+    Global-flat `pe_index` was removed in ADR-0026. Callers must use
+    structural coords (sip, cube, pe) directly. If a flat integer key is
+    needed (e.g. dict lookup), compute it explicitly at the call site.
+    """
+    sip: int              # structural — which SIP this shard lives on
+    cube: int             # local within SIP
+    pe: int               # local within cube
+    offset_bytes: int
+    nbytes: int
+```
+
+**핵심 원칙**:
+- ShardSpec의 정체성은 `(sip, cube, pe)` 3튜플.
+- **`pe_index` property도 없음** — silent semantics drift 차단.
+- Global flat을 기대한 기존 호출자는 `.pe_index` 접근 시 **즉시
+  `AttributeError`** → 반드시 구조적 좌표로 migration.
+- Flat integer key가 필요한 국소 문맥 (예: 내부 dict lookup)은 호출자가
+  명시적으로 `spec.sip * N_CUBES * N_PE + spec.cube * N_PE + spec.pe`를 계산.
+
+**Property 제거 정당화**: KernBench는 사내 프로젝트로 call site가 한정되어
+있음. Silent drift 위험 (의미만 바뀌고 타입은 같은 int) 대비 explicit breakage
+(AttributeError)가 훨씬 안전.
+
+### D3. `resolve_dp_policy`가 `target_sip`을 받아 structural 좌표 생성
+
+ADR-0024 D11의 계약 구현. Post-hoc shifting 없음.
+
+```python
+# src/kernbench/policy/placement/dp.py (after)
+
+@dataclass(frozen=True)
+class _LocalPeShard:
+    """Internal — PE resolver의 반환. Cube 내 local PE 식별자 + payload."""
+    local_pe: int                  # cube-local PE index (0..num_pe-1)
+    offset_bytes: int
+    nbytes: int
+
+
+def resolve_dp_policy(
+    policy: DPPolicy,
+    *,
+    shape: tuple[int, int],
+    itemsize: int,
+    num_pe: int,
+    num_cubes: int = 1,
+    target_sip: int,       # NEW — 어느 SIP에 배치할지 명시
+) -> list[ShardSpec]:
+    """2-level resolution (cube × PE) on a specified SIP.
+
+    Returns ShardSpecs with structural coords (sip=target_sip, cube, pe).
+    No SIP-level split — DPPolicy is intra-device only.
+    """
+    resolver = _PE_RESOLVERS[policy.pe]
+    all_shards: list[ShardSpec] = []
+
+    # Level 1: cube within SIP
+    cube_splits = _split_shape(policy.cube, shape, num_cubes, itemsize)
+
+    for cube_id, (cube_shape, cube_offset) in enumerate(cube_splits):
+        # Level 2: PE within cube — resolver returns _LocalPeShard (local_pe)
+        local_shards = resolver(shape=cube_shape, itemsize=itemsize,
+                                 num_pe=num_pe)
+
+        for ls in local_shards:
+            all_shards.append(ShardSpec(
+                sip=target_sip,                   # from caller (current_device)
+                cube=cube_id,                     # local within SIP
+                pe=ls.local_pe,                   # local within cube (explicit name)
+                offset_bytes=cube_offset + ls.offset_bytes,
+                nbytes=ls.nbytes,
+            ))
+
+    return all_shards
+```
+
+**내부 resolver** (`column_wise`, `row_wise`, `replicate`)는 `_LocalPeShard`
+리스트 반환 — `local_pe` 필드명으로 **"cube-local PE identifier"임이 명시적**.
+과거 `ShardSpec.pe_index`와 이름이 혼동되던 문제 해소.
+
+**이름 규약 정리** (전체 ADR):
+- `ShardSpec.pe`: 최종 외부 API — cube-local PE (structural coord)
+- `_LocalPeShard.local_pe`: 내부 resolver 단계의 동일 의미
+- `pe_index`: **제거**. 외부/내부 어디에도 남기지 않는다 (silent drift 차단의
+  부가 효과: 이름 재등장 없음).
+
+### D4. `_create_tensor` — 구조적 좌표로 직접 placement
+
+ADR-0024 D11 연속선. Post-hoc shifting 제거, 구조적 좌표를 `resolve_dp_policy`
+호출 시점에 직접 지정.
+
+```python
+# context.py _create_tensor (after)
+current_sip = self.ahbm.current_device()
+if current_sip is None:
+    # Single-driver fallback (ADR-0024 D9와 일관).
+    # Launcher 기반 코드가 set_device()를 빼먹으면 조용히 SIP 0에 박히는
+    # 문제가 있음 → debug mode에서 경고.
+    if os.environ.get("KERNBENCH_DEBUG"):
+        import warnings
+        warnings.warn(
+            "torch.ahbm.current_device() is None; defaulting to SIP 0. "
+            "If this is a multi-rank launcher context, you likely forgot "
+            "torch.ahbm.set_device(rank) inside the worker.",
+            stacklevel=2,
+        )
+    current_sip = 0
+
+placement = resolve_dp_policy(
+    dp,
+    shape=shape_2d,
+    itemsize=itemsize,
+    num_pe=eff_num_pe,
+    num_cubes=eff_num_cubes,
+    target_sip=current_sip,          # ← 구조적 좌표 일차 지정
+)
+
+# placement의 각 ShardSpec은 이미 (sip=current_sip, cube=local, pe=local) 포함.
+# 과거의 post-hoc shifting 블록은 완전히 제거.
+```
+
+**모든** 텐서가 current device SIP에 배치됨. Multi-SIP 텐서를 만들고 싶으면
+ADR-0027의 TP primitive 사용.
+
+**Single-driver fallback의 trade-off**: set_device 없는 호출에서 SIP 0으로
+default는 기존 single-driver 테스트 호환을 위해 유지. `KERNBENCH_DEBUG=1`
+환경에서는 launcher 컨텍스트의 실수로 set_device 누락 시 조용히 잘못된 SIP에
+배치되는 것을 감지할 수 있도록 warning.
+
+### D5. Downstream — allocator lookup은 구조적 tuple key로
+
+기존 `deploy_tensor` (`src/kernbench/runtime_api/tensor.py`):
+
+```python
+for spec in placement:
+    alloc = allocators[spec.pe_index]       # ← AttributeError (property 제거됨)
+```
+
+`pe_index`가 없어졌으므로 구조적 좌표로 **강제** migration:
+
+```python
+for spec in placement:
+    alloc = allocators[(spec.sip, spec.cube, spec.pe)]
+```
+
+`_ensure_allocators`의 dict population도 tuple key로:
+
+```python
+# context.py _ensure_allocators (after)
+for sip_id in sip_range:
+    for cube_id in range(cubes_per_sip):
+        for pe_id in range(pes_per_cube):
+            self._allocators[(sip_id, cube_id, pe_id)] = PEMemAllocator(
+                rack_id=0, sip_id=sip_id, cube_id=cube_id, pe_id=pe_id, cfg=cfg,
+            )
+```
+
+`_free_tensor`도 동일: 기존 `flat_idx = sip * ... + cube * ... + pe` 계산
+블록 제거, `(shard.sip, shard.cube, shard.pe)` 직접 사용.
+
+**Tuple vs dataclass `PEIdentity`**: Tuple이 단순하고 hashable로 바로 써서
+권고. `PEIdentity` 값객체는 명시적 타입 장점은 있지만 boilerplate가 크고 현재
+allocator dict의 유일한 key라 오버엔지니어링. Tuple 유지.
+
+### D6. Migration — 기존 call site
+
+**(A) `DPPolicy(sip=..., num_sips=..., ...)` 사용하던 코드**:
+
+- `DPPolicy(sip="column_wise", cube=..., pe=...)` 패턴 → **해당 bench를 ADR-0024
+  launcher로 재작성**. worker가 `set_device(rank)`로 SIP 선택, DPPolicy는
+  cube/PE만.
+- `DPPolicy(sip="replicate", num_sips=1, ...)` 패턴 → `DPPolicy(cube=..., pe=...)`로
+  축소 (필드가 사라지니 자연스럽게).
+
+**(B) `dp.sip`, `dp.num_sips` 읽던 코드**:
+
+- 제거. `launch()`의 `_compute_local_shape`에서 `dp.sip` 분기 삭제.
+- `pe_cpu.py`가 `dp.sip`을 참조하던 곳도 정리.
+
+**(C) `ShardSpec.pe_index`를 사용하던 코드 — 전부 수정 필요**:
+
+- `.pe_index` 접근은 이제 `AttributeError` 발생 → 모든 call site 수정 필수.
+- Allocator lookup: `allocators[spec.pe_index]` →
+  `allocators[(spec.sip, spec.cube, spec.pe)]`
+- Flat integer가 꼭 필요한 국소 문맥: `spec.sip * N_CUBES * N_PE + spec.cube *
+  N_PE + spec.pe` 명시적 계산. **국소 변수로만 사용하고 공개 API에 노출하지
+  않는다**.
+
+**구현 착수 전 grep audit 체크리스트**:
+
+1. **Property 참조**:
+   - `\.pe_index\b` — 필드/property 접근 모두 (regex)
+   - `pe_index=` — 생성 시점의 키워드 인자
+   - `pe_index:` — dataclass 필드 선언
+2. **Allocator / dict indexing**:
+   - `allocators\[` — dict lookup 패턴. `allocators[spec.pe_index]` 같은
+     것이 걸리는지
+   - `_allocators\[` — 같은 패턴 (prefix _)
+3. **Flat index 수동 계산 블록**:
+   - `flat_idx =`
+   - `pe_index =` (좌변)
+   - `* pes_per_cube +` (전형적 flat 계산 패턴)
+   - `* self._num_cubes \* self._pes_per_cube` (global flat 계산)
+4. **Serialization / logging**:
+   - `asdict(.*shard` — dataclass 직렬화 시 `pe_index` 자동 포함 여부
+   - `repr(.*ShardSpec` — 로그 포맷에서 의존하는지
+   - JSON/YAML 저장 포맷에서 `pe_index` 키 사용 여부
+5. **Tests asserting integer PE identity**:
+   - `assert .*pe_index` — 정수 동일성 주장
+   - `spec.pe_index ==` — 비교 (SIP-local 의미로 변하면 테스트가 깨질 수 있음)
+
+각 match마다 "이 호출자가 global flat / SIP-local / 내부 lookup 중 무엇을
+기대했나"를 판단한 뒤 구조적 좌표로 교체.
+
+**(D) `test_sip_parallel.py`**:
+
+- 이름 유지, 내용은 ADR-0024의 multi-greenlet launcher 기반 재작성.
+- "SIP 병렬성 = rank 별 worker × 각자 DPPolicy" 로 검증.
+
+**(E) `test_va_offset.py`, `benches/va_offset_verify.py`**:
+
+- `num_sips=1`만 쓰는 경우가 대부분. 단순히 필드 제거.
+- SIP offset 테스트가 핵심이면 `set_device(rank)` + 구조적 좌표 관찰로 이식.
+
+### D7. 하위 호환 — 불가 (cleanup ADR)
+
+이 ADR은 **breaking change**.
+
+1. `DPPolicy(sip=...)` 또는 `DPPolicy(num_sips=...)` 호출 → `TypeError`
+2. `ShardSpec.pe_index` 접근 → `AttributeError`
+
+모두 **즉시 명시적 breakage**. Deprecation warning / fallback 경로 없음.
+KernBench는 사내 프로젝트로 call site가 한정되어 있어 한 번에 migration.
+
+**Silent drift 차단**이 property 완전 제거의 주된 이점: global flat을 기대한
+코드가 SIP-local 결과를 받아 조용히 잘못된 인덱싱을 할 가능성 제거.
+
+### D8. 문서 업데이트
+
+- `ADR-0008` (tensor deploy) — DPPolicy 의미 갱신 note, ShardSpec 구조적 좌표
+  전환 명시
+- DPPolicy docstring에 "intra-device only" 명시 (D1 코드 스니펫의 docstring)
+- ShardSpec docstring에 **structural coordinates `(sip, cube, pe)`를 직접
+  사용하며, `pe_index`는 더 이상 제공되지 않음**을 명시 (D2)
+- `docs/ccl-author-guide` 등 튜토리얼에서 `sip=...` 예시 제거
+
+---
+
+## Dependencies
+
+- **ADR-0024** (launcher): `set_device(rank)` 및 current-device scoping이
+  SIP 배치 메커니즘 제공. 본 ADR은 그 위에 서서 DPPolicy를 순수 intra-device로
+  좁힘.
+- **ADR-0027** (Megatron TP): 다중 SIP에 걸친 텐서가 필요한 경우의 대안 경로.
+  이 ADR 적용 후 multi-SIP use case는 ADR-0027로 이관.
+
+---
+
+## Non-goals
+
+- **`DPPolicy.cube` / `pe` 재설계**: 기존 replicate/column_wise/row_wise 의미
+  유지.
+- **Tiling 정책 통합**: `tiled_column_major` / `tiled_row_major`는 그대로.
+- **Multi-device 텐서 추상화 신규**: DTensor-like는 ADR-0028.
+
+---
+
+## Open questions
+
+- **`_create_tensor`의 current_sip 기본값**: set_device 없는 호출에서 rank=0
+  (SIP 0)로 fallback할지, 아니면 error 낼지. 권고는 fallback (기존 single-driver
+  테스트와의 호환).
+- **`test_sip_parallel.py` 재작성 범위**: 기존 단위 테스트의 의도를 유지하며
+  launcher 기반으로 옮기려면 추가 fixture 필요. 별도 작업으로 scope.
+- **`DPPolicy`의 `num_sips=None` 의미**: 필드가 없어지면 `num_sips` 개념 자체가
+  사라짐. Multi-SIP을 표현하고 싶으면 ADR-0027의 TP primitive를 쓰라는 것이
+  명시적 답.
+
+**Resolved (이전 rev에서 open이었던 것들)**:
+- ~~`ShardSpec.pe_index` property 존치 여부~~ → **완전 제거** (D2)
+- ~~`_ensure_allocators` dict key 형식~~ → **tuple `(sip, cube, pe)`** (D5)
+
+---
+
+## Test strategy
+
+### T1. 단위 테스트 갱신
+
+- `tests/test_tensor.py`, `tests/test_ipcq_types.py`, `tests/test_runtime_api_tensor.py`
+  — DPPolicy 생성자 인자 정리, ShardSpec 구조적 좌표 검증
+- `tests/test_va_offset.py` — `num_sips=1` 제거 후 동작 유지
+
+### T2. `resolve_dp_policy` 구조적 좌표 반환
+
+`tests/test_dp_policy.py` (new 또는 확장):
+- `resolve_dp_policy(dp, ..., target_sip=1)` 결과의 모든 ShardSpec이 `sip=1`
+- 각 spec의 `(cube, pe)`가 local (0..num_cubes-1, 0..num_pe-1)
+- 같은 topology에서 `target_sip=0`과 `target_sip=1` 결과가 sip 필드만 다름
+
+### T3. `test_sip_parallel.py` 재작성
+
+SIP 병렬성 검증을 launcher 기반으로:
+
+```python
+def test_sip_parallel_via_launcher(topology):
+    ...
+    def worker(rank, ws, torch):
+        torch.ahbm.set_device(rank)
+        t = torch.zeros((1, 128), dtype="f16",
+                         dp=DPPolicy(cube="column_wise", pe="column_wise"))
+        # verify shard.sip == rank (structural coord)
+
+    spawn(worker, nprocs=n_sips, ...)
+```
+
+### T4. Allocator key migration
+
+`tests/test_allocator_structural_key.py` (new 또는 기존 확장):
+- `PEMemAllocator` dict이 `(sip, cube, pe)` tuple key로 작동
+- `deploy_tensor`가 구조적 좌표로 allocator lookup
+- `_free_tensor`도 동일
+
+### T5. E2E 회귀
+
+ADR-0024의 `test_ccl_allreduce_matrix.py` 그대로 통과.
+
+### T6. 오류 검증
+
+- `DPPolicy(sip="column_wise")` 호출 → `TypeError`. 테스트로 명시.
+- `DPPolicy(num_sips=2)` 호출 → `TypeError`.
+- `spec.pe_index` 접근 → `AttributeError` (property 완전 제거 검증).
+
+---
+
+## Consequences
+
+### Positive
+
+- **개념 분리 명확**: DPPolicy = intra-device, TP = inter-device.
+- **API 단순화**: DPPolicy 생성자 필드 ~33% 축소.
+- **Structural 좌표 일관성**: ShardSpec이 `(sip, cube, pe)` 튜플로 표현 →
+  abstraction leakage 해소 (ADR-0024 D11 계약 충족).
+- **`pe_index` 의미 명확**: SIP-local이 단일 해석. Global flat이 필요하면 명시.
+- **Launcher 모델 일관성**: ADR-0024의 "1 worker per SIP" 모델이 유일한 SIP
+  경계 제어 메커니즘.
+
+### Negative
+
+- **Breaking change (explicit)**: `DPPolicy(sip=...)` → `TypeError`,
+  `spec.pe_index` → `AttributeError`. 모든 호출자 한 번에 수정 필요.
+- **ShardSpec schema 변경**: `pe_index` 단일 필드 → `sip`/`cube`/`pe` 세 필드.
+  Downstream (`deploy_tensor`, `_free_tensor`, `_ensure_allocators`,
+  `allocators` dict key 등) 연쇄 수정.
+- **Silent drift 없음**: property 완전 제거로 runtime에서 즉시 실패 →
+  migration leakage 원천 차단. (Negative가 아니라 explicit tradeoff)
+- `test_sip_parallel.py` 재작성 비용.
+
+### Neutral
+
+- 기존 `cube` / `pe` 필드 의미 불변.
+
+---
+
+## Affected files
+
+| File | Change |
+|------|--------|
+| `src/kernbench/policy/placement/dp.py` | D1: `sip`/`num_sips` 제거 / D2: `ShardSpec`에 `sip`/`cube`/`pe` structural fields 추가, **`pe_index` property 제거** / D3: `resolve_dp_policy`에 `target_sip`, SIP-level 루프 제거 / 내부 resolver가 반환하는 shard 타입 이름도 `local_pe`로 명확화 (이름 충돌 방지) |
+| `src/kernbench/runtime_api/context.py` | D4: `_create_tensor` `target_sip` 전달 / D5: `_ensure_allocators` dict key → `(sip, cube, pe)` tuple / `launch`의 `dp.sip` 분기 제거 |
+| `src/kernbench/runtime_api/tensor.py` | D5: `deploy_tensor`가 구조적 좌표로 allocator lookup |
+| `src/kernbench/components/builtin/pe_cpu.py` | D6: `dp.sip` 참조 제거 |
+| `src/kernbench/components/legacy/builtin/pe_cpu.py` | D6: 동일 |
+| `benches/ccl_allreduce.py` | ADR-0024 scope에서 이미 처리 |
+| `benches/va_offset_verify.py` | D6: `num_sips=1` 제거 |
+| `tests/test_runtime_api_tensor.py` | D6 |
+| `tests/test_va_offset.py` | D6 |
+| `tests/test_tensor.py`, `test_ipcq_types.py` | D6 |
+| `tests/test_sip_parallel.py` | T3: launcher 기반 재작성 |
+| `tests/test_dp_policy.py` (new 또는 확장) | T2 |
+| `tests/test_allocator_structural_key.py` (new) | T4 |
@@ -0,0 +1,171 @@
+# ADR-0028: DTensor Support — 선언적 분산 텐서 (Stub / Future)
+
+## Status
+
+Stub (Future Work)
+
+## Context
+
+### 목표
+
+**선언적 분산 텐서 추상화**(PyTorch 2.x `DTensor` 스타일)를 KernBench에
+도입하기 위한 **디자인 공간 preliminary exploration**. 본 ADR은 **구현 계획이
+아닌 future 작업의 파일 플레이스홀더 + 초기 질문 목록**이다.
+
+### Megatron-style TP와의 차이 (Why DTensor)
+
+| 관점 | Megatron (ADR-0027) | DTensor (이 ADR) |
+|---|---|---|
+| 표현 | 명시적 parallel layer | 텐서 + placement spec |
+| 호출 형태 | `ColumnParallelLinear(...)` | `distribute_tensor(x, mesh, [Shard(1)])` |
+| Collective 삽입 | 레이어 내부 명시 | 연산 dispatch가 자동 |
+| Learning curve | 낮음 (명시적) | 중~높음 (선언적 의미 이해) |
+| 유연성 | 레이어 단위로 고정 | 레이어 경계 무관, 어디서나 |
+| KernBench에 선행 필요한 것 | launcher (ADR-0024) + TP (0027) | 그 + operator dispatch overhaul |
+
+DTensor는 operator-level에서 "텐서의 placement를 보고 자동으로 collective
+삽입". KernBench가 이를 지원하려면 **operator dispatch layer에 placement-aware
+rewriting**이 들어가야 한다. 이는 비-trivial.
+
+### 현재 상태
+
+- KernBench는 operator dispatch 레이어가 없음 (`torch.matmul`은 없음; kernel
+  launch로 대체).
+- DPPolicy는 정적 placement metadata를 보유 (ADR-0026 후: intra-device only).
+- ADR-0024 launcher가 rank / device 개념 제공.
+- Megatron-style TP (ADR-0027)가 명시적 대안으로 기능할 것.
+
+---
+
+## Preliminary decision space
+
+### DQ1. PyTorch DTensor API 수용 범위
+
+- `DeviceMesh`: rank들의 논리적 grid.
+- `Placements`: `Shard(dim)`, `Replicate()`, `Partial(reduce_op)`.
+- `distribute_tensor(tensor, device_mesh, placements)`: local tensor → DTensor.
+- Redistribute: `dt.redistribute(new_placements)`로 collective 자동 삽입.
+- Operator forward: `dt @ dt`, `dt + dt` 등 → 적절한 collective 자동 dispatch.
+
+KernBench가 어느 수준까지 지원할지 결정 필요. 최소: `distribute_tensor` +
+`redistribute`. 최대: 모든 operator overloading.
+
+### DQ2. Operator dispatch 레이어
+
+KernBench에서 `dt @ dt`를 정의하려면 Tensor의 `__matmul__`이 placement를
+보고 적절한 action 수행:
+
+- 둘 다 replicated → local matmul
+- A column-sharded, B row-sharded → local matmul + all-reduce (RowParallel)
+- A replicated, B column-sharded → local matmul (ColumnParallel)
+- etc.
+
+이는 Megatron-style의 **자동화된 버전**. Kernel은 기존 matmul kernel 사용.
+
+### DQ3. DeviceMesh와 기존 topology
+
+KernBench topology는 이미 SIP/cube/PE 계층. DTensor의 DeviceMesh는 추상
+`(tp_size, dp_size, ...)` grid. 매핑:
+
+- 1D mesh of size = SIP count → rank = SIP
+- 2D mesh (tp × dp) → SIP을 그룹 분할 (pure TP 대신 mixed parallelism)
+
+초기엔 1D mesh만, DP × TP 2D는 future.
+
+### DQ4. Placement의 intra-device (DP) 통합
+
+KernBench 특이점: 한 rank 내부에서 DPPolicy로 cube/PE에 분산. DTensor는
+device 내부를 보지 않음. 통합:
+
+- DTensor placement = rank (SIP) 간 분산
+- 각 rank의 local tensor는 여전히 DPPolicy로 cube/PE 배치
+- → DTensor wrapper가 local tensor의 DPPolicy도 보관
+
+### DQ5. Collective 자동 삽입 지점
+
+`redistribute` 또는 operator forward 시. ADR-0024의 submit+yield+wait 패턴을
+자동으로 호출하는 형태. `_launch_submit` 내부화.
+
+### DQ6. Autograd
+
+DTensor는 autograd와 상호작용 (backward에서 reverse collective). KernBench가
+backward 지원하기 전까지는 **forward-only DTensor**.
+
+---
+
+## Open questions (to resolve before real design)
+
+1. **우선순위**: Megatron-style(ADR-0027)이 먼저 안착한 후 DTensor를 위에
+   얹는가, 아니면 공통 lower-layer를 먼저 설계하는가?
+2. **호환성 목표**: PyTorch DTensor API와 몇 %까지 일치시키는가? 독자 API vs
+   거의 동일?
+3. **Operator dispatch**: KernBench `Tensor` 클래스에 `__matmul__` 등 연산자
+   overloading을 도입하는가? (현재는 kernel launch만)
+4. **Redistribute 정책**: `Shard(0) → Replicate()` 변환 시 어떤 collective
+   사용? `all_gather`가 없으면 구현 전까지 제약.
+5. **Mesh × DPPolicy interaction**: 하나의 DTensor가 2개 layer 분산을 갖는
+   경우의 metadata 표현.
+6. **Partial placement의 reduce 시점**: 자동 vs 명시 `redistribute` 호출.
+7. **Bench authoring impact**: 기존 Megatron-style bench가 DTensor 기반으로
+   얼마나 쉽게 포팅되는가?
+
+---
+
+## Non-goals (for future real ADR)
+
+- 이번 stub에서 API 확정. Future ADR에서 구체화.
+- Implementation timeline. 이번 round에서는 **설계 공간 매핑만**.
+
+---
+
+## Dependencies (potential)
+
+- **ADR-0024** (launcher): rank / device 기반
+- **ADR-0026** (DPPolicy cleanup): DTensor placement와의 분리 명확화
+- **ADR-0027** (Megatron TP): 실용 TP 패턴 경험을 DTensor 설계로 환류
+- **Future ADR** (operator dispatch layer): KernBench Tensor에 operator
+  overloading 도입
+
+---
+
+## Expected consequences (hypothetical)
+
+### Positive
+
+- PyTorch training code 이식이 **매우 쉬워짐** (DTensor 코드 그대로).
+- TP + DP + 더 복잡한 parallelism을 **하나의 추상화**로 표현.
+- Collective 삽입이 자동 → bench 작성자 부담 감소.
+
+### Negative
+
+- Operator dispatch layer 신규 구축 → 상당한 엔지니어링.
+- Implicit behavior 증가 → 디버깅 / 성능 분석 복잡.
+- KernBench의 "명시적 kernel launch" 철학과 tension.
+
+---
+
+## Action
+
+- **Phase 1 (현재)**: 본 stub 유지. Megatron-style (ADR-0027) 먼저 구현 +
+  사용 경험 축적.
+- **Phase 2 (future)**: 사용 경험을 바탕으로 본 ADR을 real design으로 승격.
+  위 Open questions에 대한 답을 제시.
+- **Phase 3 (future)**: Implementation.
+
+현재 구현 작업은 **없음**. 디자인 공간 매핑만.
+
+---
+
+## Affected files
+
+본 ADR은 **stub**이므로 production 변경 없음. Future real ADR에서 갱신될
+파일 후보:
+
+| File | 예상 변경 (future) |
+|------|---|
+| `src/kernbench/dtensor/__init__.py` | 신규 패키지 |
+| `src/kernbench/dtensor/device_mesh.py` | DeviceMesh |
+| `src/kernbench/dtensor/placements.py` | Shard/Replicate/Partial |
+| `src/kernbench/dtensor/api.py` | distribute_tensor, redistribute |
+| `src/kernbench/dtensor/ops/*.py` | Operator dispatch (matmul 등) |
+| `src/kernbench/runtime_api/tensor.py` | Tensor에 `__matmul__` 등 추가 |
@@ -0,0 +1,419 @@
+# ADR-0029: Hierarchical All-Reduce — 3-level intra/inter-SIP 알고리즘
+
+## Status
+
+Proposed
+
+## Context
+
+### 목표
+
+"Rank = SIP" 모델 (ADR-0024) 위에서 각 SIP 내부의 모든 PE를 참여시키는
+**3-level 계층 all-reduce** 알고리즘을 정의한다. 각 레벨이 서로 다른 물리
+연결(intra-cube ring, inter-cube NoC, inter-SIP UCIe)을 활용해 대역폭을
+극대화한다.
+
+### 왜 hierarchical인가
+
+단순 ring/mesh/tree all-reduce는 SIP당 1 PE만 참여 (ADR-0024의 `leader_only`
+mapper). 이는 inter-SIP 단계는 잘 모델링하지만:
+
+- **Intra-SIP PE가 노는 시간이 발생**. Leader PE가 inter-SIP 통신 중이면
+  나머지 7 PE / 16 cube는 유휴.
+- **Intra-cube/inter-cube 연결 대역폭 미활용**. Cube NoC는 매우 빠르지만
+  단일 leader 사용 시 이 자원이 노출되지 않음.
+- **실제 NCCL 등은 hierarchical**: NVLink(intra-node) + InfiniBand(inter-node)
+  의 bandwidth 차이를 활용. KernBench 토폴로지도 동일 구조
+  (intra-cube / inter-cube / inter-SIP의 bandwidth·latency 차이).
+
+### 현재 상태
+
+- `src/kernbench/ccl/algorithms/hierarchical_allreduce.py` 이미 존재
+  (git log `10b33b4` — "Tensor indexing + hierarchical 3-level all-reduce
+  kernel"). PE-level로 world_size = total PE를 가정하는 옛 모델 기반 구현.
+- ADR-0024에 의해 launcher는 rank = SIP로 바뀜.
+- Hierarchical 커널은 **재해석 필요**: 이제 각 worker(1 per SIP)가 자기 SIP의
+  모든 PE를 참여시키고, kernel은 intra-cube → inter-cube → inter-SIP 순으로
+  3-level reduce + broadcast.
+
+### 풀어야 할 문제
+
+1. **ADR-0024 framework 위에 hierarchical 알고리즘 맞추기**
+   - Mapper: `all_pes` (ADR-0024 D5 제공)
+   - Validator: `multi_pe_sip_local` (ADR-0024 D8 제공)
+   - Kernel: 기존 `hierarchical_allreduce.py` 수정 — rank 계산 방식을 SIP 내
+     local (cube, pe)로 바꿈
+2. **PE-level neighbor graph 생성**
+   - Intra-cube: `(sip, cube, pe) ↔ (sip, cube, pe±1 mod N_PE)` (ring 내부)
+   - Inter-cube: `(sip, cube, 0) ↔ (sip, cube±1 mod N_CUBE, 0)` (cube leader만)
+   - Inter-SIP: `(sip, 0, 0) ↔ (sip±1 mod N_SIP, 0, 0)` (SIP leader만)
+3. **Tensor layout**: 각 PE가 1 tile을 소유하고 시작 (`multi_pe_sip_local`
+   validator가 이 layout 강제). DPPolicy(cube="column_wise",
+   pe="column_wise")로 달성 가능.
+4. **PE-level topology 표현 부족** (ADR-0024 D6의 "책임 분산" 이슈 구체화)
+   - Ring/mesh/tree 같은 단순 패턴은 rank-level topology_fn + mapper 조합으로
+     충분.
+   - Hierarchical은 레벨마다 다른 peer 매핑이라 `_build_pe_installs`에서
+     multi-level 해석을 해야 함.
+   - 장기적으로는 topology 모듈이 PE-level을 직접 표현하는 편이 명시적.
+
+### Non-problem (이 ADR 밖)
+
+- Launcher / barrier / rank-to-SIP / mapper-validator registry → ADR-0024
+- IPCQ direction addressing → ADR-0025
+- DPPolicy 필드 정리 → ADR-0026
+- Megatron TP → ADR-0027
+
+---
+
+## Decision
+
+### D1. 알고리즘 구조 — 3-level reduce + 역순 broadcast
+
+```
+Level 1 (intra-cube, E/W ring):
+  각 cube의 N_PE개 PE가 bidirectional ring reduce → cube 내 PE 0에 부분합 집중
+Level 2 (inter-cube within SIP, N/S ring, PE 0만 참여):
+  N_CUBE개 cube-leader가 bidirectional ring reduce → SIP 내 (cube 0, PE 0)에
+  SIP 전체 부분합 집중
+Level 3 (inter-SIP, N_SIP peers, (cube 0, PE 0)만 참여):
+  Ring 또는 pair exchange로 전역 합산 완료
+Broadcast:
+  역순 — Level 3 결과를 (cube 0, PE 0)에서 SIP 내 모든 cube-leader로, 다시
+  각 cube 내 모든 PE로 전파
+```
+
+세부는 기존 `hierarchical_allreduce.py`의 커널 구현과 일치. ADR-0024 이후
+변경점은 **rank 계산 방식**과 **n_elem 해석**뿐:
+
+- 기존 (rank=PE 모델): `rank = cube_id * pes_per_cube + local_pe`, `pe_addr =
+  t_ptr + rank * nbytes`
+- 신규 (rank=SIP 모델): 커널은 SIP-local 좌표 `(cube_id, local_pe)`로만 동작.
+  텐서의 per-PE slice는 backend가 per-PE `TensorArg`로 전달 (ADR-0024 D3).
+  커널 내부 rank 계산 자체가 불필요해짐 — `tl.program_id(0/1)`로 충분.
+
+### D2. Framework integration — ADR-0024 infrastructure 재활용
+
+`ccl.yaml`:
+
+```yaml
+algorithms:
+  hierarchical_allreduce:
+    module: kernbench.ccl.algorithms.hierarchical_allreduce
+    topology: hierarchical_3level        # NEW — D3 참고
+    mapper: all_pes                      # ADR-0024 D5 built-in
+    validator: multi_pe_sip_local        # ADR-0024 D8 built-in
+    buffer_kind: tcm
+    n_elem: 128
+```
+
+Framework 관점에서 hierarchical은 **특별한 알고리즘이 아니라, 특정
+topology / mapper / validator 조합**. 본 ADR은 그 조합과 topology 패턴을
+정의.
+
+### D3. `hierarchical_3level` topology (신규)
+
+`kernbench/ccl/topologies.py`에 신규 추가:
+
+```python
+def hierarchical_3level(rank: int, world_size: int, spec: dict) -> dict:
+    """3-level hierarchical neighbor pattern.
+
+    Returns a nested structure describing intra-cube + inter-cube + inter-SIP
+    neighbors. Unlike ring_1d / mesh_2d which are rank → {dir: peer_rank},
+    hierarchical is PE-level and requires spec for cube_mesh / pe_layout.
+    """
+```
+
+반환 스키마 (초안):
+
+```python
+{
+    "intra_cube": {
+        # 각 cube 내 ring neighbors: (cube, pe) → {"E": (cube, pe_e), "W": (cube, pe_w)}
+        ...
+    },
+    "inter_cube": {
+        # cube-leader 간 ring: (cube, 0) → {"N": (cube_n, 0), "S": (cube_s, 0)}
+        ...
+    },
+    "inter_sip": {
+        # SIP-leader 간: rank → {"parent": peer_rank} (또는 ring 방식)
+        ...
+    },
+}
+```
+
+이 구조는 `_build_pe_installs`가 해석하여 각 PE의 neighbor table 엔트리
+(4-direction)에 대응시킨다.
+
+**Rank-level `topologies.py` 현 API와의 관계**: 기존 단순 패턴은
+`(rank → {dir: peer_rank})` 단일 레벨. Hierarchical은 multi-level이므로
+기존 API와 schema가 다름. `_resolve_topology`는 **알고리즘이 어떤 schema를
+쓰는지 선언**하고, builder가 그에 맞춰 해석하도록 확장 필요 (open question).
+
+### D4. PE-level neighbor graph — `_build_pe_installs` 확장
+
+기존 (ring/mesh/tree): topology_fn이 반환한 `(rank → {dir: peer_rank})`를
+각 참여 PE에 그대로 매핑 (leader_only일 경우 peer PE도 leader).
+
+신규 (hierarchical): `hierarchical_3level`의 3단 구조를 per-PE neighbor
+table로 펼침:
+
+```python
+def _build_pe_installs_hierarchical(rank, world_size, sip, pes, topo, spec):
+    """Hierarchical 전용 PE neighbor table 빌더."""
+    result = []
+    for (cube, pe) in pes:
+        entries = []
+        # Level 1: intra-cube ring (E/W)
+        for d, peer in topo["intra_cube"][(cube, pe)].items():
+            entries.append(NeighborTableEntry(direction=d, ...))
+        # Level 2: inter-cube ring (N/S) — cube leader (pe == 0)만
+        if pe == 0:
+            for d, peer in topo["inter_cube"][(cube, 0)].items():
+                entries.append(NeighborTableEntry(direction=d, ...))
+        # Level 3: inter-SIP — SIP leader (cube == 0 and pe == 0)만
+        if cube == 0 and pe == 0:
+            for d, peer_rank in topo["inter_sip"][rank].items():
+                # peer_rank → peer SIP의 (0, 0)
+                entries.append(NeighborTableEntry(
+                    direction=d, peer_sip=peer_rank, peer_cube=0, peer_pe=0, ...))
+        result.append(PeInstallSpec(cube=cube, pe=pe, neighbors=tuple(entries)))
+    return tuple(result)
+```
+
+`build_install_plans`에서 algorithm_config의 `topology`에 따라 적절한 builder
+선택 (기존 simple builder vs hierarchical builder).
+
+### D5. Kernel 재해석 — SIP-local 좌표로
+
+`src/kernbench/ccl/algorithms/hierarchical_allreduce.py`를 ADR-0024 D3에
+맞춰 수정:
+
+```python
+def kernel_args(*, n_elem: int, world_size: int, pes_per_cube: int,
+                cubes_per_sip: int, num_sips: int, **kw) -> tuple:
+    """world_size (= num_sips), pes_per_cube, cubes_per_sip를 스칼라로."""
+    return (n_elem, pes_per_cube, cubes_per_sip, num_sips)
+
+def kernel(t_ptr, n_elem, pes_per_cube, cubes_per_sip, num_sips, tl):
+    """SIP-local 좌표 기반.
+
+    이전 (rank=PE 모델):
+        rank = cube_id * pes_per_cube + local_pe
+        pe_addr = t_ptr + rank * nbytes
+    현재 (rank=SIP 모델):
+        per-PE tensor slice는 backend가 TensorArg로 전달 → t_ptr은 이미 local.
+        intra-cube ring은 tl.program_id(0) 사용.
+        inter-cube ring은 pe_id == 0 조건으로 제한.
+        inter-SIP reduce는 cube_id == 0 and pe_id == 0 조건으로 제한.
+    """
+    local_pe = tl.program_id(axis=0)
+    cube_id = tl.program_id(axis=1)
+
+    # Level 1: intra-cube ring
+    for _ in range(intra_rounds(pes_per_cube)):
+        tl.send(dir="E", src=acc)
+        recv = tl.recv(dir="W", shape=(n_elem,), dtype="f16")
+        acc = acc + recv
+
+    # Level 2: inter-cube (cube leader only)
+    if local_pe == 0:
+        for _ in range(inter_cube_rounds(cubes_per_sip)):
+            tl.send(dir="N", src=acc)
+            recv = tl.recv(dir="S", shape=(n_elem,), dtype="f16")
+            acc = acc + recv
+
+    # Level 3: inter-SIP (SIP leader only)
+    if local_pe == 0 and cube_id == 0:
+        for _ in range(inter_sip_rounds(num_sips)):
+            tl.send(dir="parent", src=acc)
+            recv = tl.recv(dir="parent", shape=(n_elem,), dtype="f16")
+            acc = acc + recv
+
+    # Broadcast (reverse chain)
+    # ...
+    tl.store(t_ptr, acc)
+```
+
+`kernel_args`는 ADR-0024 D4의 keyword-only signature 계약을 따른다.
+
+### D6. Validator — `multi_pe_sip_local`
+
+ADR-0024 D8의 built-in 그대로 활용. `ccl.yaml`에서 `validator:
+multi_pe_sip_local` 지정 시 backend가 각 SIP에 `cubes × pes_per_cube`개
+shard가 있는지 검증.
+
+### D7. Bench — 기본 all-reduce bench 확장
+
+`benches/ccl_allreduce.py`의 worker는 `ccl.yaml`이 `hierarchical_allreduce`를
+선택하면 자동으로:
+
+```python
+# Worker 예
+dp = DPPolicy(cube="column_wise", pe="column_wise")
+tensor = torch.zeros((1, intra_sip_pes * n_elem), dp=dp, name="in")
+# tensor는 각 SIP의 모든 PE에 1 tile씩 분산 (multi_pe_sip_local validator 통과)
+dist.all_reduce(tensor, op="sum")
+```
+
+Worker 코드 자체는 알고리즘 종류를 모름 (`ccl.yaml` 선택에 의존). 단,
+**DPPolicy가 hierarchical 요구와 일치해야** 함 — `cube/pe="column_wise"`
+같은 SIP-내 분산을 하는 DPPolicy여야 `multi_pe_sip_local` 검증 통과. 이
+DPPolicy 선택은 bench 설정 또는 sample bench에서 결정.
+
+---
+
+## Dependencies
+
+- **ADR-0024**: Launcher, `all_pes` mapper, `multi_pe_sip_local` validator,
+  registry + import path. 본 ADR 구현의 전제.
+- **ADR-0025**: IPCQ direction addressing — cube/pe/SIP 간 다중 direction을
+  동시 사용하므로 정확한 direction 매칭 필수.
+- **ADR-0023**: IPCQ protocol (neighbor table, send/recv, credit return).
+- **기존 `hierarchical_allreduce.py`**: 본 ADR은 그 커널의 재해석 + 주변
+  framework integration.
+
+---
+
+## Non-goals
+
+- **ADR-0024 framework 변경**: 재활용만.
+- **Alternative reduce topology (tree-in-tree 등)**: 3-level ring이 첫 구현.
+- **Dynamic level count**: 현재 SIP/cube/PE 3단 고정. 2단 (SIP + PE, cube
+  skip) 또는 4단 이상은 future.
+- **Bandwidth-optimal schedule tuning**: reduce round 수 / chunk size 조정
+  같은 tuning은 별도.
+- **Pipelined hierarchical**: 여러 chunk를 파이프라인으로 겹쳐서 돌리는
+  NCCL-style 최적화는 future.
+
+---
+
+## Open questions
+
+### 🟠 중간 영향 — 구현 시 결정 필요
+
+- **`topologies.py` 스키마 확장**: 기존 `ring_1d` 등은 단일 레벨 `(rank →
+  {dir: peer})`. `hierarchical_3level`은 multi-level. `_resolve_topology`가
+  둘을 모두 반환할 수 있도록 schema를 일반화할지, 아니면 hierarchical 전용
+  return type을 두고 builder가 분기할지.
+  - Option A: 모든 topology를 neighbor-list 형태로 단일화
+    (`[{direction, peer_sip, peer_cube, peer_pe}, ...]`)
+  - Option B: topology 모듈이 `kind` 필드 제공, builder가 분기
+  - 권장: Option A (single source of truth, ADR-0024 Open Q의
+    "PE-level topology 일원화" 방향과 일치)
+
+- **`hierarchical_3level` vs algorithm별 topology 모듈**: 향후 mesh-based
+  hierarchical 등 variant이 생기면? `hierarchical_3level` 같은 이름이 이미
+  topology-specific. 변형은 새 key 추가 (`hierarchical_mesh_3level` 등) 또는
+  알고리즘 모듈에서 topology 생성 override.
+
+### 🟡 Nice-to-have
+
+- **Reduce round 수 최적화**: Bidirectional ring은 `ceil((N-1)/2)` round.
+  Non-power-of-2 group size에서 idle PE 발생 가능.
+- **Non-uniform topology 대응**: cube_mesh가 w != h일 때 inter-cube ring
+  balance.
+- **Single SIP 케이스**: world_size = 1 (SIP 1개)일 때 Level 3 skip. Degenerate
+  case 검증.
+
+### 🟢 Framework evolution 시사점 (ADR-0024로부터 이관)
+
+- **PE-level topology 일원화 (중장기)**: 현 설계는
+  - topology (rank graph 또는 level-separated)
+  - mapper (per-SIP PE set)
+  - `_build_pe_installs` (actual edges)
+
+  의 3단 분산. Hierarchical이 이 분산을 가장 스트레스 받는 케이스. 중장기로는
+  `topologies.py`가 PE-level neighbor list를 직접 반환하고 mapper는 단순히
+  "어느 PE가 참여하느냐"만 결정, `_build_pe_installs`는 flat
+  mapping으로 단순화되는 방향이 자연스러움. **본 ADR에서 Option A를 채택**하면
+  이 방향으로 이미 정합.
+
+---
+
+## Test strategy
+
+### T1. Topology generator
+
+`tests/test_hierarchical_topology.py` (new):
+- `hierarchical_3level(rank, world_size, spec)` → 각 level의 neighbor set이
+  예상 구조인지 (intra-cube는 ring, inter-cube는 cube-leader만 참여, inter-SIP은
+  SIP-leader만 참여)
+- 2 SIP × 4 cubes × 4 PEs 같은 작은 토폴로지로 수작업 검증 가능
+- Symmetry: rank r의 E neighbor가 peer에서 W로 역포인팅
+
+### T2. Install plan — hierarchical × all_pes
+
+`tests/test_ccl_install_plan.py` (확장):
+- `build_install_plans(algorithm="hierarchical_allreduce", mapper="all_pes",
+  validator="multi_pe_sip_local")` 호출 시
+  - 각 SIP의 모든 PE가 `participating_pes`에 포함
+  - PE 0 (cube leader)만 inter-cube neighbor를 가짐
+  - (cube 0, pe 0) (SIP leader)만 inter-SIP neighbor를 가짐
+  - Non-leader PE는 intra-cube neighbor만
+
+### T3. Kernel unit — mock runtime
+
+`tests/test_hierarchical_mock_runtime.py` (new):
+- `run_kernel_in_mock` (kernbench.ccl.testing)을 확장해 multi-level 지원
+- 2 SIP × 2 cubes × 4 PEs (총 16 PE) 토폴로지에서 초기 tile을 rank+1로 채우고
+  hierarchical all-reduce 실행
+- 모든 PE의 최종 결과가 `sum(1..16)`인지
+
+### T4. E2E — 실제 SimPy backend
+
+`tests/test_ccl_allreduce_matrix.py` (확장):
+- `hierarchical @ ws=SIP_count`: multi_pe_sip_local layout + 3-level 알고리즘
+  전체 stack 통과 검증
+
+### T5. Validator enforcement
+
+- `multi_pe_sip_local` validator가 wrong layout (예: leader_only 스타일 1
+  shard per rank) 입력에 raise
+
+### T6. 회귀
+
+기존 ring/mesh/tree 알고리즘 모두 그대로 통과. 본 ADR은 그들을 건드리지 않음.
+
+---
+
+## Consequences
+
+### Positive
+
+- **Intra-SIP PE 활용도 증가**: Inter-SIP 통신 중에도 intra-cube / inter-cube
+  reduce가 진행되어 전체 PE 가동률 향상.
+- **Multi-level bandwidth 활용**: cube NoC, UCIe 모두 작동 → 더 정확한 HW 모델.
+- **ADR-0024 framework 검증**: `all_pes` mapper + `multi_pe_sip_local`
+  validator의 첫 non-trivial use case. Framework 설계 타당성 확인.
+- **기존 커널 재활용**: `hierarchical_allreduce.py` 큰 구조 유지, SIP-local
+  좌표만 재해석.
+
+### Negative
+
+- **`topologies.py` schema 확장 필요**: Single-level vs multi-level 표현.
+  해결안(Option A)은 기존 ring/mesh/tree의 마이그레이션 비용 유발.
+- **Validator / mapper 조합 요구**: 사용자가 DPPolicy를
+  `multi_pe_sip_local`에 맞춰 선택해야 함 (bench 설정 복잡도 증가).
+
+### Neutral
+
+- 본 ADR 구현 전까지 `hierarchical_allreduce.py`는 deprecated 상태 유지 또는
+  ADR-0024 matrix test에서 제외. 현재 파일을 곧바로 삭제하지는 않음.
+
+---
+
+## Affected files
+
+| File | Change |
+|------|--------|
+| `src/kernbench/ccl/topologies.py` | D3: `hierarchical_3level` topology 함수 추가. (Option A 채택 시) 기존 topology 출력 format 통일 |
+| `src/kernbench/ccl/install_plan.py` | D4: hierarchical builder 분기 (또는 단일 builder가 level 개수로 dispatch) |
+| `src/kernbench/ccl/algorithms/hierarchical_allreduce.py` | D5: SIP-local 좌표로 kernel 재작성, `kernel_args` keyword-only signature |
+| `ccl.yaml` | D2: `hierarchical_allreduce` 엔트리 추가 (`mapper: all_pes`, `validator: multi_pe_sip_local`, `topology: hierarchical_3level`) |
+| `tests/test_hierarchical_topology.py` (new) | T1 |
+| `tests/test_ccl_install_plan.py` | T2 확장 |
+| `tests/test_hierarchical_mock_runtime.py` (new) | T3 |
+| `tests/test_ccl_allreduce_matrix.py` | T4: hierarchical row 추가 |
@@ -0,0 +1,347 @@
+# ADR-0030: IPCQ Physical Addressing — PhysAddr integration
+
+## Status
+
+Proposed (Blocked on ADR-0031 — PhysAddr PE-resource extension)
+
+## Context
+
+### 목표
+
+IPCQ ring buffer의 주소 체계를 ADR-0023의 **synthetic parallel namespace**
+(`_IPCQ_BASE = 1<<60`)에서 **ADR-0001의 PhysAddr**로 이관한다. Routing /
+allocator / MemoryStore의 정합성을 회복하고, buffer_kind (tcm/hbm/sram)별
+physical backing을 구조적 좌표로 표현한다.
+
+### 현재 상태 (ADR-0023 D2.5)
+
+`src/kernbench/ccl/install.py:52-56`:
+
+```python
+_IPCQ_BASE = 1 << 60
+def _ipcq_base_for_pe(sip, cube, pe):
+    return _IPCQ_BASE | (sip << 40) | (cube << 32) | (pe << 24)
+
+def rx_base(s, c, p, d):
+    return _ipcq_base_for_pe(s, c, p) + direction_idx[d] * bytes_per_direction
+```
+
+- **bit 60** 사용 → ADR-0001의 51-bit PhysAddr 공간 밖 (`MAX_51 = (1 << 51) - 1`)
+- `PhysAddr.decode(addr)` → `PhysAddrError("addr must be a 51-bit value")`
+- `IpcqEndpoint.rx_base_pa: int` — 타입이 raw int, 구조 없음
+- `buffer_kind` (tcm/hbm/sram)와 synthetic 주소의 관계가 coupling 없음
+- Allocator (`PEMemAllocator`) 우회 — synthetic unique id per (sip, cube, pe,
+  direction). 진짜 physical allocation이 아님
+
+ADR-0023 D2.5 원문:
+
+> This bypasses the topology's address resolver / PhysAddr encoding and
+> treats IPCQ buffers as a separate, parallel address namespace. Real PA
+> encoding can be plugged in later without changing the rest of the design.
+
+"later"가 이 ADR.
+
+### 왜 지금 다루는가
+
+- ADR-0025 (direction addressing)은 주소-기반 매칭으로 전환. 주소가 correctness에
+  직접 기여 → 주소 체계가 설계 관점에서 더 중요해짐
+- ADR-0001의 "Routing consumes decoded domains, not raw bit-fields" 계약 위반
+  지속 → 기술 부채
+- Routing fabric (cube_noc / UCIe)은 PhysAddr.decode()로 destination을 정함.
+  IPCQ의 synthetic 주소가 fabric routing에서 실제로 어떻게 처리되는지 **검증되지
+  않음** (별도 경로로 배달되는 것으로 추정)
+- TCM / HBM / SRAM의 실제 memory layout과 IPCQ ring buffer 위치가 **disjoint**
+  → allocator가 IPCQ 영역을 모르므로 실수로 겹칠 가능성 (현재는 bit 60로 완전
+  분리되어 문제 없지만 설계 원칙상 건강하지 않음)
+
+### 풀어야 할 문제
+
+1. **IPCQ ring buffer의 PhysAddr 표현**: buffer_kind별로 어떤 PhysAddr factory를
+   쓸지.
+2. **PhysAddr 공간 부족 가능성**: 51-bit 공간에 IPCQ 버퍼를 담을 여유가 있는지.
+3. **Allocator 통합**: `PEMemAllocator`에 IPCQ buffer 영역 예약 기능 추가, 또는
+   기존 pool에서 정상 allocation.
+4. **MemoryStore space naming 정리**: 현재는 `{"tcm", "hbm", "sram"}` 문자열로
+   space 구분. IPCQ buffer도 이 space에 속하면 일반 data와 주소 겹침 방지 필요.
+5. **Routing fabric 통합**: PhysAddr 기반 routing이 IPCQ 토큰을 올바른 SIP의
+   올바른 메모리로 배달.
+6. **ADR-0025와의 정합**: 주소-기반 매칭이 PhysAddr에서도 동일하게 작동.
+
+---
+
+## Decision
+
+### D1. IPCQ ring buffer = PhysAddr factory 사용
+
+각 `buffer_kind`가 해당하는 PhysAddr factory를 호출:
+
+| buffer_kind | PhysAddr factory | 필요한 인자 |
+|---|---|---|
+| `tcm` | `PhysAddr.pe_tcm_addr(rack_id, sip_id, cube_id, pe_id, tcm_offset)` | PE-local TCM |
+| `hbm` | `PhysAddr.pe_hbm_addr(rack_id, sip_id, cube_id, pe_id, pe_local_hbm_offset, slice_size_bytes)` | PE-local HBM slice |
+| `sram` | `PhysAddr.cube_sram_addr(rack_id, sip_id, cube_id, sram_offset)` | Cube-shared SRAM |
+
+Install plan builder (`build_install_plans` in ADR-0024)가 각 PE의 rx_base를
+계산할 때:
+
+```python
+# ADR-0030 후 install_plan.py (pseudocode)
+def _compute_rx_base(sip, cube, pe, direction_idx, buffer_kind, n_slots, slot_size,
+                     allocator_pool, rack_id=0) -> PhysAddr:
+    bytes_per_direction = n_slots * slot_size
+    offset = direction_idx * bytes_per_direction
+
+    if buffer_kind == "tcm":
+        # TCM base (per-PE) + direction offset
+        tcm_base = allocator_pool.reserve_pe_tcm_for_ipcq(sip, cube, pe,
+                                                          total_bytes=N_DIR * bytes_per_direction)
+        return PhysAddr.pe_tcm_addr(rack_id=rack_id, sip_id=sip, cube_id=cube,
+                                      pe_id=pe, tcm_offset=tcm_base + offset)
+    elif buffer_kind == "hbm":
+        hbm_base = allocator_pool.reserve_pe_hbm_for_ipcq(sip, cube, pe,
+                                                          total_bytes=...)
+        return PhysAddr.pe_hbm_addr(rack_id=rack_id, sip_id=sip, cube_id=cube,
+                                      pe_id=pe, pe_local_hbm_offset=hbm_base + offset,
+                                      slice_size_bytes=slice_size)
+    elif buffer_kind == "sram":
+        sram_base = allocator_pool.reserve_cube_sram_for_ipcq(sip, cube,
+                                                               total_bytes=...)
+        return PhysAddr.cube_sram_addr(rack_id=rack_id, sip_id=sip, cube_id=cube,
+                                         sram_offset=sram_base + offset)
+```
+
+`IpcqEndpoint.rx_base_pa`의 타입을 `PhysAddr` (또는 encoded `int`)로 변경:
+
+```python
+@dataclass(frozen=True)
+class IpcqEndpoint:
+    sip: int
+    cube: int
+    pe: int
+    buffer_kind: str
+    rx_base_pa: int            # PhysAddr.encode() 결과 (51-bit)
+    rx_base_va: int
+    n_slots: int
+    slot_size: int
+```
+
+타입은 int 유지 (encoded form), 단 **반드시 PhysAddr.decode()로 복원 가능**한
+값임을 invariant으로 둔다. 디코더 호출자는 `PhysAddr.decode(rx_base_pa)`로
+구조적 좌표 획득.
+
+### D2. Allocator 확장 — IPCQ 예약 API
+
+`PEMemAllocator`에 IPCQ 전용 예약 기능 추가:
+
+```python
+class PEMemAllocator:
+    def reserve_ipcq_tcm(self, total_bytes: int) -> int:
+        """Reserve TCM region for IPCQ ring buffers at this PE.
+        Returns tcm_offset (to be used in PhysAddr.pe_tcm_addr)."""
+        # TCM에서 `total_bytes` 연속 영역 예약.
+        # Tensor allocation과 겹치지 않도록.
+
+    def reserve_ipcq_hbm(self, total_bytes: int) -> int: ...
+    # cube-level allocator도 유사
+```
+
+Install plan 빌더가 각 PE allocator에서 예약. 예약 결과(offset)를 PhysAddr
+factory에 전달.
+
+**기존 `_ipcq_base_for_pe` / `_IPCQ_BASE` 제거**.
+
+### D3. MemoryStore space 통합
+
+현재 `MemoryStore`는 `{space_name: {addr: ndarray}}` 구조. IPCQ buffer는 일반
+tensor 데이터와 같은 space (tcm/hbm/sram)를 공유하게 됨. 주소 유일성은 ADR-0001의
+PhysAddr 계층 보장.
+
+Backward compatibility: 기존 IPCQ address (synthetic)을 쓰는 code path는
+**제거**하고, 모두 PhysAddr.encode() 결과만 사용. 이 자체는 API 변경이 아니라
+값 변경.
+
+### D4. Routing fabric 통합
+
+IPCQ DMA write (`IpcqDmaToken`의 `src_addr → dst_addr`)이 PhysAddr encoding을
+사용하므로 **routing fabric이 `PhysAddr.decode(dst_addr)`로 destination
+SIP/cube/PE를 정확히 찾을 수 있음**. Fabric routing 로직 변경 없음 (기존에도
+PhysAddr.decode를 쓰는 것으로 추정).
+
+**검증 필요**: 현재 fabric이 bit 60 synthetic 주소를 어떻게 라우팅하는지 확인.
+별도 경로가 있다면 제거, PhysAddr 경로로 통합.
+
+### D5. ADR-0025와의 정합
+
+ADR-0025의 주소-기반 매칭 (dst_addr로 direction 식별)은 PhysAddr.encode()
+결과를 비교하는 것으로 자연스럽게 호환. 변경 없음.
+
+다만 debug / diagnostic 향상 가능:
+
+```python
+# pointer_dump 등에서
+print(f"E: rx_base_pa={PhysAddr.decode(qp.peer.rx_base_pa)}")
+# 출력 예: PhysAddr(sip=1, cube=0, pe=0, kind="pe_resource", unit_type=PE, ...)
+```
+
+이전 synthetic 주소는 decode 불가 → diagnostic 질 저하. PhysAddr 전환으로 개선.
+
+### D6. ADR-0023 D2.5 amendment
+
+ADR-0023의 "bypasses PhysAddr encoding" 문구를 **Accepted fallback → now
+replaced by ADR-0030**으로 수정. 본 ADR이 적용되면 ADR-0023 D2.5의 "Real PA
+encoding can be plugged in later" 약속이 이행된 것.
+
+---
+
+## Migration strategy
+
+단계적 전환 (한 PR로 하지 않는다):
+
+### Phase 1: PhysAddr 공간 재검토
+- 51-bit PhysAddr 공간에 IPCQ ring buffer가 실제로 들어갈 수 있는지 확인.
+- 각 buffer_kind (tcm/hbm/sram)별 factory가 제공하는 `local_offset` 범위가
+  IPCQ 요구 (4 direction × n_slots × slot_size)를 수용 가능한지.
+- 부족하면 PhysAddr layout 자체 확장 (ADR-0001 amendment 별도 필요).
+
+### Phase 2: Allocator API 확장
+- `PEMemAllocator.reserve_ipcq_*` 메소드 추가.
+- 기존 tensor allocation과 영역 충돌 방지.
+
+### Phase 3: Install plan builder 전환
+- `_ipcq_base_for_pe` 제거, PhysAddr factory 호출로 대체.
+- `IpcqEndpoint.rx_base_pa`가 PhysAddr.encode() 결과 (51-bit).
+
+### Phase 4: Routing fabric 검증
+- IPCQ DMA token이 fabric 정상 경로로 배달되는지 확인.
+- 별도 fast-path가 있다면 제거, 통합.
+
+### Phase 5: MemoryStore space 검증
+- IPCQ buffer 주소가 기존 tensor 주소와 겹치지 않는지.
+- Allocator 레벨에서 이미 예약했으므로 정상적으로 분리되어야 함.
+
+### Phase 6: ADR-0023 D2.5 업데이트 + 기존 sideband path 제거 (완료)
+
+---
+
+## Dependencies
+
+- **ADR-0031** (PhysAddr PE-resource extension) — **Blocker**: PhysAddr가 PE
+  resource (특히 IPCQ ring buffer)를 충분히 표현할 수 있도록 schema 확장이
+  선행되어야 함. 본 ADR은 ADR-0031 완료 후에만 실행 가능.
+- **ADR-0001** (PhysAddr layout): 본 ADR의 기반. 51-bit 공간 / factory API의
+  ADR-0031 확장본을 사용.
+- **ADR-0023** (IPCQ protocol): 본 ADR은 ADR-0023 D2.5의 "later" 약속 이행.
+  D9 piggyback / credit return 프로토콜 자체는 불변.
+- **ADR-0024** (launcher + install_plan.py): `build_install_plans`가 PhysAddr
+  factory를 호출하게 됨.
+- **ADR-0025** (direction addressing): 주소-기반 매칭이 PhysAddr에서도 동일하게
+  작동. 변경 없음.
+
+---
+
+## Non-goals
+
+- **ADR-0001 PhysAddr layout 자체 변경**: 51-bit 공간과 segment 구조는 유지.
+  부족 시 별도 ADR.
+- **IPCQ protocol semantic 변경**: ADR-0023 D9 piggyback 등 프로토콜 로직 유지.
+- **Allocator 전반 재설계**: IPCQ 예약 API 추가만.
+
+---
+
+## Open questions
+
+### 🔴 Critical — Migration 전 반드시 검증
+
+- **PhysAddr 51-bit 공간에 IPCQ 버퍼가 실제로 들어가는가**: 각 PE의 TCM
+  영역에서 `4 direction × n_slots (default 4) × slot_size (default 4KB)` =
+  64KB가 PE TCM 공간에 수용 가능. TCM size (e.g., 16MB) 대비 충분. HBM도 여유
+  많음. SRAM은 cube 공유라 direction × PE 곱이 있음 — 별도 검증 필요.
+- **Routing fabric의 현재 IPCQ 주소 처리**: 현재 synthetic 주소가 fabric에서
+  어떻게 routing되는지 trace 필요. `PhysAddr.decode()`로 판독 불가한 값이
+  fabric에서 정상 배달된다면 어떤 경로를 쓰는지 조사.
+
+### 🟡 Nice-to-have
+
+- **IPCQ 전용 kind / sub_offset 인코딩**: `UnitType.PE`의 sub_offset 공간을
+  IPCQ와 공유. 충돌 방지를 위해 IPCQ 전용 sub-space 정의할지 여부.
+- **Debug tool**: `pointer_dump`를 PhysAddr 포매팅으로 개선.
+
+---
+
+## Test strategy
+
+### T1. PhysAddr round-trip
+
+`tests/test_ipcq_physaddr.py` (new):
+- `PhysAddr.pe_tcm_addr(...)` → encode → decode → 동일 필드 복원
+- TCM / HBM / SRAM 각 factory에 대해
+
+### T2. Allocator 예약
+
+`tests/test_ipcq_alloc.py` (new):
+- `PEMemAllocator.reserve_ipcq_tcm` → 반환된 offset이 valid TCM 영역
+- 중복 예약 → 에러 또는 non-overlapping offset
+- Tensor allocation과 충돌 없음
+
+### T3. Install plan PhysAddr integration
+
+`tests/test_ccl_install_plan.py` (확장):
+- `build_install_plans` 결과의 `rx_base_pa`가 PhysAddr.decode() 가능
+- Decoded 좌표가 plan의 (sip, cube, pe)와 일치
+- I3.1 invariant (ADR-0025 D6) — rx_base range disjointness가 PhysAddr에서도 성립
+
+### T4. Routing — IPCQ DMA fabric traversal
+
+`tests/test_ipcq_routing.py` (new):
+- Cross-SIP IPCQ send → fabric이 `PhysAddr.decode(dst_addr)`로 destination SIP
+  정확히 판단 → 올바른 MemoryStore에 write
+- UCIe 경로 / cube_noc 경로 모두 검증
+
+### T5. 회귀
+
+- 기존 IPCQ E2E 테스트 (ring, mesh, tree) 모두 통과
+- ADR-0024, ADR-0025 통합 테스트 통과
+
+---
+
+## Consequences
+
+### Positive
+
+- **ADR-0001 정합성 회복**: routing과 addressing이 단일 체계.
+- **buffer_kind 명확**: TCM/HBM/SRAM이 구조적 좌표로 구분.
+- **Debug 향상**: PhysAddr.decode()로 사람이 읽을 수 있는 좌표.
+- **Allocator 통합**: IPCQ 영역이 정상 예약 → tensor와의 충돌 리스크 사전 차단.
+- **Fabric routing 일원화**: 별도 경로 없이 기존 PhysAddr-based routing 재활용.
+
+### Negative
+
+- **Migration 복잡도**: 6 Phase 단계적 전환 필요. 각 Phase마다 regression 리스크.
+- **PhysAddr 공간 검증 부담**: Phase 1에서 TCM/HBM/SRAM 공간이 IPCQ 요구를
+  수용하는지 실측 필요.
+- **Routing fabric 검증**: 현재 fabric이 synthetic 주소를 어떻게 처리하는지
+  조사 필요.
+
+### Neutral
+
+- IPCQ protocol semantic (ADR-0023 D9 등) 불변.
+- ADR-0025의 direction addressing 로직 불변.
+
+---
+
+## Affected files
+
+| File | Change |
+|------|--------|
+| `src/kernbench/ccl/install.py` | `_IPCQ_BASE`, `_ipcq_base_for_pe` 제거 |
+| `src/kernbench/ccl/install_plan.py` (ADR-0024) | D1: PhysAddr factory 호출로 rx_base 계산 |
+| `src/kernbench/policy/address/allocator.py` (or similar) | D2: IPCQ 예약 API (`reserve_ipcq_tcm` 등) |
+| `src/kernbench/common/ipcq_types.py` | D1: `IpcqEndpoint.rx_base_pa` 문서화 — PhysAddr.encode 결과 |
+| `src/kernbench/sim_engine/memory_store.py` | D3: IPCQ buffer가 기존 space와 공유되는지 검증 |
+| `src/kernbench/sim_engine/engine.py` | D4: IPCQ token routing이 PhysAddr-based fabric 경로 사용 |
+| `src/kernbench/ccl/diagnostics.py` | D5: pointer_dump를 PhysAddr 포매팅으로 개선 |
+| `docs/adr/ADR-0023-ipcq-pe-collective.md` | D6: D2.5 amendment note |
+| `tests/test_ipcq_physaddr.py` (new) | T1 |
+| `tests/test_ipcq_alloc.py` (new) | T2 |
+| `tests/test_ccl_install_plan.py` | T3 확장 |
+| `tests/test_ipcq_routing.py` (new) | T4 |
@@ -0,0 +1,257 @@
+# ADR-0031: PhysAddr PE-Resource Extension
+
+## Status
+
+Stub (Blocker for ADR-0030 — specific range allocations TBD)
+
+## Context
+
+### 목표
+
+ADR-0001의 `PhysAddr` schema를 **PE 내부의 다양한 resource**를 체계적으로
+표현할 수 있도록 확장한다. ADR-0030 (IPCQ PhysAddr integration) 및 향후의
+PE-local resource 추가 (scratchpad, register file, status register, 등)의
+기반을 제공한다.
+
+### 현재 상태 (ADR-0001)
+
+51-bit PhysAddr layout:
+
+```
+[50:47] rack_id  (4)
+[46:43] sip_id   (4)
+[42:38] sip_seg  (5)   # cube_id
+[37:0]  local_offset (38)
+```
+
+`local_offset` (38 bits) 내부:
+
+- `[37]` selector: 1 = HBM window (128GB), 0 = PE resource window
+- PE resource window는 `unit_type` (3 bits: PE | MCPU | SRAM) +
+  `pe_id` (4 bits) + `ext` (1 bit) + `sub_offset` (29 bits)
+
+Factory API:
+- `PhysAddr.hbm_addr(...)` — HBM generic
+- `PhysAddr.pe_hbm_addr(...)` — PE-local HBM slice
+- `PhysAddr.pe_tcm_addr(...)` — PE TCM (via `UnitType.PE` + `sub_offset`)
+- `PhysAddr.cube_sram_addr(...)` — Cube-shared SRAM
+
+### 풀어야 할 문제
+
+1. **PE 내부 resource 구분의 명시적 체계 부재**: 현재 `local_offset` (38 bits)
+   이 평면 공간으로 취급되고, PE TCM / IPCQ ring / scratchpad / 향후 register
+   file 등이 관습적 offset 범위로만 구분됨. Schema 레벨에서 명확하지 않음.
+2. **IPCQ 주소의 PhysAddr 표현 부재**: ADR-0030이 IPCQ ring buffer를 PhysAddr로
+   표현하려면 "이 주소가 IPCQ 영역"을 decode 가능해야 함. 현재는 불가.
+3. **향후 PE resource 확장 경로**: register file, performance counter 등
+   추가 시 일관된 위치 할당 규칙 필요.
+
+### 설계 방향 — local_offset을 PE 컴포넌트별 range로 분할
+
+`local_offset` (38 bits = 256GB per PE segment)을 **PE 컴포넌트마다 고정
+range**로 나누어 할당한다. 각 range는 해당 컴포넌트 전용 주소 공간이며,
+`PhysAddr.decode()`가 주소가 어느 range에 속하는지 판별해 해당하는 `kind` /
+`unit_type` / `sub_type` 필드를 채운다.
+
+개념적 구조 (구체적 bit 할당은 **TBD**):
+
+```
+local_offset [37:0]  (38 bits total)
+├── HBM window           [37] = 1    (기존 128GB)
+├── PE component ranges  [37] = 0
+│   ├── TCM              [range_1]
+│   ├── IPCQ rings       [range_2]
+│   ├── Scratchpad       [range_3]
+│   ├── Register file    [range_4]
+│   ├── (reserved)       ...
+│   └── Sideband / status [range_N]
+```
+
+### 왜 range-based partition인가
+
+- **Schema-level 명시성**: 주소 하나 보고 어느 컴포넌트의 자원인지 decode 가능.
+  "Routing consumes decoded domains" (ADR-0001 D5) 계약 충족.
+- **Unit type enum 확장보다 유연**: 3-bit `UnitType` 공간을 고갈시키지 않고
+  세분화 가능. 미래 추가 컴포넌트도 빈 range 할당.
+- **Allocator 통합 자연**: 각 PE-level allocator가 관리하는 하위 pool을
+  address range와 1:1 매칭 (e.g., `reserve_ipcq_tcm()` → IPCQ range 안에서만
+  할당).
+- **Decode routing 단순**: `PhysAddr.decode(addr)`가 range table을 참조해
+  `kind` + sub-field를 채움. 기존 HBM selector bit 패턴의 일반화.
+
+### 왜 지금 다루는가
+
+- ADR-0030 (IPCQ PhysAddr 통합)이 이 확장에 **의존**. ADR-0030 단독 진행 시
+  `sub_offset` 공간을 불투명하게 재사용하게 되어 ADR-0001 계약 미충족.
+- PE 내부 자원이 더 추가될 가능성 — 지금 구조를 정리해두면 일관된 확장 경로 확보.
+
+---
+
+## Decision (pending specific range allocation)
+
+### D1. Range-based local_offset partition — approach
+
+`local_offset`을 고정 byte range로 분할하고, 각 range를 PE 컴포넌트에 할당한다.
+주소의 어느 range에 속하는가로 `kind` / component type을 결정.
+
+```python
+# src/kernbench/policy/address/phyaddr.py (conceptual, post-extension)
+@dataclass(frozen=True)
+class PeResourceRange:
+    name: str                # e.g. "tcm", "ipcq", "scratchpad", "regfile"
+    start_offset: int        # local_offset 내 시작
+    end_offset: int          # exclusive
+    byte_size: int           # end - start
+
+PE_RESOURCE_MAP: tuple[PeResourceRange, ...] = (
+    # TBD — 구체적 range 할당은 사용자가 별도 업데이트
+)
+```
+
+`PhysAddr.decode(addr)`의 PE resource 경로는:
+
+```python
+def decode_pe_resource(local_offset: int) -> dict:
+    for r in PE_RESOURCE_MAP:
+        if r.start_offset <= local_offset < r.end_offset:
+            return {
+                "kind": "pe_resource",
+                "component": r.name,                 # NEW: "tcm"/"ipcq"/...
+                "component_offset": local_offset - r.start_offset,  # within range
+            }
+    raise PhysAddrError(f"local_offset {local_offset} not in any PE range")
+```
+
+### D2. Specific range allocations — **TBD**
+
+> 사용자가 구체적 byte 할당을 별도로 정의한 뒤 본 ADR에 업데이트.
+>
+> 필요 정보:
+> - 각 컴포넌트 (TCM, IPCQ, scratchpad, regfile, ...)의 이름 / byte size
+> - `local_offset` 내 시작 offset (align 고려)
+> - 현재 하드웨어 사양 / 시뮬레이션 요구 반영
+
+이 섹션이 채워진 뒤 ADR status: **Stub → Proposed → Accepted** 승격.
+
+### D3. Factory API — per-component 함수
+
+기존 `PhysAddr.pe_tcm_addr(...)` 패턴을 일반화:
+
+```python
+# 기존 (이미 존재)
+PhysAddr.pe_tcm_addr(rack_id, sip_id, cube_id, pe_id, tcm_offset)
+
+# 신규 (ADR-0031 후 추가)
+PhysAddr.pe_ipcq_addr(rack_id, sip_id, cube_id, pe_id, ipcq_offset)
+PhysAddr.pe_scratchpad_addr(...)
+PhysAddr.pe_regfile_addr(...)
+# ...
+```
+
+각 factory는 해당 컴포넌트의 range 내에서 `component_offset`만 받아 최종
+PhysAddr encoding. 호출자는 어느 range인지 몰라도 됨.
+
+### D4. Backward compatibility
+
+- 기존 `pe_tcm_addr()` signature / semantic 유지.
+- 내부 인코딩만 신규 range table을 참조하도록 변경.
+- 기존 `UnitType.PE` decoding 경로는 `PE_RESOURCE_MAP`에서 "tcm" range를
+  대응하도록 매핑 → 기존 코드 transparent.
+- 기존 코드가 `PhysAddr.decode(addr).unit_type == UnitType.PE`를 체크하는
+  경우는 여전히 유효 (TCM 주소는 계속 PE unit_type).
+
+---
+
+## Open questions
+
+### 🔴 Pending user input (ADR 승격 blocker)
+
+- **D2의 specific range allocation**: 사용자가 구체적 byte 할당 테이블을
+  제공해야 Stub → Proposed 승격 가능. 필요 정보:
+  - 컴포넌트 목록 (TCM, IPCQ, scratchpad, regfile 등)
+  - 각 컴포넌트의 byte size / 시작 offset
+  - Alignment 요구사항 (4KB / page-aligned 등)
+
+### 🟡 설계 세부 — range allocation 결정 과정에서 함께 결정
+
+- **총 local_offset space 배분**: HBM window (bit 37 = 1, 128GB)을 유지할지,
+  아니면 PE resource space를 확장하기 위해 HBM window 축소할지.
+- **Range padding / reserved space**: 미래 컴포넌트 추가를 위한 "reserved"
+  range 몇 개를 미리 확보할지.
+- **Address alignment**: 각 range의 시작 offset이 특정 alignment (page /
+  cache line) 만족해야 하는지.
+- **Diagnostic / debug 포맷**: `PhysAddr.decode()` 출력에서 component 이름 +
+  component_offset을 사람이 읽기 좋게 표시 (e.g., "IPCQ ring sip=0 cube=0 pe=3
+  offset=0x1234").
+- **기존 `UnitType` enum의 role**: Range-based 접근 후에도 `unit_type` 필드
+  유지할지 (decode 결과에 `component` 추가), 또는 enum 대체할지.
+
+### 🟢 ADR-0030 연동 질문
+
+- **IPCQ range 내 direction/slot 표현**: PhysAddr는 `component_offset` 단위
+  까지만 표현. "direction=E, slot=2"는 IPCQ range 내 offset 계산으로 도출
+  (`direction_idx * slot_region_size + slot_idx * slot_size`) — 이 공식은
+  ADR-0030 scope에서 구체화.
+- **Allocator pool 구조**: `PEMemAllocator`가 여러 range (TCM, IPCQ,
+  scratchpad)를 개별 pool로 관리할지, 단일 pool에서 kind별 reserved만 관리
+  할지. Range-based schema면 개별 pool이 자연스러움.
+
+---
+
+## Non-goals (this ADR)
+
+- **51-bit 전체 layout 재작성**: 본 ADR은 `local_offset` (38 bits) 내부의
+  subdivision만 다룬다. Rack / SIP / cube segment 같은 상위 bit 구조는
+  불변.
+- **`UnitType` enum 재설계**: range-based 접근으로 대체 가능하지만, 기존 enum
+  (PE / MCPU / SRAM)은 backward compat 위해 유지.
+- **Dynamic range allocation**: runtime에 range 크기 바꾸는 기능 불필요. 모든
+  range는 컴파일 / 설정 시점에 고정.
+- **Multi-process / multi-rack partitioning**: PE 내부 resource만 다룸.
+
+---
+
+## Action
+
+### Phase 1 — User 입력: specific range allocation (**Blocker**)
+- 사용자가 정의한 PE 컴포넌트별 byte range를 D2에 기입:
+  - `PE_RESOURCE_MAP` 테이블 내용 (name, start_offset, byte_size per 컴포넌트)
+  - 각 컴포넌트의 hardware spec 근거 note
+
+### Phase 2 — ADR Stub → Proposed 승격
+- D2 채워지면 status 변경.
+- Open questions의 "🔴 Pending user input" 블록 제거.
+- ADR-0001에 amendment note 초안 작성.
+
+### Phase 3 — 구현
+- `PhysAddr` range-based decode 구현.
+- 신규 factory 함수 (`pe_ipcq_addr`, `pe_scratchpad_addr` 등 컴포넌트별)
+  추가.
+- 기존 `pe_tcm_addr` 내부 인코딩만 신규 range table 참조하도록 수정
+  (signature 불변).
+- 기존 코드 경로 회귀 확인.
+
+### Phase 4 — ADR-0030 unblock
+- ADR-0030 "Blocked" 상태 해제.
+- Install_plan builder가 `pe_ipcq_addr(...)` 등 확장된 factory 호출하도록
+  수정.
+
+---
+
+## Dependencies
+
+- **ADR-0001** (PhysAddr layout): 본 ADR은 ADR-0001의 확장.
+- **ADR-0023** (IPCQ protocol): IPCQ ring buffer의 주소 체계를 PhysAddr로
+  통합할 수 있게 하는 기반.
+- **ADR-0030** (IPCQ PhysAddr integration): 본 ADR에 blocked.
+
+---
+
+## Affected files (future, after promotion to Proposed)
+
+| File | Change |
+|------|--------|
+| `src/kernbench/policy/address/phyaddr.py` | Range table (`PE_RESOURCE_MAP`), range-based decode, 신규 component-specific factory들 (`pe_ipcq_addr` 등), 기존 `pe_tcm_addr` 내부 인코딩 갱신 |
+| `src/kernbench/policy/address/allocator.py` | Range-aware pool 분리 (TCM pool / IPCQ pool / scratchpad pool 등 per-PE) |
+| `docs/adr/ADR-0001-physaddr-layout.md` | Amendment note: range-based PE resource partition |
+| `tests/test_phyaddr.py` | Range table 검증, 각 factory의 encode/decode round-trip, 기존 `pe_tcm_addr` 회귀 |
@@ -0,0 +1,592 @@
+# CCL Algorithm Author Guide (English)
+
+This document is a step-by-step guide for engineers writing CCL
+(Collective Communication Library) algorithms in kernbench. The
+internal system design and component structure live in
+[ADR-0023](adr/ADR-0023-ipcq-pe-collective.md).
+
+The goal here is to clearly separate **what an algorithm author has to
+touch** from **what they can leave alone**, and to get a first
+algorithm running through the shortest possible path.
+
+---
+
+## 0. Five-minute tour
+
+| Things you touch | Location |
+|------------------|----------|
+| Algorithm module (kernel + optional `neighbors()`) | `src/kernbench/ccl/algorithms/<algo>.py` |
+| Algorithm registration | `ccl.yaml` |
+| Host bench (rank count, init, launch, verify) | `benches/<your_bench>.py` |
+| (Optional) unit test | `tests/test_<algo>.py` |
+
+| Things you do NOT touch | Location |
+|--------------------------|----------|
+| TLContext API | `src/kernbench/triton_emu/tl_context.py` (ADR-0022 spec) |
+| Framework (topology generators, helpers, mock testing) | `src/kernbench/ccl/` |
+| PE_IPCQ / PE_DMA components | `src/kernbench/components/builtin/` |
+| Backend implementation (`install_ipcq`) | `src/kernbench/runtime_api/distributed.py` and `kernbench/ccl/install.py` |
+
+Workflow:
+1. Write a `kernel` function in the algorithm module.
+2. Register an entry in `ccl.yaml`.
+3. Write a host bench using `torch.distributed.init_process_group` /
+   `torch.distributed.all_reduce` (the unified `benches/ccl_allreduce.py`
+   handles the common case).
+4. (Optional) Run the mock runtime for fast unit tests (a few ms).
+5. `kernbench run --bench <name> --verify-data` for full SimPy verification.
+
+---
+
+## 1. Hello World — the simplest send/recv
+
+Each PE sends its tile to its E neighbor once and receives a tile from
+its W neighbor once. The reference code lives in
+[`src/kernbench/ccl/algorithms/hello_send.py`](../src/kernbench/ccl/algorithms/hello_send.py).
+
+### Step 1: write the kernel
+
+New file `src/kernbench/ccl/algorithms/hello_send.py`:
+
+```python
+"""Hello world: send your tile to the next rank, receive from the previous one."""
+
+
+def kernel(t_ptr, n_elem, tl):
+    # Global rank is computed from program_id(0/1) (ADR-0022).
+    local_pe = tl.program_id(axis=0)
+    cube_id = tl.program_id(axis=1)
+    pes_per_cube = tl.num_programs(axis=0)
+    rank = cube_id * pes_per_cube + local_pe
+
+    nbytes = n_elem * 2  # f16
+    pe_addr = t_ptr + rank * nbytes
+
+    # Load our slice and send it east.
+    src = tl.load(pe_addr, shape=(n_elem,), dtype="f16")
+    tl.send(dir="E", src=src)
+
+    # Receive from west and store directly back into our slice.
+    recv = tl.recv(dir="W", shape=(n_elem,), dtype="f16")
+    tl.store(pe_addr, recv)
+
+
+def kernel_args(world_size: int, n_elem: int) -> tuple:
+    """Positional kernel args used by the ahbm backend (after t_ptr)."""
+    return (n_elem,)
+```
+
+Key points:
+
+- **Global rank is computed from `program_id(axis=0)` + `program_id(axis=1)`.**
+  TL has no contractually-supported `tl.rank` / `tl.world_size`. If the
+  host needs to pass `world_size` or anything else as an algorithm
+  parameter, it goes through ordinary `torch.launch` arguments.
+- **`tl.send` takes a `TensorHandle`.** PE_IPCQ reads
+  `addr`/`space`/`shape`/`dtype`/`nbytes` from the handle to issue an
+  `IpcqDmaToken` to PE_DMA.
+- **`tl.recv` requires `shape` and `dtype`.** The returned TensorHandle
+  points at the IPCQ ring slot and can be used directly as a `dst`
+  handle (e.g. `tl.store(pe_addr, recv)`). Phase 2's `dma_write` replay
+  handles the (slot → hbm) copy, so user code never has to touch
+  `recv.data`.
+
+### Step 2: register in `ccl.yaml`
+
+```yaml
+algorithms:
+  hello_send:
+    module: kernbench.ccl.algorithms.hello_send
+    topology: ring_1d
+    buffer_kind: tcm
+    world_size: 8
+```
+
+`world_size` here is optional. If absent, `AhbmCCLBackend` derives it
+from the topology spec (`sips × cubes_per_sip × pes_per_cube`).
+
+### Step 3: write a host bench (optional — the unified bench may suffice)
+
+For most CCL benchmarks the existing `benches/ccl_allreduce.py` is
+sufficient: it reads `ccl.yaml`, picks the algorithm, sets up the
+process group, and runs the collective. If your algorithm needs custom
+host logic, write a new bench file along the same lines.
+
+The host code looks like a real PyTorch DDP worker:
+
+```python
+"""benches/ccl_hello.py"""
+from __future__ import annotations
+
+import numpy as np
+
+from kernbench.policy.placement.dp import DPPolicy
+
+
+N_ELEM = 8
+
+
+def worker(rank: int, world_size: int, torch) -> None:
+    """Per-rank business logic — mirrors a real PyTorch DDP worker."""
+    dp = DPPolicy(
+        cube="replicate", pe="column_wise",
+        num_cubes=1, num_pes=world_size,
+    )
+    tensor = torch.zeros(
+        (1, world_size * N_ELEM), dtype="f16", dp=dp, name="hello_in",
+    )
+
+    # Per-rank initialization via the real PyTorch idiom.
+    init = np.zeros((1, world_size * N_ELEM), dtype=np.float16)
+    for r in range(world_size):
+        init[0, r * N_ELEM : (r + 1) * N_ELEM] = float(r + 1)
+    tensor.copy_(torch.from_numpy(init))
+
+    # The collective itself.
+    torch.distributed.all_reduce(tensor, op="sum")
+
+    # Verify on rank 0 (real PyTorch DDP idiom).
+    if rank == 0:
+        result = tensor.numpy()
+        for r in range(world_size):
+            expected = float(((r - 1) % world_size) + 1)
+            slice_r = result[0, r * N_ELEM : (r + 1) * N_ELEM]
+            print(
+                f"  rank {r}: got {float(slice_r.mean()):.1f}, "
+                f"expected {expected:.1f}"
+            )
+
+
+def run(torch) -> None:
+    """CLI entry point. Initializes dist, dispatches to worker."""
+    dist = torch.distributed
+    dist.init_process_group(backend="ahbm")
+    worker(
+        rank=dist.get_rank(),
+        world_size=dist.get_world_size(),
+        torch=torch,
+    )
+```
+
+### Step 4: unit test (optional but strongly recommended)
+
+`tests/test_hello_send.py`:
+
+```python
+import numpy as np
+
+from kernbench.ccl.algorithms.hello_send import kernel
+from kernbench.ccl.testing import run_kernel_in_mock
+
+
+def test_hello_send_4_ranks():
+    n_elem = 8
+    inputs = [
+        np.full((n_elem,), float(r + 1), dtype=np.float16)
+        for r in range(4)
+    ]
+    outputs = run_kernel_in_mock(
+        kernel_fn=kernel,
+        world_size=4,
+        topology="ring_1d",
+        inputs=inputs,
+        kernel_args=(n_elem,),
+    )
+    # rank r should now hold rank (r-1) % 4's data.
+    for r in range(4):
+        assert np.array_equal(outputs[r], inputs[(r - 1) % 4])
+```
+
+`run_kernel_in_mock` runs every rank concurrently in pure Python (no
+SimPy), so a unit test like this finishes in **milliseconds**. It only
+verifies algorithmic correctness — no latency, no DMA, no fabric.
+
+### Step 5: SimPy validation
+
+```bash
+kernbench run --topology topology.yaml --bench ccl_hello --verify-data
+```
+
+Phase 1 runs the SimPy simulation + MemoryStore data movement, Phase 2
+replays the op_log for correctness. The bench's `print` lines should
+show OK for every rank.
+
+---
+
+## 2. Ring all-reduce — the second algorithm
+
+Slightly more complex. Each PE runs `world_size - 1` rounds, sending
+its current tile east and accumulating the tile received from the west.
+After all rounds, every PE holds the global sum.
+
+The reference implementation lives in
+[`src/kernbench/ccl/algorithms/ring_allreduce.py`](../src/kernbench/ccl/algorithms/ring_allreduce.py).
+The core flow:
+
+```python
+"""Ring all-reduce."""
+
+
+def kernel(t_ptr, n_elem, world_size, tl):
+    local_pe = tl.program_id(axis=0)
+    cube_id = tl.program_id(axis=1)
+    pes_per_cube = tl.num_programs(axis=0)
+    rank = cube_id * pes_per_cube + local_pe
+    nbytes = n_elem * 2
+    pe_addr = t_ptr + rank * nbytes
+
+    # The handle points at HBM[pe_addr]. In greenlet mode .data is
+    # populated, but the kernel never has to touch .data directly.
+    acc = tl.load(pe_addr, shape=(n_elem,), dtype="f16")
+    current = acc  # source for the first send
+
+    for _step in range(world_size - 1):
+        tl.send(dir="E", src=current)
+        recv = tl.recv(dir="W", shape=(n_elem,), dtype="f16")
+        # TensorHandle operator overload → MathCmd → PE_MATH dispatch.
+        # Phase 1 only models timing; Phase 2 DataExecutor replays the
+        # actual numpy accumulation.
+        acc = acc + recv
+        current = recv  # forward the received slot to the next round
+
+    # Store the final accumulator back to HBM. Source is acc (a PE-local
+    # scratch addr); dst is HBM. The op_log dma_write entry records both
+    # ends so Phase 2 copies the math result into HBM at verify time.
+    tl.store(pe_addr, acc)
+
+
+def kernel_args(world_size: int, n_elem: int) -> tuple:
+    return (n_elem, world_size)
+```
+
+Four key points:
+
+1. **Accumulation goes through TensorHandle operators.** `acc + recv`
+   emits a `MathCmd` and dispatches it through PE_MATH — i.e. the
+   real hardware path, so the latency model stays accurate. Per
+   ADR-0020 D3, Phase 1 only simulates timing; Phase 2's `DataExecutor`
+   replays the op_log and runs the actual numpy accumulation.
+2. **Use `current = recv` to forward.** Each round must update the send
+   source to the just-received slot handle so the same data circulates
+   exactly once around the ring. Setting `current = acc` would resend
+   the cumulative sum, inflating the result.
+3. **`tl.store(pe_addr, acc)` exactly once at the end.** Do not use a
+   store→reload pattern in the middle. `acc` lives in PE-local scratch;
+   the op_log records `(src=scratch, dst=hbm)` and Phase 2 first runs
+   math (filling scratch) then copies via the dma_write snapshot.
+4. **`world_size` is passed by the host explicitly.** TL only knows the
+   topology slot count (e.g. `num_programs(axis=0)` is "PEs per cube"),
+   not the participating CCL group size. The host bench knows
+   `world_size` and forwards it as an explicit kernel argument.
+
+For registration in `ccl.yaml` and wiring through the unified bench,
+look at the existing `ring_allreduce_tcm/_hbm/_sram` entries plus
+[`benches/ccl_allreduce.py`](../benches/ccl_allreduce.py). Mock unit
+tests live in
+[`tests/test_ccl_mock_runtime.py`](../tests/test_ccl_mock_runtime.py)
+and follow the `kernel_args=(n_elem, world_size)` convention.
+
+---
+
+## 3. `neighbors()` override — custom topology
+
+Most algorithms are happy with the builtin topologies (`ring_1d`,
+`mesh_2d`, `tree_binary`, `ring_1d_unidir`, `none`). If you want to
+modify a builtin or define a brand-new connectivity pattern, define a
+`neighbors()` function in your algorithm module.
+
+### Signature
+
+```python
+def neighbors(
+    rank: int, world_size: int, neighbor_map: dict[str, int],
+) -> dict[str, int] | None:
+    """Override the neighbor map produced by the builtin topology.
+
+    Args:
+        neighbor_map: the mapping the ccl.yaml ``topology`` field built.
+                      For ring_1d this is {"E": (rank+1)%ws, "W": (rank-1)%ws}.
+                      The dict is mutable — modify in place if you want.
+
+    Returns:
+        dict: the new neighbor map (or the modified-in-place dict).
+        None: do not override; use neighbor_map as-is.
+    """
+    return None
+```
+
+### Pattern A: tweak a builtin
+
+```python
+def neighbors(rank, world_size, neighbor_map):
+    # Only even ranks use W; remove W from odd ranks.
+    if rank % 2 == 1:
+        neighbor_map.pop("W", None)
+    return neighbor_map
+```
+
+### Pattern B: replace entirely (skip-connection ring)
+
+```python
+def neighbors(rank, world_size, neighbor_map):
+    return {"E": (rank + 2) % world_size}
+```
+
+### Pattern C: keep builtin
+
+Either omit `neighbors` entirely or return None:
+
+```python
+def neighbors(rank, world_size, neighbor_map):
+    return None  # explicit "use the builtin"
+```
+
+---
+
+## 4. PE kernel API reference (ADR-0023 D4)
+
+### IPCQ API
+
+| API | Description | Blocking? |
+|-----|-------------|-----------|
+| `tl.send(dir, src=TensorHandle)` | Send to a peer in the given direction. | Yes (waits if peer slots are full) |
+| `tl.send(dir, src_addr=..., nbytes=..., shape=..., dtype=..., space=...)` | Same, keyword form. | Yes |
+| `tl.recv(dir, shape=..., dtype=...)` | Blocking recv from one direction. | Yes |
+| `tl.recv(shape=..., dtype=...)` | Round-robin recv across all four directions. | Yes |
+| `tl.recv_async(dir, shape=..., dtype=...) → RecvFuture` | Non-blocking recv. | No |
+| `tl.wait(future)` | Wait for a non-blocking recv future → returns the resolved TensorHandle. | Yes |
+
+### Existing TL API (ADR-0020/0022, unchanged)
+
+| API | Description |
+|-----|-------------|
+| `tl.load(addr, shape, dtype) → TensorHandle` | DMA read; in greenlet mode `.data` carries the ndarray. |
+| `tl.store(addr, handle)` | DMA write — when `handle.data` is set the runner propagates it to MemoryStore. |
+| `tl.composite(op, ...)` | Submit a GEMM/Math composite (non-blocking). |
+| `tl.program_id(axis=0)` | Local PE id within the cube. |
+| `tl.program_id(axis=1)` | Cube id (ADR-0022). |
+| `tl.num_programs(axis=0/1)` | Topology slot counts (NOT the participating-rank count). |
+
+### Two recv modes
+
+The default is `return_slot` (zero-copy): the IPCQ slot address is
+returned in `handle.addr`. To force a copy into a custom destination,
+pass `dst_addr` + `dst_space`:
+
+```python
+recv = tl.recv(
+    dir="W", shape=(8,), dtype="f16",
+    dst_addr=my_scratch_addr,
+    dst_space="hbm",
+)
+# After this call recv.addr == my_scratch_addr (copy_to_dst mode).
+```
+
+---
+
+## 5. Helpers (`kernbench.ccl.helpers`)
+
+Convenience helpers to keep algorithm code short:
+
+```python
+from kernbench.ccl.helpers import chunked, ring_step, tree_step
+```
+
+### `chunked(base_addr, n_chunks, n_elem, dtype="f16") → list[Chunk]`
+
+Split a tile of `n_elem` elements into `n_chunks` equal-size views.
+Each `Chunk` has `addr`, `n_elem`, `nbytes` fields.
+
+```python
+chunks = chunked(t_ptr, n_chunks=4, n_elem=64, dtype="f16")
+# chunks[0..3] are 16-element views with consecutive addresses.
+```
+
+### `ring_step(rank, step, world_size) → (send_idx, recv_idx)`
+
+Per-step chunk indices for a ring algorithm (reduce-scatter / all-gather):
+
+```python
+for step in range(world_size - 1):
+    send_idx, recv_idx = ring_step(rank, step, world_size)
+    tl.send(
+        dir="E", src_addr=chunks[send_idx].addr,
+        nbytes=chunks[send_idx].nbytes,
+        shape=(chunks[send_idx].n_elem,), dtype="f16",
+    )
+    recv = tl.recv(
+        dir="W", shape=(chunks[recv_idx].n_elem,), dtype="f16",
+    )
+    # accumulate ...
+```
+
+### `tree_step(rank, world_size) → {"parent": int|None, "children": list[int]}`
+
+Parent / children rank ids for a binary tree:
+
+```python
+info = tree_step(rank, world_size)
+if info["parent"] is None:
+    print(f"rank {rank} is the root")
+for child in info["children"]:
+    ...
+```
+
+---
+
+## 6. Unit testing — Mock runtime
+
+`kernbench.ccl.testing.run_kernel_in_mock` runs an algorithm without
+SimPy for fast feedback.
+
+### Basic usage
+
+```python
+import numpy as np
+
+from kernbench.ccl.testing import run_kernel_in_mock
+from kernbench.ccl.algorithms.my_algo import kernel
+
+
+def test_my_algo():
+    n_elem = 16
+    inputs = [np.arange(n_elem, dtype="f16") + r for r in range(4)]
+    expected = sum(inputs)
+    outputs = run_kernel_in_mock(
+        kernel_fn=kernel,
+        world_size=4,
+        topology="ring_1d",
+        inputs=inputs,
+        kernel_args=(n_elem, 4),  # positional args after t_ptr
+    )
+    for r in range(4):
+        assert np.allclose(outputs[r], expected, rtol=1e-3)
+```
+
+### Behavior
+
+- All ranks run their kernels concurrently as cooperative greenlets.
+- `tl.send` / `tl.recv` are serviced by in-memory FIFOs (no DMA, no
+  latency).
+- Each rank's last `store` is what the helper returns as a numpy array.
+
+### Limitations
+
+- No latency or performance numbers (it is not a simulation).
+- No PE_DMA, fabric, or BW model.
+- Correctness only.
+- One cube assumed: `program_id(axis=1)` is always 0.
+
+---
+
+## 7. Debugging
+
+### CCL trace
+
+```bash
+KERNBENCH_CCL_TRACE=1 kernbench run --topology topology.yaml \
+    --bench ccl_allreduce --verify-data
+```
+
+Per-rank send/recv events appear on stdout:
+
+```
+[ccl t=346.4 send] sip0.cube0.pe1 dir=E nbytes=64 seq=0
+[ccl t=360.4 recv] sip0.cube0.pe2 dir=W nbytes=64
+```
+
+### Pointer dump
+
+`kernbench.ccl.diagnostics.pointer_dump(engine)` returns a multi-line
+dump of every PE_IPCQ ring buffer's `my_head`, `my_tail`,
+`peer_head_cache`, `peer_tail_cache`. When something hangs, this shows
+which rank is stuck and on what.
+
+### Deadlock detection
+
+When the SimPy schedule empties because of unmatched send/recv pairs,
+the engine raises `IpcqDeadlock` and embeds the pointer dump in the
+message (ADR-0023 D14 F3). Wait-for-graph visualization is future
+work.
+
+---
+
+## 8. Common mistakes
+
+### 1. Using a direction that wasn't installed
+
+`topology: ring_1d` only installs E and W. Trying:
+
+```python
+tl.send(dir="N", ...)   # → IpcqInvalidDirection
+```
+
+Fix: switch to `topology: mesh_2d`, or add N/S in a `neighbors()` override.
+
+### 2. `send` without a matching `recv`
+
+```python
+def kernel(..., tl):
+    for _ in range(100):
+        tl.send(dir="E", ...)
+    # The peer never recvs → ring buffer fills → backpressure → deadlock.
+```
+
+Fix: every `send` needs a matching `recv` on the receiver side.
+Otherwise `IpcqDeadlock` is raised.
+
+### 3. dtype/shape mismatch
+
+By default mismatches are not validated. The author is responsible for
+consistency. Set `strict_validation: true` on a PE_IPCQ node's attrs to
+enable D14 F2 strict mode and catch them immediately.
+
+### 4. Assuming round-robin recv fairness
+
+`tl.recv()` (no direction) returns the first slot to arrive in
+round-robin order, but **arrival order is not predictable**. If your
+algorithm depends on a particular direction, name it explicitly:
+`tl.recv(dir="N", ...)`.
+
+### 5. Confusing `num_programs` with the CCL group size
+
+`tl.num_programs(axis=0/1)` reports topology slot counts, not the
+number of ranks participating in the collective. The host bench knows
+`world_size` and must pass it through as a kernel argument.
+
+### 6. Overwriting the send source before it's actually sent
+
+PE_DMA snapshots the source data into the IpcqDmaToken at send time,
+preserving in-flight semantics. Even so, the safest pattern is to call
+`tl.send` first and only mutate the source addr afterwards. If you
+mutate the addr before `tl.send` makes it into the PE_DMA queue, the
+snapshot will pick up the wrong data.
+
+---
+
+## 9. Next steps
+
+- Try other topologies (`mesh_2d`, `tree_binary`).
+- Faster algorithms (recursive halving / doubling).
+- Compare `buffer_kind` (tcm/hbm/sram) and `backpressure` (poll/sleep)
+  modes for latency.
+- Larger-scale validation through the unified `ccl_allreduce` bench
+  with different `ccl.yaml` overlays.
+
+If you add a new algorithm or pattern, please send a PR.
+
+---
+
+## References
+
+- [ADR-0023](adr/ADR-0023-ipcq-pe-collective.md): IPCQ + PE-level collective design.
+- [ADR-0022](adr/ADR-0022-program-id-2d-grid.md): 2D grid program_id (axis=0/1).
+- [ADR-0020](adr/ADR-0020-data-execution-two-pass.md): 2-pass data execution.
+- [ADR-0021](adr/ADR-0021-pe-pipeline-refactor.md): PE pipeline refactor.
+
+Existing algorithm examples:
+
+- [`src/kernbench/ccl/algorithms/hello_send.py`](../src/kernbench/ccl/algorithms/hello_send.py) — simplest send/recv
+- [`src/kernbench/ccl/algorithms/ring_allreduce.py`](../src/kernbench/ccl/algorithms/ring_allreduce.py) — ring all-reduce
+- [`src/kernbench/ccl/algorithms/mesh_allreduce.py`](../src/kernbench/ccl/algorithms/mesh_allreduce.py) — 2D mesh all-reduce
+- [`src/kernbench/ccl/algorithms/tree_allreduce.py`](../src/kernbench/ccl/algorithms/tree_allreduce.py) — binary tree all-reduce
@@ -0,0 +1,537 @@
+# CCL Algorithm Author Guide
+
+이 문서는 kernbench에서 CCL (Collective Communication Library) 알고리즘을
+직접 작성하는 사람을 위한 step-by-step 가이드이다. 시스템 내부 설계와
+컴포넌트 구조는 [ADR-0023](adr/ADR-0023-ipcq-pe-collective.md)에 있다.
+
+본 가이드는 알고리즘 작성자가 **자신이 만져야 할 곳**과 **만지지 않아도 될 곳**을
+명확히 분리하고, 가장 짧은 경로로 첫 알고리즘을 동작시키는 것을 목표로 한다.
+
+---
+
+## 0. 5분 요약
+
+| 만지는 것 | 위치 |
+|----------|------|
+| 알고리즘 모듈 (kernel + 선택적 neighbors) | `src/kernbench/ccl/algorithms/<algo>.py` |
+| 알고리즘 등록 | `ccl.yaml` |
+| 호스트 bench (PE 수, 메모리 init, launch, 검증) | `benches/<your_bench>.py` |
+| (선택) 단위 테스트 | `tests/test_<algo>.py` |
+
+| 만지지 않는 것 | 위치 |
+|---------------|------|
+| TLContext API | `src/kernbench/triton_emu/tl_context.py` (ADR-0022 spec) |
+| 프레임워크 (topology generators, helpers, mock testing) | `src/kernbench/ccl/` |
+| PE_IPCQ / PE_DMA 컴포넌트 | `src/kernbench/components/builtin/` |
+| backend 구현 (install_ipcq) | `src/kernbench/runtime_api/distributed.py` 및 `kernbench/ccl/install.py` |
+
+흐름:
+1. 알고리즘 모듈에 `kernel` 작성
+2. `ccl.yaml`에 entry 등록
+3. 호스트 bench에서 `install_ipcq` + `launch`
+4. (선택) mock runtime으로 단위 테스트 (수 ms)
+5. `kernbench run --bench <name> --verify-data`로 SimPy 검증
+
+---
+
+## 1. Hello World — 가장 단순한 send/recv
+
+각 PE가 자기 데이터를 E 방향 이웃에 한 번 보내고, W 방향에서 한 번 받는
+가장 단순한 알고리즘이다. 실제 동작 코드는
+[`src/kernbench/ccl/algorithms/hello_send.py`](../src/kernbench/ccl/algorithms/hello_send.py)
+에 있다.
+
+### Step 1: kernel 작성
+
+새 파일 `src/kernbench/ccl/algorithms/hello_send.py`:
+
+```python
+"""Hello world: 자기 데이터를 다음 rank에 보내고 이전 rank에서 받기."""
+def kernel(t_ptr, n_elem, tl):
+    # 글로벌 rank는 program_id(0/1)에서 계산 (ADR-0022)
+    local_pe = tl.program_id(axis=0)
+    cube_id = tl.program_id(axis=1)
+    pes_per_cube = tl.num_programs(axis=0)
+    rank = cube_id * pes_per_cube + local_pe
+
+    nbytes = n_elem * 2  # f16
+    pe_addr = t_ptr + rank * nbytes
+
+    # 자기 슬라이스를 로드해서 E로 보낸다.
+    src = tl.load(pe_addr, shape=(n_elem,), dtype="f16")
+    tl.send(dir="E", src=src)
+
+    # W 방향에서 받아서 그대로 자기 슬라이스에 store한다.
+    recv = tl.recv(dir="W", shape=(n_elem,), dtype="f16")
+    tl.store(pe_addr, recv)
+```
+
+핵심 포인트:
+
+- **글로벌 rank는 `program_id(axis=0)` + `program_id(axis=1)`에서 계산.** TL에는
+  `tl.rank` / `tl.world_size` 같은 약속되지 않은 확장이 없다. 호스트가
+  `world_size` 같은 알고리즘 파라미터가 필요하면 `torch.launch`의 일반 인자로
+  전달한다.
+- **`tl.send`는 `TensorHandle`을 받는다.** 핸들의 `addr`/`space`/`shape`/`dtype`/`nbytes`를
+  PE_IPCQ가 읽어 PE_DMA에 IpcqDmaToken을 발행한다.
+- **`tl.recv`는 `shape`와 `dtype`이 필수.** 반환된 TensorHandle은 IPCQ ring slot을
+  가리키며, `tl.store(pe_addr, recv)`처럼 dst 핸들로 그대로 사용할 수 있다.
+  Phase 2 dma_write replay가 (slot, hbm) 복사를 수행하므로 numpy `.data`를
+  직접 만질 필요가 없다.
+
+### Step 2: ccl.yaml 등록
+
+`ccl.yaml`의 `algorithms` 섹션에 entry를 추가한다. (defaults.algorithm은 호스트
+bench가 `install_ipcq(algorithm=...)`로 명시 전달해도 되므로 꼭 바꿀 필요는 없다.)
+
+```yaml
+algorithms:
+  hello_send:
+    module: kernbench.ccl.algorithms.hello_send
+    topology: ring_1d
+    buffer_kind: tcm
+```
+
+### Step 3: 호스트 bench 작성
+
+새 파일 `benches/ccl_hello.py`:
+
+```python
+"""Hello-world ring rotation bench (각 PE가 W 이웃의 데이터를 1번 받음)."""
+import numpy as np
+
+from kernbench.ccl.algorithms import hello_send
+from kernbench.policy.placement.dp import DPPolicy
+
+ALGORITHM = "hello_send"
+N_ELEM = 8
+WORLD_SIZE = 8
+
+
+def run(torch):
+    plan = torch.install_ipcq(algorithm=ALGORITHM)
+
+    a = torch.zeros(
+        (1, WORLD_SIZE * N_ELEM), dtype="f16",
+        dp=DPPolicy(
+            cube="replicate", pe="column_wise",
+            num_cubes=1,
+        ),
+        name="hello_in",
+    )
+
+    store = torch.engine.memory_store
+    base = a._handle.va_base or a._handle.shards[0].pa
+    nbytes = N_ELEM * 2
+    for r in range(WORLD_SIZE):
+        store.write("hbm", base + r * nbytes,
+                    np.full((N_ELEM,), float(r + 1), dtype=np.float16))
+
+    torch.launch(ALGORITHM, hello_send.kernel, a, N_ELEM)
+
+    # rank r은 rank (r-1)%ws의 데이터를 가져야 한다.
+    for r, (sip, cube, pe) in enumerate(plan["rank_to_pe"]):
+        result = store.read("hbm", base + r * nbytes, shape=(N_ELEM,), dtype="f16")
+        prev = float(((r - 1) % WORLD_SIZE) + 1)
+        ok = np.allclose(result, prev)
+        print(f"  [{'OK ' if ok else 'FAIL'}] rank {r} got {float(result.mean()):.1f}, "
+              f"expected {prev:.1f}")
+```
+
+### Step 4: 단위 테스트 (선택, 강력 추천)
+
+`tests/test_hello_send.py`:
+
+```python
+import numpy as np
+from kernbench.ccl.algorithms.hello_send import kernel
+from kernbench.ccl.testing import run_kernel_in_mock
+
+
+def test_hello_send_4_ranks():
+    n_elem = 8
+    inputs = [np.full((n_elem,), float(r + 1), dtype=np.float16) for r in range(4)]
+
+    outputs = run_kernel_in_mock(
+        kernel_fn=kernel,
+        world_size=4,
+        topology="ring_1d",
+        inputs=inputs,
+        kernel_args=(n_elem,),
+    )
+
+    # rank r은 rank (r-1) % 4의 데이터를 받아야 함
+    for r in range(4):
+        assert np.array_equal(outputs[r], inputs[(r - 1) % 4])
+```
+
+`run_kernel_in_mock`는 SimPy 없이 순수 Python으로 모든 rank를 동시 실행하므로
+**ms 단위로 끝난다**. 알고리즘 logic 정합성만 검증.
+
+### Step 5: 시뮬 검증
+
+```bash
+kernbench run --topology topology.yaml --bench ccl_hello --verify-data
+```
+
+Phase 1에서 SimPy 시뮬레이션 + MemoryStore 데이터 이동, Phase 2에서 op_log
+정합성 replay. 호스트 bench의 `print` 검증이 모든 rank에 대해 OK여야 한다.
+
+---
+
+## 2. Ring All-Reduce — 두 번째 알고리즘
+
+조금 더 복잡한 예제. Ring all-reduce는 N-1 라운드 동안 각 PE가 자기 데이터를
+E로 보내고 W에서 받아 누적한다. 최종적으로 모든 PE가 글로벌 sum을 갖는다.
+
+실제 동작 코드는 [`src/kernbench/ccl/algorithms/ring_allreduce.py`](../src/kernbench/ccl/algorithms/ring_allreduce.py)
+참조. 핵심 흐름:
+
+```python
+"""Ring all-reduce."""
+
+
+def kernel(t_ptr, n_elem, world_size, tl):
+    # rank
+    local_pe = tl.program_id(axis=0)
+    cube_id = tl.program_id(axis=1)
+    pes_per_cube = tl.num_programs(axis=0)
+    rank = cube_id * pes_per_cube + local_pe
+    nbytes = n_elem * 2
+    pe_addr = t_ptr + rank * nbytes
+
+    # HBM의 자기 슬라이스를 가리키는 TensorHandle. greenlet 모드에선 .data가
+    # 채워지지만 커널은 .data를 직접 만질 필요가 없다.
+    acc = tl.load(pe_addr, shape=(n_elem,), dtype="f16")
+    current = acc  # 첫 라운드 send 출처
+
+    for _step in range(world_size - 1):
+        tl.send(dir="E", src=current)
+        recv = tl.recv(dir="W", shape=(n_elem,), dtype="f16")
+        # TensorHandle 연산자 오버로드 → MathCmd → PE_MATH 디스패치.
+        # Phase 1은 타이밍만, Phase 2 DataExecutor가 실제 numpy 누적을 수행한다.
+        acc = acc + recv
+        current = recv  # 다음 라운드는 직전에 받은 슬롯을 다시 forward
+
+    # 최종 누적값을 자기 슬라이스에 store. 출처는 acc(=PE-local scratch addr)
+    # 이고 dst는 HBM. op_log dma_write가 (scratch, hbm) 복사 정보를 기록하므로
+    # Phase 2가 검증 시점에 HBM[pe_addr]에 정답을 채워준다.
+    tl.store(pe_addr, acc)
+```
+
+네 가지 포인트:
+
+1. **누적은 TensorHandle 연산자**: `acc + recv`는 `MathCmd`를 emit하고
+   PE_MATH로 디스패치된다 — 실제 하드웨어 경로를 거치므로 latency 모델이
+   정확하다. ADR-0020 D3대로 Phase 1은 타이밍만 시뮬레이션하고, Phase 2
+   `DataExecutor`가 op_log를 재실행하면서 numpy 누적을 수행한다.
+2. **`current = recv`로 forward**: 매 라운드의 send 출처를 직전에 받은 슬롯
+   핸들로 갱신해야 같은 데이터가 ring을 순회하면서 누적이 한 번씩 일어난다.
+   `current = acc`로 두면 누적값이 다시 송출되어 결과가 부풀려진다.
+3. **`tl.store(pe_addr, acc)` 한 번이면 끝**: 중간에 store→reload 패턴은
+   금지다. acc는 PE-local scratch에 살고, op_log가 (src=scratch, dst=hbm)
+   메타데이터를 기록한다. Phase 2가 math를 먼저 실행해 scratch를 채운 뒤
+   dma_write 스냅샷으로 HBM에 복사한다.
+4. **`world_size`는 호스트가 명시 전달**: TL은 topology slot 수만 안다 (예:
+   `num_programs(axis=0)`은 cube당 PE 수). 실제 참여하는 CCL group 크기는 bench가
+   알고 호스트→kernel 인자로 넘긴다.
+
+`ccl.yaml` 등록 + 호스트 bench는 [`benches/ccl_allreduce_tcm.py`](../benches/ccl_allreduce_tcm.py)
+참조. mock 단위 테스트는 [`tests/test_ccl_mock_runtime.py`](../tests/test_ccl_mock_runtime.py)
+를 그대로 따라하면 된다 (`kernel_args=(n_elem, world_size)` 인자 형태).
+
+---
+
+## 3. neighbors() override — Custom topology
+
+대부분의 알고리즘은 builtin topology(`ring_1d`, `mesh_2d`, `tree_binary`,
+`ring_1d_unidir`, `none`)로 충분하다. builtin을 변형하거나 새로 만들고 싶으면
+알고리즘 모듈에 `neighbors()`를 정의한다.
+
+### 시그니처
+
+```python
+def neighbors(rank: int, world_size: int, neighbor_map: dict[str, int]) -> dict[str, int] | None:
+    """builtin topology가 만든 neighbor_map을 override.
+
+    Args:
+        neighbor_map: ccl.yaml의 topology 필드가 만든 builtin 매핑.
+                      예: ring_1d → {"E": (rank+1)%ws, "W": (rank-1)%ws}
+                      mutable dict — 직접 수정 가능.
+
+    Returns:
+        dict: neighbor_map을 override한 결과 (또는 수정한 그 dict)
+        None: override 안 함, neighbor_map 그대로 사용
+    """
+    return None
+```
+
+### Pattern A: builtin을 base로 일부만 수정
+
+```python
+def neighbors(rank, world_size, neighbor_map):
+    # 짝수 rank만 W 방향 사용 (홀수 rank는 W 제거)
+    if rank % 2 == 1:
+        neighbor_map.pop("W", None)
+    return neighbor_map
+```
+
+### Pattern B: 완전히 새로 작성 (skip-connection ring)
+
+```python
+def neighbors(rank, world_size, neighbor_map):
+    # neighbor_map은 무시하고 새로 작성
+    return {"E": (rank + 2) % world_size}
+```
+
+### Pattern C: builtin 사용, override 없음
+
+`neighbors()` 함수를 정의하지 않거나 None을 반환:
+
+```python
+def neighbors(rank, world_size, neighbor_map):
+    return None  # 명시적으로 builtin 사용
+```
+
+---
+
+## 4. PE 커널 API 레퍼런스 (ADR-0023 D4)
+
+### IPCQ API
+
+| API | 설명 | Blocking? |
+|-----|------|-----------|
+| `tl.send(dir, src=TensorHandle)` | direction으로 데이터 send | Yes (peer slot full 시 wait) |
+| `tl.send(dir, src_addr=..., nbytes=..., shape=..., dtype=..., space=...)` | 동일, keyword 형태 | Yes |
+| `tl.recv(dir, shape=..., dtype=...)` | 특정 방향에서 blocking recv | Yes |
+| `tl.recv(shape=..., dtype=...)` | 4방향 round-robin recv (방향 미지정) | Yes |
+| `tl.recv_async(dir, shape=..., dtype=...) → RecvFuture` | non-blocking recv | No |
+| `tl.wait(future)` | non-blocking future 완료 대기 → TensorHandle | Yes |
+
+### 기존 TL API (ADR-0020/0022, 그대로 사용 가능)
+
+| API | 설명 |
+|-----|------|
+| `tl.load(addr, shape, dtype) → TensorHandle` | DMA read; greenlet 모드에서 `.data`에 ndarray |
+| `tl.store(addr, handle)` | DMA write — handle.data가 있으면 MemoryStore에 propagate |
+| `tl.composite(op, ...)` | GEMM/Math compute 비동기 submit |
+| `tl.program_id(axis=0)` | cube 내 local PE id |
+| `tl.program_id(axis=1)` | cube id (ADR-0022) |
+| `tl.num_programs(axis=0/1)` | topology 슬롯 수 (참여 ranks 수가 아님) |
+
+### `recv` 두 가지 모드
+
+기본은 `return_slot` (zero-copy): IPCQ slot 주소가 그대로 handle.addr에 들어온다.
+slot 데이터를 별도 위치로 복사하고 싶으면 `dst_addr` + `dst_space`를 명시:
+
+```python
+recv = tl.recv(
+    dir="W", shape=(8,), dtype="f16",
+    dst_addr=my_scratch_addr,
+    dst_space="hbm",
+)
+# 이제 recv.addr == my_scratch_addr (copy_to_dst 모드)
+```
+
+---
+
+## 5. Helpers (`kernbench.ccl.helpers`)
+
+알고리즘 코드를 짧게 유지하기 위한 헬퍼들:
+
+```python
+from kernbench.ccl.helpers import chunked, ring_step, tree_step
+```
+
+### `chunked(base_addr, n_chunks, n_elem, dtype="f16") → list[Chunk]`
+
+총 `n_elem` 개의 element를 `n_chunks` 등분한 view 리스트를 반환. 각 `Chunk`는
+`addr`, `n_elem`, `nbytes` 필드를 가진다.
+
+```python
+chunks = chunked(t_ptr, n_chunks=4, n_elem=64, dtype="f16")
+# chunks[0..3] 각각 16 element view, addr이 연속
+```
+
+### `ring_step(rank, step, world_size) → (send_idx, recv_idx)`
+
+Ring algorithm의 step별 chunk 인덱스 (reduce-scatter / all-gather):
+
+```python
+for step in range(world_size - 1):
+    send_idx, recv_idx = ring_step(rank, step, world_size)
+    tl.send(dir="E", src_addr=chunks[send_idx].addr,
+            nbytes=chunks[send_idx].nbytes,
+            shape=(chunks[send_idx].n_elem,), dtype="f16")
+    recv = tl.recv(dir="W", shape=(chunks[recv_idx].n_elem,), dtype="f16")
+    # accumulate ...
+```
+
+### `tree_step(rank, world_size) → {"parent": int|None, "children": list[int]}`
+
+Binary tree의 parent/children rank:
+
+```python
+info = tree_step(rank, world_size)
+if info["parent"] is None:
+    print(f"rank {rank} is the root")
+for child in info["children"]:
+    ...
+```
+
+---
+
+## 6. 단위 테스트 — Mock Runtime
+
+`kernbench.ccl.testing.run_kernel_in_mock`은 SimPy를 거치지 않고 알고리즘을
+빠르게 검증할 수 있다.
+
+### 기본 사용법
+
+```python
+from kernbench.ccl.testing import run_kernel_in_mock
+from kernbench.ccl.algorithms.my_algo import kernel
+import numpy as np
+
+
+def test_my_algo():
+    n_elem = 16
+    inputs = [np.arange(n_elem, dtype="f16") + r for r in range(4)]
+    expected = sum(inputs)
+
+    outputs = run_kernel_in_mock(
+        kernel_fn=kernel,
+        world_size=4,
+        topology="ring_1d",
+        inputs=inputs,
+        kernel_args=(n_elem, 4),  # kernel의 (t_ptr 이후) 추가 positional 인자
+    )
+
+    for r in range(4):
+        assert np.allclose(outputs[r], expected, rtol=1e-3)
+```
+
+### 동작
+
+- 4개 rank의 kernel을 greenlet으로 동시 실행
+- `tl.send/recv`를 in-memory FIFO로 즉시 처리 (DMA, latency 무시)
+- 각 rank가 마지막에 store한 데이터를 ndarray로 반환
+
+### 한계
+
+- latency / 성능 측정 불가 (시뮬레이션이 아님)
+- PE_DMA, fabric, BW 모델 안 함
+- 정합성 검증만 가능
+- 한 cube 안에서 동작하는 가정 — `program_id(axis=1)`은 항상 0
+
+---
+
+## 7. 디버깅
+
+### CCL trace
+
+```bash
+KERNBENCH_CCL_TRACE=1 kernbench run --topology topology.yaml \
+    --bench ccl_allreduce_tcm --verify-data
+```
+
+각 rank의 send/recv 시점이 stdout에 출력된다:
+
+```
+[ccl t=346.4 send] sip0.cube0.pe1 dir=E nbytes=64 seq=0
+[ccl t=360.4 recv] sip0.cube0.pe2 dir=W nbytes=64
+...
+```
+
+### Pointer dump
+
+`kernbench.ccl.diagnostics.pointer_dump(engine)`는 모든 PE_IPCQ의 ring buffer
+상태(`my_head`, `my_tail`, `peer_head_cache`, `peer_tail_cache`)를 multi-line
+문자열로 반환한다. hang이 발생하면 어느 rank가 어떤 상태에서 막혔는지 한눈에
+보인다.
+
+### Deadlock detection
+
+매칭되지 않는 send/recv 등으로 SimPy 스케줄이 비면 engine이 `IpcqDeadlock`을
+던지며 pointer dump를 메시지에 포함시킨다 (ADR-0023 D14 F3). 별도 wait-for graph
+시각화는 미래 작업.
+
+---
+
+## 8. 흔한 실수
+
+### 1. install 안 된 direction 사용
+
+ccl.yaml의 `topology: ring_1d`는 E/W만 install한다. N/S 사용 시:
+
+```python
+tl.send(dir="N", ...)   # → IpcqInvalidDirection 예외
+```
+
+해결: `topology: mesh_2d`로 바꾸거나, `neighbors()` override로 N/S 추가.
+
+### 2. send만 호출하고 recv 없음
+
+```python
+def kernel(..., tl):
+    for _ in range(100):
+        tl.send(dir="E", ...)
+    # peer 측 recv 없음 → ring buffer 가득 차면 backpressure → deadlock
+```
+
+해결: 모든 send에 짝이 되는 recv가 있어야 한다. 안 그러면 `IpcqDeadlock`이
+발생한다.
+
+### 3. dtype/shape 불일치
+
+기본 모드에서는 dtype/shape mismatch를 검증하지 않는다. 작성자가 직접 보장하거나,
+PE_IPCQ 노드 attrs에 `strict_validation: true`를 설정해 D14 F2 strict 모드로
+mismatch를 즉시 잡을 수 있다.
+
+### 4. round-robin recv의 fairness 가정
+
+`tl.recv()` (방향 미지정)는 round-robin으로 가져오지만, 도착한 첫 슬롯을 반환한다.
+**도착 순서를 알 수 없으므로** 알고리즘이 도착 방향에 의존하면 안 된다.
+필요하면 `tl.recv(dir="N", ...)`처럼 명시.
+
+### 5. CCL 그룹 크기 가정
+
+`tl.num_programs(axis=0/1)`은 토폴로지 슬롯 개수이지 CCL group 크기가 아니다.
+참여하는 rank 수(`world_size`)는 호스트 bench가 알고 있고, kernel 인자로 명시
+전달해야 한다.
+
+### 6. 호스트가 send-source 메모리를 도착 전에 덮어씀
+
+PE_DMA가 송신 시점에 src 데이터를 토큰에 스냅샷해서 in-flight 데이터의 의미가
+보존된다. 그래도 하나의 PE 안에서 같은 주소를 여러 step에 걸쳐 갱신할 때는
+direct send 후 다른 step에서 같은 주소를 store해도 안전하다 (token snapshot 덕분).
+하지만 `tl.send`가 PE_DMA 큐에 enqueue되기 전에 주소를 덮어쓰면 잘못된 데이터가
+스냅샷된다 — `tl.send`를 먼저, 메모리 변경을 나중에 하는 게 권장.
+
+---
+
+## 9. 다음 단계
+
+- `mesh_2d` / `tree_binary` 같은 다른 topology 활용
+- recursive halving/doubling 등 더 빠른 알고리즘
+- `buffer_kind` (tcm/hbm/sram) / `backpressure` (poll/sleep) 모드별 latency 비교
+- `ccl_ring_allreduce_multicube.py`, `ccl_ring_allreduce_multisip.py`처럼 큰
+  scale의 ring 검증
+
+새 알고리즘이나 패턴을 추가했다면 PR로 기여해주세요.
+
+---
+
+## 참고
+
+- [ADR-0023](adr/ADR-0023-ipcq-pe-collective.md): IPCQ + PE-level collective 설계
+- [ADR-0022](adr/ADR-0022-program-id-2d-grid.md): 2D grid program_id (axis=0/1)
+- [ADR-0020](adr/ADR-0020-data-execution-two-pass.md): 2-pass data execution
+- [ADR-0021](adr/ADR-0021-pe-pipeline-refactor.md): PE pipeline refactor
+
+기존 알고리즘 예제:
+
+- [`src/kernbench/ccl/algorithms/hello_send.py`](../src/kernbench/ccl/algorithms/hello_send.py) — 가장 단순한 send/recv
+- [`src/kernbench/ccl/algorithms/ring_allreduce.py`](../src/kernbench/ccl/algorithms/ring_allreduce.py) — ring all-reduce
+- [`src/kernbench/ccl/algorithms/mesh_allreduce.py`](../src/kernbench/ccl/algorithms/mesh_allreduce.py) — 2D mesh all-reduce
+- [`src/kernbench/ccl/algorithms/tree_allreduce.py`](../src/kernbench/ccl/algorithms/tree_allreduce.py) — binary tree all-reduce
@@ -1,156 +1,312 @@
-<svg xmlns="http://www.w3.org/2000/svg" width="556" height="472" viewBox="0 0 556 472">
+<svg xmlns="http://www.w3.org/2000/svg" width="970" height="900" viewBox="0 0 970 900">
  <title>cube</title>
-  <rect width="556" height="472" fill="#f8fafc"/>
-  <text x="278" y="18" text-anchor="middle" font-family="monospace" font-size="14" font-weight="bold" fill="#1e293b">CUBE VIEW</text>
-  <rect x="40.0" y="40.0" width="476.0" height="392.0" rx="6" fill="none" stroke="#475569" stroke-width="2" stroke-dasharray="8,4"/>
-  <rect x="152.0" y="166.0" width="252.0" height="140.0" rx="4" fill="#d1fae5" stroke="#10b981" stroke-width="1.5" stroke-dasharray="6,3" opacity="0.5"/>
-  <text x="278.0" y="278.0" text-anchor="middle" font-family="monospace" font-size="11" fill="#047857" opacity="0.7">HBM</text>
-  <polyline points="82.0,82.0 82.0,95.0 82.0,95.0 82.0,138.0" fill="none" stroke="#f97316" stroke-width="1" opacity="0.8"/>
-  <text x="82.0" y="92.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">6.0mm 256GB/s</text>
-  <polyline points="82.0,82.0 82.0,144.0 334.0,144.0 334.0,236.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
-  <polyline points="334.0,236.0 334.0,144.0 82.0,144.0 82.0,82.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.6"/>
-  <polyline points="166.0,82.0 166.0,95.0 166.0,95.0 166.0,138.0" fill="none" stroke="#f97316" stroke-width="1" opacity="0.8"/>
-  <text x="166.0" y="92.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">6.0mm 256GB/s</text>
-  <polyline points="166.0,82.0 166.0,154.0 334.0,154.0 334.0,236.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
-  <polyline points="334.0,236.0 334.0,144.0 166.0,144.0 166.0,82.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.6"/>
-  <polyline points="390.0,82.0 390.0,95.0 390.0,95.0 390.0,138.0" fill="none" stroke="#f97316" stroke-width="1" opacity="0.8"/>
-  <text x="390.0" y="92.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">6.0mm 256GB/s</text>
-  <polyline points="390.0,82.0 390.0,164.0 334.0,164.0 334.0,236.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
-  <polyline points="334.0,236.0 334.0,144.0 390.0,144.0 390.0,82.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.6"/>
-  <polyline points="474.0,82.0 474.0,95.0 474.0,95.0 474.0,138.0" fill="none" stroke="#f97316" stroke-width="1" opacity="0.8"/>
-  <text x="474.0" y="92.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">6.0mm 256GB/s</text>
-  <polyline points="474.0,82.0 474.0,174.0 334.0,174.0 334.0,236.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
-  <polyline points="334.0,236.0 334.0,144.0 474.0,144.0 474.0,82.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.6"/>
-  <polyline points="82.0,390.0 82.0,347.0 82.0,347.0 82.0,334.0" fill="none" stroke="#f97316" stroke-width="1" opacity="0.8"/>
-  <text x="82.0" y="344.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">6.0mm 256GB/s</text>
-  <polyline points="82.0,390.0 82.0,338.0 334.0,338.0 334.0,236.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
-  <polyline points="334.0,236.0 334.0,298.0 82.0,298.0 82.0,390.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.6"/>
-  <polyline points="166.0,390.0 166.0,347.0 166.0,347.0 166.0,334.0" fill="none" stroke="#f97316" stroke-width="1" opacity="0.8"/>
-  <text x="166.0" y="344.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">6.0mm 256GB/s</text>
-  <polyline points="166.0,390.0 166.0,348.0 334.0,348.0 334.0,236.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
-  <polyline points="334.0,236.0 334.0,298.0 166.0,298.0 166.0,390.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.6"/>
-  <polyline points="390.0,390.0 390.0,347.0 390.0,347.0 390.0,334.0" fill="none" stroke="#f97316" stroke-width="1" opacity="0.8"/>
-  <text x="390.0" y="344.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">6.0mm 256GB/s</text>
-  <polyline points="390.0,390.0 390.0,358.0 334.0,358.0 334.0,236.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
-  <polyline points="334.0,236.0 334.0,298.0 390.0,298.0 390.0,390.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.6"/>
-  <polyline points="474.0,390.0 474.0,347.0 474.0,347.0 474.0,334.0" fill="none" stroke="#f97316" stroke-width="1" opacity="0.8"/>
-  <text x="474.0" y="344.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">6.0mm 256GB/s</text>
-  <polyline points="474.0,390.0 474.0,368.0 334.0,368.0 334.0,236.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
-  <polyline points="334.0,236.0 334.0,298.0 474.0,298.0 474.0,390.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.6"/>
-  <polyline points="82.0,138.0 222.0,138.0 222.0,236.0" fill="none" stroke="#10b981" stroke-width="1" opacity="0.8"/>
-  <text x="152.0" y="183.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.5mm 256GB/s</text>
-  <polyline points="166.0,138.0 222.0,138.0 222.0,236.0" fill="none" stroke="#10b981" stroke-width="1" opacity="0.8"/>
-  <text x="194.0" y="183.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.5mm 256GB/s</text>
-  <polyline points="390.0,138.0 222.0,138.0 222.0,236.0" fill="none" stroke="#10b981" stroke-width="1" opacity="0.8"/>
-  <text x="306.0" y="183.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.5mm 256GB/s</text>
-  <polyline points="474.0,138.0 222.0,138.0 222.0,236.0" fill="none" stroke="#10b981" stroke-width="1" opacity="0.8"/>
-  <text x="348.0" y="183.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.5mm 256GB/s</text>
-  <polyline points="82.0,334.0 222.0,334.0 222.0,236.0" fill="none" stroke="#10b981" stroke-width="1" opacity="0.8"/>
-  <text x="152.0" y="281.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.5mm 256GB/s</text>
-  <polyline points="166.0,334.0 222.0,334.0 222.0,236.0" fill="none" stroke="#10b981" stroke-width="1" opacity="0.8"/>
-  <text x="194.0" y="281.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.5mm 256GB/s</text>
-  <polyline points="390.0,334.0 222.0,334.0 222.0,236.0" fill="none" stroke="#10b981" stroke-width="1" opacity="0.8"/>
-  <text x="306.0" y="281.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.5mm 256GB/s</text>
-  <polyline points="474.0,334.0 222.0,334.0 222.0,236.0" fill="none" stroke="#10b981" stroke-width="1" opacity="0.8"/>
-  <text x="348.0" y="281.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.5mm 256GB/s</text>
-  <line x1="82.0" y1="138.0" x2="166.0" y2="138.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
-  <text x="124.0" y="134.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.0mm 128GB/s</text>
-  <line x1="166.0" y1="138.0" x2="82.0" y2="138.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
-  <text x="124.0" y="134.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.0mm 128GB/s</text>
-  <line x1="166.0" y1="138.0" x2="390.0" y2="138.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
-  <text x="278.0" y="134.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">10.0mm 128GB/s</text>
-  <line x1="390.0" y1="138.0" x2="166.0" y2="138.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
-  <text x="278.0" y="134.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">10.0mm 128GB/s</text>
-  <line x1="390.0" y1="138.0" x2="474.0" y2="138.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
-  <text x="432.0" y="134.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.0mm 128GB/s</text>
-  <line x1="474.0" y1="138.0" x2="390.0" y2="138.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
-  <text x="432.0" y="134.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.0mm 128GB/s</text>
-  <line x1="82.0" y1="334.0" x2="166.0" y2="334.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
-  <text x="124.0" y="330.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.0mm 128GB/s</text>
-  <line x1="166.0" y1="334.0" x2="82.0" y2="334.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
-  <text x="124.0" y="330.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.0mm 128GB/s</text>
-  <line x1="166.0" y1="334.0" x2="390.0" y2="334.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
-  <text x="278.0" y="330.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">10.0mm 128GB/s</text>
-  <line x1="390.0" y1="334.0" x2="166.0" y2="334.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
-  <text x="278.0" y="330.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">10.0mm 128GB/s</text>
-  <line x1="390.0" y1="334.0" x2="474.0" y2="334.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
-  <text x="432.0" y="330.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.0mm 128GB/s</text>
-  <line x1="474.0" y1="334.0" x2="390.0" y2="334.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
-  <text x="432.0" y="330.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.0mm 128GB/s</text>
-  <polyline points="82.0,138.0 110.0,138.0 110.0,292.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
-  <text x="96.0" y="211.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.0mm 512GB/s</text>
-  <polyline points="110.0,292.0 82.0,292.0 82.0,138.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
-  <text x="96.0" y="211.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.0mm 512GB/s</text>
-  <polyline points="82.0,334.0 110.0,334.0 110.0,292.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
-  <text x="96.0" y="309.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.0mm 512GB/s</text>
-  <polyline points="110.0,292.0 82.0,292.0 82.0,334.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
-  <text x="96.0" y="309.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.0mm 512GB/s</text>
-  <polyline points="474.0,138.0 446.0,138.0 446.0,292.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
-  <text x="460.0" y="211.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.0mm 512GB/s</text>
-  <polyline points="446.0,292.0 474.0,292.0 474.0,138.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
-  <text x="460.0" y="211.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.0mm 512GB/s</text>
-  <polyline points="474.0,334.0 446.0,334.0 446.0,292.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
-  <text x="460.0" y="309.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.0mm 512GB/s</text>
-  <polyline points="446.0,292.0 474.0,292.0 474.0,334.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
-  <text x="460.0" y="309.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.0mm 512GB/s</text>
-  <polyline points="334.0,236.0 334.0,131.4 278.0,131.4 278.0,56.8" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.6"/>
-  <polyline points="334.0,236.0 334.0,310.6 278.0,310.6 278.0,415.2" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.6"/>
-  <polyline points="334.0,236.0 334.0,221.0 488.0,221.0 488.0,236.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.6"/>
-  <polyline points="334.0,236.0 334.0,221.0 68.0,221.0 68.0,236.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.6"/>
-  <polyline points="446.0,194.0 446.0,200.0 334.0,200.0 334.0,236.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.6"/>
-  <polyline points="334.0,236.0 334.0,200.0 446.0,200.0 446.0,194.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.6"/>
-  <polyline points="334.0,236.0 110.0,236.0 110.0,194.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.8"/>
-  <polyline points="110.0,194.0 334.0,194.0 334.0,236.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.8"/>
-  <rect x="250.0" y="40.0" width="56.0" height="33.6" rx="4" fill="#3b82f6" stroke="#475569" stroke-width="1"/>
-  <text x="278.0" y="60.8" text-anchor="middle" font-family="monospace" font-size="10" fill="#ffffff">UCIe-N</text>
-  <rect x="250.0" y="398.4" width="56.0" height="33.6" rx="4" fill="#3b82f6" stroke="#475569" stroke-width="1"/>
-  <text x="278.0" y="419.2" text-anchor="middle" font-family="monospace" font-size="10" fill="#ffffff">UCIe-S</text>
-  <rect x="460.0" y="219.2" width="56.0" height="33.6" rx="4" fill="#3b82f6" stroke="#475569" stroke-width="1"/>
-  <text x="488.0" y="240.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#ffffff">UCIe-E</text>
-  <rect x="40.0" y="219.2" width="56.0" height="33.6" rx="4" fill="#3b82f6" stroke="#475569" stroke-width="1"/>
-  <text x="68.0" y="240.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#ffffff">UCIe-W</text>
-  <rect x="306.0" y="219.2" width="56.0" height="33.6" rx="4" fill="#a78bfa" stroke="#475569" stroke-width="1"/>
-  <text x="334.0" y="240.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">NOC</text>
-  <rect x="418.0" y="177.2" width="56.0" height="33.6" rx="4" fill="#f59e0b" stroke="#475569" stroke-width="1"/>
-  <text x="446.0" y="198.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">M CPU</text>
-  <rect x="194.0" y="219.2" width="56.0" height="33.6" rx="4" fill="#10b981" stroke="#475569" stroke-width="1"/>
-  <text x="222.0" y="240.0" text-anchor="middle" font-family="monospace" font-size="8" fill="#ffffff">HBM CTRL</text>
-  <rect x="82.0" y="177.2" width="56.0" height="33.6" rx="4" fill="#f59e0b" stroke="#475569" stroke-width="1"/>
-  <text x="110.0" y="198.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">SRAM</text>
-  <rect x="82.0" y="275.2" width="56.0" height="33.6" rx="4" fill="#f97316" stroke="#475569" stroke-width="1"/>
-  <text x="110.0" y="296.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">Bridge LEFT</text>
-  <rect x="418.0" y="275.2" width="56.0" height="33.6" rx="4" fill="#f97316" stroke="#475569" stroke-width="1"/>
-  <text x="446.0" y="296.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">Bridge RIGHT</text>
-  <rect x="56.8" y="68.0" width="50.4" height="28.0" rx="4" fill="#94a3b8" stroke="#475569" stroke-width="1"/>
-  <text x="82.0" y="86.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">PE0</text>
-  <rect x="54.0" y="121.2" width="56.0" height="33.6" rx="4" fill="#f97316" stroke="#475569" stroke-width="1"/>
-  <text x="82.0" y="142.0" text-anchor="middle" font-family="monospace" font-size="8" fill="#1e293b">XBAR PE0</text>
-  <rect x="140.8" y="68.0" width="50.4" height="28.0" rx="4" fill="#94a3b8" stroke="#475569" stroke-width="1"/>
-  <text x="166.0" y="86.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">PE1</text>
-  <rect x="138.0" y="121.2" width="56.0" height="33.6" rx="4" fill="#f97316" stroke="#475569" stroke-width="1"/>
-  <text x="166.0" y="142.0" text-anchor="middle" font-family="monospace" font-size="8" fill="#1e293b">XBAR PE1</text>
-  <rect x="364.8" y="68.0" width="50.4" height="28.0" rx="4" fill="#94a3b8" stroke="#475569" stroke-width="1"/>
-  <text x="390.0" y="86.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">PE2</text>
-  <rect x="362.0" y="121.2" width="56.0" height="33.6" rx="4" fill="#f97316" stroke="#475569" stroke-width="1"/>
-  <text x="390.0" y="142.0" text-anchor="middle" font-family="monospace" font-size="8" fill="#1e293b">XBAR PE2</text>
-  <rect x="448.8" y="68.0" width="50.4" height="28.0" rx="4" fill="#94a3b8" stroke="#475569" stroke-width="1"/>
-  <text x="474.0" y="86.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">PE3</text>
-  <rect x="446.0" y="121.2" width="56.0" height="33.6" rx="4" fill="#f97316" stroke="#475569" stroke-width="1"/>
-  <text x="474.0" y="142.0" text-anchor="middle" font-family="monospace" font-size="8" fill="#1e293b">XBAR PE3</text>
-  <rect x="56.8" y="376.0" width="50.4" height="28.0" rx="4" fill="#94a3b8" stroke="#475569" stroke-width="1"/>
-  <text x="82.0" y="394.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">PE4</text>
-  <rect x="54.0" y="317.2" width="56.0" height="33.6" rx="4" fill="#f97316" stroke="#475569" stroke-width="1"/>
-  <text x="82.0" y="338.0" text-anchor="middle" font-family="monospace" font-size="8" fill="#1e293b">XBAR PE4</text>
-  <rect x="140.8" y="376.0" width="50.4" height="28.0" rx="4" fill="#94a3b8" stroke="#475569" stroke-width="1"/>
-  <text x="166.0" y="394.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">PE5</text>
-  <rect x="138.0" y="317.2" width="56.0" height="33.6" rx="4" fill="#f97316" stroke="#475569" stroke-width="1"/>
-  <text x="166.0" y="338.0" text-anchor="middle" font-family="monospace" font-size="8" fill="#1e293b">XBAR PE5</text>
-  <rect x="364.8" y="376.0" width="50.4" height="28.0" rx="4" fill="#94a3b8" stroke="#475569" stroke-width="1"/>
-  <text x="390.0" y="394.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">PE6</text>
-  <rect x="362.0" y="317.2" width="56.0" height="33.6" rx="4" fill="#f97316" stroke="#475569" stroke-width="1"/>
-  <text x="390.0" y="338.0" text-anchor="middle" font-family="monospace" font-size="8" fill="#1e293b">XBAR PE6</text>
-  <rect x="448.8" y="376.0" width="50.4" height="28.0" rx="4" fill="#94a3b8" stroke="#475569" stroke-width="1"/>
-  <text x="474.0" y="394.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">PE7</text>
-  <rect x="446.0" y="317.2" width="56.0" height="33.6" rx="4" fill="#f97316" stroke="#475569" stroke-width="1"/>
-  <text x="474.0" y="338.0" text-anchor="middle" font-family="monospace" font-size="8" fill="#1e293b">XBAR PE7</text>
+  <rect width="970" height="900" fill="#0f172a"/>
+  <text x="485" y="22" text-anchor="middle" font-family="monospace" font-size="14" font-weight="bold" fill="#94a3b8">CUBE TOPOLOGY — 17.0×14.0mm | 6×6 Router Mesh | n_to_one mode | 64 pseudo-ch</text>
+  <text x="485" y="40" text-anchor="middle" font-family="monospace" font-size="10" fill="#64748b">Per-PE: 8 ch × 32.0 GB/s = 256.0 GB/s | Cube total: 64 × 32.0 = 2048.0 GB/s</text>
+  <rect x="60" y="60" width="850.0" height="700.0" rx="6" fill="none" stroke="#475569" stroke-width="2" stroke-dasharray="8,4"/>
+  <rect x="260" y="285" width="450" height="250" rx="6" fill="#052e16" stroke="#047857" stroke-width="2" opacity="0.6"/>
+  <text x="485" y="395" text-anchor="middle" font-family="monospace" font-size="11" font-weight="bold" fill="#047857">HBM_CTRL | 64 pseudo channels</text>
+  <text x="485" y="412" text-anchor="middle" font-family="monospace" font-size="9" fill="#05966988">Total BW: 2048 GB/s</text>
+  <rect x="270.0" y="289" width="12.9" height="8" rx="1" fill="#3b82f6" opacity="0.8"/>
+  <rect x="283.4" y="289" width="12.9" height="8" rx="1" fill="#3b82f6" opacity="0.8"/>
+  <rect x="296.9" y="289" width="12.9" height="8" rx="1" fill="#3b82f6" opacity="0.8"/>
+  <rect x="310.3" y="289" width="12.9" height="8" rx="1" fill="#3b82f6" opacity="0.8"/>
+  <rect x="323.8" y="289" width="12.9" height="8" rx="1" fill="#3b82f6" opacity="0.8"/>
+  <rect x="337.2" y="289" width="12.9" height="8" rx="1" fill="#3b82f6" opacity="0.8"/>
+  <rect x="350.6" y="289" width="12.9" height="8" rx="1" fill="#3b82f6" opacity="0.8"/>
+  <rect x="364.1" y="289" width="12.9" height="8" rx="1" fill="#3b82f6" opacity="0.8"/>
+  <rect x="377.5" y="289" width="12.9" height="8" rx="1" fill="#60a5fa" opacity="0.8"/>
+  <rect x="390.9" y="289" width="12.9" height="8" rx="1" fill="#60a5fa" opacity="0.8"/>
+  <rect x="404.4" y="289" width="12.9" height="8" rx="1" fill="#60a5fa" opacity="0.8"/>
+  <rect x="417.8" y="289" width="12.9" height="8" rx="1" fill="#60a5fa" opacity="0.8"/>
+  <rect x="431.2" y="289" width="12.9" height="8" rx="1" fill="#60a5fa" opacity="0.8"/>
+  <rect x="444.7" y="289" width="12.9" height="8" rx="1" fill="#60a5fa" opacity="0.8"/>
+  <rect x="458.1" y="289" width="12.9" height="8" rx="1" fill="#60a5fa" opacity="0.8"/>
+  <rect x="471.6" y="289" width="12.9" height="8" rx="1" fill="#60a5fa" opacity="0.8"/>
+  <rect x="485.0" y="289" width="12.9" height="8" rx="1" fill="#8b5cf6" opacity="0.8"/>
+  <rect x="498.4" y="289" width="12.9" height="8" rx="1" fill="#8b5cf6" opacity="0.8"/>
+  <rect x="511.9" y="289" width="12.9" height="8" rx="1" fill="#8b5cf6" opacity="0.8"/>
+  <rect x="525.3" y="289" width="12.9" height="8" rx="1" fill="#8b5cf6" opacity="0.8"/>
+  <rect x="538.8" y="289" width="12.9" height="8" rx="1" fill="#8b5cf6" opacity="0.8"/>
+  <rect x="552.2" y="289" width="12.9" height="8" rx="1" fill="#8b5cf6" opacity="0.8"/>
+  <rect x="565.6" y="289" width="12.9" height="8" rx="1" fill="#8b5cf6" opacity="0.8"/>
+  <rect x="579.1" y="289" width="12.9" height="8" rx="1" fill="#8b5cf6" opacity="0.8"/>
+  <rect x="592.5" y="289" width="12.9" height="8" rx="1" fill="#a78bfa" opacity="0.8"/>
+  <rect x="605.9" y="289" width="12.9" height="8" rx="1" fill="#a78bfa" opacity="0.8"/>
+  <rect x="619.4" y="289" width="12.9" height="8" rx="1" fill="#a78bfa" opacity="0.8"/>
+  <rect x="632.8" y="289" width="12.9" height="8" rx="1" fill="#a78bfa" opacity="0.8"/>
+  <rect x="646.2" y="289" width="12.9" height="8" rx="1" fill="#a78bfa" opacity="0.8"/>
+  <rect x="659.7" y="289" width="12.9" height="8" rx="1" fill="#a78bfa" opacity="0.8"/>
+  <rect x="673.1" y="289" width="12.9" height="8" rx="1" fill="#a78bfa" opacity="0.8"/>
+  <rect x="686.6" y="289" width="12.9" height="8" rx="1" fill="#a78bfa" opacity="0.8"/>
+  <text x="324" y="286" text-anchor="middle" font-family="monospace" font-size="6" fill="#3b82f6">PE0×8ch</text>
+  <text x="431" y="286" text-anchor="middle" font-family="monospace" font-size="6" fill="#60a5fa">PE1×8ch</text>
+  <text x="539" y="286" text-anchor="middle" font-family="monospace" font-size="6" fill="#8b5cf6">PE2×8ch</text>
+  <text x="646" y="286" text-anchor="middle" font-family="monospace" font-size="6" fill="#a78bfa">PE3×8ch</text>
+  <rect x="270.0" y="523" width="12.9" height="8" rx="1" fill="#f59e0b" opacity="0.8"/>
+  <rect x="283.4" y="523" width="12.9" height="8" rx="1" fill="#f59e0b" opacity="0.8"/>
+  <rect x="296.9" y="523" width="12.9" height="8" rx="1" fill="#f59e0b" opacity="0.8"/>
+  <rect x="310.3" y="523" width="12.9" height="8" rx="1" fill="#f59e0b" opacity="0.8"/>
+  <rect x="323.8" y="523" width="12.9" height="8" rx="1" fill="#f59e0b" opacity="0.8"/>
+  <rect x="337.2" y="523" width="12.9" height="8" rx="1" fill="#f59e0b" opacity="0.8"/>
+  <rect x="350.6" y="523" width="12.9" height="8" rx="1" fill="#f59e0b" opacity="0.8"/>
+  <rect x="364.1" y="523" width="12.9" height="8" rx="1" fill="#f59e0b" opacity="0.8"/>
+  <rect x="377.5" y="523" width="12.9" height="8" rx="1" fill="#fbbf24" opacity="0.8"/>
+  <rect x="390.9" y="523" width="12.9" height="8" rx="1" fill="#fbbf24" opacity="0.8"/>
+  <rect x="404.4" y="523" width="12.9" height="8" rx="1" fill="#fbbf24" opacity="0.8"/>
+  <rect x="417.8" y="523" width="12.9" height="8" rx="1" fill="#fbbf24" opacity="0.8"/>
+  <rect x="431.2" y="523" width="12.9" height="8" rx="1" fill="#fbbf24" opacity="0.8"/>
+  <rect x="444.7" y="523" width="12.9" height="8" rx="1" fill="#fbbf24" opacity="0.8"/>
+  <rect x="458.1" y="523" width="12.9" height="8" rx="1" fill="#fbbf24" opacity="0.8"/>
+  <rect x="471.6" y="523" width="12.9" height="8" rx="1" fill="#fbbf24" opacity="0.8"/>
+  <rect x="485.0" y="523" width="12.9" height="8" rx="1" fill="#ef4444" opacity="0.8"/>
+  <rect x="498.4" y="523" width="12.9" height="8" rx="1" fill="#ef4444" opacity="0.8"/>
+  <rect x="511.9" y="523" width="12.9" height="8" rx="1" fill="#ef4444" opacity="0.8"/>
+  <rect x="525.3" y="523" width="12.9" height="8" rx="1" fill="#ef4444" opacity="0.8"/>
+  <rect x="538.8" y="523" width="12.9" height="8" rx="1" fill="#ef4444" opacity="0.8"/>
+  <rect x="552.2" y="523" width="12.9" height="8" rx="1" fill="#ef4444" opacity="0.8"/>
+  <rect x="565.6" y="523" width="12.9" height="8" rx="1" fill="#ef4444" opacity="0.8"/>
+  <rect x="579.1" y="523" width="12.9" height="8" rx="1" fill="#ef4444" opacity="0.8"/>
+  <rect x="592.5" y="523" width="12.9" height="8" rx="1" fill="#f87171" opacity="0.8"/>
+  <rect x="605.9" y="523" width="12.9" height="8" rx="1" fill="#f87171" opacity="0.8"/>
+  <rect x="619.4" y="523" width="12.9" height="8" rx="1" fill="#f87171" opacity="0.8"/>
+  <rect x="632.8" y="523" width="12.9" height="8" rx="1" fill="#f87171" opacity="0.8"/>
+  <rect x="646.2" y="523" width="12.9" height="8" rx="1" fill="#f87171" opacity="0.8"/>
+  <rect x="659.7" y="523" width="12.9" height="8" rx="1" fill="#f87171" opacity="0.8"/>
+  <rect x="673.1" y="523" width="12.9" height="8" rx="1" fill="#f87171" opacity="0.8"/>
+  <rect x="686.6" y="523" width="12.9" height="8" rx="1" fill="#f87171" opacity="0.8"/>
+  <text x="324" y="539" text-anchor="middle" font-family="monospace" font-size="6" fill="#f59e0b">PE4×8ch</text>
+  <text x="431" y="539" text-anchor="middle" font-family="monospace" font-size="6" fill="#fbbf24">PE5×8ch</text>
+  <text x="539" y="539" text-anchor="middle" font-family="monospace" font-size="6" fill="#ef4444">PE6×8ch</text>
+  <text x="646" y="539" text-anchor="middle" font-family="monospace" font-size="6" fill="#f87171">PE7×8ch</text>
+  <line x1="135" y1="135" x2="285" y2="135" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <line x1="135" y1="135" x2="135" y2="260" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <line x1="285" y1="135" x2="435" y2="135" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <line x1="285" y1="135" x2="285" y2="260" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <line x1="435" y1="135" x2="585" y2="135" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <line x1="435" y1="135" x2="435" y2="260" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <line x1="585" y1="135" x2="685" y2="135" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <line x1="585" y1="135" x2="585" y2="260" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <line x1="685" y1="135" x2="835" y2="135" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <line x1="685" y1="135" x2="685" y2="260" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <line x1="835" y1="135" x2="835" y2="260" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <line x1="135" y1="260" x2="285" y2="260" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <line x1="135" y1="260" x2="135" y2="335" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <line x1="285" y1="260" x2="435" y2="260" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <line x1="285" y1="260" x2="285" y2="335" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <line x1="435" y1="260" x2="585" y2="260" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <line x1="435" y1="260" x2="435" y2="560" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <line x1="585" y1="260" x2="685" y2="260" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <line x1="585" y1="260" x2="585" y2="560" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <line x1="685" y1="260" x2="835" y2="260" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <line x1="685" y1="260" x2="685" y2="335" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <line x1="835" y1="260" x2="835" y2="335" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <line x1="135" y1="335" x2="285" y2="335" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <line x1="135" y1="335" x2="135" y2="485" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <line x1="285" y1="335" x2="685" y2="335" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <line x1="285" y1="335" x2="285" y2="485" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <line x1="685" y1="335" x2="835" y2="335" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <line x1="685" y1="335" x2="685" y2="485" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <line x1="835" y1="335" x2="835" y2="485" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <line x1="135" y1="485" x2="285" y2="485" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <line x1="135" y1="485" x2="135" y2="560" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <line x1="285" y1="485" x2="685" y2="485" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <line x1="285" y1="485" x2="285" y2="560" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <line x1="685" y1="485" x2="835" y2="485" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <line x1="685" y1="485" x2="685" y2="560" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <line x1="835" y1="485" x2="835" y2="560" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <line x1="135" y1="560" x2="285" y2="560" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <line x1="135" y1="560" x2="135" y2="685" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <line x1="285" y1="560" x2="435" y2="560" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <line x1="285" y1="560" x2="285" y2="685" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <line x1="435" y1="560" x2="585" y2="560" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <line x1="435" y1="560" x2="435" y2="685" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <line x1="585" y1="560" x2="685" y2="560" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <line x1="585" y1="560" x2="585" y2="685" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <line x1="685" y1="560" x2="835" y2="560" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <line x1="685" y1="560" x2="685" y2="685" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <line x1="835" y1="560" x2="835" y2="685" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <line x1="135" y1="685" x2="285" y2="685" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <line x1="285" y1="685" x2="435" y2="685" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <line x1="435" y1="685" x2="585" y2="685" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <line x1="585" y1="685" x2="685" y2="685" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <line x1="685" y1="685" x2="835" y2="685" stroke="#475569" stroke-width="1" opacity="0.4"/>
+  <circle cx="135" cy="135" r="8" fill="#475569" stroke="#64748b" stroke-width="1"/>
+  <text x="135" y="138" text-anchor="middle" font-family="monospace" font-size="6" fill="white">r0c0</text>
+  <rect x="119" y="81" width="32" height="16" rx="3" fill="#2d1f3d" stroke="#a855f7" stroke-width="1"/>
+  <text x="135" y="92" text-anchor="middle" font-family="monospace" font-size="7" font-weight="bold" fill="#a855f7">PE0</text>
+  <line x1="135" y1="127" x2="149" y2="97" stroke="#a855f7" stroke-width="1" opacity="0.6"/>
+  <circle cx="285" cy="135" r="8" fill="#475569" stroke="#64748b" stroke-width="1"/>
+  <text x="285" y="138" text-anchor="middle" font-family="monospace" font-size="6" fill="white">r0c1</text>
+  <rect x="269" y="81" width="32" height="16" rx="3" fill="#2d1f3d" stroke="#a855f7" stroke-width="1"/>
+  <text x="285" y="92" text-anchor="middle" font-family="monospace" font-size="7" font-weight="bold" fill="#a855f7">PE1</text>
+  <line x1="285" y1="127" x2="299" y2="97" stroke="#a855f7" stroke-width="1" opacity="0.6"/>
+  <circle cx="435" cy="135" r="8" fill="#334155" stroke="#475569" stroke-width="1"/>
+  <text x="435" y="138" text-anchor="middle" font-family="monospace" font-size="6" fill="white">r0c2</text>
+  <circle cx="585" cy="135" r="8" fill="#334155" stroke="#475569" stroke-width="1"/>
+  <text x="585" y="138" text-anchor="middle" font-family="monospace" font-size="6" fill="white">r0c3</text>
+  <circle cx="685" cy="135" r="8" fill="#475569" stroke="#64748b" stroke-width="1"/>
+  <text x="685" y="138" text-anchor="middle" font-family="monospace" font-size="6" fill="white">r0c4</text>
+  <circle cx="835" cy="135" r="8" fill="#475569" stroke="#64748b" stroke-width="1"/>
+  <text x="835" y="138" text-anchor="middle" font-family="monospace" font-size="6" fill="white">r0c5</text>
+  <circle cx="135" cy="260" r="8" fill="#475569" stroke="#64748b" stroke-width="1"/>
+  <text x="135" y="263" text-anchor="middle" font-family="monospace" font-size="6" fill="white">r1c0</text>
+  <circle cx="285" cy="260" r="8" fill="#334155" stroke="#475569" stroke-width="1"/>
+  <text x="285" y="263" text-anchor="middle" font-family="monospace" font-size="6" fill="white">r1c1</text>
+  <circle cx="435" cy="260" r="8" fill="#475569" stroke="#64748b" stroke-width="1"/>
+  <text x="435" y="263" text-anchor="middle" font-family="monospace" font-size="6" fill="white">r1c2</text>
+  <rect x="419" y="206" width="32" height="16" rx="3" fill="#451a03" stroke="#f59e0b" stroke-width="1"/>
+  <text x="435" y="217" text-anchor="middle" font-family="monospace" font-size="7" font-weight="bold" fill="#f59e0b">M_CPU</text>
+  <line x1="435" y1="252" x2="449" y2="222" stroke="#f59e0b" stroke-width="1" opacity="0.6"/>
+  <circle cx="585" cy="260" r="8" fill="#334155" stroke="#475569" stroke-width="1"/>
+  <text x="585" y="263" text-anchor="middle" font-family="monospace" font-size="6" fill="white">r1c3</text>
+  <circle cx="685" cy="260" r="8" fill="#475569" stroke="#64748b" stroke-width="1"/>
+  <text x="685" y="263" text-anchor="middle" font-family="monospace" font-size="6" fill="white">r1c4</text>
+  <rect x="669" y="206" width="32" height="16" rx="3" fill="#2d1f3d" stroke="#a855f7" stroke-width="1"/>
+  <text x="685" y="217" text-anchor="middle" font-family="monospace" font-size="7" font-weight="bold" fill="#a855f7">PE2</text>
+  <line x1="685" y1="252" x2="699" y2="222" stroke="#a855f7" stroke-width="1" opacity="0.6"/>
+  <circle cx="835" cy="260" r="8" fill="#475569" stroke="#64748b" stroke-width="1"/>
+  <text x="835" y="263" text-anchor="middle" font-family="monospace" font-size="6" fill="white">r1c5</text>
+  <rect x="819" y="206" width="32" height="16" rx="3" fill="#2d1f3d" stroke="#a855f7" stroke-width="1"/>
+  <text x="835" y="217" text-anchor="middle" font-family="monospace" font-size="7" font-weight="bold" fill="#a855f7">PE3</text>
+  <line x1="835" y1="252" x2="849" y2="222" stroke="#a855f7" stroke-width="1" opacity="0.6"/>
+  <circle cx="135" cy="335" r="8" fill="#334155" stroke="#475569" stroke-width="1"/>
+  <text x="135" y="338" text-anchor="middle" font-family="monospace" font-size="6" fill="white">r2c0</text>
+  <circle cx="285" cy="335" r="8" fill="#334155" stroke="#475569" stroke-width="1"/>
+  <text x="285" y="338" text-anchor="middle" font-family="monospace" font-size="6" fill="white">r2c1</text>
+  <circle cx="685" cy="335" r="8" fill="#334155" stroke="#475569" stroke-width="1"/>
+  <text x="685" y="338" text-anchor="middle" font-family="monospace" font-size="6" fill="white">r2c4</text>
+  <circle cx="835" cy="335" r="8" fill="#334155" stroke="#475569" stroke-width="1"/>
+  <text x="835" y="338" text-anchor="middle" font-family="monospace" font-size="6" fill="white">r2c5</text>
+  <circle cx="135" cy="485" r="8" fill="#475569" stroke="#64748b" stroke-width="1"/>
+  <text x="135" y="488" text-anchor="middle" font-family="monospace" font-size="6" fill="white">r3c0</text>
+  <rect x="119" y="523" width="32" height="16" rx="3" fill="#1c1917" stroke="#d97706" stroke-width="1"/>
+  <text x="135" y="534" text-anchor="middle" font-family="monospace" font-size="7" font-weight="bold" fill="#d97706">SRAM</text>
+  <line x1="135" y1="493" x2="149" y2="523" stroke="#d97706" stroke-width="1" opacity="0.6"/>
+  <circle cx="285" cy="485" r="8" fill="#334155" stroke="#475569" stroke-width="1"/>
+  <text x="285" y="488" text-anchor="middle" font-family="monospace" font-size="6" fill="white">r3c1</text>
+  <circle cx="685" cy="485" r="8" fill="#334155" stroke="#475569" stroke-width="1"/>
+  <text x="685" y="488" text-anchor="middle" font-family="monospace" font-size="6" fill="white">r3c4</text>
+  <circle cx="835" cy="485" r="8" fill="#334155" stroke="#475569" stroke-width="1"/>
+  <text x="835" y="488" text-anchor="middle" font-family="monospace" font-size="6" fill="white">r3c5</text>
+  <circle cx="135" cy="560" r="8" fill="#475569" stroke="#64748b" stroke-width="1"/>
+  <text x="135" y="563" text-anchor="middle" font-family="monospace" font-size="6" fill="white">r4c0</text>
+  <rect x="119" y="598" width="32" height="16" rx="3" fill="#2d1f3d" stroke="#a855f7" stroke-width="1"/>
+  <text x="135" y="609" text-anchor="middle" font-family="monospace" font-size="7" font-weight="bold" fill="#a855f7">PE4</text>
+  <line x1="135" y1="568" x2="149" y2="598" stroke="#a855f7" stroke-width="1" opacity="0.6"/>
+  <circle cx="285" cy="560" r="8" fill="#475569" stroke="#64748b" stroke-width="1"/>
+  <text x="285" y="563" text-anchor="middle" font-family="monospace" font-size="6" fill="white">r4c1</text>
+  <rect x="269" y="598" width="32" height="16" rx="3" fill="#2d1f3d" stroke="#a855f7" stroke-width="1"/>
+  <text x="285" y="609" text-anchor="middle" font-family="monospace" font-size="7" font-weight="bold" fill="#a855f7">PE5</text>
+  <line x1="285" y1="568" x2="299" y2="598" stroke="#a855f7" stroke-width="1" opacity="0.6"/>
+  <circle cx="435" cy="560" r="8" fill="#334155" stroke="#475569" stroke-width="1"/>
+  <text x="435" y="563" text-anchor="middle" font-family="monospace" font-size="6" fill="white">r4c2</text>
+  <circle cx="585" cy="560" r="8" fill="#334155" stroke="#475569" stroke-width="1"/>
+  <text x="585" y="563" text-anchor="middle" font-family="monospace" font-size="6" fill="white">r4c3</text>
+  <circle cx="685" cy="560" r="8" fill="#334155" stroke="#475569" stroke-width="1"/>
+  <text x="685" y="563" text-anchor="middle" font-family="monospace" font-size="6" fill="white">r4c4</text>
+  <circle cx="835" cy="560" r="8" fill="#475569" stroke="#64748b" stroke-width="1"/>
+  <text x="835" y="563" text-anchor="middle" font-family="monospace" font-size="6" fill="white">r4c5</text>
+  <circle cx="135" cy="685" r="8" fill="#475569" stroke="#64748b" stroke-width="1"/>
+  <text x="135" y="688" text-anchor="middle" font-family="monospace" font-size="6" fill="white">r5c0</text>
+  <circle cx="285" cy="685" r="8" fill="#475569" stroke="#64748b" stroke-width="1"/>
+  <text x="285" y="688" text-anchor="middle" font-family="monospace" font-size="6" fill="white">r5c1</text>
+  <circle cx="435" cy="685" r="8" fill="#334155" stroke="#475569" stroke-width="1"/>
+  <text x="435" y="688" text-anchor="middle" font-family="monospace" font-size="6" fill="white">r5c2</text>
+  <circle cx="585" cy="685" r="8" fill="#334155" stroke="#475569" stroke-width="1"/>
+  <text x="585" y="688" text-anchor="middle" font-family="monospace" font-size="6" fill="white">r5c3</text>
+  <circle cx="685" cy="685" r="8" fill="#475569" stroke="#64748b" stroke-width="1"/>
+  <text x="685" y="688" text-anchor="middle" font-family="monospace" font-size="6" fill="white">r5c4</text>
+  <rect x="669" y="723" width="32" height="16" rx="3" fill="#2d1f3d" stroke="#a855f7" stroke-width="1"/>
+  <text x="685" y="734" text-anchor="middle" font-family="monospace" font-size="7" font-weight="bold" fill="#a855f7">PE6</text>
+  <line x1="685" y1="693" x2="699" y2="723" stroke="#a855f7" stroke-width="1" opacity="0.6"/>
+  <circle cx="835" cy="685" r="8" fill="#475569" stroke="#64748b" stroke-width="1"/>
+  <text x="835" y="688" text-anchor="middle" font-family="monospace" font-size="6" fill="white">r5c5</text>
+  <rect x="819" y="723" width="32" height="16" rx="3" fill="#2d1f3d" stroke="#a855f7" stroke-width="1"/>
+  <text x="835" y="734" text-anchor="middle" font-family="monospace" font-size="7" font-weight="bold" fill="#a855f7">PE7</text>
+  <line x1="835" y1="693" x2="849" y2="723" stroke="#a855f7" stroke-width="1" opacity="0.6"/>
+  <polyline points="135,143 208,216 251,216 324,289" fill="none" stroke="#10b981" stroke-width="1.5" opacity="0.6" stroke-dasharray="4,3"/>
+  <text x="239" y="216" font-family="monospace" font-size="6" fill="#10b98188">256GB/s</text>
+  <polyline points="285,143 358,216 358,216 431,289" fill="none" stroke="#10b981" stroke-width="1.5" opacity="0.6" stroke-dasharray="4,3"/>
+  <text x="368" y="216" font-family="monospace" font-size="6" fill="#10b98188">256GB/s</text>
+  <polyline points="685,268 674,278 549,278 539,289" fill="none" stroke="#10b981" stroke-width="1.5" opacity="0.6" stroke-dasharray="4,3"/>
+  <text x="622" y="278" font-family="monospace" font-size="6" fill="#10b98188">256GB/s</text>
+  <polyline points="835,268 824,278 657,278 646,289" fill="none" stroke="#10b981" stroke-width="1.5" opacity="0.6" stroke-dasharray="4,3"/>
+  <text x="751" y="278" font-family="monospace" font-size="6" fill="#10b98188">256GB/s</text>
+  <polyline points="135,552 146,542 313,542 324,531" fill="none" stroke="#10b981" stroke-width="1.5" opacity="0.6" stroke-dasharray="4,3"/>
+  <text x="239" y="542" font-family="monospace" font-size="6" fill="#10b98188">256GB/s</text>
+  <polyline points="285,552 296,542 421,542 431,531" fill="none" stroke="#10b981" stroke-width="1.5" opacity="0.6" stroke-dasharray="4,3"/>
+  <text x="368" y="542" font-family="monospace" font-size="6" fill="#10b98188">256GB/s</text>
+  <polyline points="685,677 612,604 612,604 539,531" fill="none" stroke="#10b981" stroke-width="1.5" opacity="0.6" stroke-dasharray="4,3"/>
+  <text x="622" y="604" font-family="monospace" font-size="6" fill="#10b98188">256GB/s</text>
+  <polyline points="835,677 762,604 719,604 646,531" fill="none" stroke="#10b981" stroke-width="1.5" opacity="0.6" stroke-dasharray="4,3"/>
+  <text x="751" y="604" font-family="monospace" font-size="6" fill="#10b98188">256GB/s</text>
+  <rect x="65" y="360" width="50" height="100" rx="3" fill="#1e1b4b" stroke="#8b5cf6" stroke-width="1.5" opacity="0.9"/>
+  <text x="90" y="357" text-anchor="middle" font-family="monospace" font-size="7" font-weight="bold" fill="#8b5cf6">UCIe-W</text>
+  <rect x="67" y="362" width="46" height="23" rx="2" fill="#818cf8" opacity="0.7"/>
+  <text x="90" y="376" text-anchor="middle" font-family="monospace" font-size="5" fill="white">c0</text>
+  <polyline points="127,135 120,142 120,366 113,374" fill="none" stroke="#818cf8" stroke-width="1" opacity="0.5"/>
+  <rect x="67" y="386" width="46" height="23" rx="2" fill="#a78bfa" opacity="0.7"/>
+  <text x="90" y="400" text-anchor="middle" font-family="monospace" font-size="5" fill="white">c1</text>
+  <polyline points="127,260 120,267 120,390 113,398" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.5"/>
+  <rect x="67" y="410" width="46" height="23" rx="2" fill="#c084fc" opacity="0.7"/>
+  <text x="90" y="424" text-anchor="middle" font-family="monospace" font-size="5" fill="white">c2</text>
+  <polyline points="127,560 120,553 120,428 113,422" fill="none" stroke="#c084fc" stroke-width="1" opacity="0.5"/>
+  <rect x="67" y="434" width="46" height="23" rx="2" fill="#e879f9" opacity="0.7"/>
+  <text x="90" y="448" text-anchor="middle" font-family="monospace" font-size="5" fill="white">c3</text>
+  <polyline points="127,685 120,678 120,452 113,446" fill="none" stroke="#e879f9" stroke-width="1" opacity="0.5"/>
+  <rect x="435" y="65" width="100" height="50" rx="3" fill="#1e1b4b" stroke="#8b5cf6" stroke-width="1.5" opacity="0.9"/>
+  <text x="485" y="62" text-anchor="middle" font-family="monospace" font-size="7" font-weight="bold" fill="#8b5cf6">UCIe-N</text>
+  <rect x="437" y="67" width="23" height="46" rx="2" fill="#818cf8" opacity="0.7"/>
+  <text x="448" y="93" text-anchor="middle" font-family="monospace" font-size="5" fill="white">c0</text>
+  <polyline points="135,127 142,120 442,120 448,113" fill="none" stroke="#818cf8" stroke-width="1" opacity="0.5"/>
+  <rect x="461" y="67" width="23" height="46" rx="2" fill="#a78bfa" opacity="0.7"/>
+  <text x="472" y="93" text-anchor="middle" font-family="monospace" font-size="5" fill="white">c1</text>
+  <polyline points="285,127 292,120 466,120 472,113" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.5"/>
+  <rect x="485" y="67" width="23" height="46" rx="2" fill="#c084fc" opacity="0.7"/>
+  <text x="496" y="93" text-anchor="middle" font-family="monospace" font-size="5" fill="white">c2</text>
+  <polyline points="685,127 678,120 504,120 496,113" fill="none" stroke="#c084fc" stroke-width="1" opacity="0.5"/>
+  <rect x="509" y="67" width="23" height="46" rx="2" fill="#e879f9" opacity="0.7"/>
+  <text x="520" y="93" text-anchor="middle" font-family="monospace" font-size="5" fill="white">c3</text>
+  <polyline points="835,127 828,120 528,120 520,113" fill="none" stroke="#e879f9" stroke-width="1" opacity="0.5"/>
+  <rect x="855" y="360" width="50" height="100" rx="3" fill="#1e1b4b" stroke="#8b5cf6" stroke-width="1.5" opacity="0.9"/>
+  <text x="880" y="357" text-anchor="middle" font-family="monospace" font-size="7" font-weight="bold" fill="#8b5cf6">UCIe-E</text>
+  <rect x="857" y="362" width="46" height="23" rx="2" fill="#818cf8" opacity="0.7"/>
+  <text x="880" y="376" text-anchor="middle" font-family="monospace" font-size="5" fill="white">c0</text>
+  <polyline points="843,135 850,142 850,367 857,374" fill="none" stroke="#818cf8" stroke-width="1" opacity="0.5"/>
+  <rect x="857" y="386" width="46" height="23" rx="2" fill="#a78bfa" opacity="0.7"/>
+  <text x="880" y="400" text-anchor="middle" font-family="monospace" font-size="5" fill="white">c1</text>
+  <polyline points="843,260 850,267 850,391 857,398" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.5"/>
+  <rect x="857" y="410" width="46" height="23" rx="2" fill="#c084fc" opacity="0.7"/>
+  <text x="880" y="424" text-anchor="middle" font-family="monospace" font-size="5" fill="white">c2</text>
+  <polyline points="843,560 850,553 850,428 857,422" fill="none" stroke="#c084fc" stroke-width="1" opacity="0.5"/>
+  <rect x="857" y="434" width="46" height="23" rx="2" fill="#e879f9" opacity="0.7"/>
+  <text x="880" y="448" text-anchor="middle" font-family="monospace" font-size="5" fill="white">c3</text>
+  <polyline points="843,685 850,678 850,452 857,446" fill="none" stroke="#e879f9" stroke-width="1" opacity="0.5"/>
+  <rect x="435" y="705" width="100" height="50" rx="3" fill="#1e1b4b" stroke="#8b5cf6" stroke-width="1.5" opacity="0.9"/>
+  <text x="485" y="702" text-anchor="middle" font-family="monospace" font-size="7" font-weight="bold" fill="#8b5cf6">UCIe-S</text>
+  <rect x="437" y="707" width="23" height="46" rx="2" fill="#818cf8" opacity="0.7"/>
+  <text x="448" y="733" text-anchor="middle" font-family="monospace" font-size="5" fill="white">c0</text>
+  <polyline points="135,693 142,700 442,700 448,707" fill="none" stroke="#818cf8" stroke-width="1" opacity="0.5"/>
+  <rect x="461" y="707" width="23" height="46" rx="2" fill="#a78bfa" opacity="0.7"/>
+  <text x="472" y="733" text-anchor="middle" font-family="monospace" font-size="5" fill="white">c1</text>
+  <polyline points="285,693 292,700 466,700 472,707" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.5"/>
+  <rect x="485" y="707" width="23" height="46" rx="2" fill="#c084fc" opacity="0.7"/>
+  <text x="496" y="733" text-anchor="middle" font-family="monospace" font-size="5" fill="white">c2</text>
+  <polyline points="685,693 678,700 504,700 496,707" fill="none" stroke="#c084fc" stroke-width="1" opacity="0.5"/>
+  <rect x="509" y="707" width="23" height="46" rx="2" fill="#e879f9" opacity="0.7"/>
+  <text x="520" y="733" text-anchor="middle" font-family="monospace" font-size="5" fill="white">c3</text>
+  <polyline points="835,693 828,700 528,700 520,707" fill="none" stroke="#e879f9" stroke-width="1" opacity="0.5"/>
+  <rect x="60" y="865" width="10" height="10" rx="2" fill="#3b82f6" stroke="#475569" stroke-width="0.5"/>
+  <text x="74" y="874" font-family="monospace" font-size="8" fill="#94a3b8">PE Router</text>
+  <rect x="147" y="865" width="10" height="10" rx="2" fill="#f59e0b" stroke="#475569" stroke-width="0.5"/>
+  <text x="161" y="874" font-family="monospace" font-size="8" fill="#94a3b8">M_CPU / SRAM</text>
+  <rect x="255" y="865" width="10" height="10" rx="2" fill="#8b5cf6" stroke="#475569" stroke-width="0.5"/>
+  <text x="269" y="874" font-family="monospace" font-size="8" fill="#94a3b8">UCIe</text>
+  <rect x="307" y="865" width="10" height="10" rx="2" fill="#334155" stroke="#475569" stroke-width="0.5"/>
+  <text x="321" y="874" font-family="monospace" font-size="8" fill="#94a3b8">Relay</text>
+  <rect x="366" y="865" width="10" height="10" rx="2" fill="#10b981" stroke="#475569" stroke-width="0.5"/>
+  <text x="380" y="874" font-family="monospace" font-size="8" fill="#94a3b8">HBM Link</text>
+  <rect x="446" y="865" width="10" height="10" rx="2" fill="#475569" stroke="#475569" stroke-width="0.5"/>
+  <text x="460" y="874" font-family="monospace" font-size="8" fill="#94a3b8">Mesh Link</text>
 </svg>
@@ -26,6 +26,8 @@
  <text x="285.0" y="184.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#ffffff">PE GEMM</text>
  <rect x="241.2" y="243.0" width="87.5" height="49.0" rx="4" fill="#ec4899" stroke="#475569" stroke-width="1"/>
  <text x="285.0" y="271.5" text-anchor="middle" font-family="monospace" font-size="10" fill="#ffffff">PE MATH</text>
+  <rect x="136.2" y="68.0" width="87.5" height="49.0" rx="4" fill="#e2e8f0" stroke="#475569" stroke-width="1"/>
+  <text x="180.0" y="96.5" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">PE MMU</text>
  <rect x="346.2" y="155.5" width="87.5" height="49.0" rx="4" fill="#10b981" stroke="#475569" stroke-width="1"/>
  <text x="390.0" y="184.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#ffffff">PE TCM</text>
 </svg>
@@ -51,13 +51,13 @@
  <line x1="396.0" y1="504.0" x2="540.0" y2="504.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
  <text x="468.0" y="500.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
  <polyline points="324.0,56.0 108.0,56.0 108.0,144.0" fill="none" stroke="#0ea5e9" stroke-width="1" opacity="0.8"/>
-  <text x="216.0" y="96.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.5mm 512GB/s</text>
+  <text x="216.0" y="96.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.5mm 512GB/s</text>
  <polyline points="324.0,56.0 252.0,56.0 252.0,144.0" fill="none" stroke="#0ea5e9" stroke-width="1" opacity="0.8"/>
-  <text x="288.0" y="96.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.5mm 512GB/s</text>
+  <text x="288.0" y="96.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.5mm 512GB/s</text>
  <polyline points="324.0,56.0 396.0,56.0 396.0,144.0" fill="none" stroke="#0ea5e9" stroke-width="1" opacity="0.8"/>
-  <text x="360.0" y="96.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.5mm 512GB/s</text>
+  <text x="360.0" y="96.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.5mm 512GB/s</text>
  <polyline points="324.0,56.0 540.0,56.0 540.0,144.0" fill="none" stroke="#0ea5e9" stroke-width="1" opacity="0.8"/>
-  <text x="432.0" y="96.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.5mm 512GB/s</text>
+  <text x="432.0" y="96.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.5mm 512GB/s</text>
  <rect x="84.0" y="128.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
  <text x="108.0" y="148.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (0,0)</text>
  <rect x="228.0" y="128.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
@@ -3,9 +3,9 @@
  <rect width="768" height="396" fill="#f8fafc"/>
  <text x="384" y="18" text-anchor="middle" font-family="monospace" font-size="14" font-weight="bold" fill="#1e293b">SYSTEM VIEW</text>
  <polyline points="384.0,60.0 182.0,60.0 182.0,120.0" fill="none" stroke="#6366f1" stroke-width="1" opacity="0.8"/>
-  <text x="283.0" y="86.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">20.0mm 256GB/s</text>
+  <text x="283.0" y="86.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">20.0mm 768GB/s</text>
  <polyline points="384.0,60.0 586.0,60.0 586.0,120.0" fill="none" stroke="#6366f1" stroke-width="1" opacity="0.8"/>
-  <text x="485.0" y="86.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">20.0mm 256GB/s</text>
+  <text x="485.0" y="86.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">20.0mm 768GB/s</text>
  <rect x="374.0" y="57.0" width="20.0" height="6.0" rx="4" fill="#6366f1" stroke="#475569" stroke-width="1"/>
  <text x="384.0" y="64.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#ffffff">Fabric Switch</text>
  <rect x="62.0" y="138.0" width="240.0" height="200.0" rx="4" fill="#e0e7ff" stroke="#475569" stroke-width="1"/>
@@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta"
 name = "kernbench"
 version = "0.1.0"
 requires-python = ">=3.10"
-dependencies = ["pytest", "simpy", "pyyaml", "fastapi>=0.110", "uvicorn[standard]>=0.29", "websockets>=12"]
+dependencies = ["pytest", "simpy", "pyyaml", "fastapi>=0.110", "uvicorn[standard]>=0.29", "websockets>=12", "numpy>=1.24", "greenlet>=3.0"]

 [project.scripts]
 kernbench = "kernbench.cli.main:main"
@@ -18,6 +18,7 @@ include = ["kernbench*", "benches*"]
 [project.optional-dependencies]
 dev = [
  "pytest>=7",
+  "pytest-xdist>=3.0",
  "ruff>=0.4.0",
 ]

@@ -31,4 +32,5 @@ select = ["E", "F", "I", "B", "UP"]
 ignore = ["F401"]

 [tool.pytest.ini_options]
-addopts = ["--disable-warnings"]
+addopts = ["--disable-warnings", "-n", "auto", "-m", "not slow"]
+markers = ["slow: 256-rank full-system tests (~3min each, run with: pytest -m '')"]
@@ -0,0 +1,9 @@
+"""CCL (Collective Communication Library) framework for kernbench (ADR-0023).
+
+This package provides:
+    - topologies: builtin neighbor topology generators (ring/mesh/tree)
+    - helpers:    utilities for algorithm authors (chunked, ring_step, ...)
+    - testing:    mock CCL runtime for fast unit tests of algorithm kernels
+
+See docs/adr/ADR-0023-ipcq-pe-collective.md and docs/ccl-author-guide.md.
+"""
@@ -0,0 +1,189 @@
+"""Intercube all-reduce kernel (pe0-only, same-lane across cubes).
+
+Reduces across the 4×4 cube mesh within each SIP, then exchanges
+between SIPs using the configured SIP topology, and broadcasts back.
+
+Supported SIP topologies (selected via ``sip_topo_kind``):
+  0 — ring_1d:       global_E/global_W ring, n_sips-1 rounds
+  1 — torus_2d:      row ring (global_E/W) + col ring (global_S/N)
+  2 — mesh_2d:       row chain reduce+broadcast + col chain reduce+broadcast
+
+IPCQ wiring is handled by ``configure_sfr_intercube_multisip``.
+"""
+from __future__ import annotations
+
+SIP_TOPO_RING = 0
+SIP_TOPO_TORUS = 1
+SIP_TOPO_MESH = 2
+
+TOPO_NAME_TO_KIND = {
+    "ring_1d": SIP_TOPO_RING,
+    "torus_2d": SIP_TOPO_TORUS,
+    "mesh_2d": SIP_TOPO_TORUS,
+    "mesh_2d_no_wrap": SIP_TOPO_MESH,
+}
+
+
+def kernel_args(world_size: int, n_elem: int) -> tuple:
+    cube_w = 4
+    cube_h = 4
+    return (n_elem, cube_w, cube_h, world_size)
+
+
+def _inter_sip_ring(acc, n_sips, n_elem, tl):
+    current = acc
+    for _ in range(n_sips - 1):
+        tl.send(dir="global_E", src=current)
+        recv = tl.recv(dir="global_W", shape=(n_elem,), dtype="f16")
+        acc = acc + recv
+        current = recv
+    return acc
+
+
+def _inter_sip_torus_2d(acc, sip_rank, sip_topo_w, sip_topo_h, n_elem, tl):
+    # Row ring (global_E / global_W)
+    current = acc
+    for _ in range(sip_topo_w - 1):
+        tl.send(dir="global_E", src=current)
+        recv = tl.recv(dir="global_W", shape=(n_elem,), dtype="f16")
+        acc = acc + recv
+        current = recv
+    # Col ring (global_S / global_N)
+    current = acc
+    for _ in range(sip_topo_h - 1):
+        tl.send(dir="global_S", src=current)
+        recv = tl.recv(dir="global_N", shape=(n_elem,), dtype="f16")
+        acc = acc + recv
+        current = recv
+    return acc
+
+
+def _inter_sip_mesh_2d(acc, sip_rank, sip_topo_w, sip_topo_h, n_elem, tl):
+    sip_row = sip_rank // sip_topo_w
+    sip_col = sip_rank % sip_topo_w
+
+    # Row reduce W → E
+    if sip_col == 0:
+        tl.send(dir="global_E", src=acc)
+    elif sip_col < sip_topo_w - 1:
+        recv = tl.recv(dir="global_W", shape=(n_elem,), dtype="f16")
+        acc = acc + recv
+        tl.send(dir="global_E", src=acc)
+    else:
+        recv = tl.recv(dir="global_W", shape=(n_elem,), dtype="f16")
+        acc = acc + recv
+
+    # Row broadcast E → W
+    if sip_col == sip_topo_w - 1:
+        tl.send(dir="global_W", src=acc)
+    elif sip_col > 0:
+        acc = tl.recv(dir="global_E", shape=(n_elem,), dtype="f16")
+        tl.send(dir="global_W", src=acc)
+    else:
+        acc = tl.recv(dir="global_E", shape=(n_elem,), dtype="f16")
+
+    # Col reduce N → S
+    if sip_row == 0:
+        tl.send(dir="global_S", src=acc)
+    elif sip_row < sip_topo_h - 1:
+        recv = tl.recv(dir="global_N", shape=(n_elem,), dtype="f16")
+        acc = acc + recv
+        tl.send(dir="global_S", src=acc)
+    else:
+        recv = tl.recv(dir="global_N", shape=(n_elem,), dtype="f16")
+        acc = acc + recv
+
+    # Col broadcast S → N
+    if sip_row == sip_topo_h - 1:
+        tl.send(dir="global_N", src=acc)
+    elif sip_row > 0:
+        acc = tl.recv(dir="global_S", shape=(n_elem,), dtype="f16")
+        tl.send(dir="global_N", src=acc)
+    else:
+        acc = tl.recv(dir="global_S", shape=(n_elem,), dtype="f16")
+
+    return acc
+
+
+def allreduce_intercube_multidevice(
+    t_ptr, n_elem, cube_w, cube_h, n_sips, sip_rank,
+    sip_topo_kind, sip_topo_w, sip_topo_h, tl,
+):
+    """Intercube all-reduce (pe0-only) with configurable SIP topology.
+
+    Args:
+        t_ptr: VA base of the row-wise-sharded tensor on this SIP.
+        n_elem: f16 elements per cube tile.
+        cube_w: cube mesh width (columns).
+        cube_h: cube mesh height (rows).
+        n_sips: number of SIPs.
+        sip_rank: this SIP's rank (0-based).
+        sip_topo_kind: 0=ring, 1=torus_2d, 2=mesh_2d.
+        sip_topo_w: SIP mesh width (for 2D topologies, 0 for ring).
+        sip_topo_h: SIP mesh height (for 2D topologies, 0 for ring).
+        tl: TLContext (auto-injected).
+    """
+    cube_id = tl.program_id(axis=1)
+    row = cube_id // cube_w
+    col = cube_id % cube_w
+    nbytes = n_elem * 2
+
+    pe_addr = t_ptr + cube_id * nbytes
+    acc = tl.load(pe_addr, shape=(n_elem,), dtype="f16")
+
+    # ── Phase 1: row reduce W → E ──
+    if col == 0:
+        tl.send(dir="E", src=acc)
+    elif col < cube_w - 1:
+        recv = tl.recv(dir="W", shape=(n_elem,), dtype="f16")
+        acc = acc + recv
+        tl.send(dir="E", src=acc)
+    else:
+        recv = tl.recv(dir="W", shape=(n_elem,), dtype="f16")
+        acc = acc + recv
+
+    # ── Phase 2: col reduce N → S on rightmost column ──
+    if col == cube_w - 1:
+        if row == 0:
+            tl.send(dir="S", src=acc)
+        elif row < cube_h - 1:
+            recv = tl.recv(dir="N", shape=(n_elem,), dtype="f16")
+            acc = acc + recv
+            tl.send(dir="S", src=acc)
+        else:
+            recv = tl.recv(dir="N", shape=(n_elem,), dtype="f16")
+            acc = acc + recv
+
+    # ── Phase 3: inter-SIP exchange on root cube ──
+    root_cube = (cube_h - 1) * cube_w + (cube_w - 1)
+    if cube_id == root_cube and n_sips > 1:
+        if sip_topo_kind == SIP_TOPO_RING:
+            acc = _inter_sip_ring(acc, n_sips, n_elem, tl)
+        elif sip_topo_kind == SIP_TOPO_TORUS:
+            acc = _inter_sip_torus_2d(acc, sip_rank, sip_topo_w, sip_topo_h, n_elem, tl)
+        elif sip_topo_kind == SIP_TOPO_MESH:
+            acc = _inter_sip_mesh_2d(acc, sip_rank, sip_topo_w, sip_topo_h, n_elem, tl)
+
+    # ── Phase 4: col broadcast S → N on rightmost column ──
+    if col == cube_w - 1:
+        if row == cube_h - 1:
+            tl.send(dir="N", src=acc)
+        elif row > 0:
+            acc = tl.recv(dir="S", shape=(n_elem,), dtype="f16")
+            tl.send(dir="N", src=acc)
+        else:
+            acc = tl.recv(dir="S", shape=(n_elem,), dtype="f16")
+
+    # ── Phase 5: row broadcast E → W ──
+    if col == cube_w - 1:
+        tl.send(dir="W", src=acc)
+    elif col > 0:
+        acc = tl.recv(dir="E", shape=(n_elem,), dtype="f16")
+        tl.send(dir="W", src=acc)
+    else:
+        acc = tl.recv(dir="E", shape=(n_elem,), dtype="f16")
+
+    tl.store(pe_addr, acc)
+
+
+kernel = allreduce_intercube_multidevice
@@ -0,0 +1,127 @@
+"""CCL diagnostics: trace + pointer dump + deadlock (ADR-0023 D14).
+
+Trace
+-----
+Set ``KERNBENCH_CCL_TRACE=1`` (or any truthy value) to enable per-event
+logging of CCL send/recv to stdout. Off by default.
+
+Pointer dump
+------------
+``pointer_dump(engine)`` returns a multi-line string showing every PE_IPCQ's
+ring buffer state (my_head, my_tail, peer_head_cache, peer_tail_cache).
+Useful for diagnosing hangs.
+
+Deadlock
+--------
+``IpcqDeadlock`` is raised by the engine when SimPy's schedule empties
+while a request is still pending — typical of unmatched send/recv pairs.
+The exception message includes the pointer dump.
+"""
+from __future__ import annotations
+
+import os
+from typing import Any
+
+
+class IpcqDeadlock(RuntimeError):
+    """Raised when the simulation cannot make further progress while a
+    CCL request is still pending (D14 F3)."""
+
+
+# ── Trace toggle ─────────────────────────────────────────────────────
+
+
+_TRACE_ENABLED: bool = False
+
+
+def reload_trace_setting() -> None:
+    """Re-read the ``KERNBENCH_CCL_TRACE`` env var."""
+    global _TRACE_ENABLED
+    val = os.environ.get("KERNBENCH_CCL_TRACE", "")
+    _TRACE_ENABLED = val.strip().lower() in {"1", "true", "yes", "on"}
+
+
+def trace_enabled() -> bool:
+    return _TRACE_ENABLED
+
+
+# Initialise once at import time
+reload_trace_setting()
+
+
+# ── Trace event functions ────────────────────────────────────────────
+
+
+def log_send(
+    t_ns: float,
+    sender: str,
+    direction: str,
+    nbytes: int,
+    sender_seq: int,
+) -> None:
+    if not _TRACE_ENABLED:
+        return
+    print(
+        f"[ccl t={t_ns:.1f} send] {sender} dir={direction} nbytes={nbytes} seq={sender_seq}",
+        flush=True,
+    )
+
+
+def log_recv(
+    t_ns: float,
+    receiver: str,
+    direction: str,
+    nbytes: int,
+) -> None:
+    if not _TRACE_ENABLED:
+        return
+    print(
+        f"[ccl t={t_ns:.1f} recv] {receiver} dir={direction} nbytes={nbytes}",
+        flush=True,
+    )
+
+
+def log_credit_return(
+    t_ns: float,
+    sender: str,
+    direction: str,
+    consumer_seq: int,
+) -> None:
+    if not _TRACE_ENABLED:
+        return
+    print(
+        f"[ccl t={t_ns:.1f} credit] {sender} dir={direction} seq={consumer_seq}",
+        flush=True,
+    )
+
+
+# ── Pointer dump ─────────────────────────────────────────────────────
+
+
+def pointer_dump(engine: Any) -> str:
+    """Return a multi-line string of every PE_IPCQ's pointer state."""
+    lines: list[str] = []
+    components = getattr(engine, "_components", {})
+    for node_id in sorted(components):
+        if not node_id.endswith(".pe_ipcq"):
+            continue
+        comp = components[node_id]
+        qps = getattr(comp, "queue_pairs", {})
+        if not qps:
+            continue
+        lines.append(node_id)
+        for d in sorted(qps):
+            qp = qps[d]
+            peer = qp["peer"]
+            lines.append(
+                f"  {d}: peer=sip{peer.sip}.cube{peer.cube}.pe{peer.pe}  "
+                f"my_head={qp['my_head']} my_tail={qp['my_tail']}  "
+                f"peer_head_cache={qp['peer_head_cache']} "
+                f"peer_tail_cache={qp['peer_tail_cache']}"
+            )
+    return "\n".join(lines)
+
+
+def print_pointer_dump(engine: Any) -> None:
+    """Convenience: print pointer_dump(engine) to stdout."""
+    print(pointer_dump(engine), flush=True)
@@ -0,0 +1,118 @@
+"""Helpers for CCL algorithm authors (ADR-0023 D15).
+
+These are pure utility functions usable from any kernel module:
+
+    from kernbench.ccl.helpers import chunked, ring_step, tree_step
+
+They keep algorithm code short and free of off-by-one bugs.
+"""
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any
+
+
+_DTYPE_BYTES = {
+    "f16": 2, "fp16": 2, "float16": 2, "bf16": 2,
+    "f32": 4, "fp32": 4, "float32": 4,
+    "i8": 1, "int8": 1,
+    "i16": 2, "int16": 2,
+    "i32": 4, "int32": 4,
+}
+
+
+def _itemsize(dtype: str) -> int:
+    if dtype not in _DTYPE_BYTES:
+        raise ValueError(f"Unsupported dtype: {dtype}")
+    return _DTYPE_BYTES[dtype]
+
+
+# ── chunked ──────────────────────────────────────────────────────────
+
+
+@dataclass(frozen=True)
+class Chunk:
+    """One chunk of a tensor used by collective algorithms."""
+
+    addr: int
+    n_elem: int
+    nbytes: int
+
+
+def chunked(
+    base_addr: int,
+    n_chunks: int,
+    n_elem: int,
+    dtype: str = "f16",
+) -> list[Chunk]:
+    """Slice a 1D buffer into ``n_chunks`` equal Chunks.
+
+    Args:
+        base_addr: starting address of the buffer.
+        n_chunks: number of equal chunks to produce.
+        n_elem: total number of elements (must be divisible by n_chunks).
+        dtype: element type for byte-size calculation.
+
+    Returns:
+        List of ``Chunk`` objects whose addresses are consecutive.
+
+    Raises:
+        ValueError: if n_elem is not divisible by n_chunks.
+    """
+    if n_elem % n_chunks != 0:
+        raise ValueError(
+            f"chunked: n_elem ({n_elem}) not divisible by n_chunks ({n_chunks})"
+        )
+    per_chunk_elem = n_elem // n_chunks
+    isize = _itemsize(dtype)
+    per_chunk_bytes = per_chunk_elem * isize
+    return [
+        Chunk(
+            addr=base_addr + i * per_chunk_bytes,
+            n_elem=per_chunk_elem,
+            nbytes=per_chunk_bytes,
+        )
+        for i in range(n_chunks)
+    ]
+
+
+# ── ring_step ────────────────────────────────────────────────────────
+
+
+def ring_step(rank: int, step: int, world_size: int) -> tuple[int, int]:
+    """Return ``(send_chunk_idx, recv_chunk_idx)`` for a ring algorithm step.
+
+    Standard reduce-scatter / all-gather ring schedule:
+        at step s, rank r sends chunk (r - s) and receives chunk (r - s - 1)
+        modulo world_size.
+
+    Used by ring all-reduce kernels:
+
+        for step in range(world_size - 1):
+            send_idx, recv_idx = ring_step(rank, step, world_size)
+            tl.send(dir="E", src=chunks[send_idx])
+            chunks[recv_idx] += tl.recv(dir="W").data
+    """
+    send_idx = (rank - step) % world_size
+    recv_idx = (rank - step - 1) % world_size
+    return send_idx, recv_idx
+
+
+# ── tree_step ────────────────────────────────────────────────────────
+
+
+def tree_step(rank: int, world_size: int) -> dict[str, Any]:
+    """Return parent/children for binary tree rooted at rank 0.
+
+    Returns:
+        ``{"parent": int|None, "children": list[int]}``
+    """
+    parent = (rank - 1) // 2 if rank > 0 else None
+    children: list[int] = []
+    left = 2 * rank + 1
+    right = 2 * rank + 2
+    if left < world_size:
+        children.append(left)
+    if right < world_size:
+        children.append(right)
+    return {"parent": parent, "children": children}
@@ -0,0 +1,285 @@
+"""IPCQ install plan for AhbmCCLBackend (ADR-0023 D10/D11/D12).
+
+Given a ccl.yaml config, the topology, and the engine, this module:
+
+1. Loads ccl.yaml and resolves the chosen algorithm.
+2. Maps each rank to a (sip, cube, pe) PE address using a linear scheme.
+3. Allocates per-rank IPCQ ring buffer base addresses (synthetic but
+   unique-per-PE; see notes below).
+4. Builds neighbor tables via the algorithm's ``topology`` field plus the
+   optional ``neighbors()`` override hook from the algorithm module.
+5. Wires bidirectional credit-return SimPy Stores between every (PE, peer)
+   pair.
+6. Installs each PE_IPCQ component's neighbor table directly via its
+   ``_install_neighbors`` sideband call (equivalent to fan-out IpcqInitMsg
+   without going through fabric).
+
+Address scheme
+--------------
+For the first implementation we use a synthetic address scheme that
+guarantees uniqueness per (sip, cube, pe, direction) without going
+through ``PEMemAllocator``. The address is encoded as:
+
+    base = IPCQ_BASE | (sip << 40) | (cube << 32) | (pe << 24)
+    rx_base[direction_idx] = base + direction_idx * (n_slots * slot_size)
+
+The ``buffer_kind`` (tcm/hbm/sram) selects the *MemoryStore space* into
+which data is written. Within a space, addresses are unique per PE so
+the existing MemoryStore (``{space: {addr: ndarray}}``) handles them
+naturally.
+
+This bypasses the topology's address resolver / PhysAddr encoding and
+treats IPCQ buffers as a separate, parallel address namespace. Real PA
+encoding can be plugged in later without changing the rest of the design.
+"""
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+import simpy
+import yaml
+
+from kernbench.ccl.topologies import resolve_topology
+from kernbench.common.ipcq_types import (
+    IpcqEndpoint,
+    IpcqInitEntry,
+)
+from kernbench.runtime_api.kernel import IpcqInitMsg
+
+
+# IPCQ synthetic address space top bit
+_IPCQ_BASE = 1 << 60
+
+
+def _ipcq_base_for_pe(sip: int, cube: int, pe: int) -> int:
+    return _IPCQ_BASE | (sip << 40) | (cube << 32) | (pe << 24)
+
+
+# ── ccl.yaml loading ─────────────────────────────────────────────────
+
+
+def load_ccl_config(path: str | Path | None = None) -> dict:
+    """Load and validate ccl.yaml. Searches cwd and project root."""
+    if path is None:
+        candidates = [
+            Path.cwd() / "ccl.yaml",
+            Path(__file__).resolve().parents[3] / "ccl.yaml",
+        ]
+        for p in candidates:
+            if p.exists():
+                path = p
+                break
+    if path is None:
+        raise FileNotFoundError(
+            "ccl.yaml not found. Place it at project root or cwd."
+        )
+    with open(path) as f:
+        cfg = yaml.safe_load(f)
+    if "defaults" not in cfg:
+        raise ValueError("ccl.yaml missing 'defaults' section")
+    if "algorithms" not in cfg:
+        raise ValueError("ccl.yaml missing 'algorithms' section")
+    return cfg
+
+
+def resolve_algorithm_config(cfg: dict, name: str | None = None) -> dict:
+    """Merge defaults with the chosen algorithm's overrides.
+
+    Returns a flat dict with at minimum: module, topology, buffer_kind,
+    backpressure, n_slots, slot_size, ipcq_credit_size_bytes, world_size.
+    """
+    defaults = dict(cfg.get("defaults", {}))
+    algo_name = name or defaults.get("algorithm")
+    if algo_name is None:
+        raise ValueError("ccl.yaml: defaults.algorithm not set")
+    algos = cfg.get("algorithms", {})
+    if algo_name not in algos:
+        raise ValueError(
+            f"ccl.yaml: algorithm '{algo_name}' not in algorithms section"
+        )
+    merged = defaults.copy()
+    merged.update(algos[algo_name])
+    merged["algorithm"] = algo_name
+    return merged
+
+
+# ── rank → PE mapping ────────────────────────────────────────────────
+
+
+def linear_rank_to_pe(rank: int, spec: dict) -> tuple[int, int, int]:
+    """Map a rank to (sip, cube, pe) using linear topology order."""
+    sips = spec["system"]["sips"]["count"]
+    cubes_per_sip = spec["sip"]["cube_mesh"]["w"] * spec["sip"]["cube_mesh"]["h"]
+    pe_layout = spec["cube"]["pe_layout"]
+    pes_per_cube = pe_layout["pe_per_corner"] * len(pe_layout["corners"])
+
+    pes_per_sip = cubes_per_sip * pes_per_cube
+    if rank >= sips * pes_per_sip:
+        raise ValueError(
+            f"rank {rank} exceeds total PE count {sips * pes_per_sip}"
+        )
+    sip = rank // pes_per_sip
+    rem = rank % pes_per_sip
+    cube = rem // pes_per_cube
+    pe = rem % pes_per_cube
+    return sip, cube, pe
+
+
+# ── Install plan ─────────────────────────────────────────────────────
+
+
+def install_ipcq(
+    engine: Any,
+    spec: dict,
+    cfg: dict,
+    algo_module: Any | None = None,
+    rank_to_pe: list[tuple[int, int, int]] | None = None,
+) -> dict[str, Any]:
+    """Build neighbor tables and install them in every participating PE_IPCQ.
+
+    Args:
+        engine: GraphEngine with ``_components`` dict
+        spec: topology spec dict
+        cfg: merged algorithm config (from ``resolve_algorithm_config``)
+        algo_module: optional algorithm Python module (for neighbors override)
+        rank_to_pe: optional explicit rank → (sip, cube, pe) mapping. If
+                    None, the default linear mapping is used.
+
+    Returns:
+        A diagnostics dict with the install plan (rank → PE map, neighbor table).
+    """
+    if "world_size" in cfg:
+        world_size = int(cfg["world_size"])
+    else:
+        # Topology-derived fallback (mirrors AhbmCCLBackend / RuntimeContext).
+        sips = int(spec.get("system", {}).get("sips", {}).get("count", 1))
+        cm = spec.get("sip", {}).get("cube_mesh", {})
+        cubes_per_sip = int(cm.get("w", 1)) * int(cm.get("h", 1))
+        pl = spec.get("cube", {}).get("pe_layout", {})
+        corners = pl.get("corners", [])
+        pe_per_corner = int(pl.get("pe_per_corner", 1))
+        pes_per_cube = pe_per_corner * max(len(corners), 1)
+        world_size = sips * cubes_per_sip * pes_per_cube
+    buffer_kind = cfg["buffer_kind"]
+    n_slots = int(cfg["n_slots"])
+    slot_size = int(cfg["slot_size"])
+    backpressure = cfg["backpressure"]
+    credit_size_bytes = int(cfg.get("ipcq_credit_size_bytes", 16))
+
+    # Step 1: rank → (sip, cube, pe)
+    if rank_to_pe is not None:
+        if len(rank_to_pe) != world_size:
+            raise ValueError(
+                f"rank_to_pe has {len(rank_to_pe)} entries but world_size={world_size}"
+            )
+        rank_pe = list(rank_to_pe)
+    else:
+        rank_pe: list[tuple[int, int, int]] = [
+            linear_rank_to_pe(r, spec) for r in range(world_size)
+        ]
+    pe_to_rank = {(s, c, p): r for r, (s, c, p) in enumerate(rank_pe)}
+
+    # Step 2: resolve topology fn (with optional override)
+    topo_fn = resolve_topology(cfg["topology"], algo_module=algo_module)
+
+    # Build per-rank neighbor map
+    neighbor_table: dict[int, dict[str, int]] = {}
+    for r in range(world_size):
+        neighbor_table[r] = topo_fn(r, world_size)
+
+    # Step 3: pull the live engine reference for each PE_IPCQ
+    components = engine._components
+    pe_ipcq_id = lambda s, c, p: f"sip{s}.cube{c}.pe{p}.pe_ipcq"
+
+    # Step 4: per-PE rx_base address and per-PE credit_inbox
+    direction_keys = sorted({d for nt in neighbor_table.values() for d in nt})
+    direction_idx = {d: i for i, d in enumerate(direction_keys)}
+    bytes_per_direction = n_slots * slot_size
+
+    def rx_base(s: int, c: int, p: int, d: str) -> int:
+        return _ipcq_base_for_pe(s, c, p) + direction_idx[d] * bytes_per_direction
+
+    # Wire bidirectional credit stores: backend creates the SimPy Stores
+    # by reading each rank's PE_IPCQ.credit_inbox property.
+    rank_to_credit_inbox: dict[int, simpy.Store] = {}
+    for r, (s, c, p) in enumerate(rank_pe):
+        comp = components[pe_ipcq_id(s, c, p)]
+        # Trigger lazy creation of credit_inbox if not yet started.
+        # PE_IPCQ.start() creates it; we ensure it exists.
+        if comp._credit_inbox is None:
+            comp._credit_inbox = simpy.Store(engine._env)
+        rank_to_credit_inbox[r] = comp.credit_inbox
+
+    # Step 5: build IpcqInitMsg per rank and call _install_neighbors directly
+    plan: dict[str, Any] = {
+        "world_size": world_size,
+        "rank_to_pe": rank_pe,
+        "buffer_kind": buffer_kind,
+        "neighbor_table": neighbor_table,
+    }
+
+    _OPPOSITE_DIR = {
+        "E": "W", "W": "E", "N": "S", "S": "N",
+        "global_E": "global_W", "global_W": "global_E",
+        "global_N": "global_S", "global_S": "global_N",
+    }
+
+    def reverse_direction(my_rank: int, peer_rank: int, my_dir: str) -> str | None:
+        """Find peer's direction that reciprocates my_dir→peer_rank.
+
+        Prefer the OPPOSITE direction (E↔W, N↔S) when the peer has it
+        pointing back to us (ADR-0025 D1). This matters in 2-rank
+        bidirectional rings where both E and W on one side point to the
+        same peer — without the preference, dict-order first-match would
+        route data into the wrong rx slot. Falls back to any direction
+        pointing back for topologies without an opposite convention
+        (e.g. tree_binary's parent/child).
+        """
+        nt = neighbor_table[peer_rank]
+        opp = _OPPOSITE_DIR.get(my_dir)
+        if opp is not None and nt.get(opp) == my_rank:
+            return opp
+        for d, target in nt.items():
+            if target == my_rank:
+                return d
+        return None
+
+    for r, (s, c, p) in enumerate(rank_pe):
+        my_pe_ipcq = components[pe_ipcq_id(s, c, p)]
+        nbrs = neighbor_table[r]
+        entries: list[IpcqInitEntry] = []
+        for d, peer_rank in nbrs.items():
+            if peer_rank is None:
+                continue
+            peer_s, peer_c, peer_p = rank_pe[peer_rank]
+            peer_dir = reverse_direction(r, peer_rank, d)
+            if peer_dir is None:
+                # Peer doesn't have a reverse entry — skip (asymmetric topology)
+                continue
+            peer_endpoint = IpcqEndpoint(
+                sip=peer_s, cube=peer_c, pe=peer_p,
+                buffer_kind=buffer_kind,
+                rx_base_pa=rx_base(peer_s, peer_c, peer_p, peer_dir),
+                rx_base_va=0,
+                n_slots=n_slots, slot_size=slot_size,
+            )
+            entries.append(IpcqInitEntry(
+                direction=d,
+                peer=peer_endpoint,
+                my_rx_base_pa=rx_base(s, c, p, d),
+                my_rx_base_va=0,
+                n_slots=n_slots, slot_size=slot_size,
+                peer_credit_store=rank_to_credit_inbox[peer_rank],
+            ))
+        msg = IpcqInitMsg(
+            correlation_id="ccl_init", request_id=f"init_r{r}",
+            target_sips=(s,), target_cubes=(c,), target_pe=p,
+            entries=tuple(entries),
+            backpressure_mode=backpressure,
+            buffer_kind=buffer_kind,
+            credit_size_bytes=credit_size_bytes,
+        )
+        my_pe_ipcq._install_neighbors(msg)
+
+    return plan
@@ -0,0 +1,104 @@
+"""SFR configuration for intercube + inter-SIP IPCQ wiring.
+
+Provides ``configure_sfr_intercube_multisip`` which programs PE_IPCQ
+neighbor tables for:
+
+  1. Intercube within each SIP — pe0 of every cube connects to pe0 of
+     its N/S/E/W mesh neighbors (no wrap-around).
+  2. Inter-SIP on ALL cubes — pe0 of cube_c on sip_A connects to pe0 of
+     cube_c on each peer SIP, using ``global_E``/``global_W`` (ring) or
+     ``global_N``/``global_S``/``global_E``/``global_W`` (mesh/torus)
+     direction labels.  Wiring all cubes allows the kernel to
+     dynamically elect the root cube at runtime.
+
+SIP-level topology is read from ``topology.yaml`` →
+``system.sips.topology`` (e.g. ``ring_1d``, ``mesh_2d``).
+Intercube mesh dimensions come from ``sip.cube_mesh.w/h``.
+
+Internally delegates to ``install_ipcq`` with a computed ``rank_to_pe``
+(pe0-only) and a closure-captured ``neighbors()`` function.
+"""
+from __future__ import annotations
+
+import types
+from typing import Any
+
+from kernbench.ccl.install import install_ipcq
+from kernbench.ccl.topologies import _BUILTIN as _TOPO_BUILTINS
+
+
+def configure_sfr_intercube_multisip(
+    engine: Any,
+    spec: dict,
+    cfg: dict,
+) -> dict[str, Any]:
+    """Wire IPCQ for intercube (pe0, mesh) + inter-SIP (pe0, all cubes).
+
+    Args:
+        engine: GraphEngine with ``_components``.
+        spec: topology spec dict (from topology.yaml).
+        cfg: merged algorithm config (from ``resolve_algorithm_config``).
+
+    Returns:
+        The install plan dict from ``install_ipcq``.
+    """
+    cm = spec["sip"]["cube_mesh"]
+    mesh_w = int(cm["w"])
+    mesh_h = int(cm["h"])
+    n_cubes = mesh_w * mesh_h
+    n_sips = int(spec.get("system", {}).get("sips", {}).get("count", 1))
+    sip_topology = str(
+        spec.get("system", {}).get("sips", {}).get("topology", "ring_1d")
+    )
+
+    if sip_topology not in _TOPO_BUILTINS:
+        raise ValueError(
+            f"Unknown sip topology '{sip_topology}'. "
+            f"Available: {list(_TOPO_BUILTINS)}"
+        )
+    sip_topo_fn = _TOPO_BUILTINS[sip_topology]
+
+    world_size = n_sips * n_cubes
+    pe_idx_to_pe: list[tuple[int, int, int]] = [
+        (sip, cube, 0)
+        for sip in range(n_sips)
+        for cube in range(n_cubes)
+    ]
+
+    def _neighbors(pe_idx: int, ws: int, _base: dict) -> dict[str, int]:
+        sip = pe_idx // n_cubes
+        cube = pe_idx % n_cubes
+        row = cube // mesh_w
+        col = cube % mesh_w
+
+        nbrs: dict[str, int] = {}
+
+        # Intercube within SIP (mesh, no wrap-around)
+        if col < mesh_w - 1:
+            nbrs["E"] = sip * n_cubes + (row * mesh_w + col + 1)
+        if col > 0:
+            nbrs["W"] = sip * n_cubes + (row * mesh_w + col - 1)
+        if row < mesh_h - 1:
+            nbrs["S"] = sip * n_cubes + ((row + 1) * mesh_w + col)
+        if row > 0:
+            nbrs["N"] = sip * n_cubes + ((row - 1) * mesh_w + col)
+
+        # Inter-SIP on ALL cubes
+        if n_sips > 1:
+            sip_nbrs = sip_topo_fn(sip, n_sips)
+            for d, peer_sip in sip_nbrs.items():
+                nbrs[f"global_{d}"] = peer_sip * n_cubes + cube
+
+        return nbrs
+
+    mock_module = types.SimpleNamespace(neighbors=_neighbors)
+
+    cfg_copy = dict(cfg)
+    cfg_copy["world_size"] = world_size
+    cfg_copy["topology"] = "none"
+
+    return install_ipcq(
+        engine, spec, cfg_copy,
+        algo_module=mock_module,
+        rank_to_pe=pe_idx_to_pe,
+    )
@@ -0,0 +1,163 @@
+"""Builtin neighbor topology generators for CCL backend (ADR-0023 D11).
+
+Each generator takes ``(rank, world_size)`` and returns a
+``dict[direction, peer_rank]`` for that rank. ``direction`` is one of
+``"N" | "S" | "E" | "W"`` for ring/mesh, or
+``"parent" | "child_left" | "child_right"`` for tree topologies.
+
+Algorithm modules may override the generated map by defining a
+``neighbors(rank, world_size, neighbor_map) -> dict | None`` function in
+the same module (see D11 / D15). ``resolve_topology`` wires these together.
+"""
+from __future__ import annotations
+
+from typing import Any, Callable
+
+NeighborMap = dict[str, int]
+TopologyFn = Callable[[int, int], NeighborMap]
+
+
+# ── Builtin generators ───────────────────────────────────────────────
+
+
+def ring_1d(rank: int, world_size: int) -> NeighborMap:
+    """1D bidirectional ring (E/W)."""
+    return {
+        "E": (rank + 1) % world_size,
+        "W": (rank - 1) % world_size,
+    }
+
+
+def ring_1d_unidir(rank: int, world_size: int) -> NeighborMap:
+    """1D unidirectional ring (E only)."""
+    return {"E": (rank + 1) % world_size}
+
+
+def mesh_2d(rank: int, world_size: int) -> NeighborMap:
+    """Square 2D mesh (N/S/E/W).
+
+    Layout: rank = row * side + col, with side = sqrt(world_size).
+    Wrap-around (torus) on all four edges.
+    """
+    side = int(round(world_size ** 0.5))
+    if side * side != world_size:
+        raise ValueError(
+            f"mesh_2d requires square world_size, got {world_size}"
+        )
+    r, c = divmod(rank, side)
+    return {
+        "N": ((r - 1) % side) * side + c,
+        "S": ((r + 1) % side) * side + c,
+        "W": r * side + (c - 1) % side,
+        "E": r * side + (c + 1) % side,
+    }
+
+
+def tree_binary(rank: int, world_size: int) -> NeighborMap:
+    """Binary tree rooted at rank 0.
+
+    Children of rank r are 2r+1 and 2r+2 (if within world_size).
+    Parent of rank r > 0 is (r-1)//2.
+    Returned keys (only those that exist):
+        "parent", "child_left", "child_right"
+    """
+    n: NeighborMap = {}
+    if rank > 0:
+        n["parent"] = (rank - 1) // 2
+    left = 2 * rank + 1
+    right = 2 * rank + 2
+    if left < world_size:
+        n["child_left"] = left
+    if right < world_size:
+        n["child_right"] = right
+    return n
+
+
+def torus_2d(rank: int, world_size: int) -> NeighborMap:
+    """Square 2D torus (N/S/E/W) with wrap-around on all edges.
+
+    Alias for mesh_2d (which already wraps). Explicit name for clarity
+    when used as a SIP-level topology.
+    """
+    return mesh_2d(rank, world_size)
+
+
+def mesh_2d_no_wrap(rank: int, world_size: int) -> NeighborMap:
+    """Square 2D mesh (N/S/E/W) WITHOUT wrap-around.
+
+    Edge nodes have fewer neighbors (no wrapping). Used for SIP-level
+    topologies where physical links don't wrap.
+    """
+    side = int(round(world_size ** 0.5))
+    if side * side != world_size:
+        raise ValueError(
+            f"mesh_2d_no_wrap requires square world_size, got {world_size}"
+        )
+    r, c = divmod(rank, side)
+    n: NeighborMap = {}
+    if r > 0:
+        n["N"] = (r - 1) * side + c
+    if r < side - 1:
+        n["S"] = (r + 1) * side + c
+    if c > 0:
+        n["W"] = r * side + (c - 1)
+    if c < side - 1:
+        n["E"] = r * side + (c + 1)
+    return n
+
+
+def none(rank: int, world_size: int) -> NeighborMap:
+    """Empty map — algorithm's neighbors() must build from scratch."""
+    return {}
+
+
+_BUILTIN: dict[str, TopologyFn] = {
+    "ring_1d": ring_1d,
+    "ring_1d_unidir": ring_1d_unidir,
+    "mesh_2d": mesh_2d,
+    "torus_2d": torus_2d,
+    "mesh_2d_no_wrap": mesh_2d_no_wrap,
+    "tree_binary": tree_binary,
+    "none": none,
+}
+
+
+# ── Resolution ───────────────────────────────────────────────────────
+
+
+def resolve_topology(
+    name: str, algo_module: Any | None = None,
+) -> TopologyFn:
+    """Return a callable ``(rank, world_size) -> NeighborMap``.
+
+    Args:
+        name: builtin topology name from ccl.yaml. Must be one of
+              ``ring_1d``, ``ring_1d_unidir``, ``mesh_2d``, ``tree_binary``,
+              or ``none``.
+        algo_module: optional algorithm module. If it defines
+              ``neighbors(rank, world_size, neighbor_map)``, that hook is
+              invoked after the builtin to override the result.
+              Returning None from neighbors() leaves the builtin map
+              unchanged; returning a dict replaces it.
+
+    Raises:
+        ValueError: if ``name`` is not a known builtin.
+    """
+    if name not in _BUILTIN:
+        raise ValueError(
+            f"Unknown topology '{name}'. "
+            f"Available builtins: {list(_BUILTIN)}"
+        )
+    builtin_fn = _BUILTIN[name]
+    override_fn = getattr(algo_module, "neighbors", None) if algo_module else None
+    if override_fn is None or not callable(override_fn):
+        return builtin_fn
+
+    def _wrapped(rank: int, world_size: int) -> NeighborMap:
+        base = builtin_fn(rank, world_size)
+        result = override_fn(rank, world_size, base)
+        if result is None:
+            return base
+        return result
+
+    return _wrapped
@@ -21,6 +21,10 @@ def build_parser() -> argparse.ArgumentParser:
    runp.add_argument(
        "--device", default=None, help="Target device: 'all' or 'sip:<N>' (default: all)"
    )
+    runp.add_argument(
+        "--verify-data", action="store_true", default=False,
+        help="Enable Phase 2 data verification (ADR-0020)",
+    )
    runp.set_defaults(_handler=cmd_run)

    probep = sub.add_parser("probe", help="Probe latency and BW for predefined traffic patterns")
@@ -36,9 +40,11 @@ def build_parser() -> argparse.ArgumentParser:
    return p


-def engine_factory(topology: object, device: DeviceSelector) -> SimEngine:
+def engine_factory(
+    topology: object, device: DeviceSelector, *, enable_data: bool = False,
+) -> SimEngine:
    topo_obj = getattr(topology, "topology_obj", topology)
-    return GraphEngine(topo_obj)
+    return GraphEngine(topo_obj, enable_data=enable_data)


 def cmd_web(args) -> int:
@@ -53,8 +59,12 @@ def cmd_run(args) -> int:
    topo = resolve_topology(args.topology)
    bench = resolve_bench(args.bench)
    device = resolve_device(args.device)
+    verify_data = getattr(args, "verify_data", False)

-    result = run_bench(topology=topo, bench_fn=bench, device=device, engine_factory=engine_factory)
+    def _factory(topology, device):
+        return engine_factory(topology, device, enable_data=verify_data)
+
+    result = run_bench(topology=topo, bench_fn=bench, device=device, engine_factory=_factory)

    topo_obj = getattr(topo, "topology_obj", topo)
    spec = getattr(topo_obj, "spec", None)
@@ -62,6 +72,18 @@ def cmd_run(args) -> int:
        print(format_report(result.traces, title=args.bench, spec=spec))
    print(result.summary_text())

+    # Phase 2 diagnostic summary (ADR-0020). The actual Phase 2 replay
+    # already runs inside engine.wait() → _flush_data_phase(). We only
+    # print the summary here; no redundant re-execution.
+    if verify_data and result.engine is not None:
+        op_log = result.engine.op_log
+        if op_log:
+            n_gemm = sum(1 for r in op_log if r.op_kind == "gemm")
+            n_math = sum(1 for r in op_log if r.op_kind == "math")
+            print(f"[data] Phase 2 complete: {len(op_log)} ops ({n_gemm} gemm, {n_math} math)")
+        else:
+            print("[data] No op_log recorded — skipping Phase 2")
+
    return 0 if result.completion.ok else 1


@@ -116,7 +116,7 @@ def _fmt_util(eff: float, bn: float | None) -> str:


 def _short_name(node_id: str) -> str:
-    """Shorten node id: keep last 2 segments to avoid ambiguity (xbar.pe0 vs pe0)."""
+    """Shorten node id: keep last 2 segments to avoid ambiguity (router.pe0 vs pe0)."""
    parts = node_id.split(".")
    return ".".join(parts[-2:]) if len(parts) >= 2 else node_id

@@ -366,7 +366,7 @@ def run_probe(topology_path: str, case_filter: str | None = None) -> int:

    # --- PE DMA Summary Table ---
    print()
-    print(f"=== PE DMA Latency (pe_dma -> xbar -> HBM, data={nbytes}B) ===")
+    print(f"=== PE DMA Latency (pe_dma -> router -> HBM, data={nbytes}B) ===")
    print(f"  {'Case':<26} {'Target':<28} {'Actual':>8}"
          f"  {'Ovhd':>6} {'Drain':>6} {'Wire':>5} {'Ovhd%':>6} {'Drain%':>7}"
          f"   {'Eff.BW':>8}   {'BN.BW':>8}   {'Util%':>6}")
@@ -137,7 +137,7 @@ def _extract_peaks(spec: dict | None) -> tuple[float, float]:
    gemm_attrs = comps.get("pe_gemm", {}).get("attrs", {})
    peak_tflops = float(gemm_attrs.get("peak_tflops_f16", 0.0))
    cube_links = cube.get("links", {})
-    hbm_bw = float(cube_links.get("xbar_to_hbm_bw_gbs", 0.0))
+    hbm_bw = float(cube_links.get("hbm_to_router_bw_gbs", 0.0))
    return peak_tflops, hbm_bw


@@ -0,0 +1,249 @@
+"""IPCQ schemas and exceptions (ADR-0023 D2.5, D12, D14 F1).
+
+This module contains the data structures and exceptions used by the
+PE-level IPCQ collective communication infrastructure. The host-facing
+sideband fan-out message ``IpcqInitMsg`` lives in
+``kernbench.runtime_api.kernel`` (alongside other fabric messages),
+while all internal token / metadata / command schemas are kept here.
+
+Layering:
+    PE_CPU       --IpcqRequest(IpcqSendCmd|IpcqRecvCmd)--> PE_IPCQ
+    PE_IPCQ      --IpcqDmaToken-->                         PE_DMA (vc_comm)
+    PE_DMA       --IpcqMetaArrival-->                      PE_IPCQ (atomic, D9)
+    PE_IPCQ      --IpcqCreditMetadata-->                   peer PE_IPCQ (fast path, D9)
+
+See ADR-0023 for the full design.
+"""
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any, Union
+
+if TYPE_CHECKING:
+    import simpy
+
+
+# ── D14 F1: invalid direction exception ──────────────────────────────
+
+
+class IpcqInvalidDirection(ValueError):
+    """Raised when a kernel calls tl.send/recv with a direction that
+    has no neighbor installed for this PE."""
+
+
+# ── D2.5: IpcqEndpoint ───────────────────────────────────────────────
+
+
+@dataclass(frozen=True)
+class IpcqEndpoint:
+    """송신 측이 peer's rx_buffer 주소를 계산하기 위해 필요한 모든 정보 (D2.5).
+
+    Sender PE_IPCQ uses this to compute the destination PA for its DMA
+    write into the peer's rx ring buffer slot:
+
+        slot_idx = sender.my_head % peer.n_slots
+        dst_pa   = peer.rx_base_pa + slot_idx * peer.slot_size
+    """
+
+    sip: int                     # destination SIP
+    cube: int                    # destination cube
+    pe: int                      # destination PE (cube-local index)
+    buffer_kind: str             # "tcm" | "hbm" | "sram"
+    rx_base_pa: int              # peer rx_buffer base PA (PhysAddr.encode())
+    rx_base_va: int              # peer rx_buffer base VA (optional, MMU)
+    n_slots: int                 # peer ring depth (wrap-around modulo)
+    slot_size: int               # peer slot size (offset multiplier)
+
+
+# ── D12: IpcqInitEntry (used by IpcqInitMsg in kernel.py) ────────────
+
+
+@dataclass(frozen=True)
+class IpcqInitEntry:
+    """One direction's neighbor entry that backend installs into a PE_IPCQ
+    via IpcqInitMsg (kernbench.runtime_api.kernel.IpcqInitMsg, D12).
+    """
+
+    direction: str               # "N" | "S" | "E" | "W"
+    peer: IpcqEndpoint           # see D2.5
+    my_rx_base_pa: int           # this PE's own rx_buffer base
+    my_rx_base_va: int           # this PE's own rx_buffer base VA (optional)
+    n_slots: int                 # this PE's ring depth
+    slot_size: int               # this PE's slot size
+    # Credit fast path channel (D9).
+    # Contract: must be a simpy.Store instance dedicated to receiving
+    # IpcqCreditMetadata objects only. Backend wires it once at init time
+    # and the receiving PE_IPCQ owns its consumer side; the sender (peer's
+    # PE_IPCQ) puts IpcqCreditMetadata directly into this store via
+    # _delayed_credit_send. Do not put any other object type.
+    peer_credit_store: "simpy.Store"
+
+
+# ── D12: IpcqSendCmd (PE_CPU → PE_IPCQ) ──────────────────────────────
+
+
+@dataclass(frozen=True)
+class IpcqSendCmd:
+    """tl.send command issued by the kernel to PE_IPCQ."""
+
+    direction: str               # "N" | "S" | "E" | "W"
+    src_addr: int                # source data address (TCM/HBM/SRAM)
+    src_space: str               # "tcm" | "hbm" | "sram"
+    nbytes: int
+    shape: tuple[int, ...]       # data shape (op_log + MemoryStore use)
+    dtype: str
+    handle_id: str               # completion tracking
+    # In-flight data snapshot captured at tl.send() time from the
+    # TensorHandle.data field. Carries the actual numpy array that was
+    # visible at recv-time (when handle.data was populated), avoiding a
+    # Phase 1 race where a later IPCQ inbound overwrites the sender's
+    # slot between recv and send. If None, PE_DMA outbound falls back to
+    # reading MemoryStore[src_addr] (correct for sources that are never
+    # overwritten, such as HBM tiles).
+    data: Any = None
+    data_op: bool = True         # ADR-0020 op_log recording flag
+
+
+# ── D12: IpcqRecvCmd (PE_CPU → PE_IPCQ) ──────────────────────────────
+
+
+@dataclass(frozen=True)
+class IpcqRecvCmd:
+    """tl.recv command issued by the kernel to PE_IPCQ.
+
+    Two modes (recv_mode):
+        "return_slot" — return slot address as-is (default, zero-copy).
+                        Kernel uses the slot memory directly.
+        "copy_to_dst" — copy slot data to dst_addr, then return.
+    """
+
+    direction: str | None        # None → round-robin (weak fairness, D4)
+    shape: tuple[int, ...]
+    dtype: str
+    handle_id: str
+    recv_mode: str = "return_slot"
+    dst_addr: int = 0            # used only when recv_mode == "copy_to_dst"
+    dst_space: str = ""          # used only when recv_mode == "copy_to_dst"
+    blocking: bool = True
+    data_op: bool = True
+
+
+# ── D12: IpcqDmaToken (PE_IPCQ → PE_DMA, vc_comm) ───────────────────
+
+
+@dataclass
+class IpcqDmaToken:
+    """Token sent from PE_IPCQ to PE_DMA (vc_comm channel) carrying both
+    the data move request and the piggyback metadata (ADR-0023 D9).
+
+    Receiving PE_DMA processes this atomically (I6 MUST):
+        1. MemoryStore.write(dst_endpoint.buffer_kind, dst_addr, data)
+        2. Forward IpcqMetaArrival(token=self) to peer PE_IPCQ
+    No yield is allowed between the two steps.
+
+    The ``data`` field is a snapshot taken by the sender's PE_DMA at the
+    moment the send is issued. This preserves "in-flight data" semantics:
+    if the sender mutates its source memory after issuing the send but
+    before arrival, the receiver still gets the snapshot. The snapshot is
+    None for control-only tokens (e.g. credit-only updates).
+    """
+
+    # ── Data movement (single-hop DMA write) ──
+    src_addr: int
+    src_space: str
+    dst_addr: int                # already-computed peer rx slot PA
+    dst_endpoint: IpcqEndpoint   # routing target (sip/cube/pe) + buffer_kind
+    nbytes: int
+    handle_id: str               # completion notify back to sender PE_IPCQ
+    # Optional shape/dtype carried for op_log + MemoryStore convenience.
+    shape: tuple[int, ...] = ()
+    dtype: str = "f16"
+    # In-flight data snapshot (sender PE_DMA captures this at send time).
+    data: Any = None
+
+    # ── Piggyback metadata (D9) ──
+    sender_seq: int = 0          # monotonic; receiver updates peer_head_cache
+    src_sip: int = 0
+    src_cube: int = 0
+    src_pe: int = 0
+    src_direction: str = "E"     # sender-side direction; receiver maps to its own
+
+    data_op: bool = True
+
+
+# ── D12: IpcqMetaArrival (PE_DMA → PE_IPCQ, intra-PE wire) ──────────
+
+
+@dataclass
+class IpcqMetaArrival:
+    """Posted by receiving PE_DMA into the destination PE's PE_IPCQ inbox
+    in the same SimPy step as the MemoryStore.write (D9, I6 MUST).
+
+    The receiver PE_IPCQ uses ``token.sender_seq`` to update its
+    peer_head_cache for the corresponding direction.
+    """
+
+    token: IpcqDmaToken
+
+
+# ── D12: IpcqCreditMetadata (PE_IPCQ → peer PE_IPCQ, fast path) ─────
+
+
+@dataclass(frozen=True)
+class IpcqCreditMetadata:
+    """Credit return — recv-side → send-side fast path (D9).
+
+    Sent by ``PeIpcqComponent._delayed_credit_send`` after a
+    bottleneck-BW based latency, putting the metadata directly into
+    the peer's pre-wired credit store (no fabric routing).
+
+    ``dst_rx_base_pa`` is the receiver's ``my_rx_base_pa`` for the direction
+    whose slot was consumed. The original sender matches this against
+    ``qp.peer.rx_base_pa`` to find the correct direction (ADR-0025 D3) —
+    unambiguous even when multiple directions share the same peer (e.g.
+    2-rank bidirectional ring).
+    """
+
+    consumer_seq: int            # my_tail at recv side (new tail value)
+    dst_rx_base_pa: int          # receiver-side my_rx_base_pa (ADR-0025 D3)
+    src_sip: int                 # which peer is sending the credit (diag)
+    src_cube: int
+    src_pe: int
+    src_direction: str           # sender-side direction (peer maps to its own)
+
+
+# ── Request wrapper (PE_CPU → PE_IPCQ) ───────────────────────────────
+
+
+@dataclass
+class IpcqRequest:
+    """Wrapper carrying an IpcqSendCmd or IpcqRecvCmd plus a SimPy completion
+    event. Posted by PE_CPU into PE_IPCQ's inbox; PE_IPCQ calls
+    ``done.succeed()`` when the request is fully processed.
+
+    For recv requests, the result (slot address, direction, dtype, shape)
+    is written into ``result_data`` so the caller can read it after wait.
+    """
+
+    command: "IpcqSendCmd | IpcqRecvCmd"
+    done: "simpy.Event"
+    result_data: dict[str, Any] = field(default_factory=dict)
+
+
+# ── RecvFuture (kernel ↔ runner handshake for tl.recv_async / tl.wait) ─
+
+
+@dataclass
+class RecvFuture:
+    """Opaque future returned by ``tl.recv_async``.
+
+    The KernelRunner attaches a SimPy event and the IpcqRequest in the
+    background; ``tl.wait(future)`` switches back to the runner which
+    yields on the event and resolves the result into a TensorHandle.
+    """
+
+    cmd: "IpcqRecvCmd"
+    request: Any = None         # IpcqRequest (set by runner)
+    event: Any = None           # simpy.Event (set by runner)
+    resolved: bool = False
+    result: Any = None          # cached TensorHandle after wait()
@@ -33,6 +33,7 @@ class TensorHandle:
    dtype: str
    nbytes: int                      # total byte size
    data: object = None              # reserved for validate mode
+    space: str = "tcm"               # MemoryStore space ("tcm" | "hbm" | "sram")


@dataclass(frozen=True)
@@ -55,6 +56,7 @@ class DmaReadCmd:
    handle: TensorHandle
    src_addr: int
    nbytes: int
+    data_op: bool = True


@dataclass(frozen=True)
@@ -64,6 +66,7 @@ class DmaWriteCmd:
    handle: TensorHandle
    dst_addr: int
    nbytes: int
+    data_op: bool = True


@dataclass(frozen=True)
@@ -79,6 +82,7 @@ class GemmCmd:
    m: int
    k: int
    n: int
+    data_op: bool = True


@dataclass(frozen=True)
@@ -94,6 +98,7 @@ class MathCmd:
    inputs: tuple[TensorHandle, ...]
    out: TensorHandle
    axis: int | None = None          # for reductions
+    data_op: bool = True


@dataclass(frozen=True)
@@ -111,6 +116,7 @@ class CompositeCmd:
    out_addr: int
    out_nbytes: int
    math_op: str | None = None       # for op="math": which math operation
+    data_op: bool = True


@dataclass(frozen=True)
@@ -33,6 +33,7 @@ class ComponentBase(ABC):
        self.ctx = ctx
        self.in_ports: dict[str, simpy.Store] = {}
        self.out_ports: dict[str, simpy.Store] = {}
+        self._op_logger: Any | None = None  # OpLogger, set by GraphEngine if enabled

    def start(self, env: simpy.Environment) -> None:
        """Called once after all ports are wired.
@@ -64,9 +65,21 @@ class ComponentBase(ABC):
            txn: Any = yield self._inbox.get()
            env.process(self._forward_txn(env, txn))

+    def _on_process_start(self, env: simpy.Environment, msg: Any) -> None:
+        """Op log hook: record service start for data_op messages (ADR-0020 D2)."""
+        if self._op_logger and getattr(msg, "data_op", False):
+            self._op_logger.record_start(env.now, self.node.id, msg)
+
+    def _on_process_end(self, env: simpy.Environment, msg: Any) -> None:
+        """Op log hook: record service end for data_op messages (ADR-0020 D2)."""
+        if self._op_logger and getattr(msg, "data_op", False):
+            self._op_logger.record_end(env.now, self.node.id, msg)
+
    def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
        """Apply run() latency, then forward to next hop or drain at terminal."""
+        self._on_process_start(env, txn)
        yield from self.run(env, txn.nbytes)
+        self._on_process_end(env, txn)
        next_hop = txn.next_hop          # duck-typed: Transaction.next_hop
        if next_hop:
            yield self.out_ports[next_hop].put(txn.advance())
@@ -120,10 +133,16 @@ class PeEngineBase(ComponentBase):
        while True:
            msg: Any = yield self._inbox.get()
            if isinstance(msg, PeInternalTxn):
-                env.process(self.handle_command(env, msg))
+                env.process(self._handle_with_hooks(env, msg))
            else:
                env.process(self._forward_txn(env, msg))

+    def _handle_with_hooks(self, env: simpy.Environment, pe_txn: Any) -> Generator:
+        """Wrap handle_command with op log hooks on the inner command."""
+        self._on_process_start(env, pe_txn.command)
+        yield from self.handle_command(env, pe_txn)
+        self._on_process_end(env, pe_txn.command)
+
    @abstractmethod
    def handle_command(self, env: simpy.Environment, pe_txn: Any) -> Generator:
        """Process a PE-internal command (PeInternalTxn).
@@ -114,7 +114,7 @@ class HbmCtrlComponent(ComponentBase):

            parts = self.node.id.split(".")
            cube_id = int(parts[1].replace("cube", ""))
-            pe_id = int(parts[3].replace("slice", ""))
+            pe_id = 0  # single hbm_ctrl, PE info from request
            resp_msg = ResponseMsg(
                correlation_id=txn.request.correlation_id,
                request_id=txn.request.request_id,
@@ -238,14 +238,11 @@ class MCpuComponent(ComponentBase):
    def _resolve_dma_destinations(self, request: Any, target_pe: int | str) -> list[str]:
        """Return list of HBM destination node_ids for DMA fan-out.

-        Uses PA-based resolution to determine the actual target cube and slice,
-        enabling cross-cube DMA routing when the PA points to a remote cube.
+        With single hbm_ctrl per cube (ADR-0019), always returns one node.
+        PA-based resolution still used for cross-cube routing.
        """
        cube_prefix = self.node.id.rsplit(".", 1)[0]  # e.g. "sip0.cube0"

-        if isinstance(target_pe, int):
-            return [f"{cube_prefix}.hbm_ctrl.slice{target_pe}"]
-
        # PA-based resolution: extract actual target from physical address
        pa_val = getattr(request, "dst_pa", None) or getattr(request, "src_pa", None)
        if pa_val is not None:
@@ -256,12 +253,8 @@ class MCpuComponent(ComponentBase):
            except Exception:
                pass

-        # "all" without PA (KernelLaunch): all slices in local cube
-        n_slices = 8
-        if self.ctx and self.ctx.spec:
-            mm = self.ctx.spec.get("cube", {}).get("memory_map", {})
-            n_slices = mm.get("hbm_slices_per_cube", 8)
-        return [f"{cube_prefix}.hbm_ctrl.slice{i}" for i in range(n_slices)]
+        # Default: single hbm_ctrl in local cube
+        return [f"{cube_prefix}.hbm_ctrl"]

    def _mmu_msg_fanout(self, env: simpy.Environment, txn: Any) -> Generator:
        """Fan out MmuMapMsg/MmuUnmapMsg to target PE_MMU(s) via NOC.
@@ -1,224 +0,0 @@
-from __future__ import annotations
-
-from collections.abc import Generator
-from typing import TYPE_CHECKING, Any
-
-import simpy
-
-from kernbench.components.base import ComponentBase
-
-if TYPE_CHECKING:
-    from kernbench.components.context import ComponentContext
-    from kernbench.topology.types import Node
-
-
-class TwoDMeshNocComponent(ComponentBase):
-    """2D mesh NOC modeled as a single smart node.
-
-    Latency model:
-      - Traversal latency = Manhattan distance between prev_hop and next_hop
-        node positions, split into XY segments, traversed with pipeline.
-      - overhead_ns (from node.attrs) is added once per traversal.
-
-    Contention model:
-      - Each directed XY segment is a simpy.Resource(capacity=1).
-      - Pipeline: next segment's resource is requested before the current
-        segment's timeout completes, so a free downstream segment is acquired
-        immediately (wormhole-style cut-through).
-      - Two transactions sharing a segment (same row or column band) contend.
-
-    Concurrency:
-      - _worker spawns an independent SimPy process per transaction, so the
-        NOC is never serialized at the node level — only at segment resources.
-    """
-
-    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
-        super().__init__(node, ctx)
-        self._env: simpy.Environment | None = None
-        self._links: dict[tuple, simpy.Resource] = {}
-        self._x_grid: list[float] = []
-        self._y_grid: list[float] = []
-
-    def start(self, env: simpy.Environment) -> None:
-        self._env = env
-        self._build_grid()
-        super().start(env)
-
-    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
-        yield env.timeout(0)
-
-    # ── Grid construction ────────────────────────────────────────────
-
-    def _build_grid(self) -> None:
-        if not self.ctx:
-            return
-        mesh = self.ctx.spec.get("_mesh") if self.ctx.spec else None
-        if mesh:
-            self._build_grid_from_mesh(mesh)
-        else:
-            self._build_grid_from_positions()
-
-    def _build_grid_from_mesh(self, mesh: dict) -> None:
-        """Build XY grid from cube_mesh.yaml router positions (authoritative)."""
-        origin_x, origin_y = self._cube_origin()
-        xs: set[float] = set()
-        ys: set[float] = set()
-        for key, router in mesh.get("routers", {}).items():
-            if router is not None:
-                xs.add(round(origin_x + router["pos_mm"][0], 2))
-                ys.add(round(origin_y + router["pos_mm"][1], 2))
-        self._x_grid = sorted(xs)
-        self._y_grid = sorted(ys)
-
-    def _build_grid_from_positions(self) -> None:
-        """Fallback: infer grid from all node positions in the cube."""
-        cube_prefix = self.node.id.rsplit(".", 1)[0]
-        xs: set[float] = set()
-        ys: set[float] = set()
-        for node_id, pos in self.ctx.positions.items():
-            if node_id.startswith(cube_prefix + ".") and pos is not None:
-                xs.add(round(pos[0], 2))
-                ys.add(round(pos[1], 2))
-        self._x_grid = sorted(xs)
-        self._y_grid = sorted(ys)
-
-    def _cube_origin(self) -> tuple[float, float]:
-        """Compute absolute origin (top-left) of this cube from cube_id."""
-        parts = self.node.id.split(".")
-        cube_str = [p for p in parts if p.startswith("cube")][0]
-        cube_id = int(cube_str[4:])
-        spec = self.ctx.spec
-        sip_spec = spec.get("sip", {})
-        cube_spec = spec.get("cube", {})
-        mesh_w = sip_spec.get("cube_mesh", {}).get("w", 4)
-        cube_w = cube_spec.get("geometry", {}).get("cube_mm", {}).get("w", 17.0)
-        cube_h = cube_spec.get("geometry", {}).get("cube_mm", {}).get("h", 14.0)
-        seam = sip_spec.get("links", {}).get("inter_cube_mesh", {}).get(
-            "distance_mm_across_seam", 1.0)
-        col = cube_id % mesh_w
-        row = cube_id // mesh_w
-        return (col * (cube_w + seam), row * (cube_h + seam))
-
-    def _get_link(self, key: tuple) -> simpy.Resource:
-        if key not in self._links:
-            assert self._env is not None
-            self._links[key] = simpy.Resource(self._env, capacity=1)
-        return self._links[key]
-
-    # ── Worker ───────────────────────────────────────────────────────
-
-    def _worker(self, env: simpy.Environment) -> Generator:
-        while True:
-            txn: Any = yield self._inbox.get()
-            env.process(self._route(env, txn))
-
-    def _route(self, env: simpy.Environment, txn: Any) -> Generator:
-        prev_hop = txn.path[txn.step - 1] if txn.step > 0 else None
-        next_hop = txn.next_hop
-        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
-
-        links: list[tuple[tuple, float]] = []
-        if prev_hop and next_hop and self.ctx:
-            src_pos = self.ctx.positions.get(prev_hop)
-            dst_pos = self.ctx.positions.get(next_hop)
-            if src_pos and dst_pos:
-                links = self._xy_links(src_pos, dst_pos)
-
-        if links:
-            yield from self._traverse(env, links, overhead_ns)
-        else:
-            yield env.timeout(overhead_ns)
-
-        if next_hop:
-            yield self.out_ports[next_hop].put(txn.advance())
-        else:
-            drain = getattr(txn, "drain_ns", 0.0)
-            if drain > 0:
-                yield env.timeout(drain)
-            txn.done.succeed()
-
-    # ── XY routing and pipelined link traversal ──────────────────────
-
-    def _traverse(
-        self,
-        env: simpy.Environment,
-        links: list[tuple[tuple, float]],
-        overhead_ns: float,
-    ) -> Generator:
-        """Pipeline: request next segment before current timeout finishes."""
-        ns_per_mm = self.ctx.ns_per_mm  # type: ignore[union-attr]
-
-        # Acquire first link
-        first_key, _ = links[0]
-        current_resource = self._get_link(first_key)
-        current_req = current_resource.request()
-        yield current_req
-
-        for i, (_, dist_mm) in enumerate(links):
-            # Request next link before current timeout (pipeline)
-            if i + 1 < len(links):
-                next_key, _ = links[i + 1]
-                next_resource = self._get_link(next_key)
-                next_req = next_resource.request()
-
-            yield env.timeout(dist_mm * ns_per_mm + (overhead_ns if i == 0 else 0.0))
-            current_resource.release(current_req)
-
-            if i + 1 < len(links):
-                yield next_req  # usually already fulfilled (pipeline)
-                current_resource = next_resource
-                current_req = next_req
-
-    def _xy_links(
-        self,
-        src: tuple[float, float],
-        dst: tuple[float, float],
-    ) -> list[tuple[tuple, float]]:
-        """XY routing: horizontal segment first, then vertical.
-
-        Returns list of (link_key, dist_mm) pairs, where link_key uniquely
-        identifies a directed segment shared across concurrent transactions.
-        """
-        x0, y0 = src
-        x1, y1 = dst
-        links: list[tuple[tuple, float]] = []
-
-        # Horizontal segment at y≈y0
-        if abs(x0 - x1) > 1e-9:
-            y_band = self._snap(y0, self._y_grid)
-            for xa, xb in self._segments(x0, x1, self._x_grid):
-                d = abs(xb - xa)
-                if d > 1e-9:
-                    lo, hi = (xa, xb) if xa < xb else (xb, xa)
-                    dir_h = "E" if xb > xa else "W"
-                    links.append((("H", round(y_band, 2), round(lo, 2), round(hi, 2), dir_h), d))
-
-        # Vertical segment at x≈x1
-        if abs(y0 - y1) > 1e-9:
-            x_band = self._snap(x1, self._x_grid)
-            for ya, yb in self._segments(y0, y1, self._y_grid):
-                d = abs(yb - ya)
-                if d > 1e-9:
-                    lo, hi = (ya, yb) if ya < yb else (yb, ya)
-                    dir_v = "S" if yb > ya else "N"
-                    links.append((("V", round(x_band, 2), round(lo, 2), round(hi, 2), dir_v), d))
-
-        return links
-
-    @staticmethod
-    def _snap(val: float, grid: list[float]) -> float:
-        if not grid:
-            return val
-        return min(grid, key=lambda g: abs(g - val))
-
-    @staticmethod
-    def _segments(a: float, b: float, grid: list[float]) -> list[tuple[float, float]]:
-        """Consecutive (p_i, p_{i+1}) pairs covering range [a, b] using grid waypoints."""
-        if abs(a - b) < 1e-9:
-            return []
-        lo, hi = (a, b) if a < b else (b, a)
-        pts = [lo] + [g for g in grid if lo + 1e-9 < g < hi - 1e-9] + [hi]
-        pairs = [(pts[i], pts[i + 1]) for i in range(len(pts) - 1)]
-        if a > b:
-            pairs = [(p2, p1) for p1, p2 in reversed(pairs)]
-        return pairs
@@ -42,6 +42,30 @@ class PeCpuComponent(ComponentBase):
            self._cube_idx = int(parts[1].replace("cube", ""))
        except (IndexError, ValueError):
            self._cube_idx = 0
+        # num_cubes from spec (for tl.program_id(axis=1) — ADR-0022)
+        spec = ctx.spec if ctx else {}
+        cube_mesh = spec.get("sip", {}).get("cube_mesh", {})
+        if cube_mesh:
+            self._num_cubes = int(cube_mesh.get("w", 1)) * int(cube_mesh.get("h", 1))
+        else:
+            self._num_cubes = (
+                spec.get("system", {}).get("sips", {}).get("cubes_per_sip", 1)
+            )
+        # PE-local scratch for kernel math output handles (ADR-0020 D3
+        # extension; reserved portion of TCM addressed via a synthetic
+        # MemoryStore key, not the real PA encoder).
+        pe_template = spec.get("cube", {}).get("pe_template", {})
+        tcm_attrs = pe_template.get("components", {}).get("pe_tcm", {}).get("attrs", {})
+        scratch_mb = float(tcm_attrs.get("kernel_scratch_mb", 1))
+        self._tl_scratch_size = int(scratch_mb * (1 << 20))
+        # PE-unique base address — high bit pattern to avoid collision with
+        # IPCQ ring buffers (which use bit 60).
+        self._tl_scratch_base = (
+            (1 << 61)
+            | (self._sip_idx << 40)
+            | (self._cube_idx << 32)
+            | (self._pe_idx << 24)
+        )

    def _find_shard(self, shards: tuple) -> Any:
        """Find shard matching this PE's (sip, cube, pe). Fallback to positional index."""
@@ -65,24 +89,45 @@ class PeCpuComponent(ComponentBase):
                yield from self._forward_txn(env, txn)

    def _execute_kernel(self, env: simpy.Environment, txn: Any) -> Generator:
-        """Compile kernel function and replay command trace."""
-        from kernbench.common.pe_commands import (
-            CompositeCmd,
-            PeCpuOverheadCmd,
-            PeInternalTxn,
-            WaitCmd,
-        )
+        """Execute kernel: greenlet mode (ADR-0020) or legacy Phase 0 + replay."""
        from kernbench.triton_emu.registry import get_kernel
-        from kernbench.triton_emu.tl_context import TLContext, run_kernel

        request = txn.request
-
-        # Phase 1: Compile — apply PE_CPU setup overhead, then run kernel
        yield from self.run(env, 0)

        kernel_fn = get_kernel(request.kernel_ref.name)
+        num_programs = self._derive_num_programs(request)
+        kernel_args = self._unpack_kernel_args(request)

-        # Derive num_programs from the number of PE shards in this cube
+        pe_exec_start = env.now
+        scheduler_id = f"{self._pe_prefix}.pe_scheduler"
+
+        # Choose execution mode: greenlet (ADR-0020) or legacy command-list
+        store = getattr(self.ctx, "memory_store", None) if self.ctx else None
+
+        if store is not None:
+            composite_results = yield from self._execute_greenlet(
+                env, kernel_fn, kernel_args, num_programs, scheduler_id, store,
+            )
+        else:
+            composite_results = yield from self._execute_legacy(
+                env, kernel_fn, kernel_args, num_programs, scheduler_id,
+            )
+
+        # Record PE-internal execution time
+        txn.result_data["pe_exec_ns"] = env.now - pe_exec_start
+        total_dma_ns = 0.0
+        total_compute_ns = 0.0
+        for rd in composite_results:
+            total_dma_ns += rd.get("dma_ns", 0.0)
+            total_compute_ns += rd.get("compute_ns", 0.0)
+        txn.result_data["dma_ns"] = total_dma_ns
+        txn.result_data["compute_ns"] = total_compute_ns
+
+        # Send ResponseMsg on reverse path
+        yield from self._send_response(env, txn, request)
+
+    def _derive_num_programs(self, request: Any) -> int:
        num_programs = 1
        for arg in request.args:
            if arg.arg_kind == "tensor":
@@ -92,11 +137,9 @@ class PeCpuComponent(ComponentBase):
                )
                if cube_pe_count > num_programs:
                    num_programs = cube_pe_count
+        return num_programs

-        tl = TLContext(pe_id=self._pe_idx, num_programs=num_programs, dispatch_cycles=0)
-
-        # Unpack KernelLaunchMsg.args into positional args for kernel function
-        # TensorArg → va_base (already local, set by runtime) or PA fallback
+    def _unpack_kernel_args(self, request: Any) -> list:
        kernel_args: list = []
        for arg in request.args:
            if arg.arg_kind == "tensor":
@@ -107,15 +150,48 @@ class PeCpuComponent(ComponentBase):
                    kernel_args.append(shard.pa)
            elif arg.arg_kind == "scalar":
                kernel_args.append(arg.value)
+        return kernel_args

+    def _execute_greenlet(
+        self, env, kernel_fn, kernel_args, num_programs, scheduler_id, store,
+    ) -> Generator:
+        """Greenlet-based execution (ADR-0020 D3): kernel ↔ SimPy interleaved."""
+        from kernbench.triton_emu.kernel_runner import KernelRunner
+
+        runner = KernelRunner(
+            pe_prefix=self._pe_prefix,
+            pe_idx=self._pe_idx,
+            sip_idx=self._sip_idx,
+            cube_idx=self._cube_idx,
+            num_cubes=self._num_cubes,
+            scheduler_id=scheduler_id,
+            out_ports=self.out_ports,
+            store=store,
+            scratch_base=self._tl_scratch_base,
+            scratch_size=self._tl_scratch_size,
+        )
+        yield from runner.run(env, kernel_fn, kernel_args, num_programs)
+        return getattr(runner, "_composite_results", [])
+
+    def _execute_legacy(
+        self, env, kernel_fn, kernel_args, num_programs, scheduler_id,
+    ) -> Generator:
+        """Legacy Phase 0 + replay: generate command list, then dispatch."""
+        from kernbench.common.pe_commands import (
+            CompositeCmd, PeCpuOverheadCmd, PeInternalTxn, WaitCmd,
+        )
+        from kernbench.triton_emu.tl_context import TLContext, run_kernel
+
+        tl = TLContext(
+            pe_id=self._pe_idx, num_programs=num_programs,
+            cube_id=self._cube_idx, num_cubes=self._num_cubes,
+            dispatch_cycles=0,
+        )
        run_kernel(kernel_fn, tl, *kernel_args)
        commands = tl.commands

-        # Phase 2: Replay — dispatch commands to PE_SCHEDULER
-        pe_exec_start = env.now
-        scheduler_id = f"{self._pe_prefix}.pe_scheduler"
-        pending: dict[str, simpy.Event] = {}  # completion_id → done event
-        composite_results: list[dict] = []  # collect result_data from CompositeCmd txns
+        pending: dict[str, simpy.Event] = {}
+        composite_results: list[dict] = []

        for cmd in commands:
            if isinstance(cmd, PeCpuOverheadCmd):
@@ -126,47 +202,30 @@ class PeCpuComponent(ComponentBase):
                    if evt:
                        yield evt
                else:
-                    # Wait all pending completions
                    for evt in pending.values():
                        yield evt
                    pending.clear()
            elif isinstance(cmd, CompositeCmd):
-                # Non-blocking: dispatch to scheduler, track completion
                done_evt = env.event()
                pe_txn = PeInternalTxn(
-                    command=cmd, done=done_evt,
-                    pe_prefix=self._pe_prefix,
+                    command=cmd, done=done_evt, pe_prefix=self._pe_prefix,
                )
                composite_results.append(pe_txn.result_data)
                yield self.out_ports[scheduler_id].put(pe_txn)
                pending[cmd.completion.id] = done_evt
            else:
-                # Blocking: dispatch and wait for completion
                done_evt = env.event()
                pe_txn = PeInternalTxn(
-                    command=cmd, done=done_evt,
-                    pe_prefix=self._pe_prefix,
+                    command=cmd, done=done_evt, pe_prefix=self._pe_prefix,
                )
                yield self.out_ports[scheduler_id].put(pe_txn)
                yield done_evt

-        # Wait for any remaining pending completions
        for evt in pending.values():
            yield evt
+        return composite_results

-        # Record PE-internal execution time
-        txn.result_data["pe_exec_ns"] = env.now - pe_exec_start
-
-        # Aggregate dma_ns / compute_ns from CompositeCmd results
-        total_dma_ns = 0.0
-        total_compute_ns = 0.0
-        for rd in composite_results:
-            total_dma_ns += rd.get("dma_ns", 0.0)
-            total_compute_ns += rd.get("compute_ns", 0.0)
-        txn.result_data["dma_ns"] = total_dma_ns
-        txn.result_data["compute_ns"] = total_compute_ns
-
-        # Send ResponseMsg on reverse path (PE_CPU → NOC → M_CPU)
+    def _send_response(self, env, txn, request) -> Generator:
        reverse_path = list(reversed(txn.path))
        if len(reverse_path) >= 2:
            from kernbench.runtime_api.kernel import ResponseMsg
@@ -96,7 +96,7 @@ class PeDmaComponent(PeEngineBase):
                request=sub_request, path=path, step=0,
                nbytes=cmd.nbytes, done=sub_done, drain_ns=drain_ns,
            )
-            # Send to next hop (path[0] is pe_dma itself, path[1] is xbar)
+            # Send to next hop (path[0] is pe_dma itself, path[1] is router)
            if len(path) > 1:
                yield self.out_ports[path[1]].put(sub_txn.advance())
        # DMA channel released after issue
@@ -105,6 +105,203 @@ class PeDmaComponent(PeEngineBase):
        yield sub_done
        pe_txn.done.succeed()

+    def _worker(self, env: simpy.Environment) -> Generator:
+        """Handle TileToken (pipeline), PeInternalTxn (legacy), IpcqDmaToken,
+        and Transaction (fabric)."""
+        from kernbench.common.ipcq_types import IpcqDmaToken
+        from kernbench.common.pe_commands import PeInternalTxn
+        from kernbench.components.builtin.pe_types import TileToken
+
+        while True:
+            msg: Any = yield self._inbox.get()
+            if isinstance(msg, IpcqDmaToken):
+                # Outbound: IPCQ token from local PE_IPCQ → forward via fabric
+                env.process(self._handle_ipcq_outbound(env, msg))
+            elif isinstance(msg, TileToken):
+                env.process(self._pipeline_process(env, msg))
+            elif isinstance(msg, PeInternalTxn):
+                env.process(self._handle_with_hooks(env, msg))
+            else:
+                # Transaction (or unknown). May carry IpcqDmaToken inbound.
+                req = getattr(msg, "request", None)
+                if isinstance(req, IpcqDmaToken):
+                    env.process(self._handle_ipcq_inbound(env, msg))
+                else:
+                    env.process(self._forward_txn(env, msg))
+
+    # ── IPCQ outbound (PE_IPCQ → PE_DMA → fabric) ───────────────────
+
+    def _handle_ipcq_outbound(self, env: simpy.Environment, token: Any) -> Generator:
+        """Forward IpcqDmaToken from local PE_IPCQ through the fabric to peer
+        PE_DMA. ADR-0023 D8 (vc_comm channel)."""
+        if self.ctx is None:
+            return  # nothing to do
+        peer = token.dst_endpoint
+        peer_pe_dma = f"sip{peer.sip}.cube{peer.cube}.pe{peer.pe}.pe_dma"
+
+        # Snapshot the source data at send time (D9 in-flight semantics).
+        # Without this, the receiver could read stale or future data if the
+        # sender mutates src_addr between send issue and DMA arrival.
+        store = getattr(self.ctx, "memory_store", None)
+        if store is not None and token.data is None:
+            try:
+                snap = store.read(
+                    token.src_space, token.src_addr,
+                    shape=token.shape, dtype=token.dtype,
+                )
+                # Copy so later mutations to src_addr don't affect the snapshot.
+                token.data = snap.copy() if hasattr(snap, "copy") else snap
+            except Exception:
+                token.data = None
+
+        # Note: ipcq_copy is recorded at INBOUND time (in _handle_ipcq_inbound),
+        # not here. Outbound time is too early — it precedes fabric propagation,
+        # so in Phase 2 a later round's copy can sort before the receiver's
+        # math for an earlier round, causing slot data corruption.
+        # The secondary sort in DataExecutor (memory ops before math at the
+        # same t_start) ensures the inbound copy runs before the local math
+        # that reads the slot.
+
+        try:
+            path = self.ctx.router.find_path(self._pe_prefix, peer_pe_dma)
+        except Exception:
+            return
+        drain_ns = self.ctx.compute_drain_ns(path, token.nbytes)
+
+        sub_done = env.event()
+        sub_txn = Transaction(
+            request=token, path=path, step=0,
+            nbytes=token.nbytes, done=sub_done, drain_ns=drain_ns,
+        )
+        if len(path) > 1:
+            next_hop = path[1]
+            if next_hop in self.out_ports:
+                yield self.out_ports[next_hop].put(sub_txn.advance())
+            else:
+                return
+        # Note: don't wait on sub_done here — fire-and-forget for vc_comm.
+        # IPCQ slot bookkeeping (peer_head) was already updated by PE_IPCQ;
+        # backpressure is via credit return, not via this DMA's completion.
+
+    # ── IPCQ inbound (fabric → PE_DMA → MemoryStore + PE_IPCQ) ──────
+
+    def _handle_ipcq_inbound(self, env: simpy.Environment, txn: Any) -> Generator:
+        """At destination PE_DMA: atomically write data and forward metadata.
+
+        I6 (MUST): no SimPy yield between MemoryStore.write and the
+        IpcqMetaArrival put into PE_IPCQ.
+        """
+        from kernbench.common.ipcq_types import IpcqMetaArrival
+
+        token = txn.request
+
+        # ── ATOMIC: do not introduce yield between these two operations ──
+        # 1. Move data via MemoryStore (single-hop DMA write).
+        # Prefer the in-flight snapshot stashed by the sender PE_DMA;
+        # fall back to a fresh read of src_addr if no snapshot is present
+        # (e.g. control-only token).
+        store = getattr(self.ctx, "memory_store", None) if self.ctx else None
+        if store is not None:
+            try:
+                data = token.data
+                if data is None:
+                    data = store.read(
+                        token.src_space, token.src_addr,
+                        shape=token.shape, dtype=token.dtype,
+                    )
+                store.write(token.dst_endpoint.buffer_kind, token.dst_addr, data)
+            except Exception:
+                pass
+
+        # Record the IPCQ copy at INBOUND time with embedded data snapshot.
+        # The snapshot (token.data) was captured by the sender's outbound
+        # PE_DMA at send time. Phase 2 writes the snapshot directly to
+        # dst — it does NOT re-read from MemoryStore[src_addr], which may
+        # have been mutated by a different PE's Phase 2 ops by that point.
+        # DataExecutor's secondary sort (memory before math at same
+        # t_start) ensures the write completes before the local math
+        # that reads the slot.
+        if self._op_logger is not None:
+            try:
+                self._op_logger.record_copy(
+                    t_start=float(env.now), t_end=float(env.now),
+                    component_id=self.node.id,
+                    src_space=token.src_space, src_addr=token.src_addr,
+                    dst_space=token.dst_endpoint.buffer_kind,
+                    dst_addr=token.dst_addr,
+                    shape=token.shape, dtype=token.dtype, nbytes=token.nbytes,
+                    snapshot=token.data,
+                )
+            except Exception:
+                pass
+
+        # 2. Forward IpcqMetaArrival to local PE_IPCQ
+        ipcq_id = f"{self._pe_prefix}.pe_ipcq"
+        if ipcq_id in self.out_ports:
+            yield self.out_ports[ipcq_id].put(IpcqMetaArrival(token=token))
+        # ─────────────────────────────────────────────────────────────────
+
+        if not txn.done.triggered:
+            txn.done.succeed()
+
+    def _pipeline_process(self, env: simpy.Environment, token: Any) -> Generator:
+        """Pipeline mode: DMA read/write via fabric, then self-route."""
+        self._on_process_start(env, token)
+        yield from self._do_pipeline_dma(env, token)
+        self._on_process_end(env, token)
+
+        # Self-routing (handle same-component consecutive stages)
+        next_stage = token.advance()
+        while next_stage is not None and next_stage.component == self.node.id:
+            self._on_process_start(env, token)
+            yield from self._do_pipeline_dma(env, token)
+            self._on_process_end(env, token)
+            next_stage = token.advance()
+
+        if next_stage is not None:
+            yield self.out_ports[next_stage.component].put(token)
+        else:
+            token.pipeline_ctx.complete_tile()
+
+    def _do_pipeline_dma(self, env, token):
+        """Core DMA logic for pipeline mode."""
+        from kernbench.policy.address.phyaddr import PhysAddr
+        from kernbench.runtime_api.kernel import PeDmaMsg
+
+        params = token.params
+        from kernbench.components.builtin.pe_types import StageType
+        is_write = token.current_stage.stage_type == StageType.DMA_WRITE
+        addr = params.get("dst_addr" if is_write else "src_addr", 0)
+        nbytes = params.get("nbytes", 0)
+
+        if nbytes > 0 and self.ctx:
+            dma_res = self._dma_write if is_write else self._dma_read
+            assert dma_res is not None
+
+            pa = PhysAddr.decode(addr)
+            dst_node = self.ctx.resolver.resolve(pa)
+            path = self.ctx.router.find_path(self._pe_prefix, dst_node)
+            drain_ns = self.ctx.compute_drain_ns(path, nbytes)
+
+            with dma_res.request() as req:
+                yield req
+                sub_done = env.event()
+                sub_request = PeDmaMsg(
+                    correlation_id="pipeline",
+                    request_id=f"tile_{token.tile_id}",
+                    src_sip=0, src_cube=0, src_pe=0,
+                    dst_pa=addr, nbytes=nbytes,
+                    is_write=is_write,
+                )
+                sub_txn = Transaction(
+                    request=sub_request, path=path, step=0,
+                    nbytes=nbytes, done=sub_done, drain_ns=drain_ns,
+                )
+                if len(path) > 1:
+                    yield self.out_ports[path[1]].put(sub_txn.advance())
+
+            yield sub_done
+
    def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
        """Handle external Transaction (PeDmaMsg probe, M_CPU DMA) with channel acquisition."""
        # Response transactions bypass DMA channel (no outbound resource needed)
@@ -0,0 +1,77 @@
+"""PE_FETCH_STORE: TCM ↔ Register File transfer unit (ADR-0021 D5).
+
+Handles both fetch (TCM → register) and store (register → TCM).
+BW serialization is delegated to PE_TCM via port communication.
+"""
+from __future__ import annotations
+
+from collections.abc import Generator
+from typing import TYPE_CHECKING, Any
+
+import simpy
+
+from kernbench.components.base import PeEngineBase
+
+if TYPE_CHECKING:
+    from kernbench.components.context import ComponentContext
+    from kernbench.topology.types import Node
+
+
+class PeFetchStoreComponent(PeEngineBase):
+    """PE_FETCH_STORE: TCM ↔ Register File (ADR-0021 D5).
+
+    Receives TileTokens via pipeline self-routing.
+    Sends TcmRequest to PE_TCM for BW-based latency.
+    """
+
+    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
+        super().__init__(node, ctx)
+        self._tcm_id = f"{self._pe_prefix}.pe_tcm"
+
+    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
+        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
+        yield env.timeout(overhead_ns)
+
+    def _worker(self, env: simpy.Environment) -> Generator:
+        """Handle both PeInternalTxn (legacy) and TileToken (pipeline)."""
+        from kernbench.common.pe_commands import PeInternalTxn
+        from kernbench.components.builtin.pe_types import TileToken
+
+        while True:
+            msg: Any = yield self._inbox.get()
+            if isinstance(msg, TileToken):
+                env.process(self._pipeline_process(env, msg))
+            elif isinstance(msg, PeInternalTxn):
+                env.process(self.handle_command(env, msg))
+            else:
+                env.process(self._forward_txn(env, msg))
+
+    def _pipeline_process(self, env: simpy.Environment, token: Any) -> Generator:
+        """Process a pipeline TileToken: fetch or store via TCM."""
+        from kernbench.components.builtin.pe_tcm import TcmRequest
+
+        self._on_process_start(env, token)
+
+        direction = token.params.get("direction", "read")
+        nbytes = token.params.get("nbytes", 0)
+
+        if nbytes > 0 and self._tcm_id in self.out_ports:
+            done = env.event()
+            yield self.out_ports[self._tcm_id].put(
+                TcmRequest(direction=direction, nbytes=nbytes, done=done)
+            )
+            yield done
+
+        self._on_process_end(env, token)
+
+        # Self-routing: advance to next stage
+        next_stage = token.advance()
+        if next_stage is not None:
+            yield self.out_ports[next_stage.component].put(token)
+        else:
+            token.pipeline_ctx.complete_tile()
+
+    def handle_command(self, env: simpy.Environment, pe_txn: Any) -> Generator:
+        """Legacy PeInternalTxn handling."""
+        yield from self.run(env, 0)
+        pe_txn.done.succeed()
@@ -1,6 +1,18 @@
+"""PE_GEMM: matrix multiplication engine (ADR-0021 D6).
+
+Handles both legacy PeInternalTxn (GemmCmd) and pipeline TileToken.
+In pipeline mode, receives token after fetch stage, computes MAC, chains to next.
+
+MAC latency model (from pe_accel):
+  cycles = ceil(Tm/mac_m) * ceil(Tk/mac_k) * ceil(Tn/mac_n)
+  latency_ns = cycles / clock_freq_ghz
+
+Falls back to TFLOPS model when mac dimensions not configured.
+"""
 from __future__ import annotations

 from collections.abc import Generator
+from math import ceil
 from typing import TYPE_CHECKING, Any

 import simpy
@@ -12,33 +24,29 @@ if TYPE_CHECKING:
    from kernbench.components.context import ComponentContext
    from kernbench.topology.types import Node

-
-# dtype → bit width (for TFLOPS scaling)
 _DTYPE_BITS: dict[str, int] = {
    "f16": 16, "fp16": 16, "float16": 16, "bf16": 16,
    "f32": 32, "fp32": 32, "float32": 32,
-    "i8": 8, "int8": 8,
-    "i16": 16, "int16": 16,
-    "i32": 32, "int32": 32,
+    "i8": 8, "int8": 8, "i16": 16, "int16": 16, "i32": 32, "int32": 32,
 }


 class PeGemmComponent(PeEngineBase):
-    """PE_GEMM: matrix multiplication engine sharing accel_slot (ADR-0014 D4).
+    """PE_GEMM: MAC array (ADR-0021 D6).

-    Uses a shared compute resource (PE_ACCEL capacity=1) that is mutually
-    exclusive with PE_MATH within the same PE.
-
-    Compute latency model:
-      FLOPs = 2 * M * K * N
-      effective_tflops = peak_tflops_f16 * (16 / dtype_bits)
-      compute_ns = FLOPs / (effective_tflops * 1e3)
+    In pipeline mode: pure compute — register data already fetched.
+    In legacy mode: handles PeInternalTxn(GemmCmd) with shared accel_slot.
    """

    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
        super().__init__(node, ctx)
        self._accel: simpy.Resource | None = None
        self._peak_tflops_f16: float = float(node.attrs.get("peak_tflops_f16", 0.0))
+        # Cycle-accurate MAC dimensions (from pe_accel)
+        self._mac_m: int = int(node.attrs.get("mac_m", 0))
+        self._mac_k: int = int(node.attrs.get("mac_k", 0))
+        self._mac_n: int = int(node.attrs.get("mac_n", 0))
+        self._clock_freq: float = float(node.attrs.get("clock_freq_ghz", 1.0))

    def init_resources(self, env: simpy.Environment) -> None:
        resource_name = self.node.attrs.get("shared_resource")
@@ -47,8 +55,15 @@ class PeGemmComponent(PeEngineBase):
                env, f"{self._pe_prefix}.{resource_name}"
            )

-    def _compute_ns(self, m: int, k: int, n: int, dtype: str) -> float:
-        """Compute GEMM latency in nanoseconds."""
+    def _compute_ns_mac(self, m: int, k: int, n: int) -> float:
+        """Cycle-accurate MAC latency (pe_accel model)."""
+        if self._mac_m > 0 and self._mac_k > 0 and self._mac_n > 0:
+            cycles = ceil(m / self._mac_m) * ceil(k / self._mac_k) * ceil(n / self._mac_n)
+            return cycles / self._clock_freq
+        return 0.0
+
+    def _compute_ns_tflops(self, m: int, k: int, n: int, dtype: str = "f16") -> float:
+        """TFLOPS-based latency (legacy model)."""
        if self._peak_tflops_f16 <= 0:
            return float(self.node.attrs.get("overhead_ns", 0.0))
        dtype_bits = _DTYPE_BITS.get(dtype, 16)
@@ -56,11 +71,58 @@ class PeGemmComponent(PeEngineBase):
        flops = 2.0 * m * k * n
        return flops / (effective_tflops * 1e3)

+    def _compute_ns(self, m: int, k: int, n: int, dtype: str = "f16") -> float:
+        """Choose best available latency model."""
+        mac_ns = self._compute_ns_mac(m, k, n)
+        if mac_ns > 0:
+            return mac_ns
+        return self._compute_ns_tflops(m, k, n, dtype)
+
    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
        yield env.timeout(overhead_ns)

+    def _worker(self, env: simpy.Environment) -> Generator:
+        from kernbench.common.pe_commands import PeInternalTxn
+        from kernbench.components.builtin.pe_types import TileToken
+
+        while True:
+            msg: Any = yield self._inbox.get()
+            if isinstance(msg, TileToken):
+                env.process(self._pipeline_process(env, msg))
+            elif isinstance(msg, PeInternalTxn):
+                env.process(self._handle_with_hooks(env, msg))
+            else:
+                env.process(self._forward_txn(env, msg))
+
+    def _pipeline_process(self, env: simpy.Environment, token: Any) -> Generator:
+        """Pipeline mode: pure MAC compute, then self-route."""
+        self._on_process_start(env, token)
+
+        m = token.params.get("m", 0)
+        k = token.params.get("k", 0)
+        n = token.params.get("n", 0)
+
+        if self._accel:
+            with self._accel.request() as req:
+                yield req
+                ns = self._compute_ns(m, k, n)
+                yield env.timeout(ns)
+        else:
+            ns = self._compute_ns(m, k, n)
+            yield env.timeout(ns)
+
+        self._on_process_end(env, token)
+
+        # Self-routing
+        next_stage = token.advance()
+        if next_stage is not None:
+            yield self.out_ports[next_stage.component].put(token)
+        else:
+            token.pipeline_ctx.complete_tile()
+
    def handle_command(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
+        """Legacy PeInternalTxn handling."""
        from kernbench.common.pe_commands import GemmCmd

        cmd = pe_txn.command
@@ -81,7 +143,6 @@ class PeGemmComponent(PeEngineBase):
        pe_txn.done.succeed()

    def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
-        """Transaction forwarding with accel_slot acquisition."""
        if self._accel:
            with self._accel.request() as req:
                yield req
@@ -0,0 +1,479 @@
+"""PE_IPCQ component (ADR-0023): per-PE IPCQ control plane.
+
+Responsibilities:
+    - Hold per-direction queue pair state (my_head, my_tail,
+      peer_head_cache, peer_tail_cache, ring buffer addresses)
+    - Process IpcqInitMsg from backend to install neighbor table
+    - Handle IpcqRequest(IpcqSendCmd) from PE_CPU:
+        compute peer slot address, check backpressure, forward
+        IpcqDmaToken to PE_DMA (vc_comm)
+    - Handle IpcqRequest(IpcqRecvCmd) from PE_CPU:
+        wait for data arrival, return slot address (or copy to dst),
+        send fast-path credit return
+    - Handle IpcqMetaArrival from PE_DMA: update peer_head_cache, wake recv
+    - Handle IpcqCreditMetadata via own credit_inbox: update peer_tail_cache,
+      wake send
+
+PE_IPCQ does NOT move data — it forwards IpcqDmaToken to PE_DMA which
+performs the actual fabric DMA.
+
+Credit return uses a fast path: PE_IPCQ creates a SimPy process with a
+bottleneck-BW based latency, then puts IpcqCreditMetadata directly into
+the peer's pre-wired credit_store.
+"""
+from __future__ import annotations
+
+from collections.abc import Generator
+from typing import TYPE_CHECKING, Any
+
+import simpy
+
+from kernbench.common.ipcq_types import (
+    IpcqCreditMetadata,
+    IpcqDmaToken,
+    IpcqInvalidDirection,
+    IpcqMetaArrival,
+    IpcqRecvCmd,
+    IpcqRequest,
+    IpcqSendCmd,
+)
+from kernbench.components.base import ComponentBase
+
+if TYPE_CHECKING:
+    from kernbench.components.context import ComponentContext
+    from kernbench.runtime_api.kernel import IpcqInitMsg
+    from kernbench.topology.types import Node
+
+
+_DIR_ORDER: tuple[str, ...] = ("N", "S", "E", "W", "parent", "child_left", "child_right")
+
+
+class PeIpcqComponent(ComponentBase):
+    """PE_IPCQ: ring buffer pointer + neighbor management for CCL.
+
+    Owned by one PE; talks to PE_DMA via out_ports[<pe_dma_id>] and
+    receives credit return metadata via the public ``credit_inbox``
+    SimPy Store (wired by backend at IpcqInitMsg installation time).
+    """
+
+    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
+        super().__init__(node, ctx)
+        # Strict shape/dtype validation (D14 F2). Off by default.
+        self._strict: bool = bool(node.attrs.get("strict_validation", False))
+        # direction → list of received tokens (for strict-mode peek of next slot)
+        self._arrived_tokens: dict[str, list] = {}
+        # Parse self (sip, cube, pe) from node id, e.g. "sip0.cube0.pe0.pe_ipcq"
+        self._pe_prefix: str = node.id.rsplit(".", 1)[0]
+        parts = self._pe_prefix.split(".")
+        try:
+            self._self_sip = int(parts[0].replace("sip", ""))
+        except (IndexError, ValueError):
+            self._self_sip = 0
+        try:
+            self._self_cube = int(parts[1].replace("cube", ""))
+        except (IndexError, ValueError):
+            self._self_cube = 0
+        try:
+            self._self_pe = int(parts[2].replace("pe", ""))
+        except (IndexError, ValueError):
+            self._self_pe = 0
+
+        self._dma_node_id = f"{self._pe_prefix}.pe_dma"
+        # direction → state dict (see _install_neighbors for shape)
+        self._queue_pairs: dict[str, dict[str, Any]] = {}
+        self._installed = False
+        self._buffer_kind: str = "tcm"
+        self._backpressure_mode: str = "sleep"
+        self._credit_size_bytes: int = 16
+        # waiters for recv (per direction) and any-direction (for round-robin)
+        self._recv_waiters: dict[str, list[simpy.Event]] = {}
+        self._any_recv_waiters: list[simpy.Event] = []
+        # waiters for send backpressure (per direction)
+        self._send_waiters: dict[str, list[simpy.Event]] = {}
+        # round-robin cursor over installed directions
+        self._rr_dirs: list[str] = []
+        self._rr_cursor: int = 0
+        # credit_inbox is created in start() once env is available
+        self._credit_inbox: simpy.Store | None = None
+
+    # ── Public ──
+
+    @property
+    def credit_inbox(self) -> simpy.Store:
+        """SimPy Store that backend wires as ``peer_credit_store`` on
+        every remote sender targeting this PE. Used by D9 fast path."""
+        assert self._credit_inbox is not None, "PE_IPCQ not started yet"
+        return self._credit_inbox
+
+    @property
+    def queue_pairs(self) -> dict[str, dict[str, Any]]:
+        """Test/debug accessor."""
+        return self._queue_pairs
+
+    # ── Lifecycle ──
+
+    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
+        yield env.timeout(0)
+
+    def start(self, env: simpy.Environment) -> None:
+        # Create credit_inbox even if there are no in_ports yet
+        if self._credit_inbox is None:
+            self._credit_inbox = simpy.Store(env)
+        # If no in_ports were wired (e.g. unit test), still spin up workers
+        if not self.in_ports:
+            self._inbox = simpy.Store(env)
+        super().start(env)
+        env.process(self._credit_worker(env))
+
+    # ── Worker (override of ComponentBase._worker) ──
+
+    def _worker(self, env: simpy.Environment) -> Generator:
+        from kernbench.runtime_api.kernel import IpcqInitMsg
+
+        while True:
+            msg: Any = yield self._inbox.get()
+
+            # IpcqInitMsg may arrive wrapped in a transaction (with .request)
+            # or directly.
+            request_obj = getattr(msg, "request", None)
+            if isinstance(request_obj, IpcqInitMsg):
+                self._install_neighbors(request_obj)
+                done = getattr(msg, "done", None)
+                if done is not None and not done.triggered:
+                    done.succeed()
+                continue
+            if isinstance(msg, IpcqInitMsg):
+                self._install_neighbors(msg)
+                continue
+
+            if isinstance(msg, IpcqMetaArrival):
+                self._handle_meta_arrival(msg)
+                continue
+
+            if isinstance(msg, IpcqRequest):
+                env.process(self._handle_request(env, msg))
+                continue
+
+            # Unknown message — drop or forward via base class fallback
+            env.process(self._forward_txn(env, msg))
+
+    # ── Init ──
+
+    def _install_neighbors(self, msg: IpcqInitMsg) -> None:
+        self._installed = True
+        self._buffer_kind = msg.buffer_kind
+        self._backpressure_mode = msg.backpressure_mode
+        self._credit_size_bytes = msg.credit_size_bytes
+        for entry in msg.entries:
+            self._queue_pairs[entry.direction] = {
+                "peer": entry.peer,
+                "my_rx_base_pa": entry.my_rx_base_pa,
+                "my_rx_base_va": entry.my_rx_base_va,
+                "n_slots": entry.n_slots,
+                "slot_size": entry.slot_size,
+                "peer_credit_store": entry.peer_credit_store,
+                "my_head": 0,
+                "my_tail": 0,
+                "peer_head_cache": 0,
+                "peer_tail_cache": 0,
+            }
+            self._recv_waiters.setdefault(entry.direction, [])
+            self._send_waiters.setdefault(entry.direction, [])
+        # Reset round-robin order to a stable canonical sequence
+        self._rr_dirs = [d for d in _DIR_ORDER if d in self._queue_pairs]
+        self._rr_cursor = 0
+
+    # ── Send ──
+
+    def _handle_request(self, env: simpy.Environment, req: IpcqRequest) -> Generator:
+        cmd = req.command
+        if isinstance(cmd, IpcqSendCmd):
+            yield from self._handle_send(env, req, cmd)
+        elif isinstance(cmd, IpcqRecvCmd):
+            yield from self._handle_recv(env, req, cmd)
+
+    def _handle_send(
+        self, env: simpy.Environment, req: IpcqRequest, cmd: IpcqSendCmd,
+    ) -> Generator:
+        if cmd.direction not in self._queue_pairs:
+            raise IpcqInvalidDirection(
+                f"PE {self._pe_prefix}: direction {cmd.direction!r} not installed"
+            )
+        qp = self._queue_pairs[cmd.direction]
+        peer = qp["peer"]
+
+        # Backpressure: wait while ring full
+        while (qp["my_head"] - qp["peer_tail_cache"]) >= peer.n_slots:
+            wait_event = env.event()
+            self._send_waiters[cmd.direction].append(wait_event)
+            yield wait_event
+
+        # Compute peer slot address
+        slot_idx = qp["my_head"] % peer.n_slots
+        dst_pa = peer.rx_base_pa + slot_idx * peer.slot_size
+
+        token = IpcqDmaToken(
+            src_addr=cmd.src_addr,
+            src_space=cmd.src_space,
+            dst_addr=dst_pa,
+            dst_endpoint=peer,
+            nbytes=cmd.nbytes,
+            handle_id=cmd.handle_id,
+            shape=cmd.shape,
+            dtype=cmd.dtype,
+            # Carry the handle's recv-time data snapshot so the outbound
+            # PE_DMA doesn't need to re-read from MemoryStore (which may
+            # have been overwritten by a later inbound in the meantime).
+            data=getattr(cmd, "data", None),
+            sender_seq=qp["my_head"],
+            src_sip=self._self_sip,
+            src_cube=self._self_cube,
+            src_pe=self._self_pe,
+            src_direction=cmd.direction,
+        )
+
+        # Forward to PE_DMA (vc_comm)
+        yield self.out_ports[self._dma_node_id].put(token)
+        qp["my_head"] += 1
+        # Diagnostics trace (D14)
+        from kernbench.ccl import diagnostics
+        if diagnostics.trace_enabled():
+            diagnostics.log_send(
+                t_ns=float(env.now), sender=self._pe_prefix,
+                direction=cmd.direction, nbytes=cmd.nbytes,
+                sender_seq=qp["my_head"] - 1,
+            )
+        if not req.done.triggered:
+            req.done.succeed()
+
+    # ── Recv ──
+
+    def _handle_recv(
+        self, env: simpy.Environment, req: IpcqRequest, cmd: IpcqRecvCmd,
+    ) -> Generator:
+        if cmd.direction is None:
+            direction = yield from self._wait_any_direction(env)
+        else:
+            if cmd.direction not in self._queue_pairs:
+                raise IpcqInvalidDirection(
+                    f"PE {self._pe_prefix}: direction {cmd.direction!r} not installed"
+                )
+            direction = cmd.direction
+            qp = self._queue_pairs[direction]
+            while qp["peer_head_cache"] <= qp["my_tail"]:
+                wait_event = env.event()
+                self._recv_waiters[direction].append(wait_event)
+                yield wait_event
+
+        qp = self._queue_pairs[direction]
+        slot_idx = qp["my_tail"] % qp["n_slots"]
+        slot_addr = qp["my_rx_base_pa"] + slot_idx * qp["slot_size"]
+
+        # Strict validation (D14 F2): peek the next-arrived token's metadata
+        # against the recv command's expected shape/dtype/nbytes.
+        arrived = self._arrived_tokens.get(direction, [])
+        if arrived:
+            front = arrived.pop(0)
+            if self._strict:
+                expected_nbytes = self._nbytes_for(cmd.shape, cmd.dtype)
+                if front.dtype != cmd.dtype:
+                    raise ValueError(
+                        f"PE_IPCQ {self._pe_prefix} recv strict: dtype mismatch — "
+                        f"sender={front.dtype} recv={cmd.dtype}"
+                    )
+                if front.shape != cmd.shape:
+                    raise ValueError(
+                        f"PE_IPCQ {self._pe_prefix} recv strict: shape mismatch — "
+                        f"sender={front.shape} recv={cmd.shape}"
+                    )
+                if front.nbytes != expected_nbytes:
+                    raise ValueError(
+                        f"PE_IPCQ {self._pe_prefix} recv strict: nbytes mismatch — "
+                        f"sender={front.nbytes} recv={expected_nbytes}"
+                    )
+
+        req.result_data["src_space"] = self._buffer_kind
+        req.result_data["src_addr"] = slot_addr
+        req.result_data["direction"] = direction
+        req.result_data["dtype"] = cmd.dtype
+        req.result_data["shape"] = cmd.shape
+        req.result_data["nbytes"] = self._nbytes_for(cmd.shape, cmd.dtype)
+
+        # copy_to_dst mode: rebind the result handle to (dst_space, dst_addr).
+        # When op_log is disabled, we also do the actual data move now;
+        # when op_log is enabled, Phase 2 replays the slot→dst copy from
+        # the op_log entry below so we don't pollute the slot in Phase 1.
+        if cmd.recv_mode == "copy_to_dst" and self.ctx is not None:
+            req.result_data["src_space"] = cmd.dst_space
+            req.result_data["src_addr"] = cmd.dst_addr
+            store = getattr(self.ctx, "memory_store", None)
+            if store is not None and self._op_logger is None:
+                try:
+                    data = store.read(self._buffer_kind, slot_addr, shape=cmd.shape, dtype=cmd.dtype)
+                    store.write(cmd.dst_space, cmd.dst_addr, data)
+                except Exception:
+                    pass
+            if self._op_logger is not None:
+                # Record slot → dst copy for Phase 2 replay (ADR-0023 D9.5).
+                try:
+                    self._op_logger.record_copy(
+                        t_start=float(env.now), t_end=float(env.now),
+                        component_id=self.node.id,
+                        src_space=self._buffer_kind, src_addr=slot_addr,
+                        dst_space=cmd.dst_space, dst_addr=cmd.dst_addr,
+                        shape=cmd.shape, dtype=cmd.dtype,
+                        nbytes=self._nbytes_for(cmd.shape, cmd.dtype),
+                    )
+                except Exception:
+                    pass
+
+        qp["my_tail"] += 1
+
+        # Diagnostics trace (D14)
+        from kernbench.ccl import diagnostics
+        if diagnostics.trace_enabled():
+            diagnostics.log_recv(
+                t_ns=float(env.now), receiver=self._pe_prefix,
+                direction=direction,
+                nbytes=req.result_data.get("nbytes", 0),
+            )
+
+        # Fast path credit return — bottleneck BW based latency
+        env.process(
+            self._delayed_credit_send(env, direction, qp["peer_credit_store"], qp["my_tail"])
+        )
+
+        if not req.done.triggered:
+            req.done.succeed()
+
+    def _wait_any_direction(self, env: simpy.Environment) -> Generator:
+        """Round-robin scan over installed directions; wait until at least one
+        has data. Returns the chosen direction (str)."""
+        if not self._rr_dirs:
+            raise IpcqInvalidDirection(
+                f"PE {self._pe_prefix}: no neighbors installed"
+            )
+        while True:
+            n = len(self._rr_dirs)
+            for i in range(n):
+                idx = (self._rr_cursor + i) % n
+                d = self._rr_dirs[idx]
+                qp = self._queue_pairs[d]
+                if qp["peer_head_cache"] > qp["my_tail"]:
+                    self._rr_cursor = (idx + 1) % n
+                    return d
+            # Nothing available — wait until any arrival
+            wait_event = env.event()
+            self._any_recv_waiters.append(wait_event)
+            yield wait_event
+
+    # ── Metadata arrival from PE_DMA (D9) ──
+
+    def _handle_meta_arrival(self, msg: IpcqMetaArrival) -> None:
+        """Match arrival to the correct direction by dst_addr range (ADR-0025 D2).
+
+        Each direction has a unique rx buffer address range
+        ([my_rx_base_pa, my_rx_base_pa + n_slots * slot_size)). The token's
+        dst_addr (set by the sender's IPCQ when computing the peer slot
+        address) falls within exactly one such range. Address-based matching
+        is unambiguous even when multiple directions share the same peer
+        (2-rank bidirectional ring).
+        """
+        token = msg.token
+        dst_addr = token.dst_addr
+        for d, qp in self._queue_pairs.items():
+            base = qp["my_rx_base_pa"]
+            size = qp["n_slots"] * qp["slot_size"]
+            if base <= dst_addr < base + size:
+                qp["peer_head_cache"] = max(qp["peer_head_cache"], token.sender_seq + 1)
+                # Track arrived token for strict-mode peek
+                self._arrived_tokens.setdefault(d, []).append(token)
+                # Wake any blocked recv on this direction
+                waiters = self._recv_waiters.get(d, [])
+                self._recv_waiters[d] = []
+                for ev in waiters:
+                    if not ev.triggered:
+                        ev.succeed()
+                # Wake any-direction waiters
+                any_waiters = self._any_recv_waiters
+                self._any_recv_waiters = []
+                for ev in any_waiters:
+                    if not ev.triggered:
+                        ev.succeed()
+                return
+        # Unknown dst_addr — silently drop (could log)
+
+    # ── Credit return (fast path) ──
+
+    def _credit_worker(self, env: simpy.Environment) -> Generator:
+        """Process IpcqCreditMetadata from credit_inbox.
+
+        Matches credit to the correct direction by `credit.dst_rx_base_pa ==
+        qp.peer.rx_base_pa` (ADR-0025 D3). This is unambiguous even when
+        multiple directions share the same peer (2-rank bidirectional ring).
+        """
+        assert self._credit_inbox is not None
+        while True:
+            credit: IpcqCreditMetadata = yield self._credit_inbox.get()
+            for d, qp in self._queue_pairs.items():
+                if qp["peer"].rx_base_pa == credit.dst_rx_base_pa:
+                    qp["peer_tail_cache"] = max(qp["peer_tail_cache"], credit.consumer_seq)
+                    # Wake any blocked send on this direction
+                    waiters = self._send_waiters.get(d, [])
+                    self._send_waiters[d] = []
+                    for ev in waiters:
+                        if not ev.triggered:
+                            ev.succeed()
+                    break
+
+    def _delayed_credit_send(
+        self,
+        env: simpy.Environment,
+        direction: str,
+        peer_credit_store: simpy.Store,
+        new_tail: int,
+    ) -> Generator:
+        """Wait bottleneck-BW latency, then put IpcqCreditMetadata into peer
+        credit store (D9 fast path).
+
+        Carries ``dst_rx_base_pa`` = this PE's my_rx_base_pa for the
+        consumed direction. The peer (original sender) matches this against
+        qp.peer.rx_base_pa to identify the correct qp (ADR-0025 D3).
+        """
+        latency_ns = self._credit_latency_ns(direction)
+        if latency_ns > 0:
+            yield env.timeout(latency_ns)
+        qp = self._queue_pairs[direction]
+        meta = IpcqCreditMetadata(
+            consumer_seq=new_tail,
+            dst_rx_base_pa=qp["my_rx_base_pa"],
+            src_sip=self._self_sip,
+            src_cube=self._self_cube,
+            src_pe=self._self_pe,
+            src_direction=direction,
+        )
+        yield peer_credit_store.put(meta)
+
+    def _credit_latency_ns(self, direction: str) -> float:
+        """Compute credit fast path latency = credit_size / bottleneck_bw.
+
+        Falls back to 0 when ctx/router is unavailable (unit-test mode).
+        """
+        if self.ctx is None:
+            return 0.0
+        qp = self._queue_pairs[direction]
+        peer = qp["peer"]
+        peer_pe_prefix = f"sip{peer.sip}.cube{peer.cube}.pe{peer.pe}"
+        try:
+            path = self.ctx.router.find_path(self._pe_prefix, peer_pe_prefix)
+            return self.ctx.compute_drain_ns(path, self._credit_size_bytes)
+        except Exception:
+            return 0.0
+
+    # ── Helpers ──
+
+    @staticmethod
+    def _nbytes_for(shape: tuple[int, ...], dtype: str) -> int:
+        from math import prod
+        bits = {"f16": 16, "bf16": 16, "f32": 32, "i8": 8, "i16": 16, "i32": 32}.get(dtype, 16)
+        return prod(shape) * (bits // 8) if shape else 0
@@ -1,6 +1,16 @@
+"""PE_MATH: element-wise / reduction computation engine (ADR-0021 D6).
+
+Handles both legacy PeInternalTxn (MathCmd) and pipeline TileToken.
+In pipeline mode, receives token after fetch stage, computes SIMD, chains to next.
+
+SIMD latency model (from pe_accel):
+  cycles = ceil(num_elements / vector_width)
+  latency_ns = cycles / clock_freq_ghz
+"""
 from __future__ import annotations

 from collections.abc import Generator
+from math import ceil
 from typing import TYPE_CHECKING, Any

 import simpy
@@ -14,15 +24,17 @@ if TYPE_CHECKING:


 class PeMathComponent(PeEngineBase):
-    """PE_MATH: element-wise computation engine sharing accel_slot (ADR-0014 D4).
+    """PE_MATH: SIMD/Vector unit (ADR-0021 D6).

-    Uses a shared compute resource (PE_ACCEL capacity=1) that is mutually
-    exclusive with PE_GEMM within the same PE.
+    In pipeline mode: pure compute — register data already fetched.
+    In legacy mode: handles PeInternalTxn(MathCmd) with shared accel_slot.
    """

    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
        super().__init__(node, ctx)
        self._accel: simpy.Resource | None = None
+        self._vector_width: int = int(node.attrs.get("vector_width", 256))
+        self._clock_freq: float = float(node.attrs.get("clock_freq_ghz", 1.0))

    def init_resources(self, env: simpy.Environment) -> None:
        resource_name = self.node.attrs.get("shared_resource")
@@ -31,21 +43,83 @@ class PeMathComponent(PeEngineBase):
                env, f"{self._pe_prefix}.{resource_name}"
            )

+    def _compute_ns(self, num_elements: int) -> float:
+        """SIMD latency (pe_accel model)."""
+        if self._vector_width > 0 and self._clock_freq > 0 and num_elements > 0:
+            cycles = ceil(num_elements / self._vector_width)
+            return cycles / self._clock_freq
+        return float(self.node.attrs.get("overhead_ns", 0.0))
+
    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
        yield env.timeout(overhead_ns)

-    def handle_command(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
+    def _worker(self, env: simpy.Environment) -> Generator:
+        from kernbench.common.pe_commands import PeInternalTxn
+        from kernbench.components.builtin.pe_types import TileToken
+
+        while True:
+            msg: Any = yield self._inbox.get()
+            if isinstance(msg, TileToken):
+                env.process(self._pipeline_process(env, msg))
+            elif isinstance(msg, PeInternalTxn):
+                env.process(self._handle_with_hooks(env, msg))
+            else:
+                env.process(self._forward_txn(env, msg))
+
+    def _pipeline_process(self, env: simpy.Environment, token: Any) -> Generator:
+        """Pipeline mode: pure SIMD compute, then self-route."""
+        self._on_process_start(env, token)
+
+        num_elements = token.params.get("num_elements", 0)
+
        if self._accel:
            with self._accel.request() as req:
                yield req
-                yield from self.run(env, 0)
+                ns = self._compute_ns(num_elements)
+                yield env.timeout(ns)
        else:
-            yield from self.run(env, 0)
+            ns = self._compute_ns(num_elements)
+            yield env.timeout(ns)
+
+        self._on_process_end(env, token)
+
+        # Self-routing
+        next_stage = token.advance()
+        if next_stage is not None:
+            yield self.out_ports[next_stage.component].put(token)
+        else:
+            token.pipeline_ctx.complete_tile()
+
+    def handle_command(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
+        """PeInternalTxn handling for standalone MathCmd (CCL kernels).
+
+        Latency = max(overhead_ns, _compute_ns(num_elements)):
+        - overhead_ns: fixed per-invocation setup cost (from node attrs).
+        - _compute_ns: SIMD cycle-based model (from vector_width + clock_freq).
+        The larger of the two dominates (setup-bound vs compute-bound).
+        """
+        from kernbench.common.pe_commands import MathCmd
+        import math as _math
+
+        cmd = pe_txn.command
+        num_elements = 0
+        if isinstance(cmd, MathCmd) and cmd.out.shape:
+            num_elements = _math.prod(cmd.out.shape)
+
+        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
+        compute_ns = self._compute_ns(num_elements)
+        ns = max(overhead_ns, compute_ns)
+
+        if self._accel:
+            with self._accel.request() as req:
+                yield req
+                yield env.timeout(ns)
+        else:
+            yield env.timeout(ns)
        pe_txn.done.succeed()

    def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
-        """Transaction forwarding with accel_slot acquisition."""
        if self._accel:
            with self._accel.request() as req:
                yield req
@@ -1,3 +1,13 @@
+"""PE_SCHEDULER: plan generation + tile dispatch (ADR-0021 D2).
+
+Receives PeInternalTxn from PE_CPU, routes to engines:
+  - Simple commands (DmaReadCmd, GemmCmd, etc.) → direct dispatch to engine
+  - CompositeCmd → generate TilePlan, feed tiles via _feed_loop
+
+Composite pipeline uses token self-routing (ADR-0021 D4):
+  Scheduler only does initial dispatch + completion tracking.
+  Tiles chain through components based on their plan's stage sequence.
+"""
 from __future__ import annotations

 from collections.abc import Generator
@@ -14,29 +24,18 @@ if TYPE_CHECKING:


 class PeSchedulerComponent(ComponentBase):
-    """PE_SCHEDULER: sole dispatcher inside a PE (ADR-0014 D1).
+    """PE_SCHEDULER: sole dispatcher inside a PE (ADR-0014 D1, ADR-0021 D2).

-    Receives PeInternalTxn from PE_CPU, routes to the appropriate engine:
-      - DmaReadCmd / DmaWriteCmd → PE_DMA
-      - GemmCmd                  → PE_GEMM
-      - MathCmd                  → PE_MATH
-      - CompositeCmd             → tiled pipeline (Stage 3: ADR-0014 D3.2)
+    Simple commands are forwarded to the appropriate engine.
+    CompositeCmd creates a TilePlan and feeds tiles into the pipeline.

-    Composite GEMM pipeline (32x64x32 tiles):
-      DMA_READ(b_tile_t) → COMPUTE(t) → DMA_WRITE(out_tile_t)
-      with overlap: READ(t+1) || COMPUTE(t) || WRITE(t-1)
-
-    Applies scheduler overhead_ns before dispatching each command.
-    Non-PeInternalTxn messages are forwarded via inherited _forward_txn().
+    Single _feed_loop process per scheduler ensures FIFO command ordering.
    """

-    # Scheduler tile dimensions (ADR-0014 D3.2)
    TILE_M = 32
    TILE_K = 64
    TILE_N = 32

-    # Command → engine suffix dispatch table.
-    # New engines: add a single entry here (e.g. ConvCmd: "pe_conv").
    _CMD_DISPATCH: dict[type, str] = {}

    @classmethod
@@ -44,7 +43,6 @@ class PeSchedulerComponent(ComponentBase):
        if cls._CMD_DISPATCH:
            return
        from kernbench.common.pe_commands import DmaReadCmd, DmaWriteCmd, GemmCmd, MathCmd
-
        cls._CMD_DISPATCH = {
            DmaReadCmd: "pe_dma",
            DmaWriteCmd: "pe_dma",
@@ -56,6 +54,13 @@ class PeSchedulerComponent(ComponentBase):
        super().__init__(node, ctx)
        self._pe_prefix = node.id.rsplit(".", 1)[0]
        self._ensure_dispatch_table()
+        self._pending_feeds: simpy.Store | None = None
+        self._pipeline_counter = 0
+
+    def start(self, env: simpy.Environment) -> None:
+        self._pending_feeds = simpy.Store(env)
+        super().start(env)
+        env.process(self._feed_loop(env))

    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
@@ -72,174 +77,103 @@ class PeSchedulerComponent(ComponentBase):
                yield from self._forward_txn(env, msg)

    def _dispatch(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
-        """Route a PeInternalTxn to the correct engine via dispatch table."""
-        from kernbench.common.pe_commands import CompositeCmd
+        from kernbench.common.pe_commands import CompositeCmd, PeCpuOverheadCmd

-        # Scheduler overhead
-        yield from self.run(env, 0)
+        yield from self.run(env, 0)  # scheduler overhead

        cmd = pe_txn.command

-        # Check dispatch table first
+        # Simple command dispatch
        engine_suffix = self._CMD_DISPATCH.get(type(cmd))
        if engine_suffix is not None:
            yield self.out_ports[f"{self._pe_prefix}.{engine_suffix}"].put(pe_txn)
            return

-        # CompositeCmd: tiled pipeline (not a simple forward)
+        # CompositeCmd: generate plan and feed
        if isinstance(cmd, CompositeCmd):
-            yield from self._dispatch_composite(env, pe_txn)
+            yield from self._dispatch_composite(env, pe_txn, cmd)
+            return
+
+        if isinstance(cmd, PeCpuOverheadCmd):
+            yield env.timeout(cmd.cycles)
+            pe_txn.done.succeed()
            return

-        # Unknown command — signal done immediately
        pe_txn.done.succeed()

-    def _dispatch_composite(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
-        """Composite tiled pipeline (ADR-0014 D3.2).
+    def _dispatch_composite(
+        self, env: simpy.Environment, pe_txn: Any, cmd: Any,
+    ) -> Generator:
+        """Generate plan and enqueue to feeder. Non-blocking (ADR-0021 D4)."""
+        from kernbench.components.builtin.pe_types import PipelineContext

-        GEMM: 3-stage pipeline with b-tile streaming from HBM.
-        MATH: sequential compute + DMA_WRITE (no tiling).
+        plan = self._generate_plan(cmd)
+
+        self._pipeline_counter += 1
+        ctx = PipelineContext(
+            id=f"p{self._pipeline_counter}",
+            total_tiles=len(plan.tiles),
+            done_event=pe_txn.done,
+        )
+
+        # Enqueue to feeder — scheduler worker returns immediately
+        assert self._pending_feeds is not None
+        yield self._pending_feeds.put((plan, ctx))
+
+    def _feed_loop(self, env: simpy.Environment) -> Generator:
+        """Single feeder process: FIFO command ordering (ADR-0021 D2).
+
+        No tile feed interleaving between commands.
+        Queue full → only this process blocks.
        """
-        from kernbench.common.pe_commands import CompositeCmd
+        from kernbench.components.builtin.pe_types import TileToken
+
+        assert self._pending_feeds is not None
+        while True:
+            plan, ctx = yield self._pending_feeds.get()
+            for tile in plan.tiles:
+                first_stage = tile.stages[0]
+                token = TileToken(
+                    tile_id=tile.tile_id,
+                    pipeline_ctx=ctx,
+                    plan=tile,
+                    stage_idx=0,
+                    params=first_stage.params,
+                )
+                yield self.out_ports[first_stage.component].put(token)
+
+    def _generate_plan(self, cmd: Any) -> Any:
+        """Generate a PipelinePlan from CompositeCmd."""
+        from kernbench.components.builtin.tiling import (
+            generate_gemm_plan,
+            generate_math_plan,
+        )
+
+        pp = self._pe_prefix
+        bpe = 2  # default bytes per element (f16)

-        cmd = pe_txn.command
-        assert isinstance(cmd, CompositeCmd)
        if cmd.op == "gemm" and cmd.b is not None:
-            yield from self._pipeline_gemm(env, pe_txn, cmd)
+            a = cmd.a
+            b = cmd.b
+            M, K = a.shape[-2], a.shape[-1]
+            N = b.shape[-1]
+            return generate_gemm_plan(
+                M=M, K=K, N=N,
+                tile_m=self.TILE_M, tile_k=self.TILE_K, tile_n=self.TILE_N,
+                bytes_per_element=bpe,
+                A_addr=a.addr, B_addr=b.addr, C_addr=cmd.out_addr,
+                pe_prefix=pp,
+            )
        else:
-            yield from self._pipeline_math(env, pe_txn, cmd)
-
-    def _pipeline_gemm(self, env: simpy.Environment, pe_txn: PeInternalTxn, cmd: Any) -> Generator:
-        """Tiled GEMM pipeline: stream b tiles from HBM, compute, write results.
-
-        Tensor a is in TCM (loaded via tl.load). Tensor b is in HBM (via tl.ref).
-        Pipeline: DMA_READ(b_tile_t) -> COMPUTE(t) -> DMA_WRITE(out_tile_t)
-        Overlap: READ(t+1) || COMPUTE(t) || WRITE(t-1)
-        """
-        from kernbench.common.pe_commands import (
-            DmaReadCmd,
-            DmaWriteCmd,
-            GemmCmd,
-            PeInternalTxn as PeTxn,
-            TensorHandle,
-        )
-
-        pp = self._pe_prefix
-        a = cmd.a   # already in TCM
-        b = cmd.b   # HBM reference (via tl.ref)
-
-        M, K_a = a.shape[-2], a.shape[-1]
-        K_b, N = b.shape[-2], b.shape[-1]
-        dtype = a.dtype
-        dtype_bytes = b.nbytes // (K_b * N) if (K_b * N) > 0 else 2
-
-        # Tile counts
-        n_tiles_k = max(1, (K_a + self.TILE_K - 1) // self.TILE_K)
-        n_tiles_n = max(1, (N + self.TILE_N - 1) // self.TILE_N)
-        n_tiles = n_tiles_k * n_tiles_n
-
-        prev_compute_done = None
-        prev_write_done = None
-        total_dma_ns = 0.0
-        total_compute_ns = 0.0
-
-        for tile_idx in range(n_tiles):
-            tk = tile_idx // n_tiles_n
-            tn = tile_idx % n_tiles_n
-
-            k_start = tk * self.TILE_K
-            n_start = tn * self.TILE_N
-            tile_k = min(self.TILE_K, K_a - k_start)
-            tile_n = min(self.TILE_N, N - n_start)
-            tile_nbytes = tile_k * tile_n * dtype_bytes
-
-            # --- Stage 1: DMA_READ b_tile from HBM ---
-            read_done = env.event()
-            b_tile_addr = b.addr + (k_start * N + n_start) * dtype_bytes
-            b_tile_handle = TensorHandle(
-                id=f"b_tile_{tile_idx}", addr=b_tile_addr,
-                shape=(tile_k, tile_n), dtype=dtype, nbytes=tile_nbytes,
+            # Math composite
+            a = cmd.a
+            M = a.shape[-2] if len(a.shape) >= 2 else a.shape[0]
+            N = a.shape[-1] if len(a.shape) >= 2 else 1
+            return generate_math_plan(
+                M=M, N=N,
+                tile_m=self.TILE_M, tile_n=self.TILE_N,
+                bytes_per_element=bpe,
+                math_op=cmd.math_op or "identity",
+                src_addr=a.addr, dst_addr=cmd.out_addr,
+                pe_prefix=pp,
            )
-            read_cmd = DmaReadCmd(handle=b_tile_handle, src_addr=b_tile_addr, nbytes=tile_nbytes)
-            read_txn = PeTxn(command=read_cmd, done=read_done, pe_prefix=pp)
-            t0 = env.now
-            yield self.out_ports[f"{pp}.pe_dma"].put(read_txn)
-
-            # Wait for previous compute before starting this tile's compute
-            if prev_compute_done is not None:
-                yield prev_compute_done
-
-            # Wait for this tile's DMA_READ
-            yield read_done
-            total_dma_ns += env.now - t0
-
-            # --- Stage 2: COMPUTE (GEMM) ---
-            compute_done = env.event()
-            out_handle = TensorHandle(
-                id=f"out_tile_{tile_idx}", addr=0,
-                shape=(M, tile_n), dtype=dtype,
-                nbytes=M * tile_n * dtype_bytes,
-            )
-            compute_cmd = GemmCmd(a=a, b=b_tile_handle, out=out_handle,
-                                  m=M, k=tile_k, n=tile_n)
-            compute_txn = PeTxn(command=compute_cmd, done=compute_done, pe_prefix=pp)
-            t0 = env.now
-            yield self.out_ports[f"{pp}.pe_gemm"].put(compute_txn)
-
-            # Wait for previous write (DMA_WRITE serialization)
-            if prev_write_done is not None:
-                yield prev_write_done
-
-            # Wait for compute of THIS tile
-            yield compute_done
-            total_compute_ns += env.now - t0
-            prev_compute_done = compute_done
-
-            # --- Stage 3: DMA_WRITE out_tile to HBM ---
-            write_done = env.event()
-            out_tile_pa = cmd.out_addr + n_start * dtype_bytes
-            write_nbytes = M * tile_n * dtype_bytes
-            write_cmd = DmaWriteCmd(handle=out_handle, dst_addr=out_tile_pa, nbytes=write_nbytes)
-            write_txn = PeTxn(command=write_cmd, done=write_done, pe_prefix=pp)
-            t0 = env.now
-            yield self.out_ports[f"{pp}.pe_dma"].put(write_txn)
-            prev_write_done = write_done
-
-        # Wait for final write
-        if prev_write_done is not None:
-            t0 = env.now
-            yield prev_write_done
-            total_dma_ns += env.now - t0
-
-        pe_txn.result_data["dma_ns"] = total_dma_ns
-        pe_txn.result_data["compute_ns"] = total_compute_ns
-        pe_txn.done.succeed()
-
-    def _pipeline_math(self, env: simpy.Environment, pe_txn: PeInternalTxn, cmd: Any) -> Generator:
-        """Non-GEMM composite: sequential compute + DMA_WRITE (no tiling)."""
-        from kernbench.common.pe_commands import (
-            DmaWriteCmd,
-            MathCmd,
-            PeInternalTxn as PeTxn,
-        )
-
-        pp = self._pe_prefix
-
-        # Step 1: Compute (MATH)
-        compute_done = env.event()
-        compute_cmd = MathCmd(
-            op=cmd.math_op or "identity",
-            inputs=(cmd.a,), out=cmd.a,
-        )
-        compute_txn = PeTxn(command=compute_cmd, done=compute_done, pe_prefix=pp)
-        yield self.out_ports[f"{pp}.pe_math"].put(compute_txn)
-        yield compute_done
-
-        # Step 2: DMA_WRITE result to HBM
-        write_done = env.event()
-        write_cmd = DmaWriteCmd(handle=cmd.a, dst_addr=cmd.out_addr, nbytes=cmd.out_nbytes)
-        write_txn = PeTxn(command=write_cmd, done=write_done, pe_prefix=pp)
-        yield self.out_ports[f"{pp}.pe_dma"].put(write_txn)
-        yield write_done
-
-        pe_txn.done.succeed()
@@ -1,7 +1,18 @@
+"""PE_TCM: tightly-coupled memory with BW-based access serialization (ADR-0021).
+
+Models scratchpad memory inside the PE. Handles both legacy Transaction forwarding
+and TcmRequest from PE_FETCH_STORE for BW-serialized read/write access.
+
+Two channels (read/write) with independent serialization.
+Ported from pe_accel TcmBlock timing model.
+"""
 from __future__ import annotations

 from collections.abc import Generator
-from typing import TYPE_CHECKING
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any
+
+import simpy

 from kernbench.components.base import ComponentBase

@@ -10,16 +21,62 @@ if TYPE_CHECKING:
    from kernbench.topology.types import Node


-class PeTcmComponent(ComponentBase):
-    """PE_TCM: tightly-coupled memory / local SRAM staging buffer.
+@dataclass
+class TcmRequest:
+    """Request to read from or write to TCM (used by PE_FETCH_STORE)."""

-    Terminal storage component for PE-internal dataflow (ADR-0014 D5).
-    Phase 0: applies overhead_ns and drain_ns at terminal.
+    direction: str      # "read" or "write"
+    nbytes: int
+    done: simpy.Event
+    tag: str = ""
+
+
+class PeTcmComponent(ComponentBase):
+    """PE_TCM: BW-serialized scratchpad memory (ADR-0021 D1).
+
+    Dual-channel: read and write can proceed in parallel,
+    but concurrent reads serialize, concurrent writes serialize.
+    BW from topology attrs or pe_template links.
    """

    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
        super().__init__(node, ctx)
+        self._read_bw: float = float(node.attrs.get("read_bw_gbs", 512.0))
+        self._write_bw: float = float(node.attrs.get("write_bw_gbs", 512.0))
+        self._read_res: simpy.Resource | None = None
+        self._write_res: simpy.Resource | None = None

-    def run(self, env, nbytes: int) -> Generator:
+    def start(self, env: simpy.Environment) -> None:
+        self._read_res = simpy.Resource(env, capacity=1)
+        self._write_res = simpy.Resource(env, capacity=1)
+        super().start(env)
+
+    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
        yield env.timeout(overhead_ns)
+
+    def _worker(self, env: simpy.Environment) -> Generator:
+        """Dispatch TcmRequest (from fetch_store) and Transaction (fabric)."""
+        while True:
+            msg: Any = yield self._inbox.get()
+            if isinstance(msg, TcmRequest):
+                env.process(self._handle_tcm_request(env, msg))
+            else:
+                env.process(self._forward_txn(env, msg))
+
+    def _handle_tcm_request(self, env: simpy.Environment, req: TcmRequest) -> Generator:
+        """BW-serialized access: acquire channel, apply delay, signal done."""
+        if req.direction == "write":
+            res = self._write_res
+            bw = self._write_bw
+        else:
+            res = self._read_res
+            bw = self._read_bw
+
+        assert res is not None
+        with res.request() as lock:
+            yield lock
+            if bw > 0 and req.nbytes > 0:
+                delay_ns = req.nbytes / bw
+                yield env.timeout(delay_ns)
+            req.done.succeed()
@@ -0,0 +1,115 @@
+"""PE pipeline types for ADR-0021: TileToken, TilePlan, Stage, PipelineContext.
+
+These types are used by the PE_SCHEDULER and all PE engine components
+for tile-based pipeline execution with self-routing.
+"""
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from enum import Enum, auto
+from typing import TYPE_CHECKING, Any
+
+if TYPE_CHECKING:
+    import simpy
+
+
+# ── Stage types ──────────────────────────────────────────────────────
+
+
+class StageType(Enum):
+    DMA_READ = auto()
+    FETCH = auto()
+    GEMM = auto()
+    MATH = auto()
+    STORE = auto()
+    DMA_WRITE = auto()
+
+
+@dataclass
+class Stage:
+    """One stage in a tile's execution plan."""
+
+    stage_type: StageType
+    component: str      # topology node ID (e.g. "sip0.cube0.pe0.pe_dma")
+    params: dict = field(default_factory=dict)
+
+
+# ── Plan ─────────────────────────────────────────────────────────────
+
+
+@dataclass
+class TilePlan:
+    """Execution plan for a single tile (immutable stage sequence)."""
+
+    tile_id: int
+    stages: tuple[Stage, ...]
+
+
+@dataclass
+class PipelinePlan:
+    """Full pipeline plan for one CompositeCmd."""
+
+    tiles: list[TilePlan]
+    # Metadata for metrics
+    m_tiles: int = 0
+    k_tiles: int = 0
+    n_tiles: int = 0
+
+
+# ── Pipeline Context ─────────────────────────────────────────────────
+
+
+@dataclass
+class PipelineContext:
+    """Tracks completion of a pipeline (exactly-once contract).
+
+    Each tile's last stage calls complete_tile() exactly once.
+    When all tiles complete, done_event.succeed() is called.
+    """
+
+    id: str
+    total_tiles: int
+    completed_tiles: int = 0
+    done_event: Any = None  # simpy.Event
+
+    def complete_tile(self) -> None:
+        self.completed_tiles += 1
+        if self.completed_tiles == self.total_tiles:
+            if self.done_event is not None:
+                self.done_event.succeed()
+
+
+# ── TileToken ────────────────────────────────────────────────────────
+
+
+@dataclass
+class TileToken:
+    """Self-routing tile token passed between PE components (ADR-0021 D9).
+
+    Single-owner: only one component holds this token at any time.
+    params is a cache of plan.stages[stage_idx].params (canonical source).
+    """
+
+    tile_id: int
+    pipeline_ctx: PipelineContext
+    plan: TilePlan
+    stage_idx: int
+    params: dict = field(default_factory=dict)
+    data_op: bool = True     # op_log recording target (ADR-0020)
+
+    @property
+    def current_stage(self) -> Stage:
+        return self.plan.stages[self.stage_idx]
+
+    @property
+    def has_next_stage(self) -> bool:
+        return self.stage_idx + 1 < len(self.plan.stages)
+
+    def advance(self) -> Stage | None:
+        """Advance to next stage. Returns next Stage or None if last."""
+        self.stage_idx += 1
+        if self.stage_idx < len(self.plan.stages):
+            next_stage = self.plan.stages[self.stage_idx]
+            self.params = next_stage.params
+            return next_stage
+        return None
@@ -0,0 +1,176 @@
+"""Tile plan generators for PE pipeline (ADR-0021).
+
+Generates TilePlan with stage sequences for GEMM and Math operations.
+Ported from pe_accel tiling.py with stage-based plan structure.
+"""
+from __future__ import annotations
+
+from math import ceil
+
+from kernbench.components.builtin.pe_types import (
+    PipelinePlan,
+    Stage,
+    StageType,
+    TilePlan,
+)
+
+
+def generate_gemm_plan(
+    M: int, K: int, N: int,
+    tile_m: int, tile_k: int, tile_n: int,
+    bytes_per_element: int,
+    A_addr: int, B_addr: int, C_addr: int,
+    pe_prefix: str,
+) -> PipelinePlan:
+    """Generate GEMM tile plan: M→N→K order.
+
+    Each tile follows stage sequence:
+      DMA_READ(A) → DMA_READ(B) → FETCH → GEMM → STORE
+      On last K-tile per (m,n): → DMA_WRITE
+
+    Args:
+        pe_prefix: e.g. "sip0.cube0.pe0" — used to build component IDs.
+    """
+    M_tiles = max(1, ceil(M / tile_m))
+    K_tiles = max(1, ceil(K / tile_k))
+    N_tiles = max(1, ceil(N / tile_n))
+    bpe = bytes_per_element
+
+    dma_id = f"{pe_prefix}.pe_dma"
+    fetch_id = f"{pe_prefix}.pe_fetch_store"
+    gemm_id = f"{pe_prefix}.pe_gemm"
+    # math_id = f"{pe_prefix}.pe_math"  # for K-accumulation if needed
+
+    tiles: list[TilePlan] = []
+    tile_id = 0
+
+    for m in range(M_tiles):
+        for n in range(N_tiles):
+            c_addr = C_addr + (m * tile_m * N + n * tile_n) * bpe
+            for k in range(K_tiles):
+                last_k = k == K_tiles - 1
+                a_addr = A_addr + (m * tile_m * K + k * tile_k) * bpe
+                b_addr = B_addr + (k * tile_k * N + n * tile_n) * bpe
+
+                a_bytes = tile_m * tile_k * bpe
+                b_bytes = tile_k * tile_n * bpe
+                out_bytes = tile_m * tile_n * bpe
+
+                stages: list[Stage] = []
+
+                # DMA READ: load A and B tiles from HBM → TCM
+                stages.append(Stage(
+                    stage_type=StageType.DMA_READ,
+                    component=dma_id,
+                    params={
+                        "src_addr": a_addr, "nbytes": a_bytes,
+                        "operand": "A", "tile_m": tile_m, "tile_k": tile_k,
+                    },
+                ))
+                stages.append(Stage(
+                    stage_type=StageType.DMA_READ,
+                    component=dma_id,
+                    params={
+                        "src_addr": b_addr, "nbytes": b_bytes,
+                        "operand": "B", "tile_k": tile_k, "tile_n": tile_n,
+                    },
+                ))
+
+                # FETCH: TCM → Register File
+                stages.append(Stage(
+                    stage_type=StageType.FETCH,
+                    component=fetch_id,
+                    params={
+                        "direction": "read",
+                        "nbytes": a_bytes + b_bytes,
+                    },
+                ))
+
+                # GEMM: MAC compute
+                stages.append(Stage(
+                    stage_type=StageType.GEMM,
+                    component=gemm_id,
+                    params={
+                        "m": tile_m, "k": tile_k, "n": tile_n,
+                        "is_last_k": last_k,
+                    },
+                ))
+
+                # STORE: Register File → TCM
+                stages.append(Stage(
+                    stage_type=StageType.STORE,
+                    component=fetch_id,
+                    params={
+                        "direction": "write",
+                        "nbytes": out_bytes,
+                    },
+                ))
+
+                # DMA WRITE: TCM → HBM (only on last K-tile)
+                if last_k:
+                    stages.append(Stage(
+                        stage_type=StageType.DMA_WRITE,
+                        component=dma_id,
+                        params={
+                            "dst_addr": c_addr, "nbytes": out_bytes,
+                        },
+                    ))
+
+                tiles.append(TilePlan(tile_id=tile_id, stages=tuple(stages)))
+                tile_id += 1
+
+    return PipelinePlan(
+        tiles=tiles, m_tiles=M_tiles, k_tiles=K_tiles, n_tiles=N_tiles,
+    )
+
+
+def generate_math_plan(
+    M: int, N: int,
+    tile_m: int, tile_n: int,
+    bytes_per_element: int,
+    math_op: str,
+    src_addr: int, dst_addr: int,
+    pe_prefix: str,
+) -> PipelinePlan:
+    """Generate element-wise math tile plan.
+
+    Each tile: DMA_READ → FETCH → MATH → STORE → DMA_WRITE
+    """
+    M_tiles = max(1, ceil(M / tile_m))
+    N_tiles = max(1, ceil(N / tile_n))
+    bpe = bytes_per_element
+
+    dma_id = f"{pe_prefix}.pe_dma"
+    fetch_id = f"{pe_prefix}.pe_fetch_store"
+    math_id = f"{pe_prefix}.pe_math"
+
+    tiles: list[TilePlan] = []
+    tile_id = 0
+
+    for m in range(M_tiles):
+        for n in range(N_tiles):
+            offset = (m * tile_m * N + n * tile_n) * bpe
+            tile_bytes = tile_m * tile_n * bpe
+
+            stages = [
+                Stage(StageType.DMA_READ, dma_id, {
+                    "src_addr": src_addr + offset, "nbytes": tile_bytes,
+                }),
+                Stage(StageType.FETCH, fetch_id, {
+                    "direction": "read", "nbytes": tile_bytes,
+                }),
+                Stage(StageType.MATH, math_id, {
+                    "op": math_op, "num_elements": tile_m * tile_n,
+                }),
+                Stage(StageType.STORE, fetch_id, {
+                    "direction": "write", "nbytes": tile_bytes,
+                }),
+                Stage(StageType.DMA_WRITE, dma_id, {
+                    "dst_addr": dst_addr + offset, "nbytes": tile_bytes,
+                }),
+            ]
+
+            tiles.append(TilePlan(tile_id=tile_id, stages=tuple(stages)))
+            tile_id += 1
+
+    return PipelinePlan(tiles=tiles, m_tiles=M_tiles, n_tiles=N_tiles)
@@ -1,168 +0,0 @@
-"""Position-aware XBAR component.
-
-Models crossbar latency as base_overhead_ns + internal_distance * ns_per_mm,
-where internal_distance is the Manhattan distance between the entry port
-(PE router attachment) and exit port (HBM slice logical position) within
-the crossbar matrix.
-
-PE router positions come from cube_mesh.yaml (via ctx.spec["_mesh"]).
-HBM slice positions are uniformly distributed across the HBM physical width.
-"""
-from __future__ import annotations
-
-from collections.abc import Generator
-from typing import TYPE_CHECKING, Any
-
-import simpy
-
-from kernbench.components.base import ComponentBase
-
-if TYPE_CHECKING:
-    from kernbench.components.context import ComponentContext
-    from kernbench.topology.types import Node
-
-
-class PositionAwareXbarComponent(ComponentBase):
-    """XBAR with position-dependent latency based on PE-to-slice distance.
-
-    Latency = base_overhead_ns + |entry_port_x - exit_port_x| * ns_per_mm
-
-    Entry/exit port X positions are determined from the transaction path:
-      - PE_DMA nodes: router X from cube_mesh.yaml
-      - HBM slices: uniformly distributed across HBM physical width
-      - Bridge nodes: physical X from topology positions
-      - NOC: resolved by scanning path for PE_DMA node
-    """
-
-    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
-        super().__init__(node, ctx)
-        self._base_overhead_ns = float(node.attrs.get("overhead_ns", 0.0))
-        self._pe_router_xs: dict[str, float] = {}
-        self._slice_xs: dict[str, float] = {}
-        self._bridge_xs: dict[str, float] = {}
-        self._ns_per_mm: float = 0.0
-
-    def start(self, env: simpy.Environment) -> None:
-        self._build_position_map()
-        super().start(env)
-
-    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
-        yield env.timeout(self._base_overhead_ns)
-
-    # ── Position map construction ─────────────────────────────────
-
-    def _build_position_map(self) -> None:
-        if not self.ctx or not self.ctx.spec:
-            return
-        mesh = self.ctx.spec.get("_mesh")
-        if not mesh:
-            return
-
-        self._ns_per_mm = self.ctx.ns_per_mm
-        cube_prefix = self.node.id.rsplit(".", 1)[0]
-        xbar_name = self.node.id.rsplit(".", 1)[1]
-        is_top = xbar_name == "xbar_top"
-        xbar_key = "top" if is_top else "bottom"
-
-        # PE router X positions from mesh attachments
-        routers_list = mesh.get("xbar", {}).get(xbar_key, {}).get("routers", [])
-        for router_id in routers_list:
-            router_data = mesh["routers"].get(router_id)
-            if router_data is None:
-                continue
-            router_x = router_data["pos_mm"][0]
-            for attach in router_data.get("attach", []):
-                if attach.endswith(".dma"):
-                    pe_name = attach.split(".")[0]
-                    pe_dma_id = f"{cube_prefix}.{pe_name}.pe_dma"
-                    self._pe_router_xs[pe_dma_id] = router_x
-
-        # HBM slice X positions: uniformly distributed across HBM width
-        cube_spec = self.ctx.spec.get("cube", {})
-        cube_w = cube_spec.get("geometry", {}).get("cube_mm", {}).get("w", 17.0)
-        hbm_w = cube_spec.get("geometry", {}).get("hbm_mm", {}).get("w", 9.0)
-        n_slices = cube_spec.get("memory_map", {}).get("hbm_slices_per_cube", 8)
-        half = n_slices // 2
-        hbm_left = (cube_w - hbm_w) / 2
-
-        if is_top:
-            slice_range = range(half)
-        else:
-            slice_range = range(half, n_slices)
-
-        n = len(list(slice_range))
-        for i, sl in enumerate(slice_range):
-            if n > 1:
-                x = hbm_left + i * hbm_w / (n - 1)
-            else:
-                x = cube_w / 2
-            self._slice_xs[f"{cube_prefix}.hbm_ctrl.slice{sl}"] = x
-
-        # Bridge X positions from topology positions
-        for node_id, pos in self.ctx.positions.items():
-            if node_id.startswith(cube_prefix + ".bridge.") and pos is not None:
-                origin_x = self._cube_origin_x()
-                self._bridge_xs[node_id] = pos[0] - origin_x
-
-    def _cube_origin_x(self) -> float:
-        """Compute absolute X origin of this cube."""
-        parts = self.node.id.split(".")
-        cube_str = [p for p in parts if p.startswith("cube")][0]
-        cube_id = int(cube_str[4:])
-        spec = self.ctx.spec
-        sip_spec = spec.get("sip", {})
-        cube_spec = spec.get("cube", {})
-        mesh_w = sip_spec.get("cube_mesh", {}).get("w", 4)
-        cube_w = cube_spec.get("geometry", {}).get("cube_mm", {}).get("w", 17.0)
-        seam = sip_spec.get("links", {}).get("inter_cube_mesh", {}).get(
-            "distance_mm_across_seam", 1.0)
-        col = cube_id % mesh_w
-        return col * (cube_w + seam)
-
-    # ── Worker override ───────────────────────────────────────────
-
-    def _worker(self, env: simpy.Environment) -> Generator:
-        while True:
-            txn: Any = yield self._inbox.get()
-            env.process(self._position_aware_forward(env, txn))
-
-    def _position_aware_forward(
-        self, env: simpy.Environment, txn: Any,
-    ) -> Generator:
-        prev_hop = txn.path[txn.step - 1] if txn.step > 0 else None
-        next_hop = txn.next_hop
-
-        overhead = self._base_overhead_ns
-        if prev_hop and next_hop and self._ns_per_mm > 0:
-            entry_x = self._get_port_x(prev_hop, txn.path)
-            exit_x = self._get_port_x(next_hop, txn.path)
-            if entry_x is not None and exit_x is not None:
-                overhead = self._base_overhead_ns + abs(entry_x - exit_x) * self._ns_per_mm
-
-        yield env.timeout(overhead)
-
-        if next_hop:
-            yield self.out_ports[next_hop].put(txn.advance())
-        else:
-            drain = getattr(txn, "drain_ns", 0.0)
-            if drain > 0:
-                yield env.timeout(drain)
-            txn.done.succeed()
-
-    def _get_port_x(self, node_id: str, path: list[str]) -> float | None:
-        """Resolve the X position of an XBAR port from node context."""
-        # Direct lookup: PE DMA
-        if node_id in self._pe_router_xs:
-            return self._pe_router_xs[node_id]
-        # Direct lookup: HBM slice
-        if node_id in self._slice_xs:
-            return self._slice_xs[node_id]
-        # Direct lookup: bridge
-        if node_id in self._bridge_xs:
-            return self._bridge_xs[node_id]
-        # NOC: scan path for PE DMA node
-        if "noc" in node_id:
-            for p in path:
-                if p in self._pe_router_xs:
-                    return self._pe_router_xs[p]
-        return None
@@ -24,6 +24,8 @@ class ComponentContext:
    ns_per_mm: float  # wire propagation constant (from topology spec)
    edge_map: dict[tuple[str, str], Any] = field(default_factory=dict)
    spec: dict = field(default_factory=dict)  # topology spec (cube layout, PE count, etc.)
+    memory_store: Any = None  # MemoryStore for Phase 1 data-aware execution (ADR-0020)
+    op_logger: Any = None     # OpLogger for Phase 1 op recording (ADR-0020)

    def get_shared_resource(
        self, env: simpy.Environment, key: str, capacity: int = 1,
@@ -0,0 +1,2 @@
+# Legacy component backups — not actively used.
+# Kept for reference during ADR-0021 migration.
@@ -0,0 +1,34 @@
+"""Concrete component implementations.
+
+Loaded from components.yaml via ComponentRegistry.load_components_yaml().
+Manual imports are no longer needed — add new impls to components.yaml.
+
+Classes are still importable from this package via lazy __getattr__.
+"""
+
+from kernbench.components.base import ComponentRegistry
+
+ComponentRegistry.load_components_yaml()
+
+# Lazy re-export: allow `from kernbench.components.builtin import FooComponent`
+# without eagerly importing every module.
+_CLASS_MAP: dict[str, str] = {}  # ClassName → "module.path:ClassName"
+
+
+def _build_class_map() -> None:
+    if _CLASS_MAP:
+        return
+    for class_path in ComponentRegistry._lazy.values():
+        module_path, class_name = class_path.rsplit(":", 1)
+        _CLASS_MAP[class_name] = class_path
+
+
+def __getattr__(name: str):
+    _build_class_map()
+    class_path = _CLASS_MAP.get(name)
+    if class_path is None:
+        raise ImportError(f"cannot import name '{name}' from 'kernbench.components.builtin'")
+    import importlib
+    module_path, class_name = class_path.rsplit(":", 1)
+    mod = importlib.import_module(module_path)
+    return getattr(mod, class_name)
@@ -0,0 +1,27 @@
+from __future__ import annotations
+
+from collections.abc import Generator
+from typing import TYPE_CHECKING
+
+import simpy
+
+from kernbench.components.base import ComponentBase
+
+if TYPE_CHECKING:
+    from kernbench.components.context import ComponentContext
+    from kernbench.topology.types import Node
+
+
+class TransitComponent(ComponentBase):
+    """Transit component for NOC, UCIe, XBAR nodes.
+
+    Applies overhead_ns processing delay (from node.attrs) then forwards the
+    Transaction to the next hop via inherited _forward_txn().
+    """
+
+    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
+        super().__init__(node, ctx)
+
+    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
+        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
+        yield env.timeout(overhead_ns)
@@ -0,0 +1,129 @@
+from __future__ import annotations
+
+from collections.abc import Generator
+from typing import TYPE_CHECKING, Any
+
+import simpy
+
+from kernbench.components.base import ComponentBase
+from kernbench.sim_engine.transaction import Transaction
+
+if TYPE_CHECKING:
+    from kernbench.components.context import ComponentContext
+    from kernbench.topology.types import Node
+
+
+class HbmCtrlComponent(ComponentBase):
+    """HBM controller: terminal component that models HBM access latency.
+
+    Dual-channel model: separate read and write resources (each capacity=1)
+    allowing concurrent read/write like PE_DMA. Multiple reads or multiple
+    writes still serialize within their respective channel.
+
+    On completion, creates a ResponseMsg and sends it back on the reverse path
+    so that response latency is modeled through the fabric.
+    """
+
+    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
+        super().__init__(node, ctx)
+        self._read: simpy.Resource | None = None
+        self._write: simpy.Resource | None = None
+
+    def start(self, env: simpy.Environment) -> None:
+        capacity = int(self.node.attrs.get("capacity", 1))
+        self._read = simpy.Resource(env, capacity=capacity)
+        self._write = simpy.Resource(env, capacity=capacity)
+        super().start(env)
+
+    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
+        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
+        yield env.timeout(overhead_ns)
+
+    def _select_channel(self, txn: Any) -> simpy.Resource:
+        """Select channel based on request type: write requests → write, else → read."""
+        from kernbench.runtime_api.kernel import MemoryWriteMsg, PeDmaMsg
+
+        assert self._read is not None and self._write is not None
+        req = txn.request
+        if isinstance(req, MemoryWriteMsg):
+            return self._write
+        if isinstance(req, PeDmaMsg) and req.is_write:
+            return self._write
+        return self._read
+
+    def _worker(self, env: simpy.Environment) -> Generator:
+        """Dispatch each incoming txn to a concurrent process for channel-level parallelism."""
+        while True:
+            txn: Any = yield self._inbox.get()
+            env.process(self._handle_txn(env, txn))
+
+    def _handle_txn(self, env: simpy.Environment, txn: Any) -> Generator:
+        """Acquire channel, run, apply drain, send response."""
+        channel = self._select_channel(txn)
+        with channel.request() as req:
+            yield req
+            yield from self.run(env, txn.nbytes)
+            drain = getattr(txn, "drain_ns", 0.0)
+            if drain > 0:
+                yield env.timeout(drain)
+        yield from self._send_response(env, txn)
+
+    def _send_response(self, env: simpy.Environment, txn: Any) -> Generator:
+        """Route completion based on path type.
+
+        - PeDmaMsg: succeed done directly (probe).
+        - Bypass path (no m_cpu): MemoryWrite succeeds done; MemoryRead sends
+          data back on reverse path with original done event.
+        - M_CPU DMA path: send ResponseMsg for m_cpu/io_cpu aggregation.
+        """
+        from kernbench.runtime_api.kernel import MemoryReadMsg, PeDmaMsg
+
+        if isinstance(txn.request, PeDmaMsg):
+            reverse_path = list(reversed(txn.path))
+            if len(reverse_path) >= 2:
+                resp_txn = Transaction(
+                    request=txn.request, path=reverse_path, step=0,
+                    nbytes=0, done=txn.done, is_response=True,
+                )
+                yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
+                return
+            txn.done.succeed()
+            return
+
+        # Bypass path: no m_cpu in the transaction path
+        is_bypass = not any("m_cpu" in n for n in txn.path)
+        if is_bypass:
+            if isinstance(txn.request, MemoryReadMsg):
+                # D2H: send data back on reverse path to pcie_ep
+                reverse_path = list(reversed(txn.path))
+                if len(reverse_path) >= 2:
+                    resp_txn = Transaction(
+                        request=txn.request, path=reverse_path, step=0,
+                        nbytes=txn.request.nbytes, done=txn.done,
+                    )
+                    yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
+                    return
+            # MemoryWrite bypass or short path: done
+            txn.done.succeed()
+            return
+
+        # M_CPU DMA path: send ResponseMsg for aggregation
+        reverse_path = list(reversed(txn.path))
+        if len(reverse_path) >= 2 and self.ctx:
+            from kernbench.runtime_api.kernel import ResponseMsg
+
+            parts = self.node.id.split(".")
+            cube_id = int(parts[1].replace("cube", ""))
+            pe_id = 0  # single hbm_ctrl, PE info from request
+            resp_msg = ResponseMsg(
+                correlation_id=txn.request.correlation_id,
+                request_id=txn.request.request_id,
+                src_cube=cube_id, src_pe=pe_id, success=True,
+            )
+            resp_txn = Transaction(
+                request=resp_msg, path=reverse_path, step=0,
+                nbytes=0, done=env.event(), is_response=True,
+            )
+            yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
+        else:
+            txn.done.succeed()
@@ -0,0 +1,157 @@
+from __future__ import annotations
+
+from collections.abc import Generator
+from typing import TYPE_CHECKING, Any
+
+import simpy
+
+from kernbench.components.base import ComponentBase
+from kernbench.sim_engine.transaction import Transaction
+
+if TYPE_CHECKING:
+    from kernbench.components.context import ComponentContext
+    from kernbench.topology.types import Node
+
+
+class IoCpuComponent(ComponentBase):
+    """IO_CPU component: multi-cube fan-out with response aggregation.
+
+    Forward path:
+      1. Applies overhead_ns processing overhead.
+      2. Resolves target cube(s) from request.target_cubes.
+      3. Fans out sub-Transactions to each target cube's M_CPU.
+
+    Response path:
+      Collects ResponseMsg from each M_CPU. When all cube responses are
+      received, succeeds the parent txn.done.
+    """
+
+    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
+        super().__init__(node, ctx)
+        # Pending fan-out tracking: request_id → (expected, received, parent_txn_done)
+        self._pending: dict[str, tuple[int, int, simpy.Event]] = {}
+
+    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
+        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
+        yield env.timeout(overhead_ns)
+
+    def _worker(self, env: simpy.Environment) -> Generator:
+        while True:
+            txn: Any = yield self._inbox.get()
+            if getattr(txn, "is_response", False):
+                self._collect_response(txn)
+            else:
+                yield from self.run(env, txn.nbytes)
+                env.process(self._dispatch_to_m_cpus(env, txn))
+
+    def _collect_response(self, resp_txn: Any) -> None:
+        """Receive a cube response and increment the aggregation counter."""
+        key = resp_txn.request.request_id
+        if key not in self._pending:
+            return
+        expected, received, parent_done = self._pending[key]
+        received += 1
+        if received >= expected:
+            parent_done.succeed()
+            del self._pending[key]
+        else:
+            self._pending[key] = (expected, received, parent_done)
+
+    def _dispatch_to_m_cpus(self, env: simpy.Environment, txn: Any) -> Generator:
+        """Fan out sub-Transactions to target cube M_CPUs, wait for responses."""
+        from kernbench.runtime_api.kernel import KernelLaunchMsg, MemoryReadMsg, MemoryWriteMsg
+
+        request = txn.request
+        try:
+            cube_targets = self._resolve_cube_targets(request)
+        except Exception:
+            txn.done.succeed()
+            return
+
+        if not cube_targets:
+            txn.done.succeed()
+            return
+
+        # Setup aggregation
+        self._pending[request.request_id] = (len(cube_targets), 0, txn.done)
+
+        # Fan out to each target cube's M_CPU
+        for sip, cube in cube_targets:
+            try:
+                m_cpu_id = self.ctx.resolver.find_m_cpu(sip, cube)
+                path = self.ctx.router.find_node_path(self.node.id, m_cpu_id)
+            except Exception:
+                continue
+            if len(path) < 2:
+                continue
+            sub_txn = Transaction(
+                request=request, path=path, step=0,
+                nbytes=txn.nbytes, done=env.event(),
+                result_data=txn.result_data,
+            )
+            yield self.out_ports[path[1]].put(sub_txn.advance())
+
+    def _resolve_cube_targets(self, request: Any) -> list[tuple[int, int]]:
+        """Return list of (sip, cube) pairs to fan out to."""
+        from kernbench.runtime_api.kernel import (
+            KernelLaunchMsg, MemoryReadMsg, MemoryWriteMsg, MmuMapMsg, MmuUnmapMsg,
+        )
+
+        target_cubes = getattr(request, "target_cubes", "all")
+
+        if isinstance(request, MemoryWriteMsg):
+            sip = request.dst_sip
+            if target_cubes == "all":
+                cube = self._cube_from_pa(request.dst_pa, fallback=request.dst_cube)
+                return [(sip, cube)]
+            return [(sip, c) for c in target_cubes]
+
+        if isinstance(request, MemoryReadMsg):
+            sip = request.src_sip
+            if target_cubes == "all":
+                cube = self._cube_from_pa(request.src_pa, fallback=request.src_cube)
+                return [(sip, cube)]
+            return [(sip, c) for c in target_cubes]
+
+        if isinstance(request, KernelLaunchMsg):
+            my_sip = self._my_sip()
+            if target_cubes != "all":
+                return [(my_sip, c) for c in target_cubes]
+            # "all": derive from tensor shards, filtered to this SIP
+            seen: set[tuple[int, int]] = set()
+            targets: list[tuple[int, int]] = []
+            for arg in request.args:
+                if arg.arg_kind != "tensor":
+                    continue
+                for shard in arg.shards:
+                    if shard.sip != my_sip:
+                        continue
+                    key = (shard.sip, shard.cube)
+                    if key not in seen:
+                        seen.add(key)
+                        targets.append(key)
+            return targets
+
+        if isinstance(request, (MmuMapMsg, MmuUnmapMsg)):
+            my_sip = self._my_sip()
+            if target_cubes == "all":
+                n_cubes = 16
+                if self.ctx and self.ctx.spec:
+                    sips = self.ctx.spec.get("system", {}).get("sips", {})
+                    n_cubes = sips.get("cubes_per_sip", 16)
+                return [(my_sip, c) for c in range(n_cubes)]
+            return [(my_sip, c) for c in target_cubes]
+
+        return []
+
+    def _cube_from_pa(self, pa_val: int, fallback: int) -> int:
+        """Extract cube_id from a physical address, with fallback."""
+        from kernbench.policy.address.phyaddr import PhysAddr
+        try:
+            return PhysAddr.decode(pa_val).cube_id
+        except Exception:
+            return fallback
+
+    def _my_sip(self) -> int:
+        """Extract this IO_CPU's SIP ID from its node ID (e.g. 'sip0.io0.io_cpu' → 0)."""
+        return int(self.node.id.split(".")[0].replace("sip", ""))
@@ -0,0 +1,327 @@
+from __future__ import annotations
+
+from collections.abc import Generator
+from typing import TYPE_CHECKING, Any
+
+import simpy
+
+from kernbench.components.base import ComponentBase
+from kernbench.sim_engine.transaction import Transaction
+
+if TYPE_CHECKING:
+    from kernbench.components.context import ComponentContext
+    from kernbench.topology.types import Node
+
+
+class MCpuComponent(ComponentBase):
+    """M_CPU component: multi-PE DMA fan-out with response aggregation.
+
+    Forward path (ADR-0015 D5):
+      When a forward Transaction arrives at m_cpu (terminal hop), M_CPU fans out
+      DMA sub-Transactions to target PEs' HBM slices. target_pe on the request
+      controls fan-out: int → single PE, "all" → all PEs in the cube.
+
+    Response path:
+      ResponseMsg from each hbm_ctrl arrives back at m_cpu. Once all PE responses
+      are collected, m_cpu sends an aggregate ResponseMsg on the reverse command
+      path back to io_cpu.
+
+    Transit:
+      When m_cpu is NOT the terminal hop (transit or response relay), the
+      Transaction is forwarded normally to the next hop.
+    """
+
+    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
+        super().__init__(node, ctx)
+        # Pending fan-out tracking: request_id → (expected, received, all_done_event)
+        self._pending: dict[str, tuple[int, int, simpy.Event]] = {}
+        # Store parent txn for response sending: request_id → parent_txn
+        self._parent_txns: dict[str, Any] = {}
+        # DMA engine resources (ADR-0015 D5, ADR-0014 D4): capacity=1 each
+        self._dma_write: simpy.Resource | None = None
+        self._dma_read: simpy.Resource | None = None
+
+    def start(self, env: simpy.Environment) -> None:
+        self._dma_write = simpy.Resource(env, capacity=1)
+        self._dma_read = simpy.Resource(env, capacity=1)
+        super().start(env)
+
+    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
+        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
+        yield env.timeout(overhead_ns)
+
+    def _worker(self, env: simpy.Environment) -> Generator:
+        """Dispatch forward txns, collect response txns."""
+        from kernbench.runtime_api.kernel import KernelLaunchMsg, MmuMapMsg, MmuUnmapMsg
+
+        while True:
+            txn: Any = yield self._inbox.get()
+            if getattr(txn, "is_response", False):
+                self._collect_response(txn)
+            else:
+                yield from self.run(env, txn.nbytes)
+                next_hop = txn.next_hop
+                if next_hop:
+                    yield self.out_ports[next_hop].put(txn.advance())
+                elif self.ctx is not None and txn.request is not None:
+                    if isinstance(txn.request, KernelLaunchMsg):
+                        env.process(self._kernel_launch_fanout(env, txn))
+                    elif isinstance(txn.request, (MmuMapMsg, MmuUnmapMsg)):
+                        env.process(self._mmu_msg_fanout(env, txn))
+                    else:
+                        env.process(self._dma_fanout(env, txn))
+                else:
+                    txn.done.succeed()
+
+    def _collect_response(self, resp_txn: Any) -> None:
+        """Receive a PE response and increment the aggregation counter."""
+        key = resp_txn.request.request_id
+        if key not in self._pending:
+            return
+        expected, received, all_done = self._pending[key]
+        received += 1
+        if received >= expected:
+            all_done.succeed()
+            del self._pending[key]
+        else:
+            self._pending[key] = (expected, received, all_done)
+
+    def _dma_fanout(self, env: simpy.Environment, txn: Any) -> Generator:
+        """Fan out DMA sub-Transactions to target PE(s), wait for responses,
+        then send aggregate response on reverse command path.
+
+        Each DMA transfer acquires the DMA resource (capacity=1 per ADR-0014 D4),
+        so multi-PE fan-out is serialized through the DMA engine.
+        """
+        from kernbench.runtime_api.kernel import MemoryWriteMsg
+
+        request = txn.request
+        target_pe = getattr(request, "target_pe", "all")
+
+        dst_nodes = self._resolve_dma_destinations(request, target_pe)
+        if not dst_nodes:
+            txn.done.succeed()
+            return
+
+        # Setup aggregation
+        all_done = env.event()
+        self._pending[request.request_id] = (len(dst_nodes), 0, all_done)
+        self._parent_txns[request.request_id] = txn
+
+        # Select DMA resource based on operation type
+        dma_res = self._dma_write if isinstance(request, MemoryWriteMsg) else self._dma_read
+
+        # Fan out DMA sub-txns (serialized through DMA resource)
+        max_drain_ns = 0.0
+        for dst_node in dst_nodes:
+            try:
+                dma_path = self.ctx.router.find_mcpu_dma_path(self.node.id, dst_node)
+            except Exception:
+                continue
+            if len(dma_path) < 2:
+                continue
+            drain_ns = self.ctx.compute_drain_ns(dma_path, txn.nbytes)
+            max_drain_ns = max(max_drain_ns, drain_ns)
+            sub_txn = Transaction(
+                request=request, path=dma_path, step=0,
+                nbytes=txn.nbytes, done=env.event(),
+                drain_ns=drain_ns,
+            )
+            with dma_res.request() as req:
+                yield req
+                yield self.out_ports[dma_path[1]].put(sub_txn.advance())
+
+        # Wait for all PE responses
+        yield all_done
+        txn.result_data["xfer_ns"] = max_drain_ns
+        del self._parent_txns[request.request_id]
+
+        # Send aggregate response on reverse command path
+        reverse_path = list(reversed(txn.path))
+        if len(reverse_path) >= 2:
+            from kernbench.runtime_api.kernel import ResponseMsg
+
+            parts = self.node.id.split(".")
+            cube_id = int(parts[1].replace("cube", ""))
+            resp_msg = ResponseMsg(
+                correlation_id=request.correlation_id,
+                request_id=request.request_id,
+                src_cube=cube_id, src_pe=-1, success=True,
+            )
+            resp_txn = Transaction(
+                request=resp_msg, path=reverse_path, step=0,
+                nbytes=0, done=env.event(), is_response=True,
+            )
+            yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
+        else:
+            txn.done.succeed()
+
+    def _kernel_launch_fanout(self, env: simpy.Environment, txn: Any) -> Generator:
+        """Fan out KernelLaunchMsg to target PE_CPU(s) via NOC (ADR-0009 D3).
+
+        Routes through find_node_path (M_CPU → NOC → PE_CPU command edges).
+        PE_CPU sends ResponseMsg back via NOC → M_CPU on completion.
+        Then sends aggregate ResponseMsg back to IO_CPU on the reverse path.
+        """
+        request = txn.request
+        target_pe = getattr(request, "target_pe", "all")
+        cube_prefix = self.node.id.rsplit(".", 1)[0]  # e.g. "sip0.cube0"
+        pe_ids = self._resolve_pe_ids(target_pe)
+
+        if not pe_ids:
+            txn.done.succeed()
+            return
+
+        # Fan out to each PE_CPU, using response-based aggregation
+        sub_txns: list[Transaction] = []
+        n_dispatched = 0
+        for pe_id in pe_ids:
+            pe_cpu_id = f"{cube_prefix}.pe{pe_id}.pe_cpu"
+            try:
+                path = self.ctx.router.find_node_path(self.node.id, pe_cpu_id)
+            except Exception:
+                continue
+            if len(path) < 2:
+                continue
+            sub_txn = Transaction(
+                request=request, path=path, step=0,
+                nbytes=0, done=env.event(),
+            )
+            yield self.out_ports[path[1]].put(sub_txn.advance())
+            sub_txns.append(sub_txn)
+            n_dispatched += 1
+
+        if n_dispatched == 0:
+            txn.done.succeed()
+            return
+
+        # Setup response aggregation (PE_CPU ResponseMsg arrives via _collect_response)
+        all_done = env.event()
+        self._pending[request.request_id] = (n_dispatched, 0, all_done)
+        self._parent_txns[request.request_id] = txn
+
+        # Wait for all PE_CPU responses via NOC
+        yield all_done
+        del self._parent_txns[request.request_id]
+
+        # Aggregate PE-internal metrics (max across PEs)
+        pe_exec_values = [st.result_data.get("pe_exec_ns", 0.0) for st in sub_txns]
+        if pe_exec_values:
+            txn.result_data["pe_exec_ns"] = max(pe_exec_values)
+        dma_values = [st.result_data.get("dma_ns", 0.0) for st in sub_txns]
+        if dma_values:
+            txn.result_data["dma_ns"] = max(dma_values)
+        compute_values = [st.result_data.get("compute_ns", 0.0) for st in sub_txns]
+        if compute_values:
+            txn.result_data["compute_ns"] = max(compute_values)
+
+        # Send aggregate response on reverse command path back to IO_CPU
+        reverse_path = list(reversed(txn.path))
+        if len(reverse_path) >= 2:
+            from kernbench.runtime_api.kernel import ResponseMsg
+
+            parts = self.node.id.split(".")
+            cube_id = int(parts[1].replace("cube", ""))
+            resp_msg = ResponseMsg(
+                correlation_id=request.correlation_id,
+                request_id=request.request_id,
+                src_cube=cube_id, src_pe=-1, success=True,
+            )
+            resp_txn = Transaction(
+                request=resp_msg, path=reverse_path, step=0,
+                nbytes=0, done=env.event(), is_response=True,
+            )
+            yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
+        else:
+            txn.done.succeed()
+
+    def _resolve_dma_destinations(self, request: Any, target_pe: int | str) -> list[str]:
+        """Return list of HBM destination node_ids for DMA fan-out.
+
+        With single hbm_ctrl per cube (ADR-0019), always returns one node.
+        PA-based resolution still used for cross-cube routing.
+        """
+        cube_prefix = self.node.id.rsplit(".", 1)[0]  # e.g. "sip0.cube0"
+
+        # PA-based resolution: extract actual target from physical address
+        pa_val = getattr(request, "dst_pa", None) or getattr(request, "src_pa", None)
+        if pa_val is not None:
+            from kernbench.policy.address.phyaddr import PhysAddr
+            try:
+                pa = PhysAddr.decode(pa_val)
+                return [self.ctx.resolver.resolve(pa)]
+            except Exception:
+                pass
+
+        # Default: single hbm_ctrl in local cube
+        return [f"{cube_prefix}.hbm_ctrl"]
+
+    def _mmu_msg_fanout(self, env: simpy.Environment, txn: Any) -> Generator:
+        """Fan out MmuMapMsg/MmuUnmapMsg to target PE_MMU(s) via NOC.
+
+        Routes through find_node_path (M_CPU → NOC → PE_MMU command edges).
+        PE_MMU is a terminal node — completes the transaction directly.
+        """
+        request = txn.request
+        target_pe = getattr(request, "target_pe", "all")
+        cube_prefix = self.node.id.rsplit(".", 1)[0]  # e.g. "sip0.cube0"
+        pe_ids = self._resolve_pe_ids(target_pe)
+
+        if not pe_ids:
+            txn.done.succeed()
+            return
+
+        # Fan out to each PE_MMU
+        sub_dones: list[simpy.Event] = []
+        for pe_id in pe_ids:
+            pe_mmu_id = f"{cube_prefix}.pe{pe_id}.pe_mmu"
+            try:
+                path = self.ctx.router.find_node_path(self.node.id, pe_mmu_id)
+            except Exception:
+                continue
+            if len(path) < 2:
+                continue
+            sub_done = env.event()
+            sub_txn = Transaction(
+                request=request, path=path, step=0,
+                nbytes=0, done=sub_done,
+            )
+            yield self.out_ports[path[1]].put(sub_txn.advance())
+            sub_dones.append(sub_done)
+
+        # Wait for all PE_MMUs to complete
+        for sd in sub_dones:
+            yield sd
+
+        # Send aggregate response on reverse path
+        reverse_path = list(reversed(txn.path))
+        if len(reverse_path) >= 2:
+            from kernbench.runtime_api.kernel import ResponseMsg
+
+            parts = self.node.id.split(".")
+            cube_id = int(parts[1].replace("cube", ""))
+            resp_msg = ResponseMsg(
+                correlation_id=request.correlation_id,
+                request_id=request.request_id,
+                src_cube=cube_id, src_pe=-1, success=True,
+            )
+            resp_txn = Transaction(
+                request=resp_msg, path=reverse_path, step=0,
+                nbytes=0, done=env.event(), is_response=True,
+            )
+            yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
+        else:
+            txn.done.succeed()
+
+    def _resolve_pe_ids(self, target_pe: int | tuple | str) -> list[int]:
+        """Return list of PE IDs to fan out to (used by kernel launch fan-out)."""
+        if isinstance(target_pe, int):
+            return [target_pe]
+        if isinstance(target_pe, tuple):
+            return list(target_pe)
+        # "all": all PEs in local cube
+        n_slices = 8
+        if self.ctx and self.ctx.spec:
+            mm = self.ctx.spec.get("cube", {}).get("memory_map", {})
+            n_slices = mm.get("hbm_slices_per_cube", 8)
+        return list(range(n_slices))
@@ -0,0 +1,27 @@
+from __future__ import annotations
+
+from collections.abc import Generator
+from typing import TYPE_CHECKING
+
+import simpy
+
+from kernbench.components.base import ComponentBase
+
+if TYPE_CHECKING:
+    from kernbench.components.context import ComponentContext
+    from kernbench.topology.types import Node
+
+
+class PcieEpComponent(ComponentBase):
+    """PCIe endpoint: protocol processing overhead before forwarding.
+
+    Applies overhead_ns (from node.attrs) for PCIe protocol handling,
+    then forwards via inherited _forward_txn().
+    """
+
+    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
+        super().__init__(node, ctx)
+
+    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
+        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
+        yield env.timeout(overhead_ns)
@@ -0,0 +1,214 @@
+from __future__ import annotations
+
+from collections.abc import Generator
+from typing import TYPE_CHECKING, Any
+
+import simpy
+
+from kernbench.components.base import ComponentBase
+from kernbench.sim_engine.transaction import Transaction
+
+if TYPE_CHECKING:
+    from kernbench.components.context import ComponentContext
+    from kernbench.topology.types import Node
+
+
+class PeCpuComponent(ComponentBase):
+    """PE_CPU: kernel execution controller (Stage 2).
+
+    Two-phase kernel execution (ADR-0014 D1):
+      Phase 1 (compile): look up kernel from registry, run it with TLContext
+                         to generate a PeCommand list.
+      Phase 2 (replay):  iterate commands, dispatch to PE_SCHEDULER via
+                         PeInternalTxn, wait for blocking commands.
+
+    Non-kernel Transactions are forwarded normally.
+    """
+
+    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
+        super().__init__(node, ctx)
+        self._pe_prefix = node.id.rsplit(".", 1)[0]  # e.g. "sip0.cube0.pe0"
+        try:
+            self._pe_idx = int(self._pe_prefix.rsplit("pe", 1)[1])
+        except (IndexError, ValueError):
+            self._pe_idx = 0
+        # Extract sip/cube index for multi-SIP/cube shard matching
+        parts = node.id.split(".")
+        try:
+            self._sip_idx = int(parts[0].replace("sip", ""))
+        except (IndexError, ValueError):
+            self._sip_idx = 0
+        try:
+            self._cube_idx = int(parts[1].replace("cube", ""))
+        except (IndexError, ValueError):
+            self._cube_idx = 0
+
+    def _find_shard(self, shards: tuple) -> Any:
+        """Find shard matching this PE's (sip, cube, pe). Fallback to positional index."""
+        for s in shards:
+            if s.sip == self._sip_idx and s.cube == self._cube_idx and s.pe == self._pe_idx:
+                return s
+        return shards[min(self._pe_idx, len(shards) - 1)]
+
+    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
+        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
+        yield env.timeout(overhead_ns)
+
+    def _worker(self, env: simpy.Environment) -> Generator:
+        while True:
+            txn: Any = yield self._inbox.get()
+            from kernbench.runtime_api.kernel import KernelLaunchMsg
+
+            if hasattr(txn, "request") and isinstance(txn.request, KernelLaunchMsg):
+                yield from self._execute_kernel(env, txn)
+            else:
+                yield from self._forward_txn(env, txn)
+
+    def _execute_kernel(self, env: simpy.Environment, txn: Any) -> Generator:
+        """Execute kernel: greenlet mode (ADR-0020) or legacy Phase 0 + replay."""
+        from kernbench.triton_emu.registry import get_kernel
+
+        request = txn.request
+        yield from self.run(env, 0)
+
+        kernel_fn = get_kernel(request.kernel_ref.name)
+        num_programs = self._derive_num_programs(request)
+        kernel_args = self._unpack_kernel_args(request)
+
+        pe_exec_start = env.now
+        scheduler_id = f"{self._pe_prefix}.pe_scheduler"
+
+        # Choose execution mode: greenlet (ADR-0020) or legacy command-list
+        store = getattr(self.ctx, "memory_store", None) if self.ctx else None
+
+        if store is not None:
+            composite_results = yield from self._execute_greenlet(
+                env, kernel_fn, kernel_args, num_programs, scheduler_id, store,
+            )
+        else:
+            composite_results = yield from self._execute_legacy(
+                env, kernel_fn, kernel_args, num_programs, scheduler_id,
+            )
+
+        # Record PE-internal execution time
+        txn.result_data["pe_exec_ns"] = env.now - pe_exec_start
+        total_dma_ns = 0.0
+        total_compute_ns = 0.0
+        for rd in composite_results:
+            total_dma_ns += rd.get("dma_ns", 0.0)
+            total_compute_ns += rd.get("compute_ns", 0.0)
+        txn.result_data["dma_ns"] = total_dma_ns
+        txn.result_data["compute_ns"] = total_compute_ns
+
+        # Send ResponseMsg on reverse path
+        yield from self._send_response(env, txn, request)
+
+    def _derive_num_programs(self, request: Any) -> int:
+        num_programs = 1
+        for arg in request.args:
+            if arg.arg_kind == "tensor":
+                cube_pe_count = sum(
+                    1 for s in arg.shards
+                    if s.sip == self._sip_idx and s.cube == self._cube_idx
+                )
+                if cube_pe_count > num_programs:
+                    num_programs = cube_pe_count
+        return num_programs
+
+    def _unpack_kernel_args(self, request: Any) -> list:
+        kernel_args: list = []
+        for arg in request.args:
+            if arg.arg_kind == "tensor":
+                if arg.va_base:
+                    kernel_args.append(arg.va_base)
+                else:
+                    shard = self._find_shard(arg.shards)
+                    kernel_args.append(shard.pa)
+            elif arg.arg_kind == "scalar":
+                kernel_args.append(arg.value)
+        return kernel_args
+
+    def _execute_greenlet(
+        self, env, kernel_fn, kernel_args, num_programs, scheduler_id, store,
+    ) -> Generator:
+        """Greenlet-based execution (ADR-0020 D3): kernel ↔ SimPy interleaved."""
+        from kernbench.triton_emu.kernel_runner import KernelRunner
+
+        runner = KernelRunner(
+            pe_prefix=self._pe_prefix,
+            pe_idx=self._pe_idx,
+            sip_idx=self._sip_idx,
+            cube_idx=self._cube_idx,
+            scheduler_id=scheduler_id,
+            out_ports=self.out_ports,
+            store=store,
+        )
+        yield from runner.run(env, kernel_fn, kernel_args, num_programs)
+        return getattr(runner, "_composite_results", [])
+
+    def _execute_legacy(
+        self, env, kernel_fn, kernel_args, num_programs, scheduler_id,
+    ) -> Generator:
+        """Legacy Phase 0 + replay: generate command list, then dispatch."""
+        from kernbench.common.pe_commands import (
+            CompositeCmd, PeCpuOverheadCmd, PeInternalTxn, WaitCmd,
+        )
+        from kernbench.triton_emu.tl_context import TLContext, run_kernel
+
+        tl = TLContext(pe_id=self._pe_idx, num_programs=num_programs, dispatch_cycles=0)
+        run_kernel(kernel_fn, tl, *kernel_args)
+        commands = tl.commands
+
+        pending: dict[str, simpy.Event] = {}
+        composite_results: list[dict] = []
+
+        for cmd in commands:
+            if isinstance(cmd, PeCpuOverheadCmd):
+                yield env.timeout(cmd.cycles)
+            elif isinstance(cmd, WaitCmd):
+                if cmd.handle is not None:
+                    evt = pending.pop(cmd.handle.id, None)
+                    if evt:
+                        yield evt
+                else:
+                    for evt in pending.values():
+                        yield evt
+                    pending.clear()
+            elif isinstance(cmd, CompositeCmd):
+                done_evt = env.event()
+                pe_txn = PeInternalTxn(
+                    command=cmd, done=done_evt, pe_prefix=self._pe_prefix,
+                )
+                composite_results.append(pe_txn.result_data)
+                yield self.out_ports[scheduler_id].put(pe_txn)
+                pending[cmd.completion.id] = done_evt
+            else:
+                done_evt = env.event()
+                pe_txn = PeInternalTxn(
+                    command=cmd, done=done_evt, pe_prefix=self._pe_prefix,
+                )
+                yield self.out_ports[scheduler_id].put(pe_txn)
+                yield done_evt
+
+        for evt in pending.values():
+            yield evt
+        return composite_results
+
+    def _send_response(self, env, txn, request) -> Generator:
+        reverse_path = list(reversed(txn.path))
+        if len(reverse_path) >= 2:
+            from kernbench.runtime_api.kernel import ResponseMsg
+
+            resp_msg = ResponseMsg(
+                correlation_id=request.correlation_id,
+                request_id=request.request_id,
+                src_cube=self._cube_idx, src_pe=self._pe_idx,
+                success=True,
+            )
+            resp_txn = Transaction(
+                request=resp_msg, path=reverse_path, step=0,
+                nbytes=0, done=env.event(), is_response=True,
+            )
+            yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
+        else:
+            txn.done.succeed()
@@ -0,0 +1,138 @@
+from __future__ import annotations
+
+from collections.abc import Generator
+from typing import TYPE_CHECKING, Any
+
+import simpy
+
+from kernbench.components.base import PeEngineBase
+from kernbench.sim_engine.transaction import Transaction
+
+if TYPE_CHECKING:
+    from kernbench.common.pe_commands import PeInternalTxn
+    from kernbench.components.context import ComponentContext
+    from kernbench.topology.types import Node
+
+
+class PeDmaComponent(PeEngineBase):
+    """PE_DMA: dual-channel DMA engine with READ and WRITE resources.
+
+    Each channel has capacity=1 (ADR-0014 D4):
+      - DMA_READ and DMA_WRITE may execute concurrently.
+      - Multiple READs cannot overlap; multiple WRITEs cannot overlap.
+
+    Handles two message types:
+      - Transaction: external fabric messages (PeDmaMsg probes, M_CPU DMA)
+      - PeInternalTxn: PE-internal commands from PE_SCHEDULER
+        (DmaReadCmd → HBM read, DmaWriteCmd → HBM write)
+    """
+
+    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
+        super().__init__(node, ctx)
+        self._dma_read: simpy.Resource | None = None
+        self._dma_write: simpy.Resource | None = None
+        self._mmu = None  # PeMMU instance, set by engine wiring
+
+    def init_resources(self, env: simpy.Environment) -> None:
+        self._dma_read = simpy.Resource(env, capacity=1)
+        self._dma_write = simpy.Resource(env, capacity=1)
+
+    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
+        yield env.timeout(0)
+
+    def handle_command(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
+        """Handle PE-internal DMA command: resolve PA → HBM path → transfer."""
+        from kernbench.common.pe_commands import DmaReadCmd, DmaWriteCmd
+        from kernbench.policy.address.phyaddr import PhysAddr
+        from kernbench.runtime_api.kernel import PeDmaMsg
+
+        cmd = pe_txn.command
+        assert self._dma_read is not None and self._dma_write is not None
+
+        # Determine direction and target address (VA → PA via MMU)
+        if isinstance(cmd, DmaReadCmd):
+            dma_res = self._dma_read
+            raw_addr = cmd.src_addr
+            is_write = False
+        elif isinstance(cmd, DmaWriteCmd):
+            dma_res = self._dma_write
+            raw_addr = cmd.dst_addr
+            is_write = True
+        else:
+            pe_txn.done.succeed()
+            return
+
+        # Translate VA → PA via MMU (if available), then resolve HBM node
+        # If MMU has no mapping for this address (PageFault), treat as PA directly
+        # (backward-compatible with PA-only mode)
+        if self._mmu is not None:
+            from kernbench.policy.address.pe_mmu import PageFault
+            try:
+                target_pa = self._mmu.translate(raw_addr)
+                if self._mmu.overhead_ns > 0:
+                    yield env.timeout(self._mmu.overhead_ns)
+            except PageFault:
+                target_pa = raw_addr
+        else:
+            target_pa = raw_addr  # fallback: treat as PA directly
+        pa = PhysAddr.decode(target_pa)
+        dst_node = self.ctx.resolver.resolve(pa)
+        path = self.ctx.router.find_path(self._pe_prefix, dst_node)
+        drain_ns = self.ctx.compute_drain_ns(path, cmd.nbytes)
+
+        # Acquire DMA channel (command issue serialization)
+        with dma_res.request() as req:
+            yield req
+            # Create sub-Transaction with PeDmaMsg (HbmCtrl handles it directly)
+            sub_done = env.event()
+            sub_request = PeDmaMsg(
+                correlation_id="pe_internal",
+                request_id=f"dma_{id(pe_txn)}",
+                src_sip=0, src_cube=0, src_pe=0,
+                dst_pa=target_pa, nbytes=cmd.nbytes,
+                is_write=is_write,
+            )
+            sub_txn = Transaction(
+                request=sub_request, path=path, step=0,
+                nbytes=cmd.nbytes, done=sub_done, drain_ns=drain_ns,
+            )
+            # Send to next hop (path[0] is pe_dma itself, path[1] is router)
+            if len(path) > 1:
+                yield self.out_ports[path[1]].put(sub_txn.advance())
+        # DMA channel released after issue
+
+        # Wait for HBM transfer completion
+        yield sub_done
+        pe_txn.done.succeed()
+
+    def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
+        """Handle external Transaction (PeDmaMsg probe, M_CPU DMA) with channel acquisition."""
+        # Response transactions bypass DMA channel (no outbound resource needed)
+        if getattr(txn, "is_response", False):
+            next_hop = txn.next_hop
+            if next_hop:
+                yield self.out_ports[next_hop].put(txn.advance())
+            else:
+                txn.done.succeed()
+            return
+
+        dma_res = self._select_channel(txn)
+        with dma_res.request() as req:
+            yield req
+            next_hop = txn.next_hop
+            if next_hop:
+                yield self.out_ports[next_hop].put(txn.advance())
+            else:
+                drain = getattr(txn, "drain_ns", 0.0)
+                if drain > 0:
+                    yield env.timeout(drain)
+                txn.done.succeed()
+
+    def _select_channel(self, txn: Any) -> simpy.Resource:
+        """Select DMA channel based on request type."""
+        from kernbench.runtime_api.kernel import MemoryWriteMsg
+
+        assert self._dma_read is not None and self._dma_write is not None
+        if isinstance(txn.request, MemoryWriteMsg):
+            return self._dma_write
+        return self._dma_read
@@ -0,0 +1,90 @@
+from __future__ import annotations
+
+from collections.abc import Generator
+from typing import TYPE_CHECKING, Any
+
+import simpy
+
+from kernbench.components.base import PeEngineBase
+
+if TYPE_CHECKING:
+    from kernbench.common.pe_commands import PeInternalTxn
+    from kernbench.components.context import ComponentContext
+    from kernbench.topology.types import Node
+
+
+# dtype → bit width (for TFLOPS scaling)
+_DTYPE_BITS: dict[str, int] = {
+    "f16": 16, "fp16": 16, "float16": 16, "bf16": 16,
+    "f32": 32, "fp32": 32, "float32": 32,
+    "i8": 8, "int8": 8,
+    "i16": 16, "int16": 16,
+    "i32": 32, "int32": 32,
+}
+
+
+class PeGemmComponent(PeEngineBase):
+    """PE_GEMM: matrix multiplication engine sharing accel_slot (ADR-0014 D4).
+
+    Uses a shared compute resource (PE_ACCEL capacity=1) that is mutually
+    exclusive with PE_MATH within the same PE.
+
+    Compute latency model:
+      FLOPs = 2 * M * K * N
+      effective_tflops = peak_tflops_f16 * (16 / dtype_bits)
+      compute_ns = FLOPs / (effective_tflops * 1e3)
+    """
+
+    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
+        super().__init__(node, ctx)
+        self._accel: simpy.Resource | None = None
+        self._peak_tflops_f16: float = float(node.attrs.get("peak_tflops_f16", 0.0))
+
+    def init_resources(self, env: simpy.Environment) -> None:
+        resource_name = self.node.attrs.get("shared_resource")
+        if resource_name and self.ctx:
+            self._accel = self.ctx.get_shared_resource(
+                env, f"{self._pe_prefix}.{resource_name}"
+            )
+
+    def _compute_ns(self, m: int, k: int, n: int, dtype: str) -> float:
+        """Compute GEMM latency in nanoseconds."""
+        if self._peak_tflops_f16 <= 0:
+            return float(self.node.attrs.get("overhead_ns", 0.0))
+        dtype_bits = _DTYPE_BITS.get(dtype, 16)
+        effective_tflops = self._peak_tflops_f16 * (16.0 / dtype_bits)
+        flops = 2.0 * m * k * n
+        return flops / (effective_tflops * 1e3)
+
+    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
+        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
+        yield env.timeout(overhead_ns)
+
+    def handle_command(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
+        from kernbench.common.pe_commands import GemmCmd
+
+        cmd = pe_txn.command
+        if self._accel:
+            with self._accel.request() as req:
+                yield req
+                if isinstance(cmd, GemmCmd):
+                    ns = self._compute_ns(cmd.m, cmd.k, cmd.n, cmd.a.dtype)
+                    yield env.timeout(ns)
+                else:
+                    yield from self.run(env, 0)
+        else:
+            if isinstance(cmd, GemmCmd):
+                ns = self._compute_ns(cmd.m, cmd.k, cmd.n, cmd.a.dtype)
+                yield env.timeout(ns)
+            else:
+                yield from self.run(env, 0)
+        pe_txn.done.succeed()
+
+    def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
+        """Transaction forwarding with accel_slot acquisition."""
+        if self._accel:
+            with self._accel.request() as req:
+                yield req
+                yield from super()._forward_txn(env, txn)
+        else:
+            yield from super()._forward_txn(env, txn)
@@ -0,0 +1,54 @@
+from __future__ import annotations
+
+from collections.abc import Generator
+from typing import TYPE_CHECKING, Any
+
+import simpy
+
+from kernbench.components.base import PeEngineBase
+
+if TYPE_CHECKING:
+    from kernbench.common.pe_commands import PeInternalTxn
+    from kernbench.components.context import ComponentContext
+    from kernbench.topology.types import Node
+
+
+class PeMathComponent(PeEngineBase):
+    """PE_MATH: element-wise computation engine sharing accel_slot (ADR-0014 D4).
+
+    Uses a shared compute resource (PE_ACCEL capacity=1) that is mutually
+    exclusive with PE_GEMM within the same PE.
+    """
+
+    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
+        super().__init__(node, ctx)
+        self._accel: simpy.Resource | None = None
+
+    def init_resources(self, env: simpy.Environment) -> None:
+        resource_name = self.node.attrs.get("shared_resource")
+        if resource_name and self.ctx:
+            self._accel = self.ctx.get_shared_resource(
+                env, f"{self._pe_prefix}.{resource_name}"
+            )
+
+    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
+        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
+        yield env.timeout(overhead_ns)
+
+    def handle_command(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
+        if self._accel:
+            with self._accel.request() as req:
+                yield req
+                yield from self.run(env, 0)
+        else:
+            yield from self.run(env, 0)
+        pe_txn.done.succeed()
+
+    def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
+        """Transaction forwarding with accel_slot acquisition."""
+        if self._accel:
+            with self._accel.request() as req:
+                yield req
+                yield from super()._forward_txn(env, txn)
+        else:
+            yield from super()._forward_txn(env, txn)
@@ -0,0 +1,66 @@
+"""PE_MMU component: address translation unit.
+
+Component role: receives MmuMapMsg/MmuUnmapMsg via inbox (independent of PE_CPU).
+Utility role: PE_DMA/PE_GEMM call mmu.translate() directly (no SimPy overhead).
+"""
+from __future__ import annotations
+
+from collections.abc import Generator
+from typing import TYPE_CHECKING, Any
+
+import simpy
+
+from kernbench.components.base import ComponentBase, ComponentRegistry
+from kernbench.policy.address.pe_mmu import PeMMU
+
+if TYPE_CHECKING:
+    from kernbench.components.context import ComponentContext
+    from kernbench.topology.types import Node
+
+
+class PeMmuComponent(ComponentBase):
+    """PE_MMU: per-PE virtual-to-physical address translation.
+
+    Receives MmuMapMsg/MmuUnmapMsg via inbox and updates the internal
+    page table. PE_DMA and PE_GEMM access the underlying PeMMU object
+    via the ``mmu`` property for synchronous VA→PA translation.
+    """
+
+    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
+        super().__init__(node, ctx)
+        page_size = int(node.attrs.get("page_size", 2 * 1024 * 1024))
+        overhead_ns = float(node.attrs.get("tlb_overhead_ns", 0.0))
+        self._mmu = PeMMU(page_size=page_size, overhead_ns=overhead_ns)
+
+    @property
+    def mmu(self) -> PeMMU:
+        """The underlying PeMMU utility object for direct translate() calls."""
+        return self._mmu
+
+    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
+        yield env.timeout(0)
+
+    def _worker(self, env: simpy.Environment) -> Generator:
+        """Process MmuMapMsg/MmuUnmapMsg from inbox."""
+        from kernbench.runtime_api.kernel import MmuMapMsg, MmuUnmapMsg
+
+        while True:
+            txn: Any = yield self._inbox.get()
+
+            if hasattr(txn, "request"):
+                request = txn.request
+                if isinstance(request, MmuMapMsg):
+                    for entry in request.entries:
+                        self._mmu.map(
+                            va=entry["va"], pa=entry["pa"], size=entry["size"],
+                        )
+                    txn.done.succeed()
+                elif isinstance(request, MmuUnmapMsg):
+                    for entry in request.entries:
+                        self._mmu.unmap(va=entry["va"], size=entry["size"])
+                    txn.done.succeed()
+                else:
+                    # Forward non-MMU transactions normally
+                    yield from self._forward_txn(env, txn)
+            else:
+                yield from self._forward_txn(env, txn)
@@ -0,0 +1,245 @@
+from __future__ import annotations
+
+from collections.abc import Generator
+from typing import TYPE_CHECKING, Any
+
+import simpy
+
+from kernbench.components.base import ComponentBase
+
+if TYPE_CHECKING:
+    from kernbench.common.pe_commands import PeInternalTxn
+    from kernbench.components.context import ComponentContext
+    from kernbench.topology.types import Node
+
+
+class PeSchedulerComponent(ComponentBase):
+    """PE_SCHEDULER: sole dispatcher inside a PE (ADR-0014 D1).
+
+    Receives PeInternalTxn from PE_CPU, routes to the appropriate engine:
+      - DmaReadCmd / DmaWriteCmd → PE_DMA
+      - GemmCmd                  → PE_GEMM
+      - MathCmd                  → PE_MATH
+      - CompositeCmd             → tiled pipeline (Stage 3: ADR-0014 D3.2)
+
+    Composite GEMM pipeline (32x64x32 tiles):
+      DMA_READ(b_tile_t) → COMPUTE(t) → DMA_WRITE(out_tile_t)
+      with overlap: READ(t+1) || COMPUTE(t) || WRITE(t-1)
+
+    Applies scheduler overhead_ns before dispatching each command.
+    Non-PeInternalTxn messages are forwarded via inherited _forward_txn().
+    """
+
+    # Scheduler tile dimensions (ADR-0014 D3.2)
+    TILE_M = 32
+    TILE_K = 64
+    TILE_N = 32
+
+    # Command → engine suffix dispatch table.
+    # New engines: add a single entry here (e.g. ConvCmd: "pe_conv").
+    _CMD_DISPATCH: dict[type, str] = {}
+
+    @classmethod
+    def _ensure_dispatch_table(cls) -> None:
+        if cls._CMD_DISPATCH:
+            return
+        from kernbench.common.pe_commands import DmaReadCmd, DmaWriteCmd, GemmCmd, MathCmd
+
+        cls._CMD_DISPATCH = {
+            DmaReadCmd: "pe_dma",
+            DmaWriteCmd: "pe_dma",
+            GemmCmd: "pe_gemm",
+            MathCmd: "pe_math",
+        }
+
+    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
+        super().__init__(node, ctx)
+        self._pe_prefix = node.id.rsplit(".", 1)[0]
+        self._ensure_dispatch_table()
+
+    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
+        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
+        yield env.timeout(overhead_ns)
+
+    def _worker(self, env: simpy.Environment) -> Generator:
+        from kernbench.common.pe_commands import PeInternalTxn
+
+        while True:
+            msg: Any = yield self._inbox.get()
+            if isinstance(msg, PeInternalTxn):
+                env.process(self._dispatch(env, msg))
+            else:
+                yield from self._forward_txn(env, msg)
+
+    def _dispatch(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
+        """Route a PeInternalTxn to the correct engine via dispatch table."""
+        from kernbench.common.pe_commands import CompositeCmd
+
+        # Scheduler overhead
+        yield from self.run(env, 0)
+
+        cmd = pe_txn.command
+
+        # Check dispatch table first
+        engine_suffix = self._CMD_DISPATCH.get(type(cmd))
+        if engine_suffix is not None:
+            yield self.out_ports[f"{self._pe_prefix}.{engine_suffix}"].put(pe_txn)
+            return
+
+        # CompositeCmd: tiled pipeline (not a simple forward)
+        if isinstance(cmd, CompositeCmd):
+            yield from self._dispatch_composite(env, pe_txn)
+            return
+
+        # Unknown command — signal done immediately
+        pe_txn.done.succeed()
+
+    def _dispatch_composite(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
+        """Composite tiled pipeline (ADR-0014 D3.2).
+
+        GEMM: 3-stage pipeline with b-tile streaming from HBM.
+        MATH: sequential compute + DMA_WRITE (no tiling).
+        """
+        from kernbench.common.pe_commands import CompositeCmd
+
+        cmd = pe_txn.command
+        assert isinstance(cmd, CompositeCmd)
+        if cmd.op == "gemm" and cmd.b is not None:
+            yield from self._pipeline_gemm(env, pe_txn, cmd)
+        else:
+            yield from self._pipeline_math(env, pe_txn, cmd)
+
+    def _pipeline_gemm(self, env: simpy.Environment, pe_txn: PeInternalTxn, cmd: Any) -> Generator:
+        """Tiled GEMM pipeline: stream b tiles from HBM, compute, write results.
+
+        Tensor a is in TCM (loaded via tl.load). Tensor b is in HBM (via tl.ref).
+        Pipeline: DMA_READ(b_tile_t) -> COMPUTE(t) -> DMA_WRITE(out_tile_t)
+        Overlap: READ(t+1) || COMPUTE(t) || WRITE(t-1)
+        """
+        from kernbench.common.pe_commands import (
+            DmaReadCmd,
+            DmaWriteCmd,
+            GemmCmd,
+            PeInternalTxn as PeTxn,
+            TensorHandle,
+        )
+
+        pp = self._pe_prefix
+        a = cmd.a   # already in TCM
+        b = cmd.b   # HBM reference (via tl.ref)
+
+        M, K_a = a.shape[-2], a.shape[-1]
+        K_b, N = b.shape[-2], b.shape[-1]
+        dtype = a.dtype
+        dtype_bytes = b.nbytes // (K_b * N) if (K_b * N) > 0 else 2
+
+        # Tile counts
+        n_tiles_k = max(1, (K_a + self.TILE_K - 1) // self.TILE_K)
+        n_tiles_n = max(1, (N + self.TILE_N - 1) // self.TILE_N)
+        n_tiles = n_tiles_k * n_tiles_n
+
+        prev_compute_done = None
+        prev_write_done = None
+        total_dma_ns = 0.0
+        total_compute_ns = 0.0
+
+        for tile_idx in range(n_tiles):
+            tk = tile_idx // n_tiles_n
+            tn = tile_idx % n_tiles_n
+
+            k_start = tk * self.TILE_K
+            n_start = tn * self.TILE_N
+            tile_k = min(self.TILE_K, K_a - k_start)
+            tile_n = min(self.TILE_N, N - n_start)
+            tile_nbytes = tile_k * tile_n * dtype_bytes
+
+            # --- Stage 1: DMA_READ b_tile from HBM ---
+            read_done = env.event()
+            b_tile_addr = b.addr + (k_start * N + n_start) * dtype_bytes
+            b_tile_handle = TensorHandle(
+                id=f"b_tile_{tile_idx}", addr=b_tile_addr,
+                shape=(tile_k, tile_n), dtype=dtype, nbytes=tile_nbytes,
+            )
+            read_cmd = DmaReadCmd(handle=b_tile_handle, src_addr=b_tile_addr, nbytes=tile_nbytes)
+            read_txn = PeTxn(command=read_cmd, done=read_done, pe_prefix=pp)
+            t0 = env.now
+            yield self.out_ports[f"{pp}.pe_dma"].put(read_txn)
+
+            # Wait for previous compute before starting this tile's compute
+            if prev_compute_done is not None:
+                yield prev_compute_done
+
+            # Wait for this tile's DMA_READ
+            yield read_done
+            total_dma_ns += env.now - t0
+
+            # --- Stage 2: COMPUTE (GEMM) ---
+            compute_done = env.event()
+            out_handle = TensorHandle(
+                id=f"out_tile_{tile_idx}", addr=0,
+                shape=(M, tile_n), dtype=dtype,
+                nbytes=M * tile_n * dtype_bytes,
+            )
+            compute_cmd = GemmCmd(a=a, b=b_tile_handle, out=out_handle,
+                                  m=M, k=tile_k, n=tile_n)
+            compute_txn = PeTxn(command=compute_cmd, done=compute_done, pe_prefix=pp)
+            t0 = env.now
+            yield self.out_ports[f"{pp}.pe_gemm"].put(compute_txn)
+
+            # Wait for previous write (DMA_WRITE serialization)
+            if prev_write_done is not None:
+                yield prev_write_done
+
+            # Wait for compute of THIS tile
+            yield compute_done
+            total_compute_ns += env.now - t0
+            prev_compute_done = compute_done
+
+            # --- Stage 3: DMA_WRITE out_tile to HBM ---
+            write_done = env.event()
+            out_tile_pa = cmd.out_addr + n_start * dtype_bytes
+            write_nbytes = M * tile_n * dtype_bytes
+            write_cmd = DmaWriteCmd(handle=out_handle, dst_addr=out_tile_pa, nbytes=write_nbytes)
+            write_txn = PeTxn(command=write_cmd, done=write_done, pe_prefix=pp)
+            t0 = env.now
+            yield self.out_ports[f"{pp}.pe_dma"].put(write_txn)
+            prev_write_done = write_done
+
+        # Wait for final write
+        if prev_write_done is not None:
+            t0 = env.now
+            yield prev_write_done
+            total_dma_ns += env.now - t0
+
+        pe_txn.result_data["dma_ns"] = total_dma_ns
+        pe_txn.result_data["compute_ns"] = total_compute_ns
+        pe_txn.done.succeed()
+
+    def _pipeline_math(self, env: simpy.Environment, pe_txn: PeInternalTxn, cmd: Any) -> Generator:
+        """Non-GEMM composite: sequential compute + DMA_WRITE (no tiling)."""
+        from kernbench.common.pe_commands import (
+            DmaWriteCmd,
+            MathCmd,
+            PeInternalTxn as PeTxn,
+        )
+
+        pp = self._pe_prefix
+
+        # Step 1: Compute (MATH)
+        compute_done = env.event()
+        compute_cmd = MathCmd(
+            op=cmd.math_op or "identity",
+            inputs=(cmd.a,), out=cmd.a,
+        )
+        compute_txn = PeTxn(command=compute_cmd, done=compute_done, pe_prefix=pp)
+        yield self.out_ports[f"{pp}.pe_math"].put(compute_txn)
+        yield compute_done
+
+        # Step 2: DMA_WRITE result to HBM
+        write_done = env.event()
+        write_cmd = DmaWriteCmd(handle=cmd.a, dst_addr=cmd.out_addr, nbytes=cmd.out_nbytes)
+        write_txn = PeTxn(command=write_cmd, done=write_done, pe_prefix=pp)
+        yield self.out_ports[f"{pp}.pe_dma"].put(write_txn)
+        yield write_done
+
+        pe_txn.done.succeed()
@@ -0,0 +1,25 @@
+from __future__ import annotations
+
+from collections.abc import Generator
+from typing import TYPE_CHECKING
+
+from kernbench.components.base import ComponentBase
+
+if TYPE_CHECKING:
+    from kernbench.components.context import ComponentContext
+    from kernbench.topology.types import Node
+
+
+class PeTcmComponent(ComponentBase):
+    """PE_TCM: tightly-coupled memory / local SRAM staging buffer.
+
+    Terminal storage component for PE-internal dataflow (ADR-0014 D5).
+    Phase 0: applies overhead_ns and drain_ns at terminal.
+    """
+
+    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
+        super().__init__(node, ctx)
+
+    def run(self, env, nbytes: int) -> Generator:
+        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
+        yield env.timeout(overhead_ns)
@@ -0,0 +1,59 @@
+from __future__ import annotations
+
+from collections.abc import Generator
+from typing import TYPE_CHECKING, Any
+
+import simpy
+
+from kernbench.components.base import ComponentBase
+from kernbench.sim_engine.transaction import Transaction
+
+if TYPE_CHECKING:
+    from kernbench.components.context import ComponentContext
+    from kernbench.topology.types import Node
+
+
+class SramComponent(ComponentBase):
+    """Cube SRAM: terminal component that models SRAM access latency.
+
+    Applies overhead_ns processing overhead (from node.attrs).
+    On completion, sends a ResponseMsg back on the reverse path.
+    """
+
+    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
+        super().__init__(node, ctx)
+
+    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
+        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
+        yield env.timeout(overhead_ns)
+
+    def _worker(self, env: simpy.Environment) -> Generator:
+        """Terminal worker: process, apply drain, send response."""
+        while True:
+            txn: Any = yield self._inbox.get()
+            yield from self.run(env, txn.nbytes)
+            drain = getattr(txn, "drain_ns", 0.0)
+            if drain > 0:
+                yield env.timeout(drain)
+            yield from self._send_response(env, txn)
+
+    def _send_response(self, env: simpy.Environment, txn: Any) -> Generator:
+        """Create ResponseMsg and send on reverse path."""
+        reverse_path = list(reversed(txn.path))
+        if len(reverse_path) >= 2 and self.ctx:
+            from kernbench.runtime_api.kernel import ResponseMsg
+
+            parts = self.node.id.split(".")
+            cube_id = int(parts[1].replace("cube", ""))
+            resp_msg = ResponseMsg(
+                correlation_id=txn.request.correlation_id,
+                request_id=txn.request.request_id,
+                src_cube=cube_id, src_pe=-1, success=True,
+            )
+            resp_txn = Transaction(
+                request=resp_msg, path=reverse_path, step=0,
+                nbytes=0, done=env.event(), is_response=True,
+            )
+            yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
+        else:
+            txn.done.succeed()
--- a/Show More
+++ b/Show More