Intercube allreduce: pe0 cube-mesh reduce + multi-SIP ring/torus/mesh

New intercube allreduce kernel replacing the old flat ring algorithms. Reduces across the 4x4 cube mesh within each SIP (pe0-only, same-lane), then inter-SIP exchange on root cube, then broadcast back. Supports ring_1d, torus_2d, and mesh_2d_no_wrap SIP topologies driven by topology.yaml. Integrated with dist.init_process_group / dist.all_reduce. New files: - src/kernbench/ccl/algorithms/intercube_allreduce.py (kernel) - src/kernbench/ccl/sfr_config.py (configure_sfr_intercube_multisip) - tests/test_allreduce_multidevice.py (config-driven, 3 topologies) - tests/test_distributed_intercube_allreduce.py (full distributed path) - tests/test_intercube_sfr_config.py (SFR wiring verification) Modified: - distributed.py: AhbmCCLBackend uses configure_sfr_intercube_multisip - topologies.py: added torus_2d, mesh_2d_no_wrap - install.py: global_E/W/N/S in _OPPOSITE_DIR - topology.yaml: added system.sips.topology - ccl.yaml: single intercube_allreduce algorithm - benches/ccl_allreduce.py: row_wise cube-mesh tensor layout Removed old flat-ring algorithms and their tests. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Refactor ccl_allreduce bench: rank=SIP only, remove rank=PE legacy path
2026-04-16 17:33:42 -07:00 · 2026-04-14 16:45:27 -07:00 · 2026-04-14 16:31:13 -07:00 · 2026-04-14 14:13:26 -07:00 · 2026-04-14 13:02:19 -07:00 · 2026-04-14 12:46:33 -07:00
142 changed files with 22423 additions and 657 deletions
@@ -29,3 +29,4 @@ build/
 # Logs
 *.log
 .claude/
@@ -0,0 +1,103 @@
 """CCL all-reduce bench (ADR-0024 + ADR-0027).
 Pure TP launcher model: rank = SIP. Each rank owns a ``(N_CUBES, n_elem)``
 tensor sharded row-wise across the cube mesh (pe0 per cube). After
 ``dist.all_reduce(op="sum")`` every cube on every rank must hold
 ``N_CUBES * sum(1..world_size)``. Rank 0 prints the pass/fail line.
 Driven by ``ccl.yaml`` (``defaults.algorithm``, ``n_elem``) + ``topology.yaml``
 (SIP count → world_size, cube_mesh → N_CUBES).
 """
 from __future__ import annotations
 from dataclasses import dataclass
 import numpy as np
 from kernbench.ccl.install import load_ccl_config, resolve_algorithm_config
 from kernbench.policy.placement.dp import DPPolicy
 DEFAULT_N_ELEM = 8
@dataclass(frozen=True)
 class _BenchCfg:
    algorithm: str
    n_elem: int
    n_cubes: int
    world_size: int
 def _resolve_cfg(torch) -> _BenchCfg:
    """Read ccl.yaml + topology once at host side."""
    merged = resolve_algorithm_config(load_ccl_config())
    ws = torch.distributed.get_world_size()
    spec = torch.spec or {}
    n_sips = int(spec.get("system", {}).get("sips", {}).get("count", 1))
    if ws != n_sips:
        raise RuntimeError(
            f"ccl_allreduce bench requires world_size == topology SIP count "
            f"(world_size={ws}, n_sips={n_sips})."
        )
    cm = spec.get("sip", {}).get("cube_mesh", {})
    n_cubes = int(cm.get("w", 4)) * int(cm.get("h", 4))
    return _BenchCfg(
        algorithm=merged["algorithm"],
        n_elem=int(merged.get("n_elem", DEFAULT_N_ELEM)),
        n_cubes=n_cubes,
        world_size=ws,
    )
 def _rank_dp(n_cubes: int) -> DPPolicy:
    return DPPolicy(cube="row_wise", pe="replicate", num_cubes=n_cubes, num_pes=1)
 def _allocate_rank_tensor(torch, rank: int, cfg: _BenchCfg):
    """Allocate this rank's ``(n_cubes, n_elem)`` tensor on its SIP."""
    return torch.zeros(
        (cfg.n_cubes, cfg.n_elem), dtype="f16",
        dp=_rank_dp(cfg.n_cubes), name=f"ccl_in_r{rank}",
    )
 def _init_with_rank_value(torch, tensor, rank: int, cfg: _BenchCfg) -> None:
    """Fill all cubes with the scalar ``rank + 1``."""
    arr = np.full((cfg.n_cubes, cfg.n_elem), float(rank + 1), dtype=np.float16)
    tensor.copy_(torch.from_numpy(arr))
 def _report(result: np.ndarray, cfg: _BenchCfg) -> None:
    """Single-line pass/fail printer (rank 0 only)."""
    expected = float(cfg.n_cubes * sum(range(1, cfg.world_size + 1)))
    ok = True
    for cube_id in range(cfg.n_cubes):
        if not np.allclose(result[cube_id], expected, rtol=1e-1, atol=1e-1):
            ok = False
            break
    if ok:
        total = cfg.world_size * cfg.n_cubes
        print(f"  {cfg.algorithm} (ws={cfg.world_size}): {total} OK")
        return
    got = float(result.reshape(-1).mean())
    print(
        f"  [FAIL] {cfg.algorithm} (ws={cfg.world_size}): "
        f"got mean={got:.3f}, expected={expected:.3f}"
    )
 def _worker(rank: int, cfg: _BenchCfg, torch) -> None:
    torch.ahbm.set_device(rank)
    tensor = _allocate_rank_tensor(torch, rank, cfg)
    _init_with_rank_value(torch, tensor, rank, cfg)
    torch.distributed.all_reduce(tensor, op="sum")
    if rank == 0:
        _report(tensor.numpy(), cfg)
 def run(torch) -> None:
    torch.distributed.init_process_group(backend="ahbm")
    cfg = _resolve_cfg(torch)
    torch.multiprocessing.spawn(
        _worker, args=(cfg, torch), nprocs=cfg.world_size,
    )
@@ -3,7 +3,7 @@
 Full host-to-PE pipeline:
  Host → PCIE_EP → IO_CPU → M_CPU → PE_CPU → SchedulerV2 → PE_DMA → HBM
-Single PE: num_sips=1, num_cubes=1, num_pes=1 via DPPolicy override.
+Single PE: num_cubes=1, num_pes=1 via DPPolicy override.
 Both operands use tl.ref (HBM-resident); scheduler_v2 tiles and streams
 per-tile DMA internally.
@@ -30,7 +30,7 @@ def _gemm_kernel(a_ptr, b_ptr, out_ptr, M, K, N, tl, DTYPE="f16"):
 def run(torch):
    """Run the single-PE GEMM benchmark."""
    dp = DPPolicy(cube="replicate", pe="replicate",
-                  num_sips=1, num_cubes=1, num_pes=1)
+                  num_cubes=1, num_pes=1)
    a   = torch.empty((M, K), dtype=DTYPE, dp=dp, name="a")
    b   = torch.empty((K, N), dtype=DTYPE, dp=dp, name="b")
@@ -72,12 +72,16 @@ def run(torch):
    K = GPT3_D_MODEL
    N = COLS_PER_PE
-    # X: replicated across all PEs
+    # ADR-0026: DPPolicy is intra-device only. For multi-SIP execution the
    # ADR-0024 launcher calls this bench once per SIP (each worker via
    # torch.ahbm.set_device(rank)); here the policy describes only the
    # cube × PE layout within a single SIP.
    # X: replicated across all PEs within the SIP
    dp_replicate = DPPolicy(cube="replicate", pe="replicate",
-                            num_sips=N_SIPS, num_cubes=N_CUBES, num_pes=N_PE_PER_CUBE)
+                            num_cubes=N_CUBES, num_pes=N_PE_PER_CUBE)
-    # W_Q/K/V, out_Q/K/V: column-wise sharded across all PEs
+    # W_Q/K/V, out_Q/K/V: column-wise sharded across all PEs within the SIP
    dp_sharded = DPPolicy(cube="column_wise", pe="column_wise",
-                          num_sips=N_SIPS, num_cubes=N_CUBES, num_pes=N_PE_PER_CUBE)
+                          num_cubes=N_CUBES, num_pes=N_PE_PER_CUBE)
    x     = torch.empty((M, K), dtype=DTYPE, dp=dp_replicate, name="x")
    wq    = torch.empty((K, GPT3_D_MODEL), dtype=DTYPE, dp=dp_sharded, name="wq")
@@ -9,29 +9,32 @@ from kernbench.runtime_api.context import RuntimeContext
 BenchFn = Callable[[RuntimeContext], Any]
 def _load_module(bench_id: str):
    bench_id = bench_id.strip()
    if not bench_id:
        raise ValueError("Bench id is empty.")
    module_path = f"benches.{bench_id}"
    try:
        return importlib.import_module(module_path)
    except ModuleNotFoundError as e:
        raise ValueError(
            f"Unknown bench '{bench_id}'. Expected module {module_path}.py"
        ) from e
 def resolve_bench(bench_id: str) -> BenchFn:
-    """
+    """Resolve a bench id into its ``run(torch)`` callable.
    Resolve a bench id into a callable bench function.
    Expected layout (repo root):
        benches/<bench_id>.py
            def run(torch: RuntimeContext) -> Any
    """
-    bench_id = bench_id.strip()
+    mod = _load_module(bench_id)
    if not bench_id:
        raise ValueError("Bench id is empty.")
    module_path = f"benches.{bench_id}"
    try:
        mod = importlib.import_module(module_path)
    except ModuleNotFoundError as e:
        raise ValueError(f"Unknown bench '{bench_id}'. Expected module {module_path}.py") from e
    run_fn = getattr(mod, "run", None)
    if run_fn is None:
-        raise ValueError(f"Bench module {module_path} must define a 'run(torch)' function.")
+        raise ValueError(
            f"Bench module benches.{bench_id} must define 'run(torch)'."
        )
    if not callable(run_fn):
-        raise ValueError(f"'run' in {module_path} is not callable.")
+        raise ValueError(f"'run' in benches.{bench_id} is not callable.")
    return run_fn
@@ -10,7 +10,9 @@ Kernel: tl.load(a) + tl.ref(b) + tl.composite(gemm) + tl.wait()
 from kernbench.policy.placement.dp import DPPolicy
 # GEMM dimensions: (M, K) x (K, N) → (M, N)
-M, K, N = 128, 256, 128
+# Small dims (1 tile) for fast regression. The test verifies the full
 # host→PE pipeline, not large-matrix throughput.
 M, K, N = 32, 64, 32
 DTYPE = "f16"
@@ -10,7 +10,9 @@ Kernel: tl.load(a) + tl.ref(b) + tl.composite(gemm) + tl.wait()
 from kernbench.policy.placement.dp import DPPolicy
 # GEMM dimensions: (M, K) x (K, N) -> (M, N)
-M, K, N = 128, 256, 128
+# Small dims (1 tile) for fast regression. The test verifies the multi-PE
 # fan-out pipeline, not large-matrix throughput.
 M, K, N = 32, 64, 32
 DTYPE = "f16"
@@ -1,7 +1,7 @@
 """VA offset verification benchmark.
 Verifies that Triton-style base_ptr + pid * stride addressing works correctly
-with full TP sharding (sip/cube/pe all column_wise). Each PE loads its own
+with intra-SIP TP sharding (cube/pe column_wise). Each PE loads its own
 block from a sharded tensor and stores it back.
 The kernel uses standard Triton patterns:
@@ -28,7 +28,7 @@ def _copy_kernel(src_ptr, dst_ptr, M, K, tl, DTYPE="f16"):
 def run(torch):
    """Run the VA offset verification benchmark with full TP sharding."""
-    dp = DPPolicy(sip="column_wise", cube="column_wise", pe="column_wise")
+    dp = DPPolicy(cube="column_wise", pe="column_wise")
    src = torch.zeros((M, K), dtype=DTYPE, dp=dp, name="src")
    dst = torch.empty((M, K), dtype=DTYPE, dp=dp, name="dst")
@@ -0,0 +1,45 @@
 # ccl.yaml — CCL backend (ahbm) configuration (ADR-0023 D11)
 #
 # Loaded by AhbmCCLBackend at init_process_group time.
 # defaults.algorithm chooses which kernel + topology is installed
 # into PE_IPCQ neighbor tables. Host code is unaware of these settings.
 defaults:
  # Algorithm to run for this benchmark execution.
  algorithm: intercube_allreduce
  # IPCQ ring buffer location.
  #   tcm  — PE-local TCM (fast, small, conflicts with compute TCM access)
  #   hbm  — PE-local HBM (large, slower DMA latency)
  #   sram — Cube-shared SRAM (medium, cube-internal contention)
  buffer_kind: tcm
  # Backpressure mode.
  #   poll  — spin-loop polling of cached peer pointers
  #   sleep — yield SimPy event, wake on credit return
  backpressure: sleep
  # Ring depth: number of slots per (direction, tx|rx) buffer.
  n_slots: 4
  # Slot size in bytes (must hold one tile worth of data).
  slot_size: 4096
  # PE_DMA virtual channel chunk size (D8).
  vc_chunk_size: 256
  # Credit return fast path message size (D9).
  ipcq_credit_size_bytes: 16
 algorithms:
  # ── intercube all-reduce (pe0-only, cube mesh + inter-SIP) ──
  # Reduces across the 4×4 cube mesh within each SIP, then inter-SIP
  # exchange on root cube, then broadcast back. SIP topology is read
  # from topology.yaml → system.sips.topology. Kernel auto-selects
  # ring / torus / mesh inter-SIP exchange pattern.
  intercube_allreduce:
    module: kernbench.ccl.algorithms.intercube_allreduce
    topology: none
    buffer_kind: tcm
    n_elem: 8
    root_cube: 15
@@ -2,6 +2,10 @@
 # Maps impl names (used in topology.yaml) to Python class paths.
 # Format: impl_name: module.path:ClassName
 #
 # Naming convention:
 #   builtin.<name>  — built-in implementations
 #   custom.<name>   — user-defined implementations
 #
 # ── Adding custom components ──────────────────────────────────────────
 #
 # 1. Create your implementation in:
@@ -10,41 +14,43 @@
 #    Your class must inherit from ComponentBase (or PeEngineBase for PE engines).
 #
 # 2. Register it below under "Custom" with a unique impl name:
-#      my_pe_cpu_v2: kernbench.components.custom.my_pe_cpu:MyPeCpuComponent
+#      custom.my_pe_cpu: kernbench.components.custom.my_pe_cpu:MyPeCpuComponent
 #
 # 3. Reference it in topology.yaml:
-#      pe_cpu: { kind: pe_cpu, impl: my_pe_cpu_v2, attrs: { ... } }
+#      pe_cpu: { kind: pe_cpu, impl: custom.my_pe_cpu, attrs: { ... } }
 #
 # 4. Add unit tests in:
 #      tests/custom/test_<your_component>.py
 #
 # External packages also work — use the full module path:
-#      fast_gemm_v1: my_team.accel.fast_gemm:FastGemmComponent
+#      custom.fast_gemm: my_team.accel.fast_gemm:FastGemmComponent
 # ──────────────────────────────────────────────────────────────────────
 components:
  # Infrastructure
-  forwarding_v1:  kernbench.components.builtin.forwarding:TransitComponent
+  builtin.forwarding:  kernbench.components.builtin.forwarding:TransitComponent
-  switch_v1:      kernbench.components.builtin.forwarding:TransitComponent
+  builtin.switch:      kernbench.components.builtin.forwarding:TransitComponent
-  noc_v1:         kernbench.components.builtin.forwarding:TransitComponent
+  builtin.noc:         kernbench.components.builtin.forwarding:TransitComponent
-  ucie_v1:        kernbench.components.builtin.forwarding:TransitComponent
+  builtin.ucie:        kernbench.components.builtin.forwarding:TransitComponent
  # IO / Host interface
-  pcie_ep_v1:     kernbench.components.builtin.pcie_ep:PcieEpComponent
+  builtin.pcie_ep:     kernbench.components.builtin.pcie_ep:PcieEpComponent
-  io_cpu_v1:      kernbench.components.builtin.io_cpu:IoCpuComponent
+  builtin.io_cpu:      kernbench.components.builtin.io_cpu:IoCpuComponent
  # Cube-level
-  m_cpu_v1:       kernbench.components.builtin.m_cpu:MCpuComponent
+  builtin.m_cpu:       kernbench.components.builtin.m_cpu:MCpuComponent
-  hbm_ctrl_v1:    kernbench.components.builtin.hbm_ctrl:HbmCtrlComponent
+  builtin.hbm_ctrl:    kernbench.components.builtin.hbm_ctrl:HbmCtrlComponent
-  sram_v1:        kernbench.components.builtin.sram:SramComponent
+  builtin.sram:        kernbench.components.builtin.sram:SramComponent
  # PE-level
-  pe_cpu_v1:       kernbench.components.builtin.pe_cpu:PeCpuComponent
+  builtin.pe_cpu:         kernbench.components.builtin.pe_cpu:PeCpuComponent
-  pe_scheduler_v1: kernbench.components.builtin.pe_scheduler:PeSchedulerComponent
+  builtin.pe_scheduler:   kernbench.components.builtin.pe_scheduler:PeSchedulerComponent
-  pe_dma_v1:       kernbench.components.builtin.pe_dma:PeDmaComponent
+  builtin.pe_dma:         kernbench.components.builtin.pe_dma:PeDmaComponent
-  pe_gemm_v1:      kernbench.components.builtin.pe_gemm:PeGemmComponent
+  builtin.pe_gemm:        kernbench.components.builtin.pe_gemm:PeGemmComponent
-  pe_math_v1:      kernbench.components.builtin.pe_math:PeMathComponent
+  builtin.pe_math:        kernbench.components.builtin.pe_math:PeMathComponent
-  pe_mmu_v1:       kernbench.components.builtin.pe_mmu:PeMmuComponent
+  builtin.pe_fetch_store: kernbench.components.builtin.pe_fetch_store:PeFetchStoreComponent
-  pe_tcm_v1:       kernbench.components.builtin.pe_tcm:PeTcmComponent
+  builtin.pe_mmu:         kernbench.components.builtin.pe_mmu:PeMmuComponent
  builtin.pe_tcm:         kernbench.components.builtin.pe_tcm:PeTcmComponent
  builtin.pe_ipcq:        kernbench.components.builtin.pe_ipcq:PeIpcqComponent
  # Custom — add your implementations here
  pe_scheduler_v2: kernbench.components.custom.pe_accel.scheduler:SchedulerV2Component
@@ -0,0 +1,441 @@
 # ADR-0018: LA-Based Memory Address Abstraction and HBM Channel Mapping Mode Introduction
 ## Status
 Proposed
 ## Context
 Kernbench simulates memory access between PE_DMA and Local-HBM within a CUBE.
 Currently, a VA-based access path is used; however, the following two channel mapping models
 are difficult to represent consistently.
 ### Background: Local-HBM Pseudo Channel Structure
 The HBM in a CUBE consists of 32 or 64 pseudo channels.
 In the PE-Local-HBM model, each PE is responsible for an equal number of pseudo channels.
 Example: 64 pseudo channels, 8 PEs per cube -> each PE accesses 8 pseudo channels as local HBM
 Both the number of pseudo channels and the number of PEs are topology parameters.
 `N = hbm_pseudo_channels / pes_per_cube` (= channels_per_pe) determines
 the number of local channels per PE.
 The routing path BW between DMA and each pseudo channel matches the BW of each pseudo channel
 (e.g., 32 GB/s), so if a PE sends simultaneous requests to N channels, it can utilize the
 maximum memory BW.
 ### Limitations of the Current VA Model
 When channels are divided into 8, requests must also be generated per channel and sent to DMA.
 However, in the current architecture, the kernel generates requests with VA (`tl.load`)
 and passes them directly to DMA, making it difficult for PE_CPU to generate per-channel DMA requests.
 Therefore, instead of VA, we propose using **Logical Address (LA)**,
 where the **BAAW (Logical-to-Physical Mapping Unit)** inside PE_DMA
 converts LA to PA or a list of PAs based on segment-based mapping.
 ### Two Channel Mapping Modes
 - **1:1 mode**: Creates and executes per-channel requests. Precise per-channel modeling.
 - **n:1 mode (default)**: Assumes interleaving across local HBM channels. Aggregated BW modeling.
 By supporting both modes, the overhead of the n:1 mode can be measured and evaluated.
 ### Core Requirements
 - The effective bandwidth semantics of PE_DMA -> HBM_CTRL must be identical in both modes
 - The difference must only be in the request representation and resource modeling approach
 - The kernel programming model must not be changed
 - Physical channel information must not be exposed to the kernel
 ### Existing Physical Address
 The current system's 51-bit Physical Address is defined in `policy/address/phyaddr.py`:
 ```
 [50:47] rack_id (4 bit)
 [46:43] sip_id  (4 bit)
 [42:38] cube_id (5 bit, sip_seg)
 [37]    hbm_selector (1=HBM window)
 [36:0]  hbm_offset   (37 bit, 128GB per cube)
 ```
 PA is used to represent the final routable canonical physical destination,
 and this role is preserved.
 However, the timing and policy of logical access -> physical request conversion are not clearly separated.
 ---
 ## Decision
 ### D1. Introduction of LA (Logical Address) — Replacing VA
 The existing VA (Virtual Address) infrastructure is replaced with LA (Logical Address).
 #### Characteristics of LA
 - Like VA, tensors can be mapped to a contiguous memory space
 - Represents logical buffer + offset
 - Does not directly contain physical channel information
 - An intermediate abstraction maintained until physical resolution
 - The sole address scheme used by kernel code (`tl.load`, `tl.store`, `tl.composite`)
 #### LA Space Definition
 | Item | Value |
 |------|-------|
 | LA start address | `0x1_0000_0000` (4 GB, preserving the existing VA start point) |
 | LA space size | 64 GB per PE |
 | Alignment unit | Segment-based (see D3 below) |
 LA is a PE-local address space.
 Even if different PEs use the same LA value, they resolve to different PAs
 because each PE has a different BAAW segment table.
 #### VA Infrastructure Removal Scope
 With the introduction of LA, the following existing code will be replaced/removed:
 | Removal Target | Replacement |
 |----------------|-------------|
 | `policy/address/va_allocator.py` (VirtualAllocator) | LA allocator (same free-list approach, name/role changed) |
 | `policy/address/pe_mmu.py` (PeMMU) | BAAW segment table (inside PE_DMA) |
 | `components/builtin/pe_mmu.py` (PeMmuComponent) | Removed — BAAW is internal PE_DMA logic, not a separate component |
 | `runtime_api/kernel.py`: MmuMapMsg, MmuUnmapMsg | Replaced with BaawSegmentInstallMsg |
 | `runtime_api/context.py`: VA alloc + MMU mapping install | LA alloc + BAAW segment install |
 | `runtime_api/tensor.py`: `va_base` field | `la_base` field |
 | `topology.yaml`: pe_mmu component entry | Removed |
 ---
 ### D2. Mapping Mode Configuration
 The mapping mode is configured at the cube level in topology.yaml:
 ```yaml
 cube:
  memory_map:
    hbm_mapping_mode: n_to_one    # one_to_one | n_to_one
    hbm_pseudo_channels: 64       # total pseudo channel count
    hbm_channels_per_pe: 8        # local channel count per PE
    hbm_channel_bw_gbs: 32.0      # per-channel bandwidth
 ```
 This configuration is referenced during graph compilation (topology builder) and BAAW initialization.
 ---
 ### D3. Segments and BAAW
 #### Segment Definition
 A segment is a logical allocation unit that partitions the LA space so that each segment
 maps to a specific HBM channel or channel group.
 Segments are created by the runtime allocator during tensor deployment,
 and BAAW uses them to convert LA into physical requests.
 #### BAAW Segment Table Entry
 ```python
@dataclass
 class BaawSegment:
    la_base: int          # segment start LA
    la_size: int          # segment size (bytes)
    mode: str             # "one_to_one" | "n_to_one"
    # 1:1 mode fields
    channel_count: int    # number of channels assigned to this segment (e.g., 8)
    pa_bases: list[int]   # per-channel PA start address list (len = channel_count)
    channel_ids: list[int]  # per-channel logical IDs (e.g., [0,1,2,...,7])
    channel_size: int     # per-channel size (la_size // channel_count)
    # n:1 mode fields
    agg_pa_base: int      # aggregated PA start address
    agg_node_id: str      # aggregated router node_id (for routing)
 ```
 #### Segment Lifecycle
 1. **Allocation time** (tensor deploy):
   - RuntimeContext allocates LA space from the LA allocator
   - PEMemAllocator allocates per-channel PA (1:1) or aggregated PA (n:1)
   - Sends `BaawSegmentInstallMsg` to PE_DMA to register in the segment table
 2. **Usage time** (kernel execution):
   - Kernel issues `tl.load(la_ptr)` -> DmaReadCmd(src_addr=LA)
   - PE_DMA looks up the segment corresponding to the LA in BAAW
   - Converts to PA(s) according to the mode
 3. **Deallocation time** (tensor free):
   - Removed from the segment table
   - LA space returned, PA deallocated
 ---
 ### D4. BAAW (Logical-to-Physical Mapping Unit)
 #### Location
 BAAW is placed as a front-end stage inside PE_DMA.
 It is not a separate SimPy component; it is synchronous address resolution logic
 executed at the beginning of PE_DMA's `handle_command()`.
 #### Input
 - LA (Logical Address) — DmaReadCmd.src_addr or DmaWriteCmd.dst_addr
 - access size (bytes)
 #### Output
 - 1:1 mode: `list[PhysicalRequest]` — each request is (PA, nbytes, channel_node_id)
 - n:1 mode: 1 `PhysicalRequest` — (agg_PA, nbytes, agg_node_id)
 ```python
@dataclass
 class PhysicalRequest:
    pa: int           # 51-bit Physical Address
    nbytes: int       # transfer size for this request
    dst_node: str     # target node_id (channel router or aggregated router)
 ```
 #### BAAW Resolve Logic
 ```python
 def resolve(self, la: int, nbytes: int) -> list[PhysicalRequest]:
    seg = self._find_segment(la)  # la_base <= la < la_base + la_size
    offset = la - seg.la_base
    if seg.mode == "n_to_one":
        pa = seg.agg_pa_base + offset
        return [PhysicalRequest(pa=pa, nbytes=nbytes, dst_node=seg.agg_node_id)]
    elif seg.mode == "one_to_one":
        requests = []
        per_ch_size = seg.channel_size
        for i, (pa_base, ch_id) in enumerate(zip(seg.pa_bases, seg.channel_ids)):
            ch_offset = offset % per_ch_size  # interleaved or striped
            ch_nbytes = nbytes // seg.channel_count
            pa = pa_base + ch_offset
            dst_node = f"{self._pe_prefix}.ch_r{ch_id}"
            requests.append(PhysicalRequest(pa=pa, nbytes=ch_nbytes, dst_node=dst_node))
        return requests
 ```
 #### Scope of Responsibility
 BAAW is responsible for:
 - Converting logical accesses into physical request units
 - Performing fan-out (1:1) or pass-through (n:1) according to the mapping mode
 - Generating Physical Addresses and determining target nodes
 BAAW is NOT responsible for:
 - Performing actual data movement
 - Executing NOC routing
 - Simulating bandwidth consumption (this is the role of downstream components)
 #### Output Contract
 The output of BAAW must be request units that can be directly used by the simulator's
 routing and resource model without any additional address decoding.
 ---
 ### D5. PE_DMA handle_command() Changes
 #### Current Flow (VA-based)
 ```
 DmaReadCmd.src_addr (VA)
  -> MMU.translate(VA) -> PA
  -> PhysAddr.decode(PA) -> PhysAddr object
  -> resolver.resolve(PhysAddr) -> dst_node_id (e.g., "sip0.cube0.hbm_ctrl")
  -> router.find_path(pe_prefix, dst_node_id) -> path
  -> 1 sub-Transaction created -> fabric inject
 ```
 #### New Flow (LA-based)
 ```
 DmaReadCmd.src_addr (LA)
  -> BAAW.resolve(LA, nbytes) -> list[PhysicalRequest]
  -> For each PhysicalRequest:
      -> router.find_path(pe_prefix, req.dst_node) -> path
      -> compute_drain_ns(path, req.nbytes) -> drain
      -> sub-Transaction created -> fabric inject
  -> Wait for all sub-Transactions to complete
  -> pe_txn.done.succeed()
 ```
 Key changes:
 - MMU reference removed -> replaced with BAAW resolve
 - PhysAddr.decode() + resolver.resolve() -> BAAW directly returns dst_node
 - 1 request -> N requests injected in parallel (1:1 mode)
 ---
 ### D6. 1:1 Mode Details
 - One logical access -> N (= `channels_per_pe`) physical requests
 - N is a parameter determined by `hbm_pseudo_channels / pes_per_cube`
 - Each request:
  - Fully resolved 51-bit PA
  - Targets a specific channel router (`{pe_prefix}.ch_r{channel_id}`)
 - BW contention modeling via per-channel links
 - PE_DMA injects N sub-transactions simultaneously
 #### 1:1 Mode Example
 Configuration: `hbm_pseudo_channels=64`, `pes_per_cube=8`
 -> `channels_per_pe=8`, PE0 owns ch0-7
 ```text
 Tensor A (4 KB) -> LA 0x1_0000_0000, size=4096 bytes
 BAAW segment: {
    la_base: 0x1_0000_0000, la_size: 4096,
    mode: "one_to_one", channel_count: 8,  # = channels_per_pe
    pa_bases: [PA_ch0, PA_ch1, ..., PA_ch7],
    channel_ids: [0, 1, 2, 3, 4, 5, 6, 7],
    channel_size: 512,  # = la_size / channel_count
 }
 BAAW resolve result (N=8 requests):
  -> PhysicalRequest(pa=PA_ch0, nbytes=512, dst_node="sip0.cube0.pe0.ch_r0")
  -> PhysicalRequest(pa=PA_ch1, nbytes=512, dst_node="sip0.cube0.pe0.ch_r1")
  -> ...
  -> PhysicalRequest(pa=PA_ch7, nbytes=512, dst_node="sip0.cube0.pe0.ch_r7")
 PE_DMA: N sub-transactions injected in parallel
  Each accesses HBM via channel router -> hbm_ctrl link (channel_bw_gbs)
  Total effective BW = N x channel_bw_gbs
 ```
 Examples with different N values:
 - `hbm_pseudo_channels=32`, `pes_per_cube=8` -> `channels_per_pe=4`, 4 requests
 - `hbm_pseudo_channels=64`, `pes_per_cube=4` -> `channels_per_pe=16`, 16 requests
 ---
 ### D7. n:1 Mode Details
 - One logical access -> one aggregated request
 - Target: aggregated router -> hbm_ctrl (see ADR-0019)
 - Aggregated link BW = `channels_per_pe` x `channel_bw_gbs` (e.g., 8 x 32 = 256 GB/s)
 - Modeled as a single queue / resource
 - No per-channel PA decomposition
 #### n:1 Mode Example
 ```
 Tensor A (4 KB) -> LA 0x1_0000_0000, size=4096 bytes
 BAAW segment: {
    la_base: 0x1_0000_0000, la_size: 4096,
    mode: "n_to_one",
    agg_pa_base: PA_agg,
    agg_node_id: "sip0.cube0.pe0.agg_router",
 }
 BAAW resolve result:
  -> PhysicalRequest(pa=PA_agg, nbytes=4096, dst_node="sip0.cube0.pe0.agg_router")
 PE_DMA: 1 sub-transaction injected
  Accesses HBM via aggregated router -> hbm_ctrl link (256 GB/s)
 ```
 ---
 ### D8. Kernel Model Preservation
 - The kernel still issues only single memory ops (`tl.load`, `tl.store`, `tl.composite`)
 - LA is the address scheme passed to the kernel
 - Channel decomposition/aggregation is performed by BAAW inside PE_DMA
 - Physical channel information is not exposed to kernel code
 ---
 ## Consequences
 ### Positive
 - 1:1 vs n:1 semantics are clearly separated at a single point: BAAW
 - Kernel abstraction is preserved — no kernel code changes required
 - Topology-based policy control is possible (mode switching via yaml)
 - Improved simulation model consistency and debuggability
 - Segment-based mapping is simpler and has lower overhead compared to page tables
 ### Negative
 - Full refactoring of VA/MMU-based code is required
 - Increased complexity in the request generation path (managing N requests in 1:1 mode)
 - Reduced per-channel visibility in n:1 mode
 - Existing VA-related tests must be rewritten
 ---
 ## Alternatives
 ### A1. Keep VA + Fan-out at MMU
 - Extend MMU to return per-channel PAs
 - Problem: MMU's role expands beyond address translation to include request decomposition
 - Problem: Aggregation representation is difficult in n:1 mode
 ### A2. Kernel Generates Channel-Aware Requests
 - Kernel directly calls per-channel load/store
 - Problem: Abstraction leakage, reduced portability
 - Problem: All benchmark code must be modified
 ### A3. Always Use PA (Without LA)
 - Runtime directly passes per-channel PA to the kernel
 - Problem: Conflicts with the aggregation model
 - Problem: Conversion timing is unclear, channel information exposed to kernel
 ---
 ## Implementation Notes
 ### Implementation Order
 1. Introduce LA type (`policy/address/la_allocator.py`)
 2. Implement BAAW segment table (`policy/address/baaw.py`)
 3. Add `BaawSegmentInstallMsg` message type (`runtime_api/kernel.py`)
 4. Integrate BAAW into PE_DMA (`components/builtin/pe_dma.py` handle_command changes)
 5. Modify RuntimeContext: LA alloc + segment install (`runtime_api/context.py`)
 6. Change Tensor.va_base -> la_base (`runtime_api/tensor.py`)
 7. Remove VA/MMU code
 8. Remove pe_mmu from topology.yaml, add mapping mode configuration
 9. Test migration
 ### Affected Existing Tests
 | Test File | Impact |
 |-----------|--------|
 | `tests/test_mmu_component.py` | Remove -> replace with BAAW segment install test |
 | `tests/test_mmu_fabric.py` | Remove -> replace with BAAW + fabric integration test |
 | `tests/test_pe_mmu.py` | Remove |
 | `tests/test_va_allocator.py` | Replace with LA allocator test |
 | `tests/test_va_integration.py` | Replace with LA + BAAW integration test |
 | `tests/test_va_offset.py` | Replace with LA offset test |
 ---
 ## Test Requirements
 - For the same logical access:
  - 1:1 -> verify N requests are generated
  - n:1 -> verify 1 aggregated request is generated
 - Verify effective bandwidth consistency across both modes
 - 1:1 -> verify per-channel contention modeling
 - n:1 -> verify aggregated bandwidth is reflected
 - Verify operation without kernel code changes
 - Verify correct BAAW segment install/uninstall operation
 - Verify no conflicts when multiple tensors are assigned to different segments
 ---
 ## Links
 - ADR-0011 (Memory Addressing Simplification — PA-first, VA/MMU introduction) -> superseded by this ADR
 - ADR-0019 (NOC Per-Channel HBM Connection Model) -> topology-side integration
 - ADR-0014 (PE Internal Execution Model) -> PE_DMA change impact
@@ -0,0 +1,431 @@
 # ADR-0019: Per-Channel and Aggregated HBM Connection Models within CUBE NOC
 ## Status
 Proposed
 ## Context
 ADR-0018 introduced LA-based address abstraction and BAAW,
 defining how a logical memory access is translated into the following two forms of requests:
 - 1:1 mode: one logical access → N per-channel requests
 - n:1 mode: one logical access → one aggregated request
 Here N = `hbm_pseudo_channels / pes_per_cube` (= `channels_per_pe`),
 determined by topology parameters.
 ### Problems with the Existing Structure
 In the current implementation (`topology/builder.py`):
 - PE_DMA → NOC → xbar_top/xbar_bot → HBM_CTRL.slice{0-7} path is used
 - HBM is modeled as 8 slice (= per-PE) nodes
 - Local/remote access use different paths:
  - local: NOC → xbar → HBM slice
  - cross-half: NOC → xbar_top → bridge → xbar_bot → HBM slice
  - remote cube: NOC → UCIe → remote NOC → remote xbar → remote HBM slice
 Limitations of this structure:
 - Cannot model at the pseudo-channel granularity (slice = per-PE granularity, not per-channel)
 - xbar/bridge bifurcate local/remote paths
 - Cannot express 1:1 / n:1 modes consistently
 ---
 ## Decision
 ### D1. HBM Attaches to PE Routers
 Consolidate the current `hbm_ctrl.slice{0-7}` (8 nodes) into a **single `hbm_ctrl` node**,
 and attach the HBM access point to the same router where the PE is attached.
 - n:1 mode: PE's local HBM access goes directly from its own router (switching overhead only, 0 hops)
 - Remote PE's HBM access: reaches the target PE's router via mesh hops
 - The read/write resource model within the HBM controller is preserved
 Node naming changes:
 | Current | After Change |
 | ---- | ------- |
 | `sip0.cube0.hbm_ctrl.slice0` ~ `slice7` | `sip0.cube0.hbm_ctrl` (single) |
 In `mesh_gen.py`, add `pe{idx}.hbm` to the PE attachment so that
 the builder generates an edge between that router and hbm_ctrl.
 ---
 ### D2. Complete Removal of xbar, bridge, and Single NOC Node
 Remove all of the following nodes and related edges:
 - `{cube}.xbar_top`, `{cube}.xbar_bot`
 - `{cube}.bridge.left`, `{cube}.bridge.right`
 - `{cube}.noc` (single TwoDMeshNocComponent node)
 - Edges of type `noc_to_xbar`, `xbar_to_noc`, `xbar_to_hbm`, `hbm_to_xbar`
 - Edges of type `xbar_to_bridge`, `bridge_to_xbar`
 - Edges of type `pe_to_noc`, `noc_to_pe`, `noc_to_pe_cpu`, etc. referencing the single noc node
 Their role is replaced by an **explicit router mesh based on cube_mesh.yaml**.
 Each router (r0c0, r0c1, ...) from the 6x6 router grid generated by `mesh_gen.py`
 is created as a separate SimPy node in the topology graph,
 and adjacent routers are connected via XY mesh edges.
 ---
 ### D3. Explicit Router Mesh (Common Basis for n:1 / 1:1)
 #### Router Nodes Based on cube_mesh.yaml
 Each non-null router from cube_mesh.yaml generated by `mesh_gen.py`
 is created as a **separate SimPy node** in the topology graph.
 - Node ID: `{cube}.r{row}c{col}` (e.g., `sip0.cube0.r0c0`)
 - kind: `noc_router`, impl: `forwarding_v1`
 - pos_mm: taken from cube_mesh.yaml
 Based on the attach information in cube_mesh.yaml, components are connected to each router:
 - `pe{p}.dma` → PE_DMA ↔ router edge
 - `pe{p}.cpu` → PE_CPU ↔ router edge
 - `pe{p}.hbm` → HBM_CTRL ↔ router edge (added in n:1)
 - `m_cpu` → M_CPU ↔ router edge
 - `sram` → SRAM ↔ router edge
 - `ucie_{dir}.c{i}` → UCIe conn ↔ router edge
 Router-to-router XY mesh edges: bidirectional edges between adjacent routers.
 Null routers (HBM exclusion zones) are skipped.
 #### 1:1 Mode Extension (To Be Implemented Later)
 In 1:1 mode, each router differentiates into N channel mini-routers.
 Per-channel routing and ChannelSplitter (LA → per-channel PA) introduction are required.
 N GEMM engines per PE are also added at this point.
 ---
 ### D4. Cross-PE HBM Access (n:1 Mode)
 In n:1 mode, when a PE accesses another PE's local HBM,
 it hops through the XY mesh in cube_mesh.yaml to reach the target PE's router.
 Example: PE0 (r0c0) accessing PE2's (r1c4) HBM:
 ```text
 PE0.pe_dma → r0c0 → r0c1 → r0c2 → r0c3 → r0c4 → r1c4 → hbm_ctrl
 ```
 The Dijkstra router finds the shortest path in the mesh.
 Cross-PE channel access in 1:1 mode will be defined during the 1:1 extension in D3.
 ---
 ### D5. n:1 Mode: Uses cube_mesh.yaml Router Mesh
 In n:1 mode, no separate "aggregated router" is created.
 The existing router grid from cube_mesh.yaml serves that role.
 #### Connection Structure
 PE_DMA, PE_CPU, and HBM are all connected to the router where each PE is attached:
 ```text
 sip0.cube0.pe0.pe_dma ←→ sip0.cube0.r0c0  (bw: N × channel_bw_gbs)
 sip0.cube0.hbm_ctrl   ←→ sip0.cube0.r0c0  (bw: N × channel_bw_gbs)
 ```
 Routers are connected via XY mesh edges. PE's local HBM access goes
 directly from its own router (switching overhead only).
 #### n:1 Mode Full Data Paths
 **Local HBM (0 hops):**
 ```text
 PE0.pe_dma → r0c0 → hbm_ctrl  (switching overhead only)
 ```
 **Remote HBM (mesh hops):**
 ```text
 PE0.pe_dma → r0c0 → r0c1 → ... → r1c4 → hbm_ctrl
 ```
 **M_CPU DMA:**
 ```text
 M_CPU → r2c0 → (mesh hops) → r{x}c{y} → hbm_ctrl
 ```
 ---
 ### D6. All Traffic Is Unified onto the Same Router Mesh
 - All memory accesses (DMA data) and commands (PE_CPU) use the same router mesh
 - Local access does not use a separate fast path (xbar)
 - Cross-cube (remote) access path:
 ```text
 PE_DMA → r{x}c{y} → (mesh hops) → ucie_conn → ucie-{PORT}
  → [UCIe link] → remote ucie → remote conn → remote r{x}c{y} → hbm_ctrl
 ```
 UCIe connections maintain the existing structure,
 but both endpoints become mesh routers instead of xbars.
 The number of UCIe lines is determined by BW ratio: `ucie_lines_per_side = ceil(ucie_bw / noc_line_bw)`.
 ---
 ### D7. AddressResolver Changes
 Current `AddressResolver.resolve()`:
 ```python
 # Current: HBM offset → pe_slice → "sip{s}.cube{c}.hbm_ctrl.slice{pe_slice}"
 pe_slice = PhysAddr.hbm_pe_id(addr.hbm_offset, self._slice_size_bytes)
 return f"sip{s}.cube{c}.hbm_ctrl.slice{pe_slice}"
 ```
 After change:
 ```python
 # Changed: HBM → single endpoint
 return f"sip{s}.cube{c}.hbm_ctrl"
 ```
 The pe_slice calculation is removed.
 In n:1 mode, PE_DMA directly accesses the hbm_ctrl attached to its own router.
 resolver.resolve() is retained for external access (M_CPU DMA, etc.) and backward compatibility.
 ---
 ### D8. topology.yaml Configuration Changes
 #### Added Settings
 ```yaml
 cube:
  memory_map:
    hbm_mapping_mode: n_to_one          # one_to_one | n_to_one
    hbm_pseudo_channels: 64             # total pseudo channel count
    hbm_channels_per_pe: 8              # local channels per PE (= pseudo_channels / pes_per_cube)
    hbm_channel_bw_gbs: 32.0            # per-channel bandwidth (GB/s)
    hbm_total_gb_per_cube: 48           # retained
 ```
 #### Removed Settings
 ```yaml
 # To be removed
 links:
  xbar_to_hbm_bw_gbs: 256.0            # → replaced by channel_bw_gbs × channels_per_pe
  xbar_to_hbm_mm: 2.5                  # → replaced by ch_router_to_hbm_mm
  xbar_to_bridge_bw_gbs: 128.0         # → removed (no bridge)
  xbar_to_bridge_mm: 3.0               # → removed
  noc_to_xbar_bw_gbs: ...              # → removed
  noc_to_xbar_mm: ...                  # → removed
 ```
 #### Added Link Settings
 ```yaml
 links:
  router_link_bw_gbs: 256.0            # XY mesh link BW between routers
  router_overhead_ns: 2.0              # router switching overhead
  pe_to_router_bw_gbs: 256.0           # PE_DMA ↔ router
  hbm_to_router_bw_gbs: 256.0          # HBM ↔ router (= N × channel_bw)
 ```
 ---
 ### D9. Bandwidth Numerical Consistency
 | Configuration | Value |
 | ---- | --- |
 | pseudo channels per cube | 64 (parameter) |
 | PEs per cube | 8 (parameter) |
 | channels per PE (N) | `pseudo_channels / pes_per_cube` = 8 |
 | per-channel BW | 32 GB/s (parameter) |
 | per-PE local BW | N × 32 = 256 GB/s |
 | cube total HBM BW | 64 × 32 = 2048 GB/s |
 The effective BW per PE is identical in both modes:
 - 1:1 mode: N channel links × channel_bw_gbs = N × 32 = 256 GB/s
 - n:1 mode: 1 aggregated link = N × channel_bw_gbs = 256 GB/s
 ---
 ## Consequences
 ### Positive
 - The router mesh based on cube_mesh.yaml accurately reflects physical placement
 - In n:1 mode, the existing VA scheme is preserved, keeping transition costs low
 - Local / remote / command traffic is unified onto the same mesh, resulting in simplicity
 - Aligns well with graph compiler-based topology generation
 - Channel count and PE count are both parameterized, enabling testing of various configurations
 - 1:1 mode extension naturally follows through router differentiation
 ### Negative
 - The number of SimPy nodes increases due to explicit router nodes (6x6 = up to 32 routers/cube)
 - Requires complete rewrite of existing xbar/bridge/single NOC-based tests
 - The internal contention model of TwoDMeshNocComponent needs to be replaced with a per-router model
 ---
 ## Alternatives
 ### A1. Retain Existing xbar + HBM Slices
 - Local/remote paths remain bifurcated
 - Cannot model at pseudo-channel granularity
 - Cannot switch between 1:1/n:1 modes
 ### A2. Always Generate Per-Channel Links and Aggregate Only in n:1
 - Topology structure always has 1:1 size
 - Expressing n:1 semantics via link aggregation is complex
 - No reduction in router node count
 ### A3. Gradual Transition (Retain xbar + Add NOC Path)
 - Higher compatibility, but dual-path coexistence increases complexity
 - Since xbar removal is ultimately necessary, the intermediate step provides little value
 ---
 ## Implementation Notes
 ### topology/builder.py Change Details
 #### Code to Remove (within current `_instantiate_cube()`)
 - xbar_top, xbar_bot node creation (~line 495-508)
 - bridge.left, bridge.right node creation
 - noc ↔ xbar edge creation (~line 540-555)
 - xbar ↔ hbm_ctrl.slice edge creation (~line 510-538)
 - xbar ↔ bridge edge creation (~line 557-572)
 #### Code to Add
 1:1 mode:
 ```python
 N = hbm_channels_per_pe  # from topology config
 total_ch = hbm_pseudo_channels
 # Create channel router nodes
 for ch_id in range(total_ch):
    pe_id = ch_id // N
    nodes[f"{cp}.ch_r{ch_id}"] = Node(
        id=f"{cp}.ch_r{ch_id}", kind="noc_router", impl="noc_v1",
        attrs={}, pos_mm=(...),  # horizontal row = ch_id % N
    )
 # PE_DMA ↔ local channel router edges
 for pe_id in range(pes_per_cube):
    for local_ch in range(N):
        ch_id = pe_id * N + local_ch
        edges.append(Edge(
            src=f"{cp}.pe{pe_id}.pe_dma", dst=f"{cp}.ch_r{ch_id}",
            bw_gbs=channel_bw, kind="pe_to_ch_router", ...))
        edges.append(Edge(
            src=f"{cp}.ch_r{ch_id}", dst=f"{cp}.pe{pe_id}.pe_dma",
            bw_gbs=channel_bw, kind="ch_router_to_pe", ...))
 # Channel router ↔ hbm_ctrl edges
 for ch_id in range(total_ch):
    edges.append(Edge(
        src=f"{cp}.ch_r{ch_id}", dst=f"{cp}.hbm_ctrl",
        bw_gbs=channel_bw, kind="ch_router_to_hbm", ...))
    edges.append(Edge(
        src=f"{cp}.hbm_ctrl", dst=f"{cp}.ch_r{ch_id}",
        bw_gbs=channel_bw, kind="hbm_to_ch_router", ...))
 # Horizontal line edges (same logical index)
 for row in range(N):
    for p in range(pes_per_cube - 1):
        ch_a = p * N + row
        ch_b = (p + 1) * N + row
        edges.append(Edge(
            src=f"{cp}.ch_r{ch_a}", dst=f"{cp}.ch_r{ch_b}",
            bw_gbs=ch_horizontal_bw, kind="ch_horizontal", ...))
        edges.append(Edge(
            src=f"{cp}.ch_r{ch_b}", dst=f"{cp}.ch_r{ch_a}",
            bw_gbs=ch_horizontal_bw, kind="ch_horizontal", ...))
 ```
 n:1 mode:
 ```python
 # Create aggregated router nodes
 for pe_id in range(pes_per_cube):
    nodes[f"{cp}.pe{pe_id}.agg_router"] = Node(
        id=f"{cp}.pe{pe_id}.agg_router", kind="noc_router", impl="noc_v1",
        attrs={}, pos_mm=(...),
    )
 agg_bw = N * channel_bw  # aggregated BW
 # PE_DMA ↔ aggregated router
 for pe_id in range(pes_per_cube):
    edges.append(Edge(
        src=f"{cp}.pe{pe_id}.pe_dma", dst=f"{cp}.pe{pe_id}.agg_router",
        bw_gbs=agg_bw, kind="pe_to_agg_router", ...))
    edges.append(Edge(
        src=f"{cp}.pe{pe_id}.agg_router", dst=f"{cp}.pe{pe_id}.pe_dma",
        bw_gbs=agg_bw, kind="agg_router_to_pe", ...))
 # Aggregated router ↔ hbm_ctrl
 for pe_id in range(pes_per_cube):
    edges.append(Edge(
        src=f"{cp}.pe{pe_id}.agg_router", dst=f"{cp}.hbm_ctrl",
        bw_gbs=agg_bw, kind="agg_to_hbm", ...))
    edges.append(Edge(
        src=f"{cp}.hbm_ctrl", dst=f"{cp}.pe{pe_id}.agg_router",
        bw_gbs=agg_bw, kind="hbm_to_agg", ...))
 # Horizontal links between aggregated routers
 for p in range(pes_per_cube - 1):
    edges.append(Edge(
        src=f"{cp}.pe{p}.agg_router", dst=f"{cp}.pe{p+1}.agg_router",
        bw_gbs=agg_horizontal_bw, kind="agg_horizontal", ...))
    edges.append(Edge(
        src=f"{cp}.pe{p+1}.agg_router", dst=f"{cp}.pe{p}.agg_router",
        bw_gbs=agg_horizontal_bw, kind="agg_horizontal", ...))
 ```
 ### Affected Existing Tests
 | Test File | Impact |
 | ---------- | ---- |
 | `tests/test_topology_compile.py` | Remove xbar/bridge node references, add channel router verification |
 | `tests/test_topology_load.py` | Reflect topology.yaml configuration changes |
 | `tests/test_pe_components.py` | PE_DMA routing path changes |
 | `tests/test_sip_parallel.py` | Cross-PE access path changes |
 | Cases that directly test xbar/bridge | Remove |
 ---
 ## Test Requirements
 - Verify that requests are delivered via per-channel links in 1:1 mode
 - Verify that requests are delivered via the aggregated link in n:1 mode
 - Verify that topology is correctly generated in both modes:
  - 1:1: `total_ch` channel routers + per-PE links + horizontal links
  - n:1: `pes_per_cube` aggregated routers + per-PE links
 - Verify that effective BW is consistent across both modes for the same workload
 - Verify that horizontal line routing works for cross-PE access
 - Verify that routing through UCIe works for cross-cube access
 - Verify that topology generation is correct under parameter variations (channels_per_pe = 4, 8, 16, etc.)
 ---
 ## Links
 - ADR-0018 (LA + BAAW) → addressing-side integration
 - ADR-0017 (Cube NOC 2D Mesh) → this ADR replaces the xbar/bridge portion
 - ADR-0004 (Memory Semantics) → BW model redefinition
 - ADR-0014 (PE Internal Execution Model) → impact from PE_DMA path changes
@@ -0,0 +1,553 @@
 # ADR-0020: 2-Pass Data Execution Model (Timing / Data Separation)
 ## Status
 Proposed
 ## Context
 The current simulation models **timing only**.
 `tl.load()`, `tl.composite(op="gemm")`, etc. generate SimPy latencies,
 but do not actually read tensor data or perform computations.
 ### Required Capabilities
 1. Must be able to store and read actual data in HBM/TCM/SRAM
 2. PE_GEMM, PE_MATH must be able to perform actual matrix operations and verify results
 3. Must minimize simulation performance degradation
 ### Limitations of the Existing Kernel Execution Structure
 The current kernel execution is separated into 3 stages:
 ```
 Phase 0: Kernel function execution in TLContext → PeCommand list generation (outside SimPy, no data)
 Phase 1: PE_CPU replays PeCommand list via SimPy (timing only)
 ```
 Phase 0 requires the kernel to **complete execution entirely** before SimPy begins.
 `tl.load()` returns a TensorHandle (placeholder), so actual data cannot be accessed.
 Therefore, branching based on data values (dynamic control flow) is impossible.
 This ADR resolves this limitation **for memory operations only** (see D1, D3).
 ### Constraints
 - SimPy is a single-thread event loop — running numpy matmul inside it blocks everything
 - Components must be replaceable (ADR-0015) — framework requirements must not leak into implementations
 - Benchmark kernels are imperative code (tl.load → tl.composite → tl.wait) — the same code must be reused
 - Kernel functions must remain plain Python functions (no generator/async transformation)
 ### Design Exploration Results
 | Option | Approach | Verdict |
 |--------|----------|---------|
 | Direct execution in SimPy | Call numpy GEMM inside SimPy | Rejected: single-thread block |
 | SimPy + ThreadPool | future.submit → timeout → result() | Rejected: blocks on result() for back-to-back requests |
 | Symbolic + lazy | Track metadata only, execute later | Rejected: difficult to handle control-flow dependent reads |
 | **2-pass (adopted)** | Phase 1: timing, Phase 2: data | Full separation, no performance impact |
 ---
 ## Decision
 ### D1. 2-Pass Execution Model — Phase 0 Elimination
 The existing 3 stages (Phase 0 → Phase 1 → Phase 2) are **consolidated into 2 stages**.
 Before:
 ```
 Phase 0: Kernel → PeCommand list (no data, no branching)
 Phase 1: Replay PeCommand list via SimPy (timing only)
 ```
 After:
 ```
 Phase 1 (timing): Kernel + SimPy integrated execution — greenlet-based
  - Memory read/write: SimPy timing + MemoryStore actual data
  - Compute (GEMM/Math): SimPy timing + op_log recording (actual computation in Phase 2)
  - Dynamic control flow possible (tl.load returns actual data)
 Phase 2 (data): Actual computation execution based on op_log — outside SimPy, parallelizable
 ```
 This ADR **extends Phase 1 to be data-aware for memory operations only**.
 Phase 1 handles latency/BW bottleneck analysis + memory data tracking,
 Phase 2 handles GEMM/Math computation correctness verification.
 Phase 2 is optional — if only timing is needed, run Phase 1 alone.
 ### D2. Op Log Recording — ComponentBase Hook
 Op log recording is performed as a **hook in the component base class**.
 Individual component implementations are not modified.
 ```python
 class ComponentBase:
    def _on_process_start(self, env, msg):
        if self._op_logger and getattr(msg, 'data_op', False):
            self._op_logger.record_start(env.now, self.node.id, msg)
    def _on_process_end(self, env, msg):
        if self._op_logger and getattr(msg, 'data_op', False):
            self._op_logger.record_end(env.now, self.node.id, msg)
 ```
 Hooks are called before and after `run()` within `_forward_txn()`.
 `_op_logger` is optional — zero overhead when absent.
 **Hook timing definitions**:
 | Timing | Meaning |
 |--------|---------|
 | `t_start` | The point at which the component **begins servicing** the msg (immediately before `run()` entry) |
 | `t_end` | The point at which the component's **internal service completes** (immediately after `run()` returns) |
 Link traversal latency is not included in t_start/t_end.
 Link latency is observed as the difference between the sending component's t_end and the receiving component's t_start.
 ### D3. Greenlet-Based Kernel Execution — Phase 0 Elimination
 The existing Phase 0 (kernel → PeCommand list) is eliminated,
 and **greenlet** is used to cooperatively interleave kernel and SimPy execution.
 #### Operating Principle
 greenlet is a C extension that provides cooperative context switching.
 When the kernel (child greenlet) calls `tl.load()` etc., it switches to the SimPy loop (parent greenlet)
 to perform timing simulation, and after completion, returns to the kernel with actual data.
 ```
 SimPy loop (parent greenlet)           Kernel (child greenlet)
 ─────────────────────────              ──────────────────────
 g.switch() ─────────────────────────→ Kernel starts
                                       a = tl.load(ptr, ...)
                                         internal: parent.switch(DmaReadCmd)
 cmd = DmaReadCmd ←──────────────────  (kernel paused)
  yield DmaReadMsg(...)
  yield env.timeout(dma_latency)
  data = memory_store.read(...)
 g.switch(data) ─────────────────────→ (kernel resumed)
                                       a = data  ← actual numpy array
                                       if a[0][0] > 0.5:  ← branching possible
                                         ...
 ```
 The kernel is maintained as a **plain Python function**.
 greenlet switches exist **only within the internal implementation** of `tl.load()`, `tl.store()`, etc.
 #### KernelRunner — Framework Layer
 The greenlet loop resides not in the PE_CPU component but in the framework layer,
 **KernelRunner**.
 ```python
 # KernelRunner (framework — greenlet ↔ SimPy bridge)
 class KernelRunner:
    def run(self, env, kernel_fn, args, store):
        g = greenlet(self._run_kernel)
        cmd = g.switch(kernel_fn, args)
        while cmd is not None:
            if isinstance(cmd, DmaReadCmd):
                yield from self._dispatch_dma(env, cmd)
                data = store.read(cmd.src_addr, cmd.shape, cmd.dtype)
                cmd = g.switch(data)            # resume with actual data
            elif isinstance(cmd, GemmCmd):
                yield from self._dispatch_gemm(env, cmd)
                cmd = g.switch()                # resume (no data)
            elif isinstance(cmd, DmaWriteCmd):
                store.write(cmd.dst_addr, cmd.data)  # visibility = issue time
                yield from self._dispatch_dma(env, cmd)  # timing only
                cmd = g.switch()
 # PE_CPU (component — kept simple, unaware of greenlet)
 def _execute_kernel(self, env):
    runner = KernelRunner(self.ctx)
    yield from runner.run(env, kernel_fn, args, store)
 ```
 **Op logging single source of truth**: KernelRunner does not record directly to op_log.
 All op logging is handled **solely by the ComponentBase hook (_on_process_start/end)**.
 When KernelRunner delivers messages to components via `_dispatch_gemm()` etc.,
 the component base class hooks automatically record them.
 **Layer separation**:
 - **Kernel code**: plain function, unaware of greenlet
 - **TLContext**: calls `parent.switch(cmd)` inside `tl.load()`
 - **KernelRunner**: greenlet ↔ SimPy bridge, handles MemoryStore read/write. **Does not log**.
 - **ComponentBase hook**: the sole path for op_log recording
 - **PE_CPU**: only calls KernelRunner, replaceable as a component
 #### Handling Differences Between Memory Read/Write and Compute
 | Operation | In Phase 1 | In Phase 2 |
 |-----------|-----------|-----------|
 | `tl.load()` | SimPy timing + MemoryStore read → **actual data returned** | — |
 | `tl.store()` | SimPy timing + MemoryStore write → **actual write** | — |
 | `tl.composite(gemm)` | SimPy timing + **op_log recording only** | numpy actual computation |
 | `tl.dot()` / math ops | SimPy timing + **op_log recording only** | numpy actual computation |
 Memory read/write is processed immediately in Phase 1 (numpy slice, fast).
 GEMM/Math operations are batch-executed in Phase 2 (performance separation).
 #### Store Visibility Rule
 `tl.store()` is **immediately reflected in MemoryStore at issue time** (visibility = issue).
 SimPy DMA timing is simulated separately afterward.
 This is an intentional separation of timing and visibility:
 - **visibility**: the point at which it is reflected in MemoryStore = when `store.write()` is called
 - **timing**: the point at which DMA latency completes in SimPy
 This separation allows a load immediately after a store to see the latest data in dynamic control flow.
 #### Result Handle Semantics
 `tl.composite()` (sync/async) returns a **handle** referencing the result tensor.
 The key contract in Phase 1:
 1. **All compute handles are always considered pending in Phase 1.**
 2. `tl.wait(handle)` **expresses timing synchronization only**
   and does not make the handle ready.
 3. Accessing the handle's actual result data (`handle.data`, element access,
   numpy conversion, etc.) is **only possible in Phase 2**.
 4. Therefore, **compute-result-based control flow is not supported in Phase 1.**
 5. In contrast, `tl.load()` returns actual data in Phase 1, so
   **memory-read-based control flow is supported**.
 | Handle state | Phase | Allowed operations |
 |------------|-------|----------|
 | pending | Phase 1 | `tl.wait(handle)` — timing synchronization only |
 | pending | Phase 1 | Pass handle as target of `tl.store()` (logical destination binding only, payload in Phase 2) |
 | pending | Phase 1 | **Data access not allowed** — value-based branching not possible |
 | ready | Phase 2 | Actual numpy data access, verification |
 This restriction is intentional. If computations were executed in Phase 1,
 the SimPy single-thread would block, defeating the purpose of 2-pass separation.
 #### Phase 1 Materialization — Future Extension
 If Phase 1 eager execution becomes necessary for small operations
 (scalar, small reduction) in the future, selective materialization can be supported
 by adding a `materialized_in_phase1: bool` flag to the op record.
 This is not implemented in the current scope.
 ### D4. data_op Flag — Message Self-Declaration
 The logging target is determined by the `data_op` attribute on the message instance,
 not by message type. The framework does not hardcode message types.
 ```python
 class MsgBase:
    data_op: bool = False       # default: no logging
 class DmaReadCmd(MsgBase):
    data_op = True              # memory transfer → logging
 class GemmCmd(MsgBase):
    data_op = True              # compute → logging
 class MathCmd(MsgBase):
    data_op = True              # compute → logging
 ```
 When adding a new message type (e.g., IpcqMsg), simply setting `data_op = True`
 enables automatic logging without modifying framework code.
 ### D5. Op Log Structure
 #### Op Classification Scheme
 A two-level classification is used:
 | Level | Field | Role |
 |-------|-------|------|
 | `op_kind` | `memory` \| `gemm` \| `math` | executor dispatch criterion |
 | `op_name` | `dma_read` \| `dma_write` \| `gemm_f16` \| `exp` \| `add` \| `sum` etc. | specific operation identification |
 #### OpRecord Definition
 ```python
@dataclass
 class OpRecord:
    t_start: float              # SimPy time (ns) — service start
    t_end: float                # SimPy time (ns) — service completion
    component_id: str           # e.g. "sip0.cube0.pe0.pe_gemm"
    op_kind: str                # "memory" | "gemm" | "math"
    op_name: str                # specific operation name
    params: dict                # per-operation parameters (see below)
    dependency_ids: list[int]   # currently based on in-memory record index, may be replaced with stable op_id in the future
 ```
 #### dependency_ids Generation Rules
 `dependency_ids` is **optional**, and by default the executor performs
 address-based dependency inference (see D6).
 Explicit setting is only needed when precise execution ordering is required:
 - **Default (address-based inference)**: the executor analyzes read/write sets to
  automatically infer RAW/WAW/WAR dependencies. This is sufficient for most cases.
 - **Explicit setting**: set when logical dependencies cannot be expressed via addresses
  at the TLContext or command generation stage.
  Example: completion handle-based synchronization — handle dependencies depend on
  logical completion order rather than memory addresses, so they cannot be captured
  by address inference.
 #### op_log Ordering
 The op_log maintains **stable ordering** based on `t_start`.
 Records with the same `t_start` preserve insertion order.
 #### params Details
 **memory (dma_read / dma_write)**:
 ```python
 {
    "src_addr": int,            # source address (byte)
    "dst_addr": int,            # destination address (byte)
    "nbytes": int,              # transfer size
    "src_space": str,           # "hbm" | "tcm" | "sram"
    "dst_space": str,           # "hbm" | "tcm" | "sram"
 }
 ```
 **gemm**:
 ```python
 {
    "src_a_addr": int,          # operand A address
    "src_b_addr": int,          # operand B address
    "dst_addr": int,            # output address
    "shape_a": tuple,           # e.g. (128, 256)
    "shape_b": tuple,           # e.g. (256, 128)
    "shape_out": tuple,         # e.g. (128, 128)
    "dtype_in": str,            # e.g. "f16"
    "dtype_acc": str,           # accumulation dtype, e.g. "f32"
    "dtype_out": str,           # output dtype, e.g. "f16"
    "transpose_a": bool,
    "transpose_b": bool,
    "layout_a": str,            # "row_major" | "col_major"
    "layout_b": str,
    "layout_out": str,
    "addr_space": str,          # "tcm" (GEMM operands are always in TCM)
 }
 ```
 **math**:
 ```python
 {
    "op": str,                  # "exp" | "add" | "sum" | "where" | ...
    "input_addrs": list[int],   # list of operand addresses
    "input_shapes": list[tuple],
    "dst_addr": int,
    "shape_out": tuple,
    "dtype": str,
    "axis": int | None,         # reduction axis
    "addr_space": str,          # "tcm"
 }
 ```
 ### D6. Phase 2 Executor
 Phase 2 executes the op_log outside of SimPy.
 ```python
 class DataExecutor:
    def __init__(self, op_log: list[OpRecord], initial_store: MemoryStore):
        self.store = initial_store  # Takes the Phase 1 MemoryStore snapshot as input
    def run(self):
        for t, ops in groupby(op_log, key=lambda o: o.t_start):
            batch = list(ops)
            independent, sequential = self._classify(batch)
            self._execute_parallel(independent)
            self._execute_sequential(sequential)
 ```
 **Parallel execution determination**:
 Ops with the same `t_start` are considered **parallel candidates**.
 The executor determines actual parallel execution based on the following criteria:
 - Whether read/write address ranges overlap (WAW, RAW, WAR conflict checks)
 - Whether predecessor ops specified in `dependency_ids` have completed
 Only ops with no overlapping address ranges and no explicit dependencies are executed in parallel.
 **Batch optimization**: Only independent ops with the same op_name **and identical
 shape, dtype, layout, and transpose flags** are eligible for batching.
 Example: identical shape GEMMs from multiple PEs → bundled into a single `np.matmul(a_batch, b_batch)` call.
 Improves BLAS efficiency on CPU, reduces launch overhead on GPU.
 **Phase 2 execution order guarantee**:
 Phase 2 does not consider data arrival timing,
 and guarantees execution order solely through
 dependencies (address-based inference + explicit dependency_ids).
 ### D7. Memory Store
 `MemoryStore` logically follows byte-addressable semantics,
 and the current implementation uses **tensor-granular storage** (addr → numpy ndarray mapping).
 ```python
 class MemoryStore:
    def write(self, space: str, addr: int, data: np.ndarray) -> None: ...
    def read(self, space: str, addr: int, shape: tuple, dtype: str) -> np.ndarray: ...
 ```
 **Internal storage format: numpy ndarray**
 MemoryStore stores tensors as **numpy ndarrays**.
 | Candidate | store/load speed | Phase 2 compute | Verdict |
 |-----------|-----------------|-----------------|---------|
 | **numpy ndarray** | Immediate (reference passing, no copy) | `np.matmul` directly usable | **Adopted** |
 | bytearray | Requires memcpy | Requires `np.frombuffer` conversion | Rejected |
 | torch tensor | Immediate | torch operations available | Use only for GPU optimization |
 - write: **stores numpy array by reference** (no copy) → Phase 1 overhead = 1 dict lookup
 - read: **returns numpy array by reference** (no copy)
 - Re-writing to the same addr **overwrites at tensor granularity** (partial overwrite not supported)
 - dtype uses numpy native (`np.float16`, `np.float32`, `np.bfloat16`, etc.)
 - For byte-level access, convert via `.view(np.uint8)`
 - For GPU batch optimization in Phase 2, numpy → torch tensor conversion is the executor's responsibility
 **read/write contract**:
 - read/write operates on a **contiguous tensor** basis.
  If non-contiguous stride views are needed, express them as separate copy ops.
 - In the normal benchmark path, producer/consumer dtype match is expected.
  Reinterpret cast is a permissive behavior for low-level memory validation
  or special test cases.
 - addr is byte-aligned, with minimum alignment = dtype size.
 - dtype mismatch (reading with a different dtype than written) is handled as a reinterpret cast.
  Shape mismatch is verified based on nbytes, and raises an error on mismatch.
 - Correctness criteria follow address-range-based read/write semantics.
 - A tensor object cache may be used as an implementation optimization,
  but the canonical state is byte-addressable storage.
 - At deploy time, the host injects initial tensor data.
 ### D8. Benchmark Kernel Code
 The benchmark's **user code API is not changed**.
 The call interfaces for `tl.load()`, `tl.composite()`, `tl.store()`, etc. are maintained.
 However, internal command/message schemas may be extended to include metadata
 required for Phase 2 execution (e.g., additional fields such as dtype_acc, transpose).
 ### D9. No Component Changes
 Individual component implementations (PE_GEMM, PE_DMA, HBM_CTRL, etc.) are not modified.
 Op log recording is the responsibility of the ComponentBase hook.
 When custom components are replaced, only the timing model changes,
 and Phase 2 data execution is unaffected.
 ### D10. Phase 2 is Optional
 ```python
 engine = GraphEngine(graph)
 engine.run(benchmark)                       # Phase 1: timing only
 result = engine.get_timing_result()
 if verify_data:
    executor = DataExecutor(engine.op_log)  # Phase 2: data
    executor.run()
    executor.verify(expected_output)
 ```
 If only timing analysis is needed, Phase 2 is skipped.
 If the op_logger is deactivated, Phase 1 performance is identical to the original.
 ### D11. Verification Contract
 Basic verification **compares the final output tensor** against a reference backend (numpy).
 Per-dtype tolerance policy:
 | dtype | Comparison method | Tolerance |
 |-------|----------|-----------|
 | f32 | `np.allclose` | rtol=1e-5, atol=1e-5 |
 | f16 | `np.allclose` | rtol=1e-3, atol=1e-3 |
 | bf16 | `np.allclose` | rtol=1e-2, atol=1e-2 |
 | int types | `np.array_equal` | exact |
 - Default mode: compare final output only (end-to-end correctness)
 - Debug mode: can compare intermediate tensors on a per-op basis
  (MemoryStore snapshot at each op boundary)
 ---
 ## Non-goals
 - **Compute-result-based control flow**: not supported.
  All compute handles are in pending state during Phase 1,
  `wait()` expresses timing synchronization only and does not imply data readiness.
  Accessing `handle.data`, element access, or truth-value evaluation in Phase 1
  is **treated as an error**.
  Memory-data-based branching (results of `tl.load()`) is supported via greenlet.
  Phase 1 materialization is a future extension (see D3).
 - **Cycle-accurate overlap reconstruction**: Phase 2 does not precisely reproduce
  the execution time overlap from Phase 1. Phase 2 only verifies data correctness.
 - **GPU kernel compilation**: GEMM/Math in Phase 2 are numpy/torch calls
  and do not reproduce the actual hardware PE microarchitecture.
 ## Open Questions
 - **Aliasing / slice view**: How to represent slice/views referencing the same
  backing storage in MemoryStore (stride-based view vs copy semantics)
 - **IPCQ/descriptor read generalization**: Whether to fully generalize PE-to-PE
  communication as memory ops or introduce a separate op_kind
 - **Op log streaming**: Managing op_log memory usage in large-scale simulations
  (in-memory list vs disk-backed streaming)
 - **Fused operation**: Whether to record tl.composite's tiled pipeline
  (READ→COMPUTE→WRITE) as a single fused op record or separate individual ops
 - **Math op schema generalization**: The current math params have a simple structure,
  but generalization may be needed for broadcasting rules, per-input dtype, keepdims,
  scalar/immediate operands, where/mask expressions, etc.
 - **Op record identifier**: Currently dependency_ids are based on in-memory list indices;
  replacement with stable op_id is needed when introducing streaming/disk-backed mode
 - **Phase 1 materialization policy**: See Future Extension in D3.
  If allowed, the Phase 2 handling approach (skip / verify / recompute) for those ops
  needs to be defined
 ---
 ## Consequences
 ### Positive
 - Minimal impact on SimPy simulation performance (only op_log append added)
 - Free to use multi-threading/GPU in Phase 2
 - Component replaceability preserved (ADR-0015 design philosophy maintained)
 - No changes needed to benchmark user code API
 - When adding new message types, only set the data_op flag
 - Phase 0 eliminated via greenlet — memory-data-based dynamic control flow supported
 - `tl.load()` returns actual data, making kernel debugging easier
 ### Negative
 - op_log memory usage (for large-scale simulations)
 - Phase 2 execution time is proportional to tensor size (large GEMM)
 - Dynamic branching based on pending handles (incomplete computations) not possible
  (computations execute in Phase 2, result values are undetermined in Phase 1).
  Memory-data-based branching is supported via greenlet.
 - greenlet C extension dependency added (pip install greenlet)
 ---
 ## Affected Files
 | File | Change |
 |------|--------|
 | `src/kernbench/components/base.py` | Add `_on_process_start/end` hooks |
 | `src/kernbench/common/pe_commands.py` | Add `data_op = True`, extend metadata fields |
 | `src/kernbench/sim_engine/op_log.py` | New: OpRecord, OpLogger |
 | `src/kernbench/sim_engine/data_executor.py` | New: DataExecutor, MemoryStore |
 | `src/kernbench/sim_engine/engine.py` | op_logger injection (optional) |
 | `src/kernbench/triton_emu/tl_context.py` | greenlet switch calls inside `tl.load()` etc. |
 | `src/kernbench/triton_emu/kernel_runner.py` | New: KernelRunner (greenlet ↔ SimPy bridge) |
 | `src/kernbench/components/builtin/pe_cpu.py` | Remove Phase 0, change to KernelRunner invocation |
 | `pyproject.toml` | Add greenlet dependency |
 Component implementation files (pe_gemm.py, pe_dma.py, hbm_ctrl.py, etc.): **no changes**
 Benchmark kernels (benches/*.py): **no user API changes**
@@ -0,0 +1,550 @@
 # ADR-0020: 2-Pass 데이터 실행 모델 (타이밍 / 데이터 분리)
 ## Status
 Proposed
 ## Context
 현재 시뮬레이션은 **타이밍만** 모델링한다.
 `tl.load()`, `tl.composite(op="gemm")` 등은 SimPy latency를 생성하지만,
 실제 텐서 데이터를 읽거나 연산하지 않는다.
 ### 필요한 기능
 1. HBM/TCM/SRAM에 실제 데이터를 저장하고 읽을 수 있어야 한다
 2. PE_GEMM, PE_MATH가 실제 행렬 연산을 수행하고 결과를 검증할 수 있어야 한다
 3. 시뮬레이션 성능 저하를 최소화해야 한다
 ### 기존 커널 실행 구조의 한계
 현재 커널 실행은 3단계로 분리되어 있다:
 ```
 Phase 0: TLContext에서 커널 함수 실행 → PeCommand 리스트 생성 (SimPy 밖, 데이터 없음)
 Phase 1: PE_CPU가 PeCommand 리스트를 SimPy로 replay (타이밍만)
 ```
 Phase 0에서 커널이 **전부 실행 완료**된 후에야 SimPy가 시작된다.
 `tl.load()`는 TensorHandle(placeholder)을 반환하므로 실제 데이터에 접근할 수 없다.
 따라서 데이터 값에 따른 분기(dynamic control flow)가 불가능하다.
 본 ADR은 이 한계를 **메모리 연산에 한해** 해소한다 (D1, D3 참조).
 ### 제약 조건
 - SimPy는 single-thread 이벤트 루프 — numpy matmul을 안에서 하면 전체가 block
 - 컴포넌트는 교체 가능해야 한다 (ADR-0015) — 프레임워크 요구사항이 구현에 침투하면 안 됨
 - 벤치마크 커널은 명령형 코드(tl.load → tl.composite → tl.wait) — 같은 코드를 재사용해야 함
 - 커널 함수는 plain Python function으로 유지해야 한다 (generator/async 변환 불가)
 ### 설계 탐색 결과
 | Option | 방식 | 판정 |
 |--------|------|------|
 | SimPy 내 직접 실행 | GEMM을 SimPy 안에서 numpy 호출 | 탈락: single-thread block |
 | SimPy + ThreadPool | future.submit → timeout → result() | 탈락: back-to-back 요청 시 result()에서 block |
 | Symbolic + lazy | 메타데이터만 추적, 나중에 실행 | 탈락: control-flow dependent 읽기 처리 곤란 |
 | **2-pass (채택)** | Phase 1: 타이밍, Phase 2: 데이터 | 완전 분리, 성능 영향 없음 |
 ---
 ## Decision
 ### D1. 2-Pass 실행 모델 — Phase 0 제거
 기존의 3단계(Phase 0 → Phase 1 → Phase 2)를 **2단계로 통합**한다.
 기존:
 ```
 Phase 0: 커널 → PeCommand 리스트 (데이터 없음, 분기 불가)
 Phase 1: PeCommand 리스트를 SimPy replay (타이밍만)
 ```
 변경:
 ```
 Phase 1 (타이밍): 커널 + SimPy 통합 실행 — greenlet 기반
  - 메모리 읽기/쓰기: SimPy 타이밍 + MemoryStore 실제 데이터
  - 연산 (GEMM/Math): SimPy 타이밍 + op_log 기록 (실제 연산은 Phase 2)
  - dynamic control flow 가능 (tl.load가 실제 데이터 반환)
 Phase 2 (데이터): op_log 기반 실제 연산 실행 — SimPy 외부, 병렬 가능
 ```
 본 ADR은 **메모리 연산에 한해 Phase 1을 data-aware로 확장**한다.
 Phase 1은 latency/BW 병목 분석 + 메모리 데이터 추적,
 Phase 2는 GEMM/Math 연산 정합성 검증.
 Phase 2는 optional — 타이밍만 필요하면 Phase 1만 실행.
 ### D2. Op Log 기록 — ComponentBase hook
 op_log 기록은 **컴포넌트 베이스 클래스의 hook**으로 수행한다.
 개별 컴포넌트 구현을 수정하지 않는다.
 ```python
 class ComponentBase:
    def _on_process_start(self, env, msg):
        if self._op_logger and getattr(msg, 'data_op', False):
            self._op_logger.record_start(env.now, self.node.id, msg)
    def _on_process_end(self, env, msg):
        if self._op_logger and getattr(msg, 'data_op', False):
            self._op_logger.record_end(env.now, self.node.id, msg)
 ```
 `_forward_txn()` 에서 `run()` 전후로 hook을 호출한다.
 `_op_logger`는 optional — 없으면 오버헤드 제로.
 **hook 시점 정의**:
 | 시점 | 의미 |
 |------|------|
 | `t_start` | 컴포넌트가 해당 msg의 **service를 시작**한 시점 (`run()` 진입 직전) |
 | `t_end` | 컴포넌트의 **내부 service가 완료**된 시점 (`run()` 반환 직후) |
 link traversal latency는 t_start/t_end에 포함되지 않는다.
 link latency는 발신 컴포넌트의 t_end와 수신 컴포넌트의 t_start 차이로 관측된다.
 ### D3. Greenlet 기반 커널 실행 — Phase 0 제거
 기존 Phase 0 (커널 → PeCommand 리스트)를 제거하고,
 **greenlet**을 사용하여 커널과 SimPy를 협력적으로 interleave 실행한다.
 #### 동작 원리
 greenlet은 협력적 context switch를 제공하는 C 확장이다.
 커널(child greenlet)이 `tl.load()` 등을 호출하면 SimPy 루프(parent greenlet)로
 switch하여 타이밍 시뮬레이션을 수행하고, 완료 후 실제 데이터와 함께 커널로 돌아온다.
 ```
 SimPy 루프 (parent greenlet)          커널 (child greenlet)
 ─────────────────────────              ──────────────────────
 g.switch() ─────────────────────────→ 커널 시작
                                       a = tl.load(ptr, ...)
                                         내부: parent.switch(DmaReadCmd)
 cmd = DmaReadCmd ←──────────────────  (커널 일시정지)
  yield DmaReadMsg(...)
  yield env.timeout(dma_latency)
  data = memory_store.read(...)
 g.switch(data) ─────────────────────→ (커널 재개)
                                       a = data  ← 실제 numpy array
                                       if a[0][0] > 0.5:  ← 분기 가능
                                         ...
 ```
 커널은 **plain Python function**으로 유지된다.
 greenlet switch는 `tl.load()`, `tl.store()` 등의 **내부 구현에만** 존재한다.
 #### KernelRunner — 프레임워크 레이어
 greenlet 루프는 PE_CPU 컴포넌트가 아니라 프레임워크 레이어인
 **KernelRunner**에 위치한다.
 ```python
 # KernelRunner (프레임워크 — greenlet ↔ SimPy 연결)
 class KernelRunner:
    def run(self, env, kernel_fn, args, store):
        g = greenlet(self._run_kernel)
        cmd = g.switch(kernel_fn, args)
        while cmd is not None:
            if isinstance(cmd, DmaReadCmd):
                yield from self._dispatch_dma(env, cmd)
                data = store.read(cmd.src_addr, cmd.shape, cmd.dtype)
                cmd = g.switch(data)            # 실제 데이터와 함께 재개
            elif isinstance(cmd, GemmCmd):
                yield from self._dispatch_gemm(env, cmd)
                cmd = g.switch()                # 재개 (데이터 없음)
            elif isinstance(cmd, DmaWriteCmd):
                store.write(cmd.dst_addr, cmd.data)  # visibility = issue 시점
                yield from self._dispatch_dma(env, cmd)  # timing만 반영
                cmd = g.switch()
 # PE_CPU (컴포넌트 — 간단하게 유지, greenlet을 모름)
 def _execute_kernel(self, env):
    runner = KernelRunner(self.ctx)
    yield from runner.run(env, kernel_fn, args, store)
 ```
 **Op logging single source of truth**: KernelRunner는 op_log에 직접 기록하지 않는다.
 모든 op logging은 **ComponentBase hook (_on_process_start/end)만** 담당한다.
 KernelRunner가 `_dispatch_gemm()` 등으로 컴포넌트에 메시지를 전달하면,
 컴포넌트 베이스 클래스의 hook이 자동으로 기록한다.
 **레이어 분리**:
 - **커널 코드**: plain function, greenlet 존재를 모름
 - **TLContext**: `tl.load()` 내부에서 `parent.switch(cmd)` 호출
 - **KernelRunner**: greenlet ↔ SimPy 연결, MemoryStore 읽기/쓰기 처리. **logging 안 함**.
 - **ComponentBase hook**: op_log 기록의 유일한 경로
 - **PE_CPU**: KernelRunner를 호출만 함, 컴포넌트로서 교체 가능
 #### 메모리 읽기/쓰기 vs 연산의 처리 차이
 | 연산 | Phase 1에서 | Phase 2에서 |
 |------|------------|------------|
 | `tl.load()` | SimPy 타이밍 + MemoryStore read → **실제 데이터 반환** | — |
 | `tl.store()` | SimPy 타이밍 + MemoryStore write → **실제 기록** | — |
 | `tl.composite(gemm)` | SimPy 타이밍 + **op_log 기록만** | numpy 실제 연산 |
 | `tl.dot()` / math ops | SimPy 타이밍 + **op_log 기록만** | numpy 실제 연산 |
 메모리 읽기/쓰기는 Phase 1에서 즉시 처리 (numpy slice, 빠름).
 GEMM/Math 연산은 Phase 2에서 batch 실행 (성능 분리).
 #### Store Visibility Rule
 `tl.store()`는 **issue 시점에 MemoryStore에 즉시 반영**된다 (visibility = issue).
 SimPy DMA 타이밍은 이후 별도로 시뮬레이션된다.
 이는 timing과 visibility를 의도적으로 분리한 것이다:
 - **visibility**: MemoryStore에 반영되는 시점 = `store.write()` 호출 시
 - **timing**: SimPy에서 DMA latency가 완료되는 시점
 이 분리로 dynamic control flow에서 store 직후 load가 최신 데이터를 볼 수 있다.
 #### Result Handle Semantics
 `tl.composite()`(sync/async)는 결과 tensor를 참조하는 **handle**을 반환한다.
 Phase 1에서의 핵심 계약:
 1. **모든 compute handle은 Phase 1에서 항상 pending 상태로 간주한다.**
 2. `tl.wait(handle)`은 **timing synchronization만 표현**하며,
   handle을 ready로 만들지 않는다.
 3. handle의 실제 결과 데이터 접근(`handle.data`, element access,
   numpy conversion 등)은 **Phase 2에서만 가능**하다.
 4. 따라서 Phase 1에서 **compute-result 기반 control flow는 지원하지 않는다.**
 5. 반면 `tl.load()`는 Phase 1에서 실제 데이터를 반환하므로,
   **memory-read 기반 control flow는 지원 가능**하다.
 | handle 상태 | Phase | 허용 동작 |
 |------------|-------|----------|
 | pending | Phase 1 | `tl.wait(handle)` — timing 동기화만 |
 | pending | Phase 1 | handle을 `tl.store()`의 대상으로 전달 (logical destination 연결만, payload는 Phase 2) |
 | pending | Phase 1 | **데이터 접근 불가** — 값 기반 분기 불가 |
 | ready | Phase 2 | 실제 numpy 데이터 접근, 검증 |
 이 제약은 의도적이다. Phase 1에서 연산을 실행하면 SimPy single-thread가
 block되어 2-pass 분리의 존재 이유가 사라진다.
 #### Phase 1 Materialization — Future Extension
 향후 소형 연산(scalar, 작은 reduction)에 대해 Phase 1 eager execution이
 필요한 경우, `materialized_in_phase1: bool` 플래그를 op record에 추가하여
 선택적 materialization을 지원할 수 있다. 현재 범위에서는 구현하지 않는다.
 ### D4. data_op 플래그 — 메시지 자기 선언
 로깅 대상은 메시지 타입이 아니라 메시지 인스턴스의 `data_op` 속성으로 결정한다.
 프레임워크가 메시지 타입을 하드코딩하지 않는다.
 ```python
 class MsgBase:
    data_op: bool = False       # 기본: 로깅 안 함
 class DmaReadCmd(MsgBase):
    data_op = True              # 메모리 이동 → 로깅
 class GemmCmd(MsgBase):
    data_op = True              # 연산 → 로깅
 class MathCmd(MsgBase):
    data_op = True              # 연산 → 로깅
 ```
 새 메시지 타입(예: IpcqMsg) 추가 시 `data_op = True`만 설정하면
 프레임워크 코드 수정 없이 자동 로깅된다.
 ### D5. Op Log 구조
 #### op 분류 체계
 2단계로 분류한다:
 | 레벨 | 필드 | 역할 |
 |------|------|------|
 | `op_kind` | `memory` \| `gemm` \| `math` | executor dispatch 기준 |
 | `op_name` | `dma_read` \| `dma_write` \| `gemm_f16` \| `exp` \| `add` \| `sum` 등 | 구체 연산 식별 |
 #### OpRecord 정의
 ```python
@dataclass
 class OpRecord:
    t_start: float              # SimPy 시각 (ns) — service 시작
    t_end: float                # SimPy 시각 (ns) — service 완료
    component_id: str           # e.g. "sip0.cube0.pe0.pe_gemm"
    op_kind: str                # "memory" | "gemm" | "math"
    op_name: str                # 구체 연산명
    params: dict                # 연산별 파라미터 (아래 참조)
    dependency_ids: list[int]   # 현재는 in-memory record index 기반, 향후 stable op_id로 대체 가능
 ```
 #### dependency_ids 생성 규칙
 `dependency_ids`는 **optional**이며, 기본적으로 executor는
 주소 기반 dependency 추론을 수행한다 (D6 참조).
 정확한 실행 순서가 필요한 경우에만 명시적으로 설정한다:
 - **기본 (address-based inference)**: executor가 read/write set을 분석하여
  RAW/WAW/WAR 의존성을 자동 추론. 대부분의 경우 이것으로 충분.
 - **명시적 설정**: TLContext 또는 command 생성 단계에서 logical dependency가
  주소로 표현되지 않는 경우에 설정.
  예: completion handle 기반 동기화 — handle dependency는 메모리 주소가 아니라
  논리적 완료 순서에 의존하므로 address inference로 잡히지 않는다.
 #### op_log ordering
 op_log는 `t_start` 기준으로 **stable ordering**을 유지한다.
 동일 `t_start`의 record들은 insertion order를 보존한다.
 #### params 상세
 **memory (dma_read / dma_write)**:
 ```python
 {
    "src_addr": int,            # source 주소 (byte)
    "dst_addr": int,            # destination 주소 (byte)
    "nbytes": int,              # 전송 크기
    "src_space": str,           # "hbm" | "tcm" | "sram"
    "dst_space": str,           # "hbm" | "tcm" | "sram"
 }
 ```
 **gemm**:
 ```python
 {
    "src_a_addr": int,          # operand A 주소
    "src_b_addr": int,          # operand B 주소
    "dst_addr": int,            # output 주소
    "shape_a": tuple,           # e.g. (128, 256)
    "shape_b": tuple,           # e.g. (256, 128)
    "shape_out": tuple,         # e.g. (128, 128)
    "dtype_in": str,            # e.g. "f16"
    "dtype_acc": str,           # accumulation dtype, e.g. "f32"
    "dtype_out": str,           # output dtype, e.g. "f16"
    "transpose_a": bool,
    "transpose_b": bool,
    "layout_a": str,            # "row_major" | "col_major"
    "layout_b": str,
    "layout_out": str,
    "addr_space": str,          # "tcm" (GEMM operand는 항상 TCM)
 }
 ```
 **math**:
 ```python
 {
    "op": str,                  # "exp" | "add" | "sum" | "where" | ...
    "input_addrs": list[int],   # operand 주소 목록
    "input_shapes": list[tuple],
    "dst_addr": int,
    "shape_out": tuple,
    "dtype": str,
    "axis": int | None,         # reduction axis
    "addr_space": str,          # "tcm"
 }
 ```
 ### D6. Phase 2 Executor
 Phase 2는 SimPy 밖에서 op_log를 실행한다.
 ```python
 class DataExecutor:
    def __init__(self, op_log: list[OpRecord], initial_store: MemoryStore):
        self.store = initial_store  # Phase 1의 MemoryStore snapshot을 입력으로 받는다
    def run(self):
        for t, ops in groupby(op_log, key=lambda o: o.t_start):
            batch = list(ops)
            independent, sequential = self._classify(batch)
            self._execute_parallel(independent)
            self._execute_sequential(sequential)
 ```
 **병렬 실행 판정**:
 같은 `t_start`의 op들은 **병렬 후보**로 간주한다.
 실제 병렬 실행 여부는 executor가 다음 기준으로 판정한다:
 - read/write 주소 범위 겹침 여부 (WAW, RAW, WAR 충돌 검사)
 - `dependency_ids`에 명시된 선행 op 완료 여부
 주소 범위가 겹치지 않고 명시적 의존성이 없는 op들만 병렬 실행한다.
 **배치 최적화**: 동일 op_name이며 **shape, dtype, layout, transpose flag가
 모두 동일한** 독립 op들만 batching 대상이 된다.
 예: 여러 PE의 동일 shape GEMM → `np.matmul(a_batch, b_batch)` 한 번으로 묶음.
 CPU에서도 BLAS 효율 향상, GPU에서는 launch overhead 절감.
 **Phase 2 실행 순서 보장**:
 Phase 2는 데이터 도착 시점을 고려하지 않으며,
 dependency (주소 기반 추론 + 명시적 dependency_ids)를 통해서만
 실행 순서를 보장한다.
 ### D7. Memory Store
 `MemoryStore`는 논리적으로 byte-addressable semantics를 따르며,
 현재 구현은 **tensor-granular storage** (addr → numpy ndarray 매핑)를 사용한다.
 ```python
 class MemoryStore:
    def write(self, space: str, addr: int, data: np.ndarray) -> None: ...
    def read(self, space: str, addr: int, shape: tuple, dtype: str) -> np.ndarray: ...
 ```
 **내부 저장 포맷: numpy ndarray**
 MemoryStore는 텐서를 **numpy ndarray**로 저장한다.
 | 후보 | store/load 속도 | Phase 2 연산 | 판정 |
 |------|----------------|-------------|------|
 | **numpy ndarray** | 즉시 (참조 전달, 복사 없음) | `np.matmul` 바로 사용 | **채택** |
 | bytearray | memcpy 필요 | `np.frombuffer` 변환 필요 | 탈락 |
 | torch tensor | 즉시 | torch 연산 가능 | GPU 최적화 시만 사용 |
 - write: numpy array를 **참조 저장** (복사 없음) → Phase 1 오버헤드 = dict lookup 1회
 - read: numpy array를 **참조 반환** (복사 없음)
 - 동일 addr에 재 write 시 기존 array를 **tensor 단위로 덮어쓴다** (partial overwrite 미지원)
 - dtype은 numpy native 사용 (`np.float16`, `np.float32`, `np.bfloat16` 등)
 - byte-level access가 필요한 경우 `.view(np.uint8)` 로 변환
 - Phase 2에서 GPU batch 최적화 시 numpy → torch tensor 변환은 executor가 담당
 **read/write contract**:
 - read/write는 **contiguous tensor** 기준이다.
  non-contiguous stride view가 필요한 경우 별도 copy op으로 표현한다.
 - 일반 benchmark path에서는 producer/consumer dtype 일치를 기대한다.
  reinterpret cast는 low-level memory validation 또는 특수 테스트 케이스를 위한
  permissive behavior이다.
 - addr은 byte-aligned이며, 최소 alignment = dtype 크기.
 - dtype mismatch (write와 다른 dtype으로 read)는 reinterpret cast로 처리한다.
  shape 불일치 시 nbytes 기준으로 검증하고, 불일치하면 error.
 - 정합성 기준은 주소 범위 기반 read/write semantics를 따른다.
 - 구현 최적화로 tensor object cache를 둘 수 있지만,
  canonical state는 byte-addressable storage이다.
 - deploy 시점에 호스트가 초기 텐서 데이터를 주입한다.
 ### D8. 벤치마크 커널 코드
 벤치마크의 **사용자 코드 API는 변경하지 않는다**.
 `tl.load()`, `tl.composite()`, `tl.store()` 등의 호출 인터페이스는 유지.
 단, 내부 command/message schema는 Phase 2 실행에 필요한 metadata를
 포함하도록 확장될 수 있다 (예: dtype_acc, transpose 등 추가 필드).
 ### D9. 컴포넌트 변경 없음
 개별 컴포넌트 구현(PE_GEMM, PE_DMA, HBM_CTRL 등)은 수정하지 않는다.
 op_log 기록은 ComponentBase hook의 책임이다.
 커스텀 컴포넌트 교체 시 타이밍 모델만 교체되며,
 Phase 2 데이터 실행은 영향받지 않는다.
 ### D10. Phase 2는 Optional
 ```python
 engine = GraphEngine(graph)
 engine.run(benchmark)                       # Phase 1: 타이밍만
 result = engine.get_timing_result()
 if verify_data:
    executor = DataExecutor(engine.op_log)  # Phase 2: 데이터
    executor.run()
    executor.verify(expected_output)
 ```
 타이밍 분석만 필요하면 Phase 2를 건너뛴다.
 op_logger를 비활성화하면 Phase 1 성능도 기존과 동일.
 ### D11. Verification Contract
 기본 검증은 **최종 output tensor**를 reference backend(numpy)와 비교한다.
 dtype별 tolerance 정책:
 | dtype | 비교 방식 | tolerance |
 |-------|----------|-----------|
 | f32 | `np.allclose` | rtol=1e-5, atol=1e-5 |
 | f16 | `np.allclose` | rtol=1e-3, atol=1e-3 |
 | bf16 | `np.allclose` | rtol=1e-2, atol=1e-2 |
 | int 계열 | `np.array_equal` | exact |
 - 기본 모드: 최종 output만 비교 (end-to-end correctness)
 - 디버그 모드: intermediate tensor도 op 단위로 비교 가능
  (MemoryStore snapshot at each op boundary)
 ---
 ## Non-goals
 - **Compute-result-based control flow**: 지원하지 않는다.
  모든 compute handle은 Phase 1에서 pending 상태이며,
  `wait()`는 timing synchronization만 표현하고 data readiness를 의미하지 않는다.
  Phase 1에서 `handle.data` 접근, element access, truth-value evaluation은
  **error로 처리**한다.
  메모리 데이터 기반 분기(`tl.load()` 결과)는 greenlet으로 지원된다.
  Phase 1 materialization은 future extension (D3 참조).
 - **Cycle-accurate overlap reconstruction**: Phase 2에서 Phase 1의 실행 시간
  overlap을 정확히 재현하지 않는다. Phase 2는 데이터 정합성만 검증한다.
 - **GPU kernel compilation**: Phase 2의 GEMM/Math는 numpy/torch 호출이며,
  실제 하드웨어 PE의 마이크로아키텍처를 재현하지 않는다.
 ## Open Questions
 - **Aliasing / slice view**: 동일 backing storage를 참조하는 slice/view를
  MemoryStore에서 어떻게 표현할지 (stride-based view vs copy semantics)
 - **IPCQ/descriptor read 일반화**: PE-to-PE 통신을 memory op으로 완전히
  일반화할지, 별도 op_kind를 둘지
 - **Op log streaming**: 대규모 시뮬레이션에서 op_log 메모리 사용량 관리
  (in-memory list vs disk-backed streaming)
 - **Fused operation**: tl.composite의 tiled pipeline (READ→COMPUTE→WRITE)을
  하나의 fused op record로 기록할지, 개별 op으로 분리할지
 - **Math op schema 일반화**: 현재 math params는 단순 구조이나,
  broadcasting rule, input별 dtype, keepdims, scalar/immediate operand,
  where/mask 표현 등 일반화가 필요할 수 있음
 - **Op record 식별자**: 현재 dependency_ids는 in-memory list index 기반이며,
  streaming/disk-backed mode 도입 시 stable op_id로 대체 필요
 - **Phase 1 materialization policy**: D3의 Future Extension 참조.
  허용 시 해당 op의 Phase 2 처리 방식 (skip / verify / recompute) 정의 필요
 ---
 ## Consequences
 ### 긍정적
 - SimPy 시뮬레이션 성능 영향 최소 (op_log append만 추가)
 - Phase 2에서 멀티스레드/GPU 자유롭게 사용 가능
 - 컴포넌트 교체 자유도 유지 (ADR-0015 설계 철학 보존)
 - 벤치마크 사용자 코드 API 변경 불필요
 - 새 메시지 타입 추가 시 data_op 플래그만 설정
 - greenlet으로 Phase 0 제거 — 메모리 데이터 기반 dynamic control flow 지원
 - `tl.load()`가 실제 데이터를 반환하므로 커널 디버깅 용이
 ### 부정적
 - op_log 메모리 사용량 (대규모 시뮬레이션 시)
 - Phase 2 실행 시간은 텐서 크기에 비례 (대형 GEMM)
 - pending handle (연산 미완료) 기반 동적 분기 불가
  (연산은 Phase 2에서 실행, Phase 1에서 결과 값 미확정).
  메모리 데이터 기반 분기는 greenlet으로 지원된다.
 - greenlet C 확장 의존성 추가 (pip install greenlet)
 ---
 ## 영향받는 파일
 | 파일 | 변경 |
 |------|------|
 | `src/kernbench/components/base.py` | `_on_process_start/end` hook 추가 |
 | `src/kernbench/common/pe_commands.py` | `data_op = True` 추가, metadata 필드 확장 |
 | `src/kernbench/sim_engine/op_log.py` | 신규: OpRecord, OpLogger |
 | `src/kernbench/sim_engine/data_executor.py` | 신규: DataExecutor, MemoryStore |
 | `src/kernbench/sim_engine/engine.py` | op_logger 주입 (optional) |
 | `src/kernbench/triton_emu/tl_context.py` | `tl.load()` 등 내부에서 greenlet switch 호출 |
 | `src/kernbench/triton_emu/kernel_runner.py` | 신규: KernelRunner (greenlet ↔ SimPy 연결) |
 | `src/kernbench/components/builtin/pe_cpu.py` | Phase 0 제거, KernelRunner 호출로 변경 |
 | `pyproject.toml` | greenlet 의존성 추가 |
 컴포넌트 구현 파일 (pe_gemm.py, pe_dma.py, hbm_ctrl.py 등): **변경 없음**
 벤치마크 커널 (benches/*.py): **사용자 API 변경 없음**
@@ -0,0 +1,537 @@
 # ADR-0021: PE Pipeline Refactoring — Component Separation + Scheduler-Based Routing
 ## Status
 Proposed
 ## Context
 ### Problems with the Current Structure
 pe_accel (SchedulerV2Component) hides 5 hardware blocks (DmaIn, DmaWb, Gemm, Math, Tcm)
 **inside a single component**.
 ```
 SchedulerV2Component (single topology node)
 ├── DmaInBlock     ← directly connected via internal SimPy Store
 ├── DmaWbBlock     ← not visible in topology
 ├── GemmBlock      ← not replaceable
 ├── MathBlock      ← not replaceable
 └── TcmBlock       ← not replaceable
 ```
 Problems:
 - Blocks directly reference the next block via `desc.next_block` — hardcoded routing
 - Individual blocks cannot be replaced (violates ADR-0015 component replacement principle)
 - PE internal structure is not visible in the topology
 - GemmBlock and MathBlock each duplicate TCM load/store logic
 ### Actual Hardware Structure
 ```
 HBM ←(DMA)→ TCM ←(Fetch/Store Unit)→ Register File ←→ GEMM/MATH Engine
 ```
 - DMA: HBM ↔ TCM transfer (via fabric, tens to hundreds of ns)
 - Fetch/Store Unit: TCM ↔ Register File transfer (BW-based, a few ns)
 - GEMM/MATH Engine: computation between Register Files (cycle-accurate)
 - Completion signal: PE-internal 1-cycle wire signal (done pin assert)
 ---
 ## Decision
 ### D1. Separate Each Block into an Independent Component
 The internal blocks of pe_accel are separated into **independent PeEngineBase components**.
 Existing 5 blocks + 1 Fetch/Store Unit = 6 components.
 | Component | Role | HW Correspondence |
 |-----------|------|-------------------|
 | PE_SCHEDULER | Plan generation, tile state management, stage routing | Scheduler/Sequencer |
 | PE_DMA | HBM ↔ TCM (via fabric) | DMA Engine |
 | PE_FETCH_STORE | TCM ↔ Register File | Load/Store Unit |
 | PE_GEMM | MAC compute (register only) | MAC Array |
 | PE_MATH | Element-wise/reduction (register only) | SIMD/Vector Unit |
 | PE_TCM | BW-serialized scratchpad | SRAM Bank |
 Each component exists as a topology node and is connected via ports/wires.
 Replacing the `impl` allows changing the timing model of an individual block.
 ### D2. Token Self-Routing — Scheduler Handles Only Dispatch + Completion
 **Components do not pass through the scheduler at every stage.**
 The token carries a plan so that components chain directly to the next stage.
 ```
 Scheduler → DMA → Fetch → GEMM → Math → Store → DMA_WB → (done) → Scheduler
              ↑ chaining: does not go through scheduler          completion only
 ```
 This matches the actual HW structure where each block's done signal is directly
 connected to the next block via wire. The scheduler is responsible **only for
 initial dispatch + completion aggregation**.
 #### Stage Definition
 ```python
 class StageType(Enum):
    DMA_READ = 0
    FETCH = 1
    GEMM = 2
    MATH = 3
    STORE = 4
    DMA_WRITE = 5
 ```
 #### Plan Structure
 When the scheduler receives a CompositeCmd, it generates a **per-tile execution plan**.
 The plan defines the **stage sequence** for each tile:
 ```python
@dataclass
 class Stage:
    stage_type: StageType
    component: str       # topology node ID (e.g. "sip0.cube0.pe0.pe_dma")
    params: dict         # per-stage parameters (dynamic)
@dataclass(frozen=True)
 class TilePlan:
    tile_id: int
    stages: tuple[Stage, ...]  # list of stages to execute in order (immutable)
 ```
 The stage sequence varies depending on the plan:
 ```python
 # Normal GEMM: HBM → TCM → Register → Compute → Register → TCM → HBM
 stages = (DMA_READ, FETCH, GEMM, STORE, DMA_WRITE)
 # GEMM directly from TCM data (skip DMA read):
 stages = (FETCH, GEMM, STORE, DMA_WRITE)
 # MATH element-wise:
 stages = (DMA_READ, FETCH, MATH, STORE, DMA_WRITE)
 # GEMM + accumulation (intermediate K-tile, skip writeback):
 stages = (DMA_READ, FETCH, GEMM, STORE)  # store to TCM only
 ```
 **Components do not hardcode the next component.**
 They read the next stage from the token's plan and forward it directly via out_port.
 This is the same pattern as a network packet carrying a routing header.
 #### Pipeline Context
 ```python
@dataclass
 class PipelineContext:
    id: str
    total_tiles: int
    completed_tiles: int = 0
    done_event: simpy.Event = None  # succeeds when all tiles are complete
    def complete_tile(self) -> None:
        self.completed_tiles += 1
        if self.completed_tiles == self.total_tiles:
            self.done_event.succeed()
 ```
 **Completion follows an exactly-once contract**: the last stage of each tile must call
 `complete_tile()` exactly once. Duplicate calls are a bug, and `done_event` must
 succeed only once (SimPy Event constraint).
 #### Scheduler Role (Reduced)
 When the scheduler receives a CompositeCmd, it creates a plan and PipelineContext,
 enqueues them into the scheduler's internal `_pending_feeds` FIFO, and returns immediately.
 Actual tile injection is handled by a **single feeder process** (`_feed_loop`).
 This feeder consumes `_pending_feeds` in FIFO order and
 **does not allow tile feed interleaving across composite commands.**
 That is, the feed for the next command begins only after all tiles of the current
 command have been injected into the first stage queue.
 There is **exactly one `_feed_loop`** per scheduler, and
 tile feed for composite commands is performed exclusively through this single process.
 Command issue order refers to **the order in which PE_SCHEDULER receives PeInternalTxn**.
 This structure maintains command issue order while ensuring that when the first stage
 queue is full, only the feeder process blocks — the scheduler worker's inbox processing
 itself does not stall.
 ```python
 class PeSchedulerV2(PeEngineBase):
    _pipelines: dict[str, PipelineContext]
    _pending_feeds: simpy.Store   # FIFO of (plan, ctx)
    def start(self, env):
        super().start(env)
        self._pending_feeds = simpy.Store(env)
        env.process(self._feed_loop(env))
    def _dispatch_composite(self, env, pe_txn, cmd):
        plan = generate_plan(cmd)
        ctx = PipelineContext(
            id=next_id(),
            total_tiles=len(plan.tiles),
            done_event=pe_txn.done,
        )
        self._pipelines[ctx.id] = ctx
        # only enqueue to feeder queue and return immediately
        yield self._pending_feeds.put((plan, ctx))
    def _feed_loop(self, env):
        """Single feeder process: feeds composite commands in FIFO order.
        Tile feed interleaving across composite commands is not allowed.
        The feed for the next command begins only after all tiles of the
        current command have been injected into the first stage queue.
        When the first stage queue is full, only this feeder blocks;
        the scheduler worker's inbox processing does not stall.
        """
        while True:
            plan, ctx = yield self._pending_feeds.get()
            for tile in plan.tiles:
                token = TileToken(
                    tile_id=tile.tile_id,
                    pipeline_ctx=ctx,
                    plan=tile,
                    stage_idx=0,
                    params=tile.stages[0].params,
                )
                yield self.out_ports[tile.stages[0].component].put(token)
                # queue capacity = HW queue depth → feeder blocks only when full
 ```
 In this ADR, the scheduler can accept multiple composite commands,
 but tile submission order follows per-command FIFO.
 Within a command, tile-level pipeline overlap is allowed,
 but tile feed interleaving across commands is not.
 ### D3. Data Transfer vs. Completion Signal — HW Modeling Criteria
 | Communication Type | Method | HW Correspondence |
 |-------------------|--------|-------------------|
 | Tile token (work directive) | message via out_port | enqueue to command queue |
 | Stage completion → next stage | component directly calls out_port.put | done-triggered local enqueue |
 | Pipeline completion → scheduler | PipelineContext.complete_tile() | completion interrupt |
 **Tile token**: uses out_port.put(). SimPy Store capacity = HW queue depth.
 **Intra-PE chaining latency**: within the scope of this ADR, no explicit latency model
 is applied to intra-PE stage triggers. Chaining between components corresponds to
 PE-internal wires, and since there is no scheduler round-trip, no artificial hop cost
 is incurred.
 **Pipeline completion**: the component at the last stage calls `pipeline_ctx.complete_tile()`.
 When all tiles are complete, PipelineContext calls done_event.succeed().
 ### D4. Asynchronous Pipeline — Natural Overlap
 The scheduler processes CompositeCmds **asynchronously**.
 However, tile feed does not spawn an independent process per command; instead,
 the scheduler's internal **single feeder process** performs the feed in FIFO order.
 Therefore, the scheduler can continue to receive the next command,
 but the first-stage tile injection order is guaranteed per command.
 Since **SimPy Store capacity = HW queue depth**:
 - When the queue is full, put() naturally blocks (backpressure)
 - While DMA is processing tile 0, GEMM can start fetching an already-completed tile
 - When a second CompositeCmd arrives, it is immediately queued to the DMA queue
 ```
 First-stage feed order (feeder → DMA queue):
  [cmd1:t0][cmd1:t1][cmd1:t2]...[cmd1:tN] | [cmd2:t0][cmd2:t1]...
                                            ↑ cmd2 starts after cmd1 feed completes
 Runtime pipeline (downstream overlap):
  PE_DMA:    [cmd1:t0][cmd1:t1][cmd1:t2]...[cmd1:tN][cmd2:t0][cmd2:t1]...
  PE_FETCH:          [cmd1:t0][cmd1:t1]...
  PE_GEMM:                   [cmd1:t0][cmd1:t1]...
                              ↑ pipeline overlap within the same command
 ```
 Here, the overlap does not come from tile feed interleaving across different commands,
 but occurs naturally as tiles from earlier commands progress to downstream stages
 while the feeder continues injecting subsequent tiles.
 For example, tile feed for cmd2 does not start until all tiles of cmd1 have been
 injected into the first stage queue. However, while cmd1.tile0 has already progressed
 to GEMM, cmd1.tile1 and cmd1.tile2 may still remain in DMA/FETCH, so
 **pipeline overlap within the same command occurs naturally**.
 #### Component Chaining Pattern
 All components follow the same pattern:
 ```python
 def _pipeline_worker(self, env):
    while True:
        token = yield self._inbox.get()
        # process own stage
        yield from self._process(env, token)
        # chain to next stage (read from plan)
        next_idx = token.stage_idx + 1
        if next_idx < len(token.plan.stages):
            next_stage = token.plan.stages[next_idx]
            token.stage_idx = next_idx
            token.params = next_stage.params
            yield self.out_ports[next_stage.component].put(token)
        else:
            # last stage — pipeline completion
            token.pipeline_ctx.complete_tile()
 ```
 ### D5. PE_FETCH_STORE — Dedicated TCM ↔ Register File Transfer
 Previously, GemmBlock and MathBlock each implemented their own TCM read/write.
 This is separated into a **PE_FETCH_STORE component**.
 ```python
 # PE_FETCH_STORE._process()
 def _process(self, env, token):
    yield self.out_ports[tcm_id].put(TcmRequest(token.params["direction"], ...))
    yield tcm_done
    # chaining is handled by the base class (D4 pattern)
 ```
 Advantages:
 - GEMM/MATH perform **pure compute only** — no TCM access logic
 - Fetch/store BW contention is naturally modeled (serialization via PE_TCM resource)
 - Prefetch strategies can be experimented with by replacing the fetch unit alone
 ### D6. Simplification of Each Compute Component
 GEMM/MATH perform compute only with register data already prepared.
 **Chaining follows the common pattern (D4), so only _process() needs to be implemented:**
 ```python
 # PE_GEMM._process()
 def _process(self, env, token):
    yield env.timeout(self._mac_latency(token.params))
 # PE_MATH._process()
 def _process(self, env, token):
    yield env.timeout(self._simd_latency(token.params))
 # PE_FETCH_STORE._process()
 def _process(self, env, token):
    yield self.out_ports[tcm_id].put(TcmRequest(token.params["direction"], ...))
    yield tcm_done
 # PE_DMA._process()
 def _process(self, env, token):
    yield from self._do_fabric_dma(token.params)
 ```
 By replacing only the timing model, one can freely switch between cycle-accurate
 and analytical models. Since the chaining logic resides in the base class,
 each component only implements its pure stage logic.
 ### D7. Topology Changes
 Add PE_FETCH_STORE to the PE template:
 ```yaml
 pe_template:
  components:
    pe_cpu:         { kind: pe_cpu,         impl: pe_cpu_v1, ... }
    pe_scheduler:   { kind: pe_scheduler,   impl: pe_scheduler_v2, ... }
    pe_dma:         { kind: pe_dma,         impl: pe_dma_v1, ... }
    pe_fetch_store: { kind: pe_fetch_store, impl: pe_fetch_store_v1, ... }
    pe_gemm:        { kind: pe_gemm,        impl: pe_gemm_v1, ... }
    pe_math:        { kind: pe_math,        impl: pe_math_v1, ... }
    pe_mmu:         { kind: pe_mmu,         impl: pe_mmu_v1, ... }
    pe_tcm:         { kind: pe_tcm,         impl: pe_tcm_v1, ... }
  links:
    # existing links...
    fetch_store_to_tcm_bw_gbs: 512.0
    fetch_store_to_tcm_mm: 0.0
 ```
 PE internal edge connections:
 ```
 PE_SCHEDULER → PE_DMA (initial dispatch)
 PE_SCHEDULER → PE_FETCH_STORE (initial dispatch)
 PE_SCHEDULER → PE_GEMM (initial dispatch)
 PE_SCHEDULER → PE_MATH (initial dispatch)
 PE_DMA → PE_FETCH_STORE (chaining)
 PE_FETCH_STORE → PE_GEMM (chaining)
 PE_FETCH_STORE → PE_MATH (chaining)
 PE_GEMM → PE_FETCH_STORE (store chaining)
 PE_MATH → PE_FETCH_STORE (store chaining)
 PE_FETCH_STORE → PE_DMA (writeback chaining)
 PE_FETCH_STORE → PE_TCM (BW request)
 ```
 Topology edges encompass both **control/dispatch visibility + runtime chaining**.
 Scheduler → sub-component edges are initial dispatch paths, while
 inter-component edges are runtime chaining paths driven by token self-routing.
 ### D8. Existing Code Migration — Builtin Integration
 The existing builtin v1 components and pe_accel are **replaced with new builtin components**.
 #### Migration Strategy
 1. Back up existing `components/builtin/` → `components/builtin_legacy/` (preserved without modification)
 2. Back up existing `components/custom/pe_accel/` → likewise
 3. Re-implement new `components/builtin/` with the ADR-0021 architecture
 4. Maintain **only one** topology.yaml (including pe_fetch_store)
 5. components.yaml points to the new builtin
 ```yaml
 # components.yaml — new builtin
 pe_scheduler_v1: kernbench.components.builtin.pe_scheduler:PeSchedulerComponent
 pe_gemm_v1:      kernbench.components.builtin.pe_gemm:PeGemmComponent
 pe_math_v1:      kernbench.components.builtin.pe_math:PeMathComponent
 pe_dma_v1:       kernbench.components.builtin.pe_dma:PeDmaComponent
 pe_fetch_store_v1: kernbench.components.builtin.pe_fetch_store:PeFetchStoreComponent
 pe_tcm_v1:       kernbench.components.builtin.pe_tcm:PeTcmComponent
 ```
 The impl names (pe_gemm_v1, etc.) are preserved, but **the implementations are replaced
 with the ADR-0021 architecture**. Existing benchmarks and tests referencing topology.yaml
 continue to work without changes.
 #### Latency Model Inheritance
 The latency modeling of the new builtin components (MAC cycle calculation, SIMD latency,
 TCM BW serialization, DMA fabric latency, etc.) is **based on the current pe_accel
 implementation**. The tile schedule generation logic from tiling.py is also carried over.
 Only the architecture (component separation, self-routing) changes; timing accuracy
 is preserved.
 #### Test Strategy
 #### Test Plan
 **1. Existing test pass** (regression):
 After migration is complete, all existing tests (366) must pass.
 **2. Latency regression**:
 Verify that the new builtin produces identical latency for the same inputs as pe_accel.
 **3. Phase 1 → Phase 2 end-to-end**:
 Integration test from SimPy simulation (Phase 1) op_log generation → DataExecutor
 (Phase 2) actual numpy computation → result correctness verification.
 - GEMM: tl.composite(gemm) → op_log → Phase 2 matmul → allclose verification
 - MATH: tl.exp / tl.add, etc. → op_log → Phase 2 numpy op → allclose verification
 - Chaining: GEMM output → MATH input → final result end-to-end verification
 **4. TileToken self-routing**:
 - Verify that tiles chain according to the plan's stage sequence
 - Verify PipelineContext.complete_tile() exactly-once at the last stage
 - Queue backpressure: verify that only the feeder blocks when DMA queue capacity is exceeded
 **5. Asynchronous pipeline overlap**:
 - Verify that inter-tile stage overlap occurs within the same command (tile0 in GEMM while tile1 in DMA)
 - Multiple commands: verify that cmd2 feed starts after cmd1 feed completes (FIFO order)
 ### D9. TileToken Message Definition
 A message used for passing tile work between components.
 The token carries the plan and stage index, enabling self-routing.
 ```python
@dataclass
 class TileToken:
    tile_id: int
    pipeline_ctx: PipelineContext    # completion tracking
    plan: TilePlan                   # full stage sequence for this tile (immutable)
    stage_idx: int                   # current stage index in plan.stages
    params: dict                     # current stage parameter cache (canonical: plan.stages[stage_idx].params)
    data_op: bool = True             # op_log recording target (ADR-0020)
 ```
 A TileToken is **owned by exactly one component at a time** and
 is never referenced by multiple components simultaneously (single-owner).
 Token lifecycle:
 1. Scheduler creates it with stage_idx=0 and puts it to the first stage component
 2. The component executes _process(), increments stage_idx, and puts it to the next component
 3. The last stage component calls pipeline_ctx.complete_tile()
 4. When all tiles are complete, PipelineContext calls done_event.succeed()
 Relationship with existing PeInternalTxn:
 - PeInternalTxn: command transfer between PE_CPU → PE_SCHEDULER (existing, unchanged)
 - TileToken: per-tile work transfer from PE_SCHEDULER → sub-components (new, self-routing)
 ---
 ## Non-goals
 - **PE_CPU changes**: the PE_CPU → PE_SCHEDULER interface is not modified
  (PeInternalTxn-based, ADR-0014 maintained)
 - **Resource contention model across multiple pipelines**: the current scope focuses on
  accurate modeling of a single pipeline. TCM bank conflicts across multiple pipelines
  are future work.
 - **builtin_legacy maintenance**: kept for backup purposes only; not a target for
  bug fixes or feature additions.
 ## Open Questions
 - **Register File capacity model**: whether to model capacity limits when the fetch unit
  loads into registers. Capacity is expressed in bytes (register_file_bytes), and
  the number of tiles that can be held simultaneously is determined by tile size.
  When capacity is exceeded, fetch stalls, creating natural backpressure.
 - **Prefetch strategy**: this ADR does not allow tile feed interleaving across composite
  commands. Therefore, overlap arises not from pre-injection across commands, but
  naturally from pipeline progression of tiles within the same command.
  If additional prefetch is needed, it should be considered at the level of tile ordering
  within the same command or fetch/store unit policy, not cross-command injection.
 - **PE_DMA coalescing**: per-tile DMA may cause fragmentation.
  Direction is to merge/coalesce within DMA without scheduler involvement.
 - **Synchronous execution mode**: this ADR adopts asynchronous pipeline as the
  default/sole execution model. If a sync mode is needed for debug or validation
  purposes, it will be considered in a future ADR.
 - **TCM bank conflict across multiple pipelines**: currently based on a single pipeline.
  Bank conflict modeling when multiple pipelines simultaneously access TCM is future work.
 ---
 ## Consequences
 ### Positive
 - Each block is an independent component — individually replaceable (ADR-0015 compliant)
 - PE internal structure is visible in the topology
 - Components do not know the next component — plan-based routing provides flexibility
 - Natural pipeline overlap between DMA and compute (SimPy Store backpressure)
 - Improved HW modeling accuracy (done signal = Event, data transfer = message)
 - Fetch/store separation enables accurate TCM BW contention modeling
 ### Negative
 - Increased number of PE internal components (5 → 6) — more topology nodes/edges
 - Component separation makes intra-PE token forwarding more explicit than before
 - Breaking change from existing builtin/pe_accel — migration required
 ---
 ## Affected Files
 | File | Change |
 |------|--------|
 | `topology.yaml` | Add pe_fetch_store component, add chaining edges |
 | `components.yaml` | Register new builtin components |
 | `src/kernbench/topology/builder.py` | Add fetch_store + chaining edges to PE internal edges |
 | `src/kernbench/common/pe_commands.py` | Add TileToken definition |
 | `src/kernbench/components/builtin/pe_scheduler.py` | Re-implement (feeder + plan-based dispatch) |
 | `src/kernbench/components/builtin/pe_gemm.py` | Re-implement (TileToken, _process pattern) |
 | `src/kernbench/components/builtin/pe_math.py` | Re-implement (TileToken, _process pattern) |
 | `src/kernbench/components/builtin/pe_dma.py` | Re-implement (TileToken, _process pattern) |
 | `src/kernbench/components/builtin/pe_fetch_store.py` | New |
 | `src/kernbench/components/builtin/pe_tcm.py` | Re-implement (TcmRequest service) |
 | `src/kernbench/components/builtin/types.py` | New: TilePlan, Stage, StageType, PipelineContext, TileToken |
 | `src/kernbench/components/builtin/tiling.py` | Ported from pe_accel: plan generation logic |
 Backup:
 | `src/kernbench/components/builtin_legacy/` | Full backup of existing builtin (preserved without modification) |
 | `src/kernbench/components/custom/pe_accel/` | Backup of existing pe_accel (preserved without modification) |
@@ -0,0 +1,528 @@
 # ADR-0021: PE 파이프라인 리팩토링 — 컴포넌트 분리 + Scheduler 기반 라우팅
 ## Status
 Proposed
 ## Context
 ### 현재 구조의 문제
 pe_accel (SchedulerV2Component)은 5개 하드웨어 블록(DmaIn, DmaWb, Gemm, Math, Tcm)을
 **단일 컴포넌트 내부**에 숨기고 있다.
 ```
 SchedulerV2Component (단일 topology 노드)
 ├── DmaInBlock     ← 내부 SimPy Store로 직접 연결
 ├── DmaWbBlock     ← topology에 안 보임
 ├── GemmBlock      ← 교체 불가
 ├── MathBlock      ← 교체 불가
 └── TcmBlock       ← 교체 불가
 ```
 문제점:
 - 블록이 다음 블록을 `desc.next_block`으로 직접 참조 — 하드코딩된 라우팅
 - 개별 블록 교체 불가 (ADR-0015 컴포넌트 교체 원칙 위배)
 - topology에서 PE 내부 구조가 보이지 않음
 - GemmBlock과 MathBlock이 TCM load/store 로직을 각각 중복 구현
 ### 실제 하드웨어 구조
 ```
 HBM ←(DMA)→ TCM ←(Fetch/Store Unit)→ Register File ←→ GEMM/MATH Engine
 ```
 - DMA: HBM ↔ TCM 전송 (fabric 경유, 수십~수백 ns)
 - Fetch/Store Unit: TCM ↔ Register File 전송 (BW 기반, 수 ns)
 - GEMM/MATH Engine: Register File 간 연산 (cycle-accurate)
 - 완료 신호: PE 내부 1-cycle wire signal (done pin assert)
 ---
 ## Decision
 ### D1. 각 블록을 독립 컴포넌트로 분리
 pe_accel의 내부 블록을 **독립 PeEngineBase 컴포넌트**로 분리한다.
 기존 5개 + Fetch/Store Unit 1개 = 6개 컴포넌트.
 | 컴포넌트 | 역할 | HW 대응 |
 |----------|------|---------|
 | PE_SCHEDULER | plan 생성, tile 상태 관리, stage 라우팅 | Scheduler/Sequencer |
 | PE_DMA | HBM ↔ TCM (fabric 경유) | DMA Engine |
 | PE_FETCH_STORE | TCM ↔ Register File | Load/Store Unit |
 | PE_GEMM | MAC compute (register only) | MAC Array |
 | PE_MATH | element-wise/reduction (register only) | SIMD/Vector Unit |
 | PE_TCM | BW-serialized scratchpad | SRAM Bank |
 각 컴포넌트는 topology 노드로 존재하며, port/wire로 연결된다.
 `impl`을 교체하면 개별 블록의 타이밍 모델을 변경할 수 있다.
 ### D2. Token Self-Routing — Scheduler는 dispatch + completion만
 **컴포넌트가 매 stage마다 scheduler를 경유하지 않는다.**
 Token이 plan을 가지고 있어 컴포넌트가 직접 다음 stage로 체이닝한다.
 ```
 Scheduler → DMA → Fetch → GEMM → Math → Store → DMA_WB → (done) → Scheduler
              ↑ 체이닝: scheduler 안 거침                    completion만
 ```
 이는 실제 HW에서 각 블록의 done signal이 다음 블록에 직접 wire로 연결되어
 있는 구조와 일치한다. Scheduler는 **초기 dispatch + completion aggregation만** 담당.
 #### Stage 정의
 ```python
 class StageType(Enum):
    DMA_READ = 0
    FETCH = 1
    GEMM = 2
    MATH = 3
    STORE = 4
    DMA_WRITE = 5
 ```
 #### Plan 구조
 Scheduler가 CompositeCmd를 받으면 **tile 단위 실행 plan**을 생성한다.
 Plan은 각 tile의 **stage sequence**를 정의한다:
 ```python
@dataclass
 class Stage:
    stage_type: StageType
    component: str       # topology 노드 ID (e.g. "sip0.cube0.pe0.pe_dma")
    params: dict         # stage별 파라미터 (dynamic)
@dataclass(frozen=True)
 class TilePlan:
    tile_id: int
    stages: tuple[Stage, ...]  # 순서대로 실행할 stage 목록 (immutable)
 ```
 Plan에 따라 stage sequence가 달라진다:
 ```python
 # 일반 GEMM: HBM → TCM → Register → Compute → Register → TCM → HBM
 stages = (DMA_READ, FETCH, GEMM, STORE, DMA_WRITE)
 # TCM 데이터로 바로 GEMM (DMA read 생략):
 stages = (FETCH, GEMM, STORE, DMA_WRITE)
 # MATH element-wise:
 stages = (DMA_READ, FETCH, MATH, STORE, DMA_WRITE)
 # GEMM + accumulation (중간 K-tile, writeback 생략):
 stages = (DMA_READ, FETCH, GEMM, STORE)  # store to TCM only
 ```
 **컴포넌트는 다음 컴포넌트를 하드코딩하지 않는다.**
 Token의 plan에서 다음 stage를 읽고, out_port로 직접 전달한다.
 네트워크 패킷이 라우팅 헤더를 가지고 있는 것과 같은 패턴이다.
 #### Pipeline Context
 ```python
@dataclass
 class PipelineContext:
    id: str
    total_tiles: int
    completed_tiles: int = 0
    done_event: simpy.Event = None  # 모든 tile 완료 시 succeed
    def complete_tile(self) -> None:
        self.completed_tiles += 1
        if self.completed_tiles == self.total_tiles:
            self.done_event.succeed()
 ```
 **Completion은 exactly-once contract**: 각 tile의 마지막 stage는 정확히 한 번만
 `complete_tile()`을 호출해야 한다. 중복 호출은 버그이며, `done_event`는
 단 한 번만 succeed되어야 한다 (SimPy Event 제약).
 #### Scheduler 역할 (축소됨)
 Scheduler는 CompositeCmd를 받으면 plan과 PipelineContext를 생성한 뒤,
 이를 scheduler 내부의 `_pending_feeds` FIFO에 enqueue하고 즉시 리턴한다.
 실제 tile 투입은 **단일 feeder process** (`_feed_loop`)가 담당한다.
 이 feeder는 `_pending_feeds`를 FIFO 순서로 소비하며,
 **composite command 간 tile feed interleaving은 허용하지 않는다.**
 즉, 한 command의 모든 tile이 첫 stage queue에 투입된 후에만
 다음 command의 feed가 시작된다.
 Scheduler당 `_feed_loop`는 **정확히 하나만** 존재하며,
 composite command의 tile feed는 이 단일 process를 통해서만 수행된다.
 Command issue order는 **PE_SCHEDULER가 PeInternalTxn을 수신한 순서**를 의미한다.
 이 구조는 command issue order를 유지하면서도, 첫 stage queue full 시
 feeder process만 block되고 scheduler worker의 inbox 처리 자체는 멈추지 않도록 한다.
 ```python
 class PeSchedulerV2(PeEngineBase):
    _pipelines: dict[str, PipelineContext]
    _pending_feeds: simpy.Store   # FIFO of (plan, ctx)
    def start(self, env):
        super().start(env)
        self._pending_feeds = simpy.Store(env)
        env.process(self._feed_loop(env))
    def _dispatch_composite(self, env, pe_txn, cmd):
        plan = generate_plan(cmd)
        ctx = PipelineContext(
            id=next_id(),
            total_tiles=len(plan.tiles),
            done_event=pe_txn.done,
        )
        self._pipelines[ctx.id] = ctx
        # feeder queue에 등록만 하고 즉시 리턴
        yield self._pending_feeds.put((plan, ctx))
    def _feed_loop(self, env):
        """단일 feeder process: composite command를 FIFO 순서로 feed.
        Composite command 간 tile feed interleaving은 허용하지 않는다.
        한 command의 모든 tile이 첫 stage queue에 투입된 후에만
        다음 command의 feed가 시작된다.
        첫 stage queue full 시 이 feeder만 block되며,
        scheduler worker의 inbox 처리는 멈추지 않는다.
        """
        while True:
            plan, ctx = yield self._pending_feeds.get()
            for tile in plan.tiles:
                token = TileToken(
                    tile_id=tile.tile_id,
                    pipeline_ctx=ctx,
                    plan=tile,
                    stage_idx=0,
                    params=tile.stages[0].params,
                )
                yield self.out_ports[tile.stages[0].component].put(token)
                # queue capacity = HW queue depth → full이면 feeder만 block
 ```
 본 ADR에서 scheduler는 여러 composite command를 수용할 수 있으나,
 tile submission order는 command 단위 FIFO를 따른다.
 Command 내부에서는 tile-level pipeline overlap을 허용하지만,
 command 간 tile feed interleaving은 허용하지 않는다.
 ### D3. 데이터 전달 vs 완료 신호 — HW 모델링 기준
 | 통신 유형 | 방식 | HW 대응 |
 |----------|------|---------|
 | tile token (작업 지시) | message via out_port | command queue에 enqueue |
 | stage 완료 → 다음 stage | 컴포넌트가 직접 out_port.put | done-triggered local enqueue |
 | pipeline 완료 → scheduler | PipelineContext.complete_tile() | completion interrupt |
 **Tile token**: out_port.put() 사용. SimPy Store capacity = HW queue depth.
 **Intra-PE chaining latency**: 본 ADR 범위에서는 intra-PE stage trigger에
 explicit latency model을 두지 않는다. 컴포넌트 간 체이닝은 PE 내부 wire에 해당하며,
 scheduler 왕복이 없으므로 artificial hop cost가 발생하지 않는다.
 **Pipeline 완료**: 마지막 stage의 컴포넌트가 `pipeline_ctx.complete_tile()` 호출.
 모든 tile 완료 시 PipelineContext가 done_event.succeed().
 ### D4. 비동기 파이프라인 — 자연스러운 overlap
 Scheduler는 CompositeCmd를 **비동기로** 처리한다.
 다만 tile feed는 command마다 독립 process를 만들지 않고,
 scheduler 내부의 **단일 feeder process**가 FIFO 순서로 수행한다.
 따라서 scheduler는 다음 command를 계속 받을 수 있지만,
 첫-stage tile 투입 순서는 command 단위로 보장된다.
 **SimPy Store capacity = HW queue depth**이므로:
 - queue가 차면 put()이 자연스럽게 block (backpressure)
 - DMA가 tile 0을 처리하는 동안 GEMM은 이미 완료된 tile의 fetch를 시작
 - 두 번째 CompositeCmd가 들어오면 DMA queue에 바로 이어서 투입
 ```
 First-stage feed order (feeder → DMA queue):
  [cmd1:t0][cmd1:t1][cmd1:t2]...[cmd1:tN] | [cmd2:t0][cmd2:t1]...
                                            ↑ cmd1 feed 완료 후 cmd2 시작
 Runtime pipeline (downstream overlap):
  PE_DMA:    [cmd1:t0][cmd1:t1][cmd1:t2]...[cmd1:tN][cmd2:t0][cmd2:t1]...
  PE_FETCH:          [cmd1:t0][cmd1:t1]...
  PE_GEMM:                   [cmd1:t0][cmd1:t1]...
                              ↑ 같은 cmd 내부에서 pipeline overlap
 ```
 이때 overlap은 서로 다른 command의 tile feed interleaving에서 오는 것이 아니라,
 먼저 투입된 command의 tile들이 downstream stage로 진행되는 동안 feeder가
 다음 tile들을 계속 투입하면서 자연스럽게 발생한다.
 예를 들어 cmd1의 모든 tile이 첫 stage queue에 투입되기 전에는
 cmd2의 tile feed는 시작되지 않는다. 그러나 cmd1.tile0이 이미 GEMM으로
 진행한 상태에서 cmd1.tile1, cmd1.tile2가 DMA/FETCH에 남아 있을 수 있으므로,
 **같은 command 내부에서는 pipeline overlap이 자연스럽게 발생**한다.
 #### 컴포넌트 체이닝 패턴
 모든 컴포넌트가 동일한 패턴을 따른다:
 ```python
 def _pipeline_worker(self, env):
    while True:
        token = yield self._inbox.get()
        # 자기 stage 처리
        yield from self._process(env, token)
        # 다음 stage로 체이닝 (plan에서 읽음)
        next_idx = token.stage_idx + 1
        if next_idx < len(token.plan.stages):
            next_stage = token.plan.stages[next_idx]
            token.stage_idx = next_idx
            token.params = next_stage.params
            yield self.out_ports[next_stage.component].put(token)
        else:
            # 마지막 stage — pipeline completion
            token.pipeline_ctx.complete_tile()
 ```
 ### D5. PE_FETCH_STORE — TCM ↔ Register File 전담
 기존에 GemmBlock과 MathBlock이 각각 TCM read/write를 구현했으나,
 이를 **PE_FETCH_STORE 컴포넌트**로 분리한다.
 ```python
 # PE_FETCH_STORE._process()
 def _process(self, env, token):
    yield self.out_ports[tcm_id].put(TcmRequest(token.params["direction"], ...))
    yield tcm_done
    # 체이닝은 base class가 처리 (D4 패턴)
 ```
 장점:
 - GEMM/MATH는 **순수 compute만** — TCM 접근 로직 없음
 - fetch/store BW 경합이 자연스럽게 모델링됨 (PE_TCM의 resource로 serialization)
 - prefetch 전략 등 fetch unit 단독 교체로 실험 가능
 ### D6. 각 Compute 컴포넌트의 단순화
 GEMM/MATH는 register 데이터가 이미 준비된 상태에서 compute만 수행.
 **체이닝은 공통 패턴(D4)을 따르므로, _process()만 구현하면 된다:**
 ```python
 # PE_GEMM._process()
 def _process(self, env, token):
    yield env.timeout(self._mac_latency(token.params))
 # PE_MATH._process()
 def _process(self, env, token):
    yield env.timeout(self._simd_latency(token.params))
 # PE_FETCH_STORE._process()
 def _process(self, env, token):
    yield self.out_ports[tcm_id].put(TcmRequest(token.params["direction"], ...))
    yield tcm_done
 # PE_DMA._process()
 def _process(self, env, token):
    yield from self._do_fabric_dma(token.params)
 ```
 타이밍 모델만 교체하면 cycle-accurate든 analytical든 자유롭게 변경 가능.
 체이닝 로직은 base class에 있으므로 각 컴포넌트는 순수 stage 로직만 구현.
 ### D7. Topology 변경
 PE template에 PE_FETCH_STORE 추가:
 ```yaml
 pe_template:
  components:
    pe_cpu:         { kind: pe_cpu,         impl: pe_cpu_v1, ... }
    pe_scheduler:   { kind: pe_scheduler,   impl: pe_scheduler_v2, ... }
    pe_dma:         { kind: pe_dma,         impl: pe_dma_v1, ... }
    pe_fetch_store: { kind: pe_fetch_store, impl: pe_fetch_store_v1, ... }
    pe_gemm:        { kind: pe_gemm,        impl: pe_gemm_v1, ... }
    pe_math:        { kind: pe_math,        impl: pe_math_v1, ... }
    pe_mmu:         { kind: pe_mmu,         impl: pe_mmu_v1, ... }
    pe_tcm:         { kind: pe_tcm,         impl: pe_tcm_v1, ... }
  links:
    # 기존 links...
    fetch_store_to_tcm_bw_gbs: 512.0
    fetch_store_to_tcm_mm: 0.0
 ```
 PE 내부 edge 연결:
 ```
 PE_SCHEDULER → PE_DMA (초기 dispatch)
 PE_SCHEDULER → PE_FETCH_STORE (초기 dispatch)
 PE_SCHEDULER → PE_GEMM (초기 dispatch)
 PE_SCHEDULER → PE_MATH (초기 dispatch)
 PE_DMA → PE_FETCH_STORE (체이닝)
 PE_FETCH_STORE → PE_GEMM (체이닝)
 PE_FETCH_STORE → PE_MATH (체이닝)
 PE_GEMM → PE_FETCH_STORE (store 체이닝)
 PE_MATH → PE_FETCH_STORE (store 체이닝)
 PE_FETCH_STORE → PE_DMA (writeback 체이닝)
 PE_FETCH_STORE → PE_TCM (BW 요청)
 ```
 Topology edge는 **control/dispatch visibility + runtime chaining** 양쪽을 포함한다.
 Scheduler → 하위 컴포넌트 edge는 초기 dispatch 경로이며,
 컴포넌트 간 edge는 token self-routing에 의한 runtime chaining 경로이다.
 ### D8. 기존 코드 마이그레이션 — builtin 통합
 기존 builtin v1 컴포넌트와 pe_accel을 **새 builtin으로 교체**한다.
 #### 마이그레이션 전략
 1. 기존 `components/builtin/` → `components/builtin_legacy/`로 백업 (수정 없이 보관)
 2. 기존 `components/custom/pe_accel/` → 동일하게 백업
 3. 새 `components/builtin/`에 ADR-0021 아키텍처로 재구현
 4. topology.yaml은 **하나만 유지** (pe_fetch_store 포함)
 5. components.yaml은 새 builtin을 가리킴
 ```yaml
 # components.yaml — 새 builtin
 pe_scheduler_v1: kernbench.components.builtin.pe_scheduler:PeSchedulerComponent
 pe_gemm_v1:      kernbench.components.builtin.pe_gemm:PeGemmComponent
 pe_math_v1:      kernbench.components.builtin.pe_math:PeMathComponent
 pe_dma_v1:       kernbench.components.builtin.pe_dma:PeDmaComponent
 pe_fetch_store_v1: kernbench.components.builtin.pe_fetch_store:PeFetchStoreComponent
 pe_tcm_v1:       kernbench.components.builtin.pe_tcm:PeTcmComponent
 ```
 impl 이름(pe_gemm_v1 등)은 유지하되, **구현이 ADR-0021 아키텍처로 교체**된다.
 기존 벤치마크와 테스트의 topology.yaml 참조는 변경 없이 동작한다.
 #### 레이턴시 모델 계승
 새 builtin 컴포넌트의 레이턴시 모델링(MAC cycle 계산, SIMD latency,
 TCM BW serialization, DMA fabric latency 등)은 **pe_accel 현재 버전의 구현을 바탕으로** 한다.
 tiling.py의 tile schedule 생성 로직도 그대로 가져온다.
 아키텍처(컴포넌트 분리, self-routing)만 변경하고, 타이밍 정확도는 유지한다.
 #### 테스트 전략
 #### 테스트 계획
 **1. 기존 테스트 통과** (regression):
 마이그레이션 완료 후 기존 테스트(366개)가 전부 통과해야 한다.
 **2. 레이턴시 regression**:
 pe_accel과 동일한 입력에 대해 새 builtin이 동일 레이턴시를 산출하는지 검증.
 **3. Phase 1 → Phase 2 end-to-end**:
 SimPy 시뮬레이션(Phase 1)에서 op_log 생성 → DataExecutor(Phase 2)로
 실제 numpy 연산 → 결과 정합성 검증까지 통합 테스트.
 - GEMM: tl.composite(gemm) → op_log → Phase 2 matmul → allclose 검증
 - MATH: tl.exp / tl.add 등 → op_log → Phase 2 numpy op → allclose 검증
 - 체이닝: GEMM 출력 → MATH 입력 → 최종 결과 end-to-end 검증
 **4. TileToken self-routing**:
 - tile이 plan의 stage sequence를 따라 체이닝되는지 검증
 - 마지막 stage에서 PipelineContext.complete_tile() exactly-once 검증
 - queue backpressure: DMA queue capacity 초과 시 feeder만 block 검증
 **5. 비동기 pipeline overlap**:
 - 동일 command 내 tile 간 stage overlap 발생 검증 (tile0 GEMM 중 tile1 DMA)
 - 다중 command: cmd1 feed 완료 후 cmd2 feed 시작 (FIFO 순서) 검증
 ### D9. TileToken 메시지 정의
 컴포넌트 간 tile 작업 전달에 사용하는 메시지.
 Token이 plan과 stage index를 가지고 있어 self-routing이 가능하다.
 ```python
@dataclass
 class TileToken:
    tile_id: int
    pipeline_ctx: PipelineContext    # completion 추적
    plan: TilePlan                   # 이 tile의 전체 stage sequence (immutable)
    stage_idx: int                   # 현재 stage index in plan.stages
    params: dict                     # current stage 파라미터 캐시 (canonical: plan.stages[stage_idx].params)
    data_op: bool = True             # op_log 기록 대상 (ADR-0020)
 ```
 TileToken은 한 시점에 **하나의 컴포넌트에 의해서만 소유**되며,
 동시에 여러 컴포넌트에 의해 참조되지 않는다 (single-owner).
 Token lifecycle:
 1. Scheduler가 stage_idx=0으로 생성, 첫 stage 컴포넌트에 put
 2. 컴포넌트가 _process() 실행 후 stage_idx 증가, 다음 컴포넌트에 put
 3. 마지막 stage 컴포넌트가 pipeline_ctx.complete_tile() 호출
 4. 모든 tile 완료 시 PipelineContext가 done_event.succeed()
 기존 PeInternalTxn과의 관계:
 - PeInternalTxn: PE_CPU → PE_SCHEDULER 간 command 전달 (기존 유지)
 - TileToken: PE_SCHEDULER → 하위 컴포넌트 간 tile 단위 작업 전달 (신규, self-routing)
 ---
 ## Non-goals
 - **PE_CPU 변경**: PE_CPU → PE_SCHEDULER 인터페이스는 변경하지 않음
  (PeInternalTxn 기반, ADR-0014 유지)
 - **다중 pipeline 간 자원 경합 모델**: 현재 범위에서는 단일 pipeline의
  정확한 모델링에 집중. 다중 pipeline 간 TCM bank conflict 등은 future work.
 - **builtin_legacy 유지보수**: 백업 목적이며, 버그 수정이나 기능 추가 대상이 아님.
 ## Open Questions
 - **Register File 용량 모델**: fetch unit이 register에 로드할 때 용량 제한을
  모델링할지. 용량은 바이트 단위(register_file_bytes)로 표현하며,
  동시에 보유 가능한 tile 수는 tile 크기에 따라 결정된다.
  용량 초과 시 fetch가 stall되어 자연스러운 backpressure가 발생한다.
 - **Prefetch 전략**: 본 ADR에서는 composite command 간 tile feed interleaving을
  허용하지 않는다. 따라서 overlap은 command 간 선행 투입이 아니라,
  같은 command 내부 tile들의 pipeline progression에서 자연스럽게 발생한다.
  추가적인 prefetch가 필요하면 command 간 투입이 아니라, 같은 command 내부에서의
  tile ordering 또는 fetch/store unit policy 차원에서 검토한다.
 - **PE_DMA coalescing**: tile 단위 DMA는 fragmentation 발생 가능.
  DMA 내부에서 merge/coalesce하되 scheduler는 관여하지 않는 방향.
 - **동기 실행 모드**: 본 ADR에서는 비동기 pipeline을 기본/유일 execution model로
  채택한다. 디버그 또는 validation 목적의 sync mode가 필요하면 future ADR에서 검토.
 - **다중 pipeline 간 TCM bank conflict**: 현재 단일 pipeline 기준.
  다중 pipeline이 동시에 TCM에 접근할 때의 bank conflict 모델은 future work.
 ---
 ## Consequences
 ### 긍정적
 - 각 블록이 독립 컴포넌트 — 개별 교체 가능 (ADR-0015 준수)
 - topology에서 PE 내부 구조 가시화
 - 컴포넌트가 다음 컴포넌트를 모름 — plan 기반 라우팅으로 유연성 확보
 - DMA와 compute의 자연스러운 파이프라인 overlap (SimPy Store backpressure)
 - HW 모델링 정확도 향상 (done signal = Event, data transfer = message)
 - fetch/store 분리로 TCM BW 경합 정확히 모델링
 ### 부정적
 - PE 내부 컴포넌트 수 증가 (5 → 6) — topology 노드/edge 증가
 - 컴포넌트 분리로 인해 intra-PE token forwarding이 이전 대비 더 명시적으로 드러남
 - 기존 builtin/pe_accel과의 breaking change — 마이그레이션 필요
 ---
 ## 영향받는 파일
 | 파일 | 변경 |
 |------|------|
 | `topology.yaml` | pe_fetch_store 컴포넌트 추가, 체이닝 edge 추가 |
 | `components.yaml` | 새 builtin 컴포넌트 등록 |
 | `src/kernbench/topology/builder.py` | PE 내부 edge에 fetch_store + 체이닝 edge 추가 |
 | `src/kernbench/common/pe_commands.py` | TileToken 정의 추가 |
 | `src/kernbench/components/builtin/pe_scheduler.py` | 재구현 (feeder + plan 기반 dispatch) |
 | `src/kernbench/components/builtin/pe_gemm.py` | 재구현 (TileToken, _process 패턴) |
 | `src/kernbench/components/builtin/pe_math.py` | 재구현 (TileToken, _process 패턴) |
 | `src/kernbench/components/builtin/pe_dma.py` | 재구현 (TileToken, _process 패턴) |
 | `src/kernbench/components/builtin/pe_fetch_store.py` | 신규 |
 | `src/kernbench/components/builtin/pe_tcm.py` | 재구현 (TcmRequest 서비스) |
 | `src/kernbench/components/builtin/types.py` | 신규: TilePlan, Stage, StageType, PipelineContext, TileToken |
 | `src/kernbench/components/builtin/tiling.py` | pe_accel에서 이식: plan 생성 로직 |
 백업:
 | `src/kernbench/components/builtin_legacy/` | 기존 builtin 전체 백업 (수정 없이 보관) |
 | `src/kernbench/components/custom/pe_accel/` | 기존 pe_accel 백업 (수정 없이 보관) |
@@ -0,0 +1,90 @@
 # ADR-0022: 2D Grid program_id Semantics
 - **Status**: Accepted
 - **Date**: 2026-04-09
 - **Context**: Triton-style kernel addressing for multi-cube PE topology
 ## Problem
 Triton kernels use `tl.program_id(axis)` to identify their position in a launch grid.
 Our hardware has a 2-level hierarchy: **cubes** contain **PEs**.
 The previous implementation ignored the `axis` parameter and always returned a flat PE index,
 making it impossible for kernels to distinguish their cube-local position from their cube identity.
 ## Decision
 Map `tl.program_id` and `tl.num_programs` to the 2D hardware grid:
 | Call | Returns | Description |
 |------|---------|-------------|
 | `tl.program_id(axis=0)` | `local_pe_id` | PE index within cube |
 | `tl.program_id(axis=1)` | `cube_id` | Cube index |
 | `tl.num_programs(axis=0)` | `num_pes_per_cube` | PEs per cube |
 | `tl.num_programs(axis=1)` | `num_cubes` | Total cubes |
 Global PID is derived as:
 ```python
 global_pid = tl.program_id(axis=1) * tl.num_programs(axis=0) + tl.program_id(axis=0)
 ```
 ### Axis mapping rationale
 - **axis=0 = PE (innermost)**: PEs within a cube share HBM and communicate via local NOC mesh. This is the fast, tightly-coupled dimension — analogous to threads within a block.
 - **axis=1 = Cube (outer)**: Cross-cube communication goes through UCIe with higher latency. This is the coarser scheduling dimension — analogous to blocks in a grid.
 ## Implementation
 ### TLContext (`triton_emu/tl_context.py`)
 Added `cube_id` and `num_cubes` constructor parameters. `program_id()` and `num_programs()` dispatch on `axis`:
 ```python
 def program_id(self, axis: int = 0) -> int:
    if axis == 1:
        return self._cube_id
    return self._pe_id
 def num_programs(self, axis: int = 0) -> int:
    if axis == 1:
        return self._num_cubes
    return self._num_programs
 ```
 ### PE_CPU (`components/builtin/pe_cpu.py`)
 - Extracts `num_cubes` from `ctx.spec["system"]["sips"]["cubes_per_sip"]`
 - Passes `cube_id` (already available as `self._cube_idx`) and `num_cubes` to TLContext
 ### KernelRunner (`triton_emu/kernel_runner.py`)
 - Receives `num_cubes` from PE_CPU
 - Passes `cube_id` and `num_cubes` to TLContext in greenlet mode
 ## Backward Compatibility
 - Existing code using `tl.program_id(0)` or `tl.program_id()` is unchanged — returns the same PE index as before.
 - `cube_id` and `num_cubes` default to `0` and `1`, so callers that don't provide them (e.g. unit tests) continue to work.
 ## Usage Example
 ```python
 def sharded_gemm_kernel(a_ptr, b_ptr, out_ptr, M, K, N, tl):
    local_pid = tl.program_id(axis=0)      # PE within cube
    cube_id   = tl.program_id(axis=1)      # which cube
    global_pid = cube_id * tl.num_programs(axis=0) + local_pid
    # Column-wise sharding across global PID
    n_per_pid = N // (tl.num_programs(axis=1) * tl.num_programs(axis=0))
    col_start = global_pid * n_per_pid
    a = tl.load(a_ptr, shape=(M, K), dtype="f16")
    b = tl.ref(b_ptr + col_start * K * 2, shape=(K, n_per_pid), dtype="f16")
    h = tl.composite(op="gemm", a=a, b=b, out_ptr=out_ptr + col_start * M * 2)
    tl.wait(h)
 ```
 ## Consequences
 - Benchmarks can now express cube-aware sharding and addressing without hardcoding topology dimensions.
 - Future axis=2 (SIP-level) can be added following the same pattern if needed.
@@ -0,0 +1,866 @@
 # ADR-0023: PE-level IPCQ — Inter-PE Collective Communication
 ## Status
 Proposed
 ## Context
 ### Goal
 Add the infrastructure that lets CCL (Collective Communication Library)
 kernels run **inside** a PE. The host just launches a kernel on each
 SIP; the actual synchronization and data movement happen **inside the
 PE kernel via an IPCQ (Inter-Process Communication Queue)**.
 This mirrors how NCCL performs NVLink communication inside a GPU
 kernel, or how Cerebras / Tenstorrent expose core-local communication
 queues. Host-level collectives (`dist.all_reduce`) are deferred to
 **future work**; this ADR focuses solely on the kernel-side collective
 infrastructure.
 ### Current state
 - ADR-0021 PE pipeline refactor: each PE is decomposed into components
  (PE_CPU, PE_SCHEDULER, PE_DMA, PE_FETCH_STORE, PE_GEMM, PE_MATH,
  PE_TCM, PE_MMU).
 - No direct PE-to-PE channel exists today. All data movement goes
  through PE_DMA → cube_noc / UCIe / PCIE → HBM.
 - A pre-ADR host CCL skeleton exists (`dist.init_process_group(backend="ahbm")`,
  `_run_ccl_bench` running per-rank greenlets concurrently). The
  collective itself is a stub.
 ### Problems to solve
 1. PE-to-PE direct data movement (writing into a peer's memory).
 2. Synchronization — the sender must check that the receiver has space
   in its buffer (backpressure).
 3. Resource contention between compute traffic and communication
   traffic (Head-of-Line blocking).
 4. The host must be able to construct logical neighbor topologies
   (ring / mesh / tree) per algorithm.
 ---
 ## Decision
 ### D1. Add a new `PE_IPCQ` component
 A new component `PE_IPCQ` is added inside each PE. It follows the same
 pattern as PE_GEMM / PE_MATH — modeling a sub-block of the PE as a
 distinct component.
 ```
 PE
 ├── PE_CPU
 ├── PE_SCHEDULER
 ├── PE_DMA
 ├── PE_IPCQ          ← new
 ├── PE_FETCH_STORE
 ├── PE_GEMM
 ├── PE_MATH
 ├── PE_TCM
 ├── PE_MMU
 ```
 **Role separation** (control plane vs. data plane):
 - **PE_IPCQ (control plane)**: ring-buffer address arithmetic, head /
  tail pointer management, peer pointer caches, backpressure, 4-direction
  neighbor mapping.
 - **PE_DMA (data plane)**: actually moves data through cube_noc / UCIe
  / PCIE into the peer's memory.
 PE_IPCQ does **not** move data itself — it delegates to PE_DMA.
 ### D2. Ring buffer model
 Each PE owns 4 directions (N/S/E/W) × {tx, rx} = 8 ring buffers.
 ```python
@dataclass
 class IpcqQueuePair:
    direction: Direction          # N/S/E/W
    peer: IpcqEndpoint            # set by host at init time (D2.5)
    tx_buffer_base: int           # outgoing data base addr (in our memory)
    rx_buffer_base: int           # incoming data base addr (in our memory)
    slot_size: int                # 1 tile per slot
    n_slots: int                  # ring depth
    my_head: int                  # next slot we will write/send into
    my_tail: int                  # next slot we will read/recv from
    peer_head_cache: int          # peer's last-seen head (updated via D9 piggyback)
    peer_tail_cache: int          # peer's last-seen tail (updated via D9 fast-path credit)
 ```
 **Canonical field names**: throughout this ADR the four names above
 (`my_head`, `my_tail`, `peer_head_cache`, `peer_tail_cache`) are used
 consistently. Synonyms (`peer_head_local`, `peer_head`, `peer_tail`,
 etc.) are not used.
 | Field | Owner | Updated when |
 |-------|-------|--------------|
 | `my_head` | local PE_IPCQ | immediately after `tl.send` (send tracking) |
 | `my_tail` | local PE_IPCQ | immediately after `tl.recv` (recv tracking) |
 | `peer_head_cache` | local PE_IPCQ | on `IpcqMetaArrival` (D9 piggyback) |
 | `peer_tail_cache` | local PE_IPCQ | on `IpcqCreditMetadata` (D9 fast path) |
 **Slot unit**: fixed-size, one slot holds one full tile (no descriptor
 indirection). Full data embedded in the slot. See D5.
 ### D2.5. `IpcqEndpoint` schema
 `IpcqQueuePair.peer` carries everything the sender needs to compute the
 peer's rx slot address:
 ```python
@dataclass(frozen=True)
 class IpcqEndpoint:
    sip: int
    cube: int
    pe: int
    buffer_kind: str             # "tcm" | "hbm" | "sram"
    rx_base_pa: int              # peer rx_buffer base PA (PhysAddr.encode())
    rx_base_va: int              # peer rx_buffer base VA (optional, MMU mode)
    n_slots: int                 # peer ring depth (for wrap-around)
    slot_size: int               # peer slot size (for offset)
 ```
 Address computation:
 ```python
 slot_idx = self.my_head % peer.n_slots
 dst_pa = peer.rx_base_pa + slot_idx * peer.slot_size
 ```
 PE_IPCQ passes `dst_pa` to PE_DMA inside an `IpcqDmaToken`. PE_DMA
 (vc_comm) routes the data to `dst_pa` through the fabric.
 **Endpoint construction order**: at backend init (D10), the IPCQ
 buffers for **every PE** are allocated first (so each rank knows the
 others' PA), then the per-rank neighbor tables are built and pushed to
 PE_IPCQ via `IpcqInitMsg`.
 ### D3. Four-direction mapping ≡ logical ProcessGroup
 The PE views four directions (N/S/E/W) as logical ports. Real peer
 addresses are configured by the host CCL init, per the chosen
 algorithm. The PE kernel never knows the topology, only directions.
 ```python
 # 1D ring
 for rank in range(world_size):
    ipcq_set_neighbor(rank, "E", peer=ranks[(rank + 1) % world_size])
    ipcq_set_neighbor(rank, "W", peer=ranks[(rank - 1) % world_size])
 # 2D mesh
 for r in range(R):
    for c in range(C):
        ipcq_set_neighbor((r, c), "N", peer=((r - 1) % R, c))
        ipcq_set_neighbor((r, c), "S", peer=((r + 1) % R, c))
        ipcq_set_neighbor((r, c), "E", peer=(r, (c + 1) % C))
        ipcq_set_neighbor((r, c), "W", peer=(r, (c - 1) % C))
 ```
 The PE code does not need to know where `tl.send(dir="E", ...)` actually
 ends up.
 ### D4. PE kernel API
 ```python
 # Send (blocking; may stall on backpressure)
 tl.send(dir: str, src=TensorHandle)
 tl.send(dir: str, src_addr=..., nbytes=..., shape=..., dtype=..., space=...)
 # Recv (blocking)
 recv = tl.recv(dir: str, shape=..., dtype=...)
 recv = tl.recv(shape=..., dtype=...)        # round-robin across 4 directions
 # Recv (non-blocking)
 fut  = tl.recv_async(dir: str, shape=..., dtype=...)
 recv = tl.wait(fut)
 ```
 `tl.recv()` (no direction) keeps a `last_polled_dir` cursor and on each
 call rotates through directions, returning the first available slot.
 Empty in all 4 directions → wait.
 **Fairness is weak**: the rotating start mitigates simple bias, but if
 one direction always wins the race the others can starve. Algorithms
 that need strict fairness must call `tl.recv(dir=...)` explicitly.
 ### D5. Single-hop DMA write + full-data slot model
 Data moves from sender memory into the receiver's ring slot in **one
 DMA transfer**. Key properties:
 - **Single-hop**: the sender already knows the peer rx slot address and
  fires one fabric DMA into it.
 - **No CPU memcpy**: the CPU never copies data.
 - **No intermediate staging**: neither side keeps a separate staging
  buffer (sender uses the source addr directly; receiver gets the data
  in its ring slot directly).
 (Strictly speaking the fabric DMA write does happen, so this is not
 literally "no data movement" — it's the same property NCCL labels
 "zero-copy", meaning no CPU memcpy and no staging copy.)
 ```
 PE A: tl.send(E, src_addr, nbytes)
  1. IPCQ computes the peer rx slot address:
       dst_addr = peer.rx_base_pa + (my_head % peer.n_slots) * peer.slot_size
  2. Backpressure: my_head - peer_tail_cache < peer.n_slots ?
                   (full → sleep / poll)
  3. Submit DMA on PE_DMA(vc_comm): src_addr → peer dst_addr, nbytes
  4. my_head += 1
 PE B: data = tl.recv(W)
  1. Look at rx_buffer[my_tail % n_slots]
  2. Wait for the data to arrive (D7 backpressure mode)
  3. Return the slot address to the kernel (or fetch into register file)
  4. my_tail += 1
  5. Issue a credit-return fast path (D9): after the bottleneck-BW
     latency the peer A's peer_tail_cache is updated.
 ```
 The slot holds the full tile. The receiver only reads its own
 rx_buffer; it never reads back into A's memory. The sender knows the
 peer rx slot address and DMAs directly into it (single-hop).
 The PE's own PE_TCM read/write does not go through DMA (PE_TCM is local
 to the PE).
 ### D6. Buffer placement — three-way benchmark
 The host CCL init picks the IPCQ ring-buffer location:
 ```python
 ipcq_init(
    backend="ahbm",
    buffer_kind="tcm" | "hbm" | "sram",
    n_slots=8,
    slot_size=4096,
 )
 ```
 | Location | Trait | Trade-off |
 |----------|-------|-----------|
 | **PE_TCM** | Attached to the PE; fast | Small; competes with PE-internal resources |
 | **PE-local HBM** | Large; via DMA | Higher latency |
 | **Cube SRAM** | Mid-size; cube-shared | Cube-internal contention |
 All three locations run the same kernel code; only the init differs.
 ### D7. Backpressure — two-mode benchmark
 How the sender or receiver waits when peer slots are full / data not
 yet arrived:
 | Mode | Behavior | Model |
 |------|----------|-------|
 | **poll** | Periodically re-check the cached peer pointer | Spin loop |
 | **sleep** | Yield a SimPy event; wake on a peer-trigger | Interrupt-like |
 ```python
 ipcq_init(backpressure="poll" | "sleep", ...)
 ```
 Both modes are implemented so latency / throughput trade-offs can be
 benchmarked.
 ### D8. PE_DMA virtual channels
 Extend PE_DMA from a single queue into a **two-channel virtual-channel**
 model.
 ```
 PE_DMA
 ├── vc_compute: tile load / store / writeback for GEMM and Math
 └── vc_comm:    IPCQ send data
 ```
 Each VC has an independent state machine:
 - One channel stalling does not block the other.
 - The same physical link (cube_noc, UCIe, …) is shared, but link BW is
  split between channels.
 **Chunk-level interleave**:
 - Large GEMM tile DMAs do not lock the link end-to-end.
 - Progress happens in chunks (e.g. 256 B); each chunk shares link BW
  with the other VC's pending chunks.
 - Chunk size is an init parameter (smaller = fairer, larger = more
  efficient).
 Net effect:
 - HoL blocking is eliminated (an IPCQ send can interleave with a long
  compute DMA).
 - Compute / comm overlap is natural (NVIDIA copy-engine + compute-SM
  pattern).
 - Matches the NoC-virtual-channel pattern used in real HW.
 **First-implementation accuracy limit (intentional)**: this ADR's
 first cut uses **deterministic chunk-level interleave + weighted
 round-robin arbitration** (default 50 / 50, exposed in `ccl.yaml`).
 This is a first-order approximation and is simpler than real HW
 dynamic-contention / credit-based arbiters. Functional correctness is
 unaffected, but heavy-contention scenarios may report slightly
 optimistic latency vs. real HW. A separate ADR can add a NoC arbiter
 component later if more precision is needed.
 #### Token routing
 - Compute tokens (`TileToken`) — go through the existing
  PE_FETCH_STORE → PE_DMA chain.
 - Communication tokens (`IpcqDmaToken`, new) — PE_IPCQ → PE_DMA
  self-routing.
 - PE_DMA picks the channel by token type.
 ```python
 class PeDmaComponent:
    def _process(self, env, token):
        if isinstance(token, IpcqDmaToken):
            yield from self._vc_comm_process(env, token)
        else:
            yield from self._vc_compute_process(env, token)
 ```
 ### D9. Pointer synchronization — DMA payload piggyback
 Real HW (NVLink, UCIe, etc.) piggybacks metadata onto DMA payloads so
 pointers update along with the data. This simulation adopts the same
 model: **no separate control channel** — metadata travels with the
 data.
 The big benefits:
 - **Automatic ordering**: data and metadata move on the same token, so
  data is visible **before** the head_cache update. No race.
 - **HW fidelity**: matches NVLink / UCIe piggybacked headers.
 - **Component simplification**: no separate `IpcqPtrUpdate` event type.
 #### Send flow (head update via piggyback)
 ```
 PE A: tl.send(E, src_addr, nbytes)
  1. PE_IPCQ checks backpressure (using peer_tail_cache)
  2. PE_IPCQ creates an IpcqDmaToken:
       - data body (src_addr → peer dst_addr)
       - piggyback metadata: (sender_seq, src_sip/cube/pe, src_direction)
  3. Hand the token to PE_DMA(vc_comm)
  4. PE A increments my_head (send tracking)
 [fabric DMA: latency elapses]
 PE B's PE_DMA receives the token
  5. Writes data into dst_addr (B's rx slot) via MemoryStore.write
  6. Forwards token metadata to PE B's PE_IPCQ (PE-internal wire, ~1 cycle)
 PE B's PE_IPCQ receives the metadata
  7. Updates peer_head_cache (= A's head)
  8. Wakes any pending recv on that direction
 ```
 **Steps 5 and 6 must execute in the same SimPy step** — DMA completion
 makes data and metadata atomically visible.
 #### Recv flow (credit return — fast path with bottleneck-BW latency)
 When the receiver frees a slot, the sender must learn about it
 (backpressure release). Unlike data, the credit return does **not**
 travel through general vc_comm fabric — it uses a **separate fast
 path**, an abstraction of the NVLink / UCIe credit-return wire.
 **Latency** is computed from the **bottleneck BW on the path**, not a
 magic constant:
 ```
 credit_size_bytes = 16  (ccl.yaml: ipcq_credit_size_bytes)
 path = router.find_path(self_pe, peer_pe)
 latency = compute_drain_ns(path, credit_size_bytes)
        = credit_size_bytes / bottleneck_bw_on_path
 ```
 That gives us:
 - **Topology-proportional approximation**: an in-cube credit return is
  automatically faster than a cross-SIP credit return.
 - **No magic constants**: no arbitrary `ipcq_ctrl_latency_ns`.
 - **No deadlock risk**: unlike piggyback, B can issue credit even when
  it has no data to send back.
 - **Reuses existing utility**: `ComponentContext.compute_drain_ns`.
 #### Component coupling — SimPy Store channel
 PE B's PE_IPCQ does not call PE A's PE_IPCQ directly. Instead, at init
 time, **a SimPy Store is wired between the two** (a per-direction
 fast-path channel) and credit metadata is `put` into that store.
 ```python
 class PeIpcqComponent:
    def _delayed_credit_send(self, env, peer_credit_store, my_tail, latency_ns):
        yield env.timeout(latency_ns)
        yield peer_credit_store.put(IpcqCreditMetadata(seq=my_tail, ...))
 ```
 Backend init wires both directions of the fast-path channel as part of
 fan-out (see `IpcqInitMsg` in D12).
 #### Credit-return fast path limitations
 - `credit_size_bytes` is an estimate (typically 16–64 bytes).
 - The fast path is **excluded from vc_comm BW contention** (separate
  wire). Real HW credit-return wires are very lightweight, so this is a
  reasonable first approximation.
 - A follow-up ADR can: model the credit fast path as a separate link
  (BW limit + contention), or switch to piggyback (`credit_return_mode:
  piggyback`).
 #### PE_DMA's added responsibility
 When `vc_comm` receives a token, PE_DMA processes it as the following
 **atomic** sequence. **No SimPy yield is allowed between the two steps**
 (invariant I6):
 ```python
 def _on_vc_comm_recv(self, env, token):
    # ── ATOMIC: no yield between these two operations ──
    data = self._memory_store.read(token.src_space, token.src_addr,
                                   shape=..., dtype=...)
    self._memory_store.write(token.dst_endpoint.buffer_kind,
                             token.dst_addr, data)
    # 2. Forward metadata to the local PE_IPCQ
    yield self.out_ports[self._ipcq_id].put(IpcqMetaArrival(token=token))
    # ───────────────────────────────────────────────────
 ```
 The final `put` is yieldable but uses an unbounded internal store, so
 it completes in a single step. That `put` is the closing call of the
 atomic block; nothing may be inserted before it.
 ### D9.5. ADR-0020 (2-pass) integration
 `tl.send` / `tl.recv` integrates with ADR-0020's two-pass model. Phase
 1 simulates timing **and** moves data via MemoryStore; Phase 2 enables
 op-log-based correctness verification.
 #### Phase 1 (timing + data)
 D9 models head and tail updates with two different mechanisms:
 - **Send-side (head update)** — DMA payload piggyback. Data write and
  metadata forward happen in the same SimPy step → automatic atomic
  visibility.
 - **Recv-side (tail credit return)** — fast-path SimPy Store channel
  with bottleneck-BW latency, then `peer_tail_cache` update.
 Together they preserve ring-buffer pointer consistency.
 The op-log records `op_kind="ipcq"` entries for sends (with
 `src/dst/space/addr/nbytes/dir/dtype/shape/sender_seq`) and recvs (with
 `recv_mode/src/dst/space/addr/nbytes/dir/dtype/shape/consumer_seq`).
 Two recv modes:
 - **`return_slot`** (default): the slot address is returned to the
  kernel. Zero-copy.
 - **`copy_to_dst`**: when the kernel passes `dst_addr` + `dst_space`,
  PE_IPCQ copies the slot data into the user dst.
 #### Phase 2 (op_log replay)
 When `DataExecutor` encounters an `op_kind="ipcq"` record:
 - **send**: idempotent `src → dst` ndarray write.
 - **recv (`return_slot`)**: no-op (the slot already holds the data).
 - **recv (`copy_to_dst`)**: idempotent `slot → dst_addr` copy.
 IPCQ ops are pure data movement — Phase 2 has nothing extra to compute.
 The downstream GEMM / Math ops in `DataExecutor` will consume the data
 and naturally validate correctness.
 ### D10. Host CCL init keeps the PyTorch shape
 The host code looks just like real PyTorch DDP. `init_process_group`
 creates the backend object; it does **not** receive IPCQ knobs
 (neighbor topology, buffer_kind, backpressure …).
 ```python
 # benches/ccl_allreduce.py — same shape as real PyTorch
 def worker(rank, world_size, torch):
    dist = torch.distributed
    dist.init_process_group(backend="ahbm")  # reads ccl.yaml + topology
    tensor = torch.zeros((1, world_size * N_ELEM), dtype="f16", dp=...)
    tensor.copy_(torch.from_numpy(init))
    dist.all_reduce(tensor, op="sum")
 ```
 The IPCQ configuration is decided by the backend at
 `init_process_group` time: it loads `ccl.yaml`, picks the algorithm,
 and pushes IPCQ neighbor tables to every participating PE_IPCQ. The
 host code never has to know about IPCQ.
 A bench runs one algorithm, chosen via `ccl.yaml`'s `defaults.algorithm`.
 Switching algorithms is purely a `ccl.yaml` change — no host edits
 required.
 #### Init flow (eager)
 1. `init_process_group(backend="ahbm")` is called.
 2. Backend loads `ccl.yaml` → resolves `defaults.algorithm`.
 3. Pulls topology + buffer_kind + backpressure + slot config from
   `algorithms[<algo>]`.
 4. **Immediately** installs neighbor tables on every PE_IPCQ
   (sideband or fabric `IpcqInitMsg`).
 5. Subsequent `torch.launch(kernel_name, ...)` calls behave normally —
   PE_IPCQ is already prepared whether the kernel is a CCL kernel or
   not.
 ### D11. CCL config file (`ccl.yaml`)
 IPCQ config and algorithm metadata live in a separate YAML file,
 following the same pattern as `components.yaml` and `topology.yaml`.
 A single benchmark execution runs one algorithm
 (`defaults.algorithm`). Switching algorithms means editing
 `defaults.algorithm` only.
 ```yaml
 defaults:
  algorithm: ring_allreduce_tcm
  buffer_kind: tcm                # tcm | hbm | sram
  backpressure: sleep             # poll | sleep
  n_slots: 8
  slot_size: 4096
  vc_chunk_size: 256
  ipcq_credit_size_bytes: 16
 algorithms:
  ring_allreduce_tcm:
    module: kernbench.ccl.algorithms.ring_allreduce
    topology: ring_1d             # builtin name or "custom"
    buffer_kind: tcm
    n_elem: 8                     # optional, per-algorithm tile width
  tree_allreduce_7:
    module: kernbench.ccl.algorithms.tree_allreduce
    topology: tree_binary
    buffer_kind: tcm
    world_size: 7                 # algorithm-level override
    n_elem: 16
  custom_mesh:
    module: kernbench.ccl.algorithms.custom_mesh
    topology: custom              # the module supplies its own neighbors()
 ```
 `world_size` is **not set in `defaults`**. The backend resolves it via:
 `algorithm-level override > defaults override > topology spec`. The
 last fallback (`sips × cubes_per_sip × pes_per_cube`) mirrors real DDP
 where `WORLD_SIZE` comes from env vars rather than config files.
 #### Algorithm module structure
 Each algorithm module exports two hooks — `kernel` (required) and
 `neighbors` (optional) — plus a `kernel_args` helper that the
 backend uses to populate positional kernel arguments at `all_reduce`
 time:
 ```python
 # src/kernbench/ccl/algorithms/ring_allreduce.py
 def kernel_args(world_size: int, n_elem: int) -> tuple:
    return (n_elem, world_size)
 def kernel(t_ptr, n_elem, world_size, tl):
    """Required — the PE kernel.
    IPCQ is already installed by the backend before this is called.
    The kernel only uses the four-direction send / recv API.
    """
    ...
 def neighbors(rank, world_size, neighbor_map):
    """Optional — override the builtin topology's neighbor map.
    Returns a new dict, the modified-in-place dict, or None to keep the
    builtin map.
    """
    return None
 ```
 #### `neighbors` override patterns
 - **Pattern A — tweak a builtin**: drop a direction for some ranks, etc.
 - **Pattern B — replace entirely**: ignore `neighbor_map` and return a
  brand-new dict.
 - **Pattern C — keep builtin**: omit `neighbors` or return None.
 #### Builtin topologies
 | topology | direction set |
 |----------|---------------|
 | `ring_1d` | E, W |
 | `ring_1d_unidir` | E only |
 | `mesh_2d` | N, S, E, W |
 | `tree_binary` | parent, child_left, child_right |
 | `none` | (empty) — algorithm must supply `neighbors()` |
 #### Adding a new algorithm
 1. Write `kernel` and `kernel_args` in
   `src/kernbench/ccl/algorithms/<algo>.py`.
 2. Add an entry in `ccl.yaml`'s `algorithms` section.
 3. (Optional) provide `neighbors()` for custom topology.
 4. Set `defaults.algorithm` to the new algorithm.
 The host bench (`benches/ccl_allreduce.py`) does not change.
 ### D12. Message / token schema
 The new message types added by this ADR. They live in
 `src/kernbench/common/pe_commands.py` and
 `src/kernbench/runtime_api/kernel.py`.
 #### `IpcqInitMsg` (sideband, fan-out at init)
 The backend pushes neighbor tables to every PE_IPCQ. Structure mirrors
 `MmuMapMsg` (`target_sips`, `target_cubes`, `target_pe`, `entries`).
 Each `IpcqInitEntry` has `direction`, `peer: IpcqEndpoint`,
 `my_rx_base_pa/va`, `n_slots`, `slot_size`, plus a `peer_credit_store`
 field — a `simpy.Store` instance pre-wired so the sender PE_IPCQ can
 push `IpcqCreditMetadata` directly into the receiver's input queue.
 #### `IpcqSendCmd` (PE_CPU → PE_IPCQ)
 Carries `direction`, source addr/space, nbytes, shape, dtype, and a
 handle id. `data_op=True` so it lands in the op_log.
 #### `IpcqRecvCmd` (PE_CPU → PE_IPCQ)
 Carries `direction` (or None for round-robin), `recv_mode`
 (`return_slot` / `copy_to_dst`), optional `dst_addr/dst_space`, shape,
 dtype, blocking flag.
 #### `IpcqDmaToken` (PE_IPCQ → PE_DMA, vc_comm channel)
 Per D9 piggyback: the token carries the data (`src/dst/space/nbytes`)
 plus the head metadata (`sender_seq`, `src_sip/cube/pe`,
 `src_direction`). PE_DMA picks the channel by token type
 (`IpcqDmaToken → vc_comm`, `TileToken → vc_compute`).
 The receiver's PE_DMA, on token arrival, performs the I6 atomic
 sequence: write data into MemoryStore, then forward `IpcqMetaArrival`
 to the local PE_IPCQ.
 #### `IpcqCreditMetadata` (PE_IPCQ → peer PE_IPCQ, fast path)
 Carries `consumer_seq` (= my_tail), source PE coords, and source
 direction. Travels through the dedicated SimPy Store channel rather
 than `vc_comm`. Latency = `credit_size_bytes / bottleneck_bw_on_path`.
 There is **no `IpcqPtrUpdate` event** — head updates flow via D9
 piggyback, tail updates via the D9 fast-path channel.
 ### D13. Test strategy
 Following the ADR-0021 D8 pattern.
 #### T1. Unit tests (component-level)
 - **PE_IPCQ** (`tests/test_pe_ipcq.py`): send without backpressure
  immediately forwards a token; full peer slot triggers backpressure
  (poll / sleep modes); recv waits, wakes on `IpcqMetaArrival`;
  round-robin recv weak fairness; bad direction → `IpcqInvalidDirection`.
 - **PE_DMA virtual channels** (`tests/test_pe_dma_vc.py`): `vc_compute`
  / `vc_comm` independent progress, chunk interleave, BW split.
 - **Builtin topology** (`tests/test_ccl_topologies.py`): ring_1d /
  mesh_2d / tree_binary correctness, mesh_2d non-square →
  `ValueError`, custom resolver returns the module's `neighbors`.
 #### T2. Integration tests (E2E send/recv)
 - **`tests/test_ipcq_e2e.py`**: 2-rank ring, 4-rank ring (bidirectional
  no-deadlock), 4×4 mesh.
 - **CCL kernel + 2-pass** (`tests/test_ipcq_2pass.py`): greenlet mode
  records `ipcq` ops in op_log; DataExecutor produces correct
  `out.data`.
 #### T3. Backend init (`tests/test_ccl_backend_ipcq.py`)
 `ccl.yaml` load, builtin topology → `IpcqInitMsg` fan-out, endpoint PA
 consistency, per-`buffer_kind` allocation.
 #### T4. Regression
 All existing tests pass; ADR-0020 op_log / DataExecutor unaffected for
 non-CCL benches.
 #### T5. Performance / overhead
 Single send/recv pair latency = (DMA latency) + (IPCQ overhead).
 Should be close to a regular PE_DMA write of the same nbytes (IPCQ
 overhead < 100 ns).
 ### D14. Invariants and failure modes
 #### Invariants
 I1. **Slot lifecycle exactly-once**: one send → exactly one recv.
 I2. **Pointer monotonicity**: `my_head` / `my_tail` strictly
   non-decreasing; `sender_seq` strictly increasing.
 I3. **Endpoint consistency**: if rank A's `direction=E` peer is rank
   B, then rank B's reverse-direction peer must be rank A. Verified at
   init.
 I4. **`buffer_kind` consistency**: all PEs in a process group share
   the same `buffer_kind` (no mixed mode in the first cut).
 I5. **op_log ordering**: send → DMA complete → recv possible. The
   t_start order in op_log respects this causality.
 I6. **Atomic data + metadata visibility (MUST)**: at the receiver
   side, data write (`MemoryStore.write`) and metadata forward
   (`peer_head_cache` update) **must execute in the same SimPy step**.
   No yield is allowed between the two operations in PE_DMA's vc_comm
   handler. Code review must reject any inserted `yield` (or `yield
   from`) — it would create a race where head_cache becomes visible
   before or after the data.
 I7. **MemoryStore slot existence ↔ pointer**: as a consequence of I6,
   the step in which `peer_head_cache > my_tail` becomes truthy is the
   same step in which the slot data is observable.
 #### Failure modes (runtime errors)
 F1. **Bad direction**: `tl.send(dir="X")` for an uninstalled direction
   → `IpcqInvalidDirection`, simulation aborts.
 F2. **Type mismatch**: dtype/shape/nbytes disagreement between matched
   send and recv. Not validated by default; opt-in strict mode catches
   it (`strict_validation: true` on a PE_IPCQ node attrs).
 F3. **Deadlock detection (timeout-based)**: the simulator empties its
   schedule while a send/recv is still pending → engine raises
   `IpcqDeadlock` and embeds a pointer dump.
 F4. **Backend init failure**: missing `defaults.algorithm`, missing
   `algorithms[name]`, module import failure, topology validation
   failure (I3, I4) — all raised at `init_process_group` time.
 F5. **Slot full + infinite backpressure**: the peer never recvs.
   Surfaces as F3 timeout.
 #### Diagnostics
 - **CCL trace**: `KERNBENCH_CCL_TRACE=1` logs each send/recv as
  `(rank, t, dir, nbytes)`.
 - **Pointer dump**: `kernbench.ccl.diagnostics.pointer_dump(engine)`
  prints every PE_IPCQ ring buffer's `my_head`, `my_tail`,
  `peer_head_cache`, `peer_tail_cache`.
 - **Deadlock dump**: on hang the engine includes the pointer dump in
  the `IpcqDeadlock` exception message.
 ### D15. Algorithm-author cheat sheet
 Full step-by-step lives in
 [`docs/ccl-author-guide.en.md`](../ccl-author-guide.en.md). The
 shortest version:
 | Things you touch | Things you don't |
 |------------------|-------------------|
 | `src/kernbench/ccl/algorithms/<your_algo>.py` (`kernel`, `kernel_args`, optional `neighbors`) | `benches/ccl_allreduce.py` host code |
 | One entry in `ccl.yaml` + optionally `defaults.algorithm` | `src/kernbench/ccl/` framework |
 | (Optional) `tests/test_<your_algo>.py` mock test | PE_IPCQ component, AhbmCCLBackend |
 5-step flow: write the kernel → register in `ccl.yaml` → optional
 `neighbors` override → optional mock unit test → SimPy validation via
 `kernbench run --bench ccl_allreduce --verify-data`.
 Common mistakes: using a direction that wasn't installed, sends
 without matching recvs (deadlock), dtype/shape disagreement, assuming
 fairness from `tl.recv()` round-robin, confusing
 `tl.num_programs(axis)` with the CCL group size.
 ---
 ## Non-goals
 - **Host collective**: a model where `dist.all_reduce` itself moves
  data on the host side is out of scope. This ADR only covers
  communication that happens inside the PE kernel.
 - **All-reduce algorithms**: ring / tree / etc. live in algorithm
  modules and can be added without amending this ADR.
 - **Reliability / error handling**: link faults, send/recv failure
  recovery, etc. are out of scope.
 - **NoC arbiter precision**: dynamic VC contention is left for a future
  ADR (see D8).
 ---
 ## Open questions
 - **VC arbitration accuracy** — the first cut uses deterministic
  chunk interleave + weighted round-robin; heavy contention may report
  optimistic latency. A NoC arbiter component can be added later.
 - **Credit return BW model** — the fast path is currently outside the
  fabric BW contention model. Can be modeled as a separate link or
  switched to piggyback (`credit_return_mode: piggyback`).
 - **Ring buffer slot allocation metadata** — whether the host pushes
  IPCQ buffer metadata via sideband or via a fabric message similar to
  `MmuMapMsg` is open.
 - **VC BW split default** — 50/50 vs. weighted (e.g. 80/20). Exposed in
  `ccl.yaml`; default value TBD.
 - **Direction count** — 4 (N/S/E/W) is fixed in the first cut; 6
  (with Up/Down for 3D) or N (variable) is future work.
 - **Multi-tile aggregation primitives** — whether
  `tl.recv_all` or similar is needed for fan-in.
 - **Round-robin recv fairness** — current weak fairness can starve;
  strict fairness counter is future work.
 - **Deadlock detection precision** — currently timeout-based; a
  realtime wait-for graph would enable deterministic detection.
 ---
 ## Consequences
 ### Positive
 - PE-to-PE direct communication enables CCL kernels to be written.
 - Host stays minimal (just `launch`), synchronization happens inside
  the PE → strong compute / comm overlap.
 - VCs eliminate HoL blocking → collective latency is not blocked by
  compute traffic.
 - Buffer placement and backpressure mode are init-time parameters →
  easy to benchmark.
 - Four-direction logical neighbors → host is free to map
  ring/mesh/tree algorithms.
 ### Negative
 - One new component (PE_IPCQ) and a redesigned PE_DMA (VCs).
 - IPCQ memory cost = 8 rings × `slot_size` × `n_slots` per PE.
 - VC arbitration is a first-order approximation; heavy contention
  scenarios may report slightly optimistic latency vs real HW (D8).
 - Chunk-level interleave makes PE_DMA implementation more complex.
 ---
 ## Affected files
 | File | Change |
 |------|--------|
 | `topology.yaml` | Add `pe_ipcq` to `pe_template`, plus the IPCQ ↔ DMA / CPU / TCM edges. |
 | `components.yaml` | Register `pe_ipcq_v1`. |
 | `src/kernbench/topology/builder.py` | Wire the IPCQ chain into PE-internal edges. |
 | `src/kernbench/components/builtin/pe_ipcq.py` | New. |
 | `src/kernbench/components/builtin/pe_dma.py` | Add VCs, handle `IpcqDmaToken`. |
 | `src/kernbench/common/pe_commands.py` | `IpcqSendCmd`, `IpcqRecvCmd`, `IpcqDmaToken`. |
 | `src/kernbench/triton_emu/tl_context.py` | `tl.send` / `tl.recv` API. |
 | `src/kernbench/runtime_api/distributed.py` | Eager IPCQ install in `AhbmCCLBackend.__init__`. |
 | `src/kernbench/runtime_api/kernel.py` | `IpcqInitMsg` definition. |
 | `src/kernbench/ccl/__init__.py` | New CCL package. |
 | `src/kernbench/ccl/topologies.py` | Builtin topology generators + `resolve_topology()`. |
 | `src/kernbench/ccl/helpers.py` | Algorithm-author helpers (`chunked`, `ring_step`, `tree_step`). |
 | `src/kernbench/ccl/testing.py` | Mock CCL runtime (`run_kernel_in_mock`). |
 | `src/kernbench/ccl/algorithms/*.py` | Algorithm modules (kernel + `kernel_args` + optional `neighbors`). |
 | `ccl.yaml` | Algorithm metadata + IPCQ defaults. |
 | `tests/test_pe_ipcq.py` | PE_IPCQ unit tests. |
 | `tests/test_pe_dma_vc.py` | PE_DMA VC tests. |
 | `tests/test_ipcq_e2e.py` | end-to-end send/recv tests. |
 | `tests/test_ccl_topologies.py` | Builtin topology generator tests. |
 | `tests/test_ccl_allreduce_matrix.py` | Unified bench × algorithm matrix. |
@@ -0,0 +1,990 @@
 # ADR-0024: SIP-level TP Launcher — rank = SIP (host-driven dispatch)
 ## Status
 Proposed (Revision 8 — Hierarchical content split out to ADR-0029)
 ## Context
 ### 목표
 `torch.distributed` collective 호출의 참여 단위(rank)를 **SIP**(device)
 경계에 맞춘다. 실제 PyTorch DDP/TP 스크립트와 **호스트 레벨에서 구분 없이**
 읽히는 bench 코드를 목표로 한다.
 real PyTorch와 비교:
 | 차원 | real PyTorch | KernBench (이 ADR 이후) |
 |---|---|---|
 | 프로세스 모델 | N개 프로세스, 각 1 GPU | 1 프로세스, N greenlet, 각 1 SIP |
 | `get_rank()` | `RANK` env var | greenlet-local 레지스트리 |
 | `get_world_size()` | `WORLD_SIZE` env var | topology의 SIP 수 |
 | `torch.cuda.set_device(r)` (real) / `torch.ahbm.set_device(r)` (KernBench) | rank → GPU | rank → SIP |
 | `mp.spawn` | OS 프로세스 fork | greenlet fan-out |
 ### 설계 원칙 — 공개 API의 추상화, 내부는 기존 path 활용
 **공개 API (bench worker) 수준의 추상화**:
 ```
 rank = SIP
 DPPolicy = intra-device (cube × PE) 분산만
 dist.all_reduce, torch.ahbm.set_device, mp.spawn 등 PyTorch-style 표면
 ```
 **Framework 내부 구현**:
 ```
 build_install_plans (host): topology + mapper + algorithm → SipInstallPlan
  ↓
 backend (host): plan의 per-PE spec을 engine.submit으로 IpcqInitMsg 디스패치
  ↓
 engine: 기존 PE-scoped routing (MmuMapMsg 등과 동일 경로)
  ↓
 PE_IPCQ: 자체 message loop에서 IpcqInitMsg 처리 (기존 capability)
 ```
 **핵심**: 새 message 타입이나 IO_CPU 확장 없음. 기존 engine routing과 기존
 `IpcqInitMsg` 타입을 그대로 사용. 기존의 "sideband direct call" 우회만
 제거하여 convention 일원화.
 ### 현재 상태
 - `DistributedContext` facade 존재
 - `init_process_group("ahbm")` → `AhbmCCLBackend`가 `ctx.install_ipcq` 호출
  → `ccl/install.py`가 **sideband direct call** (`pe_ipcq._install_neighbors`)로
  PE_IPCQ에 neighbor table 설치
 - `get_rank()` 항상 `0` (single-driver)
 - `get_world_size()` fallback: 총 PE 수 (rank = PE)
 - `benches/ccl_allreduce.py`: `worker(rank=0, world_size=total_PEs)` 1회 호출
 ### 풀어야 할 문제
 1. **공개 API에서 rank = SIP** — bench worker가 PE 개념을 알지 않도록.
 2. **Multi-worker 실행** — N개 rank가 독립 worker 코드 실행. 1 프로세스 제약
   하에서 greenlet + barrier 동기화.
 3. **Cross-rank collective submit 동기화** — 첫 rank가 혼자 wait하면 peer 부재로
   SimPy deadlock. 모든 rank submit 후 drain 보장.
 4. **기존 sideband install 제거** — IpcqInitMsg를 engine.submit으로 일원화.
   MmuMapMsg 등 다른 control-plane 메시지와 동일 패턴.
 5. **Algorithm / mapper / validator 분리** — 알고리즘 모듈은 kernel 코드만
   담고, topology / mapping / validation은 registry + 선언.
 ### Non-problem (이 ADR 밖)
 - IPCQ direction addressing fix → **ADR-0025**
 - `DPPolicy.sip`/`num_sips` 제거 → **ADR-0026**
 - Megatron-style TP → **ADR-0027**
 - DTensor → **ADR-0028 (future)**
 - **IO_CPU를 SIP-level control-plane 단일 endpoint로 승격**: 이 ADR에서는
  invariant으로 채택하지 않음. 현재 KernBench에 해당 원칙이 없고, 단독으로
  도입하기엔 정당화가 약함. 미래에 control-plane latency 모델링 정밀도 요구가
  생기면 별도 ADR.
 ### TODO (이 ADR 구현 이후)
 - Tensor Parallelism (ADR-0027)
 - Hierarchical all-reduce 알고리즘 설계 (ADR-0029) — 본 ADR의 mapper /
  validator registry 인프라를 활용하는 첫 사례
 ---
 ## Decision
 ### D1. rank = SIP (world_size 해석)
 ```python
 def _resolve_world_size(self) -> int:
    if "world_size" in self._merged:
        return int(self._merged["world_size"])
    defaults = self._cfg_all.get("defaults", {})
    if "world_size" in defaults:
        return int(defaults["world_size"])
    spec = self.ctx.spec or {}
    return int(spec.get("system", {}).get("sips", {}).get("count", 1))
 ```
 우선순위: 알고리즘 override > defaults override > SIP count. `ccl.yaml`
 override는 legacy "rank = PE" 테스트 경로로 유지.
 ### D2. Install 경로 — engine.submit 일원화
 `ccl/install.py`의 sideband direct call을 제거하고, `IpcqInitMsg`를
 `engine.submit`으로 보낸다. MmuMapMsg / MemoryWriteMsg 등이 이미 동일 패턴.
 ```python
 # Backend (AhbmCCLBackend.__init__ 또는 init_process_group 시점)
 from kernbench.ccl.install_plan import build_install_plans
 plans = build_install_plans(
    world_size=self._world_size,
    algorithm=self._merged["algorithm"],
    algorithm_config=self._merged,
    spec=self.ctx.spec,
 )
 self._plans = plans
 # Each PE_IPCQ가 자기 neighbor table을 받도록 engine 경유 submit
 handles = []
 for plan in plans:
    for pe_install in plan.pe_installs:
        h = self.ctx.submit(IpcqInitMsg(
            correlation_id=self.ctx.correlation_id,
            request_id=f"ipcq_init_s{plan.sip}c{pe_install.cube}p{pe_install.pe}",
            target_sips=(plan.sip,),
            target_cubes=(pe_install.cube,),
            target_pe=pe_install.pe,
            entries=pe_install.neighbors,
            buffer_kind=plan.buffer_kind,
            n_slots=plan.n_slots,
            slot_size=plan.slot_size,
            # ... (기존 IpcqInitMsg 필드)
        ))
        handles.append(h)
 # Eager install — init_process_group이 반환하기 전에 완료 보장
 for h in handles:
    self.ctx.wait(h)
 ```
 **PE_IPCQ 컴포넌트**는 이미 `IpcqInitMsg`를 main loop에서 처리 (`pe_ipcq.py`
 라인 145-147). 변경 불필요. 유일한 차이는 "message가 sideband Python call이
 아니라 engine queue를 거쳐 도착한다"는 점.
 **Correctness invariant (equivalence)**: `init_process_group()`은 모든
 install handle을 `wait()`한 후 반환하므로 launch-before-install 문제는
 구조적으로 없다. 남는 correctness 질문은 단 하나:
 > Engine-routed `IpcqInitMsg` 처리가 기존 sideband
 > `pe_ipcq._install_neighbors(msg)` 호출과 **동일한 최종 PE_IPCQ 상태**를
 > 생성하는가.
 검증 포인트 (T3 참고):
 1. **State equivalence**: `_install_neighbors()` 내부 상태 전이가 engine
   dispatch path에서도 동일하게 일어나 최종 PE_IPCQ state
   (`_queue_pairs`, `_installed`, `_credit_inbox` 등)가 일치.
 2. **Sideband-only side effect 부재**: Sideband path에서만 있던 부수 효과가
   없음 (예: engine.submit이 설정하는 request_id / correlation tracking 등이
   install semantics를 왜곡하지 않음).
 3. **Ordering independence**: 서로 다른 PE들의 install message가 engine
   큐에서 임의 순서로 처리되어도 최종 상태가 동일. 즉 install은 **PE별
   독립 연산**이어야 하고, cross-PE 순서 의존성이 있으면 안 됨.
 4. **Idempotency**: 동일 PE에 대해 `IpcqInitMsg`가 두 번 도착하면? 현재
   설계 전제는 "per-PE 단 한 번 install". 중복 install 시 동작은 정의되지
   않음. 보수적 정책:
   - 최초 install 시 `_installed = True`로 전이
   - 이후 중복 install msg는 **에러** (raise) 또는 **silent idempotent**
     (no-op) 둘 중 하나로 명시
   - Recommend: **raise** (명시적 에러 → 버그 조기 검출). T3에 duplicate
     install 케이스 추가.
 5. **Partial install visibility**: 일부 PE만 install 완료된 중간 상태가
   외부에 observable한가? 현재 구조에서는 `init_process_group()`의 eager
   wait-all이 barrier 역할을 하므로 partial state는 bench 코드에 노출되지
   않음. 단, debugging / introspection API는 중간 상태를 볼 수 있음 (문제
   아님, 문서화만).
 **Timing 영향**: Engine-routed install은 `init_process_group()`이 SimPy 시간을
 소비하게 만든다. 기존 sideband install은 사실상 zero-cost. ADR 계약:
 > Benchmarks must not rely on zero-cost initialization.
 > `init_process_group()` consumes simulated time proportional to the number
 > of participating PEs × per-PE install latency. First collective call
 > starts at a well-defined but non-zero sim time.
 ### D3. Launch 경로 — non-CCL 커널과 동일 primitive
 **CCL 커널은 non-CCL 커널과 동일한 `KernelLaunchMsg` submission path를 쓴다.**
 Engine 내부의 IO_CPU/M_CPU transit 같은 것은 **기존 구현 세부이지 CCL-specific
 장치가 아님**. Backend는 plan의 `participating_pes` 목록을 돌면서 `KernelLaunchMsg`를
 submit할 뿐이다. 새 메시지 타입 없음, 새 라우팅 경로 없음.
 ```python
 # AhbmCCLBackend.all_reduce
 def all_reduce(self, tensor, op="sum"):
    if op != "sum":
        raise NotImplementedError(...)
    if tensor._handle is None or not tensor._handle.shards:
        raise RuntimeError(...)
    # Validator — global handle 기준 (D8)
    validator_name = self._merged.get("validator")
    if validator_name:
        resolve_validator(validator_name)(tensor._handle, self._world_size, self.ctx.spec)
    rank = self.ctx.distributed.get_rank()
    plan = self._plans[rank]
    tensor_view = _tensor_slice_for_sip(tensor._handle, plan.sip)
    # Plan에서 kernel args 계산 (host-side)
    import importlib
    mod = importlib.import_module(plan.kernel_module)
    n_elem = tensor_view.shards[0].nbytes // tensor.itemsize
    kargs = mod.kernel_args(n_elem=n_elem, world_size=plan.world_size,
                             **plan.kernel_config)
    def _submit():
        out = []
        for (cube, pe) in plan.participating_pes:
            h = self.ctx.submit(KernelLaunchMsg(
                correlation_id=self.ctx.correlation_id,
                request_id=f"allreduce_r{rank}_c{cube}p{pe}",
                kernel_ref=KernelRef(name=plan.algorithm_name, kind="builtin"),
                args=(_tensor_arg_for_pe(tensor_view, cube, pe), *kargs),
                target_sips=(plan.sip,),
                target_cubes=(cube,),
                target_pe=pe,
            ))
            out.append(h)
        return out
    self._barrier.submit_and_drain(self.ctx, rank, _submit)
 ```
 ### D4. Algorithm ABI — 얇게 + 명시적 arg 계약
 각 알고리즘 모듈은 **kernel + kernel_args만 필수**.
 ```python
 # src/kernbench/ccl/algorithms/ring_allreduce.py
 def kernel(t_ptr, n_elem, world_size, tl):
    """PE-side kernel code.
    Signature convention: first positional arg is the tensor pointer
    (per-PE slice), subsequent positional args are whatever
    kernel_args() returns. `tl` is injected by the TLContext runtime.
    """
 def kernel_args(*, n_elem: int, world_size: int, **kw) -> tuple:
    """Return the tuple of non-tensor positional args.
    Signature contract:
    - Called keyword-only with n_elem and world_size plus kernel_config.
    - Returns a tuple (possibly empty) of scalar / metadata args.
    - The backend constructs the final KernelLaunchMsg.args as:
          (per_pe_tensor_arg, *kernel_args(...))
      where per_pe_tensor_arg is a TensorArg containing only the shards
      local to the receiving PE (derived from tensor_view).
    """
    return (n_elem, world_size)
 ```
 **Arg assembly in backend (reference)**:
 ```python
 # AhbmCCLBackend.all_reduce (D3에서 발췌)
 kargs = mod.kernel_args(n_elem=n_elem, world_size=plan.world_size,
                         **plan.kernel_config)
 for (cube, pe) in plan.participating_pes:
    pe_tensor_arg = _tensor_arg_for_pe(tensor_view, cube, pe)
    self.ctx.submit(KernelLaunchMsg(
        args=(pe_tensor_arg, *kargs),       # tensor first, then kernel_args return
        target_sips=(plan.sip,),
        target_cubes=(cube,),
        target_pe=pe,
        ...
    ))
 ```
 **ccl.yaml**에서 선언적 metadata:
 ```yaml
 algorithms:
  ring_allreduce_tcm:
    module: kernbench.ccl.algorithms.ring_allreduce
    topology: ring_1d             # kernbench/ccl/topologies.py
    mapper: leader_only           # kernbench/ccl/mappers.py (신규)
    validator: single_shard_per_rank   # kernbench/ccl/validators.py (신규)
    buffer_kind: tcm
    n_elem: 8
 ```
 - `topology` (필수)
 - `mapper` (선택, default `"leader_only"`)
 - `validator` (선택)
 알고리즘 모듈 자체에는 mapper/validator/participating_pes/neighbor
 생성기가 **들어가지 않음**.
 ### D5. Mapper + validator — registry key **또는** import path
 Host-side framework가 built-in registry 제공. 커스텀 확장은 dot-import path.
 ```python
 # src/kernbench/ccl/mappers.py (new)
 Mapper = Callable[[dict, int], list[tuple[int, int]]]
 def leader_only(spec, rank):
    """Single leader PE per SIP. Ring/tree/mesh용."""
    return [(0, 0)]
 def all_pes(spec, rank):
    """Every PE in the SIP. 알고리즘이 intra-SIP 전체 PE를 참여시킬 때 사용
    (e.g. intra-SIP reduction, intra-SIP broadcast, hierarchical collective
    의 낮은 레벨 등)."""
    cm = spec["sip"]["cube_mesh"]
    pl = spec["cube"]["pe_layout"]
    n_cubes = cm["w"] * cm["h"]
    n_pes = pl["pe_per_corner"] * len(pl["corners"])
    return [(c, p) for c in range(n_cubes) for p in range(n_pes)]
 MAPPER_REGISTRY = {"leader_only": leader_only, "all_pes": all_pes}
 def resolve_mapper(key_or_path: str) -> Mapper:
    if key_or_path in MAPPER_REGISTRY:
        return MAPPER_REGISTRY[key_or_path]
    if "." in key_or_path:
        import importlib
        mod_path, fn_name = key_or_path.rsplit(".", 1)
        return getattr(importlib.import_module(mod_path), fn_name)
    raise ValueError(f"unknown mapper: {key_or_path!r}")
 ```
 Validator도 동일 패턴 (`src/kernbench/ccl/validators.py`). 입력은 **global
 TensorHandle** (D8 참고).
 ### D6. Host-side install plan builder
 ```python
 # src/kernbench/ccl/install_plan.py (new; 기존 install.py의 재구성)
 from dataclasses import dataclass
 from typing import Any, Mapping
@dataclass(frozen=True)
 class NeighborTableEntry:
    direction: str
    peer_direction: str       # ADR-0025
    peer_sip: int
    peer_cube: int
    peer_pe: int
    rx_base_pa: int
    # ... 기타 IPCQ 설정 ...
@dataclass(frozen=True)
 class PeInstallSpec:
    cube: int
    pe: int
    neighbors: tuple[NeighborTableEntry, ...]
@dataclass(frozen=True)
 class SipInstallPlan:
    algorithm_name: str                  # human-readable ("ring_allreduce_tcm")
    sip: int
    rank: int
    world_size: int
    pe_installs: tuple[PeInstallSpec, ...]     # per-PE neighbor tables
    buffer_kind: str
    n_slots: int
    slot_size: int
    kernel_module: str
    participating_pes: tuple[tuple[int, int], ...]
    kernel_config: Mapping[str, Any]
 def build_install_plans(
    world_size: int,
    algorithm: str,
    algorithm_config: dict,
    spec: dict,
 ) -> list[SipInstallPlan]:
    """Compose topology + mapper + algorithm into per-SIP plan list."""
    topo_fn = _resolve_topology(algorithm_config["topology"])
    mapper = resolve_mapper(algorithm_config.get("mapper", "leader_only"))
    # kernel_config: launch 시 kernel_args에 전달할 algorithm-specific params
    kernel_config = {
        k: v for k, v in algorithm_config.items()
        if k in {"n_elem", "reduce_op", "chunk_size"} or k.startswith("kernel_")
    }
    plans = []
    for rank in range(world_size):
        sip = rank  # identity mapping (non-identity는 open question)
        pes = mapper(spec, rank)
        pe_installs = _build_pe_installs(
            rank=rank, world_size=world_size, sip=sip,
            pes=pes, topo_fn=topo_fn, algorithm_config=algorithm_config, spec=spec,
        )
        plans.append(SipInstallPlan(
            algorithm_name=algorithm,
            sip=sip, rank=rank, world_size=world_size,
            pe_installs=pe_installs,
            buffer_kind=algorithm_config["buffer_kind"],
            n_slots=algorithm_config["n_slots"],
            slot_size=algorithm_config["slot_size"],
            kernel_module=algorithm_config["module"],
            participating_pes=tuple(pes),
            kernel_config=kernel_config,
        ))
    return plans
 ```
 `_build_pe_installs`는 기존 `ccl/install.py`의 neighbor 계산 로직을 재활용
 (ADR-0025의 `reverse_direction` 개선 반영).
 **Multi-PE 매퍼와 neighbor 생성 책임**: mapper가 SIP 내 여러 PE를 반환하는
 경우 (`all_pes` 등), PE-level neighbor 그래프는 `_build_pe_installs` 내부에
 형성된다. 즉 topology 모듈은 rank-level 관계만 제공하고, PE-level 연결은
 builder에서 풀어낸다. 복잡한 multi-level 패턴을 쓰는 알고리즘은 이 책임
 분산이 관리 부담이 될 수 있음 — 관련 논의는 ADR-0029 참고.
 ### D7. Epoch-based collective barrier
 Cross-rank submit 동기화. 각 collective 호출은 독립 epoch. 같은 rank의
 중복 join은 즉시 에러.
 ```python
 # src/kernbench/runtime_api/distributed.py
@dataclass
 class _EpochState:
    participants: set[int] = field(default_factory=set)
    pending: list = field(default_factory=list)
    drained: bool = False
    returned: int = 0
 class _CollectiveBarrier:
    """Epoch-based barrier.
    Contract:
    - Each call joins the earliest non-drained epoch.
    - Each rank may join a given epoch at most once. Duplicate join raises.
    - Last arriver (participants == world_size) performs drain and advances
      _next_epoch. Earlier arrivers yield and re-check drained on resume.
    - Epoch state is GC'd when returned == world_size (success path).
    - On failure paths, residual state is acceptable; reset() clears it.
    """
    def __init__(self, world_size: int):
        self._world_size = world_size
        self._next_epoch = 0
        self._state: dict[int, _EpochState] = {}
    def submit_and_drain(self, ctx, rank: int, submit_fn) -> None:
        epoch = self._next_epoch
        state = self._state.setdefault(epoch, _EpochState())
        if rank in state.participants:
            raise RuntimeError(
                f"rank {rank} attempted duplicate join to epoch {epoch}"
            )
        state.participants.add(rank)
        handles = submit_fn()
        state.pending.extend(handles)
        is_last = len(state.participants) >= self._world_size
        if is_last:
            for h in state.pending:
                ctx.wait(h)
            state.drained = True
            self._next_epoch = epoch + 1
        else:
            from greenlet import getcurrent
            g = getcurrent()
            if g.parent is None:
                raise RuntimeError("barrier requires a bound worker greenlet")
            while not state.drained:
                g.parent.switch()
        state.returned += 1
        if state.returned >= self._world_size:
            self._state.pop(epoch, None)
    def reset(self) -> None:
        """Explicit cleanup on spawn exception unwinding."""
        self._state.clear()
        self._next_epoch = 0
 ```
 ### D8. Per-rank tensor view + validator contract
 **Validator** (host-side, pre-slice, global handle 기준):
 ```python
 # src/kernbench/ccl/validators.py
 Validator = Callable[[TensorHandle, int, dict], None]
 def single_shard_per_rank(handle, world_size, spec):
    """Ring 계열: 정확히 world_size개 shard, SIP당 1개."""
    if len(handle.shards) != world_size:
        raise ValueError(...)
    per_sip = {}
    for s in handle.shards:
        per_sip[s.sip] = per_sip.get(s.sip, 0) + 1
    if any(c != 1 for c in per_sip.values()):
        raise ValueError(...)
 def multi_pe_sip_local(handle, world_size, spec):
    """Multi-PE per SIP layout: 각 SIP에 intra-SIP PE 수만큼 shard 존재.
    Intra-SIP 전체 PE를 참여시키는 알고리즘이 사용."""
    cm = spec["sip"]["cube_mesh"]
    pl = spec["cube"]["pe_layout"]
    per_sip = cm["w"] * cm["h"] * pl["pe_per_corner"] * len(pl["corners"])
    if len(handle.shards) != world_size * per_sip:
        raise ValueError(...)
 VALIDATOR_REGISTRY = {...}
 def resolve_validator(key_or_path): ...
 ```
 Validator는 world 전체의 shard layout 불변량을 본다. Per-rank view는
 backend가 validator 호출 **후** `_tensor_slice_for_sip`로 생성.
 **Per-rank tensor view** — SIP-local slice:
 ```python
 def _tensor_slice_for_sip(handle, sip) -> TensorArg:
    sip_shards = [s for s in handle.shards if s.sip == sip]
    if not sip_shards:
        raise RuntimeError(f"tensor has no shards on SIP {sip}")
    # Deterministic ordering contract: (cube, pe, offset_bytes) ascending.
    # Multi-PE mappers (hierarchical 등) rely on this ordering to align
    # per-PE tensor arg construction with participating_pes enumeration.
    sip_shards.sort(key=lambda s: (s.cube, s.pe, s.offset_bytes))
    min_offset = min(s.offset_bytes for s in sip_shards)
    local_va_base = handle.va_base + min_offset if handle.va_base else 0
    return TensorArg(
        shards=tuple(TensorArgShard(...) for s in sip_shards),
        va_base=local_va_base,
    )
 ```
 **Ordering invariant**: slice의 shard는 `(cube, pe, offset_bytes)` 오름차순.
 Backend가 `participating_pes`를 iterate하며 `_tensor_arg_for_pe(view, cube, pe)`를
 구성할 때, 결정론적 ordering을 전제할 수 있다. 특히 `all_pes` mapper +
 hierarchical 알고리즘이 per-PE slice 조합을 순서 의존적으로 해석하는 경우에
 중요.
 ### D9. Greenlet-local rank registry (+ debug warning)
 ```python
 class DistributedContext:
    def __init__(self):
        self._backend = None
        self._rank_by_greenlet: dict = {}
    def _bind_rank(self, g, rank: int) -> None:
        self._rank_by_greenlet[g] = int(rank)
    def get_rank(self) -> int:
        self._ensure_initialized()
        from greenlet import getcurrent
        g = getcurrent()
        if g not in self._rank_by_greenlet:
            if os.environ.get("KERNBENCH_DEBUG"):
                warnings.warn(
                    "get_rank() called outside a bound greenlet — returning 0. "
                    "Likely a bug unless running single-driver."
                )
            return 0
        return int(self._rank_by_greenlet[g])
 ```
 ### D10. `torch.ahbm.set_device(rank)` — SIP 바인딩
 KernBench 백엔드 이름은 `ahbm` (ADR-0023 D10). Real PyTorch는
 `torch.cuda.set_device(r)`이지만 우리는 CUDA가 아니므로 honestly-named
 namespace를 사용한다.
 ```python
 class _AhbmNamespace:
    """torch.ahbm — per-greenlet SIP device binding.
    Real-PyTorch parity idiom: ``torch.cuda.set_device(rank)``. Since
    KernBench's backend is 'ahbm' (not CUDA), we expose the equivalent
    API under ``torch.ahbm`` to avoid pretending to be a CUDA runtime.
    """
    def __init__(self):
        self._device_by_greenlet: dict = {}
    def set_device(self, device: int) -> None:
        from greenlet import getcurrent
        self._device_by_greenlet[getcurrent()] = int(device)
    def current_device(self) -> int | None:
        from greenlet import getcurrent
        return self._device_by_greenlet.get(getcurrent())
 # Attached to RuntimeContext as `self.ahbm = _AhbmNamespace()`.
 # Bench code: `torch.ahbm.set_device(rank)` mirrors `torch.cuda.set_device`.
 ```
 **PyTorch 2.x style 병행 지원**: 최신 PyTorch는 device-agnostic한
 `torch.accelerator` 네임스페이스를 지향 (`torch.accelerator.set_device_index(r)`,
 `torch.accelerator.current_device_index()`). Device vendor에 종속되지 않는
 코드를 쓰려는 사용자를 위해 KernBench도 이 표면을 병행 지원한다.
 ```python
 class _AcceleratorNamespace:
    """torch.accelerator — device-agnostic API (PyTorch 2.x style).
    Aliases torch.ahbm for bench code that prefers device-neutral idiom:
        torch.accelerator.set_device_index(rank)
        torch.accelerator.current_device_index()
    """
    def __init__(self, ahbm: _AhbmNamespace):
        self._ahbm = ahbm
    def set_device_index(self, device: int) -> None:
        self._ahbm.set_device(device)
    def current_device_index(self) -> int | None:
        return self._ahbm.current_device()
 # RuntimeContext
 self.ahbm = _AhbmNamespace()
 self.accelerator = _AcceleratorNamespace(self.ahbm)   # alias
 ```
 Bench 작성자는 다음 중 하나를 선택 — 둘 다 내부적으로 같은 레지스트리를 보유:
 ```python
 torch.ahbm.set_device(rank)                   # KernBench-native, explicit backend
 torch.accelerator.set_device_index(rank)      # PyTorch 2.x device-agnostic
 ```
 ### D11. Tensor placement = structural (sip, cube, pe) 좌표
 `resolve_dp_policy`가 `target_sip`을 직접 받아 구조적 좌표로 placement 생성.
 세부는 ADR-0026.
 ```python
 # RuntimeContext._create_tensor
 current_sip = self.ahbm.current_device()          # (D10 naming)
 if current_sip is None:
    current_sip = 0  # single-driver fallback (D9와 일관)
 placement = resolve_dp_policy(
    dp, shape=shape_2d, itemsize=itemsize,
    num_pe=eff_num_pe, num_cubes=eff_num_cubes,
    target_sip=current_sip,
 )
 ```
 Post-hoc `pe_index` shifting 제거 — ShardSpec이 `(sip, cube, pe)` 구조적
 좌표 보유.
 ### D12. `torch.multiprocessing.spawn`-compat surface
 Bench 작성자 표면은 real PyTorch `mp.spawn`과 동일:
 ```python
 # src/kernbench/runtime_api/multiprocessing.py (new)
 def spawn(fn, args=(), nprocs=1, join=True, daemon=False, start_method="spawn"):
    """Drop-in for torch.multiprocessing.spawn.
    Internal: greenlet fan-out + epoch-barrier sync + exception propagation.
    """
    ...
 # torch namespace에 부착
 torch.multiprocessing = SimpleNamespace(spawn=spawn)
 ```
 Bench:
 ```python
 import torch.multiprocessing as mp
 mp.spawn(worker, nprocs=world_size, args=(world_size, torch))
 ```
 ### D13. Scheduler + exception handling
 ```python
 def spawn(fn, args, nprocs, ...):
    dist = torch.distributed
    gs: list[greenlet] = []
    errors: dict[int, Exception] = {}
    for rank in range(nprocs):
        def _entry(r=rank):
            try:
                fn(r, *args)
            except Exception as e:
                errors[r] = e
                raise
        g = greenlet(_entry)
        dist._bind_rank(g, rank)
        gs.append(g)
    try:
        while True:
            alive = [g for g in gs if not g.dead]
            if not alive:
                break
            for g in alive:
                if not g.dead:
                    g.switch()
    except Exception as outer:
        for other in gs:
            if not other.dead:
                try:
                    other.throw(SystemExit)
                except Exception:
                    pass
        # Epoch barrier state 명시적 cleanup
        backend = getattr(dist, "_backend", None)
        if backend is not None and hasattr(backend, "_barrier"):
            backend._barrier.reset()
        raise SpawnException(errors) from outer
 ```
 **Scheduler contract**:
 - Deterministic round-robin over insertion order (rank 0, 1, ..., N-1).
 - 동기화 지점은 epoch barrier (D7)만. Scheduler 순서에 의존하는 correctness 없음.
 - 예외 발생 시 다른 greenlet 강제 종료 + `SpawnException` 전파.
 **Starvation guideline**:
 - 일반적으로 collective barrier가 workers를 동기화. 큰 편차 없음.
 - 극단적 non-collective 루프 대비 cooperative yield 제공:
  `torch.distributed.cooperative_yield()`.
 ### D14. Backward compatibility
 1. **Single-driver 호출**: `get_rank()` 0 반환 (D9).
 2. **`ccl.yaml` world_size override**: D1 fallback 우회 — legacy "rank = PE"
   테스트 경로로 사용 가능.
 3. **`DPPolicy.sip="column_wise"` 명시**: ADR-0026 scope.
 4. **`install_ipcq()` compatibility wrapper**:
 기존 `ccl/install.py`의 `install_ipcq()` API는 곧바로 제거하지 않는다.
 Thin compatibility wrapper로 남겨 기존 직접 호출자가 점진적으로 migration할
 수 있게 한다.
 ```python
 # src/kernbench/ccl/install.py (after this ADR)
 def install_ipcq(engine, spec, merged, *, algo_module=None, rank_to_pe=None):
    """DEPRECATED: legacy host-side PE installer.
    Internally delegates to build_install_plans + engine-routed IpcqInitMsg.
    Use dist.init_process_group() instead.
    """
    from kernbench.ccl.install_plan import build_install_plans
    import warnings
    warnings.warn(
        "install_ipcq() is deprecated; use dist.init_process_group()",
        DeprecationWarning, stacklevel=2,
    )
    plans = build_install_plans(
        world_size=merged.get("world_size", 1),
        algorithm=merged["algorithm"],
        algorithm_config=merged,
        spec=spec,
    )
    handles = []
    for plan in plans:
        for pe_install in plan.pe_installs:
            h = engine.submit(IpcqInitMsg(
                target_sips=(plan.sip,),
                target_cubes=(pe_install.cube,),
                target_pe=pe_install.pe,
                entries=pe_install.neighbors,
                buffer_kind=plan.buffer_kind,
                n_slots=plan.n_slots,
                slot_size=plan.slot_size,
            ))
            handles.append(h)
    for h in handles:
        engine.wait(h)
    return {"world_size": merged.get("world_size", 1), "plans": plans}
 ```
 Migration 스케줄:
 - Phase 1: wrapper로 유지 + DeprecationWarning
 - Phase 2: 직접 호출자 grep-audit → 각각 `dist.init_process_group()` 또는
  `build_install_plans()` 직접 사용으로 이관
 - Phase 3: wrapper 제거 (별도 cleanup ADR 또는 PR)
 ---
 ## Dependencies
 - **ADR-0023** (IPCQ): `IpcqInitMsg` 메시지 타입과 PE_IPCQ 핸들링을 그대로
  활용. Engine-routed submit으로 전환하는 것이 유일한 변경.
 - **ADR-0025** (IPCQ direction fix): `_build_pe_installs`의 neighbor 계산이
  2-rank ring 등에서 정확히 동작하려면 필요.
 - **ADR-0003 / 0016** (IO_CPU): IO_CPU는 기존 transit 역할 그대로. 본 ADR에서
  IO_CPU 역할 변경 없음.
 ---
 ## Non-goals
 - **IPCQ protocol 수정**: ADR-0023 유지.
 - **DPPolicy 필드 정리**: ADR-0026.
 - **Megatron-style TP**: ADR-0027.
 - **Multi-node (프로세스 간)**: 단일 프로세스.
 - **IO_CPU SIP control-plane 단일 endpoint 원칙 채택**: 본 ADR 범위 밖. 현재
  KernBench에 이 원칙이 없고, 도입은 별도 ADR.
 - **Hierarchical all-reduce 알고리즘 설계**: ADR-0029. 본 ADR은 그 알고리즘이
  쓸 framework 인프라 (`all_pes` mapper, `multi_pe_sip_local` validator,
  registry 확장점)만 제공.
 ---
 ## Open questions
 ### 🔴 Critical — 구현 blocker 가능성 (integration 전 반드시 검증)
 - **`IpcqInitMsg`의 engine routing — primary implementation risk**: 현재
  sideband만 쓰여서 engine routing path가 실사용 검증되지 않은 상태. **본
  ADR 전체가 "engine routing이 동작한다"는 가정 위에 서 있다**. 이것이
  실제로 안 되면 D2, D14, T3 등이 전부 영향 받음. 반드시 **ADR 구현 착수
  전 스파이크 검증**:
  - `engine.submit(IpcqInitMsg(target_sips=..., target_cubes=..., target_pe=...))`
    가 PE_IPCQ로 정확히 배달되는지 (기존 `MmuMapMsg` / `MemoryWriteMsg` 라우팅
    패턴과 비교)
  - 미지원 시 minor hook: engine의 message-type → component-kind 매핑 테이블에
    `IpcqInitMsg → "pe_ipcq"` 등록 (localized change, topology builder /
    message schema 영향 없음)
  - 결과에 따라 D2 채택 여부가 달라질 수 있음 — 만약 routing 불가 시 sideband
    path 유지로 fallback 후 본 ADR 범위 재조정
 - **Engine-routed install vs sideband equivalence** (D2 검증점 1-5): T3의
  equivalence test가 실제 동작하는지 스파이크. 특히 ordering independence와
  idempotency는 기존 테스트에 없는 속성이라 신규 검증 필요.
 - **`install_ipcq()` 직접 호출자 audit** (구현 전 필수): deprecated wrapper
  전략은 적절하지만 실제 migration 리스크는 호출자 목록에 따라 다름. 착수 전
  grep audit:
  - Pattern: `install_ipcq(` (cwd 전체)
  - Scope: `src/`, `tests/`, `benches/`, `scripts/`, `src/kernbench/cli/`
  - 각 호출자의 예상 migration path (→ `dist.init_process_group` vs
    `build_install_plans` 직접)를 정리한 후 wrapper 도입
 ### 🟡 Nice-to-have — scope 경계 관련
 - **Install timing 허용치**: SimPy 시간 상 install이 몇 ns~us 소모. 기존
  sideband는 0ns. 기존 테스트가 t=0 시작을 전제로 하는지 확인 (audit 결과에
  따라 테스트 교정 필요).
 - **`IpcqInitMsg` 배치 가능성**: MmuMapMsg처럼 `target_pe="all"` 브로드캐스트
  는 IPCQ에서는 부적합 (PE마다 neighbor가 다름). 현재는 per-PE 개별 submit.
  Per-PE payload를 담는 batched IpcqInitMsg 타입은 future optimization.
 - **`_rank_to_sip` 매핑**: 현재 identity. Non-trivial mapping 요구 시 별도.
 - **Cooperative yield API 위치**: `torch.distributed.cooperative_yield()`로
  노출 예정. 실제 필요성은 Phase 2 이후 벤치 추가 시 판단.
 (PE-level topology 일원화 관련 중장기 방향은 **ADR-0029** 참고 — 복잡한
 multi-level 알고리즘이 driving force가 되는 framework 진화 방향.)
 ---
 ## Test strategy
 ### T1. Launcher infrastructure
 `tests/test_ccl_ddp_launcher.py`:
 - `test_world_size_equals_sip_count` — D1
 - `test_ahbm_set_device_binds_tensor_to_single_sip` — D10/D11
 - `test_get_rank_is_greenlet_local` — D9
 - `test_run_spawns_one_worker_per_rank` — D12/D13
 - `test_get_rank_debug_warning` — D9 warning path
 ### T2. Install plan builder
 `tests/test_ccl_install_plan.py` (new):
 - `build_install_plans` — ring_1d × leader_only 조합 (단일 PE per rank)
 - `build_install_plans` — ring_1d × all_pes 조합 (multi-PE per rank; mapper
  framework 동작 확인, 알고리즘-무관)
 - Mapper / validator registry resolution (built-in key vs import path vs
  unknown)
 - Import path fallback (`"pkg.mod.fn"` 형식) 동작 검증
 ### T3. Engine-routed IpcqInitMsg (equivalence — 핵심 검증)
 `tests/test_ipcq_init_routing.py` (new):
 - **Routing**: `engine.submit(IpcqInitMsg)` → 지정 PE_IPCQ가 실제 설치 수행
 - **Equivalence**: 동일한 IpcqInitMsg를 (a) sideband `_install_neighbors`
  직접 호출, (b) engine.submit 두 경로로 보낸 뒤 PE_IPCQ 최종 state
  (`_queue_pairs`, `_installed` 등) 동일성 비교
 - **Ordering independence**: 서로 다른 PE의 install msg를 engine 큐에 임의
  순서로 넣어도 최종 state가 동일
 - **Idempotency (duplicate install)**: 동일 PE에 두 번 install msg → 두
  번째는 에러 raise (policy: explicit error; D2 검증점 4 참고)
 - **Multi-PE 병렬 install**: per-PE submit이 interference 없이 완료
 - **Install 후 send 성공**: 설치 직후 `IpcqSendCmd` 실행해서 neighbor table
  state가 실제로 유효한지 확인
 ### T4. Barrier correctness
 `tests/test_collective_barrier.py` (new):
 - Single collective 정상
 - 다중 collective 연속 호출 (epoch 격리)
 - 동일 rank의 duplicate join → RuntimeError
 - Rank 1이 all_reduce 전 종료 → SpawnException + barrier.reset()
 - Conditional branch 시 모든 rank 도달하면 정상
 ### T5. E2E
 `tests/test_ccl_allreduce_matrix.py`:
 - `ring_tcm` / `ring_hbm` / `ring_sram` @ ws=SIP_count
 ### T6. 회귀
 기존 `test_ccl_framework`, `test_ccl_install`, `test_ccl_topologies`,
 `test_ccl_mock_runtime`, `test_pe_ipcq`, `test_ipcq_e2e`, 기타 non-CCL
 모두 통과.
 ---
 ## Consequences
 ### Positive
 - **새 message 타입 0개**: 기존 `IpcqInitMsg` + `KernelLaunchMsg`만으로 구현.
 - **IO_CPU / engine 변경 없음**: 기존 routing 그대로.
 - **Sideband install convention 제거**: MmuMapMsg 등과 동일 패턴으로 일원화.
 - **Plan state stale 문제 소멸**: Plan은 host 단일 소유.
 - **Bench = real PyTorch DDP** (공개 API 관점).
 - **Algorithm ABI 경량**: `kernel` + `kernel_args`만 필수.
 - **Epoch-based barrier**: interleaved collective 안전.
 - **Control/data plane 분리**: data plane(PE_IPCQ)은 ADR-0023 유지, control
  plane은 host-driven.
 - 장기 확장성: Megatron TP, DTensor 기반.
 ### Negative
 - 신규 모듈: `install_plan.py`, `mappers.py`, `validators.py`,
  `multiprocessing.py`.
 - Engine이 `IpcqInitMsg`를 엔진-path로 라우팅할 수 있는지 구현 시 확인 필요
  (minor hook 가능성).
 - Install이 SimPy 시간을 소모 (positive로도 볼 수 있으나, 기존 sideband 시점
  0ns 전제인 테스트가 있으면 교정 필요).
 ### Neutral
 - IPCQ PE-level protocol (ADR-0023) 불변.
 - `DPPolicy` 필드 변경은 ADR-0026.
 - IO_CPU 역할 불변 (기존 transit 그대로).
 ---
 ## Affected files
 | File | Change |
 |------|--------|
 | `src/kernbench/runtime_api/distributed.py` | D1/D2/D7/D9: world_size fallback, rank_to_sip, plan 소유, engine-routed install/launch, epoch barrier |
 | `src/kernbench/runtime_api/context.py` | D10/D11: `_AhbmNamespace`, `ctx.ahbm`, `_create_tensor`가 `target_sip` 전달 |
 | `src/kernbench/runtime_api/multiprocessing.py` (new) | D12/D13: `spawn` + scheduler + exception |
 | `src/kernbench/ccl/install_plan.py` (new) | D6: `build_install_plans`, `SipInstallPlan`, `PeInstallSpec`, `NeighborTableEntry` |
 | `src/kernbench/ccl/mappers.py` (new) | D5: `leader_only`, `all_pes`, registry + resolver |
 | `src/kernbench/ccl/validators.py` (new) | D5: validator registry + resolver |
 | `src/kernbench/ccl/install.py` | Thin deprecated compat wrapper (D14) |
 | `src/kernbench/ccl/algorithms/ring_allreduce.py` | D4: `kernel` + `kernel_args` 유지 (큰 변화 없음) |
 | `src/kernbench/ccl/algorithms/mesh_allreduce.py` | D4 동일 |
 | `src/kernbench/ccl/algorithms/tree_allreduce.py` | D4 동일 |
 | `ccl.yaml` | 각 알고리즘에 `mapper` / `validator` 선언 추가 |
 | `src/kernbench/sim_engine/engine.py` | (If needed) `IpcqInitMsg` → PE_IPCQ 라우팅 확인 hook |
 | `benches/ccl_allreduce.py` | 새 launcher 기반 rewrite |
 | `tests/test_ccl_ddp_launcher.py` (new) | T1 |
 | `tests/test_ccl_install_plan.py` (new) | T2 |
 | `tests/test_ipcq_init_routing.py` (new) | T3 |
 | `tests/test_collective_barrier.py` (new) | T4 |
 | `tests/test_ccl_allreduce_matrix.py` | T5: ws=SIP_count 단순화 |
@@ -0,0 +1,365 @@
 # ADR-0025: IPCQ Direction Addressing — address-based matching
 ## Status
 Proposed (Revision 2 — Address-based matching; peer_direction field dropped)
 ## Context
 ### 목표
 ADR-0023의 IPCQ protocol에서 **"어느 direction pair를 통한 전송인가"의 식별**을
 topology / dict-order에 의존하지 않고 **주소 기반**으로 일관되게 한다.
 2-rank bidirectional ring (또는 여러 direction이 동일 peer를 가리키는
 topology 일반)에서 정확히 동작하도록 한다.
 ### 현재 상태 (ADR-0023 D9 구현)
 `src/kernbench/components/builtin/pe_ipcq.py` — `_handle_meta_arrival`:
 ```python
 def _handle_meta_arrival(self, msg: IpcqMetaArrival) -> None:
    token = msg.token
    sender_key = (token.src_sip, token.src_cube, token.src_pe)
    for d, qp in self._queue_pairs.items():
        p = qp["peer"]
        if (p.sip, p.cube, p.pe) == sender_key:
            qp["peer_head_cache"] = max(qp["peer_head_cache"], token.sender_seq + 1)
            # ... wake recv waiters ...
            return
 ```
 `_credit_worker`도 동일한 "sender-coord-first-match" 패턴.
 `src/kernbench/ccl/install.py` — `reverse_direction`:
 ```python
 def reverse_direction(my_rank: int, peer_rank: int) -> str | None:
    for d, target in neighbor_table[peer_rank].items():
        if target == my_rank:
            return d
    return None
 ```
 ### 드러난 버그 — 2-rank bidirectional ring
 `ring_1d(rank, world_size=2)` → `{"E": 1, "W": 1}` (rank 0). 양쪽 방향이 같은 peer.
 **버그 1 (install)**:
 - `reverse_direction(0, 1)` → dict order로 "E" 반환 (틀림, "W"가 맞음 — opposite
  direction convention)
 - rank 0의 E entry가 `peer.rx_base_pa = rx_base(sip1, cube0, pe0, d="E")`로 설정
 - tl.send(E) → data가 sip1의 E-rx buffer로 landing (should be W-rx)
 **버그 2 (runtime)**:
 - 설령 install이 올바른 주소로 설정해도, receiver의 `_handle_meta_arrival`이
  sender 좌표만으로 direction 매칭 → 첫 direction (E) 승
 - peer_head_cache[E] 증가, peer_head_cache[W]는 불변
 - Kernel의 tl.recv(W)는 peer_head_cache[W] 대기 → 영원히 블록 → IpcqDeadlock
 ### 근본 원인
 두 축에서 동일 문제:
 1. **Install-time pairing**: "내 direction과 peer의 어느 direction이 짝인가"
   결정이 dict-iteration-order에 의존 → 여러 direction이 같은 peer를 가리킬 때
   fragile
 2. **Runtime identification**: "어느 qp를 업데이트해야 하는가" 결정이 sender
   좌표만으로 이루어짐 → direction 중복 시 ambiguous
 ### 해결 방향 — address-based matching
 각 PE의 rx buffer는 **direction별로 고유한 주소 range**에 위치 (rx_base_pa +
 direction_idx × bytes_per_direction). 따라서:
 - **Runtime**: sender coord 대신 **dst_addr 범위**로 매칭 → unambiguous
 - **Install**: opposite-direction 우선 선택 heuristic (ring / mesh의 자연스러운
  대칭성)
 - `peer_direction` 같은 이중 메타데이터 불필요 — **주소가 single source of
  truth**
 이 설계는 **PhysAddr 전환 (ADR-0030)과 독립적**으로 작동. 현재 synthetic
 주소든 PhysAddr든 direction별 range 유일성만 지켜지면 동일하게 적용 가능.
 ---
 ## Decision
 ### D1. Install — `reverse_direction` opposite-preference
 `src/kernbench/ccl/install.py`:
 ```python
 _OPPOSITE_DIR = {"E": "W", "W": "E", "N": "S", "S": "N"}
 def reverse_direction(my_rank: int, peer_rank: int, my_dir: str) -> str | None:
    """Find peer's direction that reciprocates my_dir→peer_rank.
    Prefer the OPPOSITE direction (E↔W, N↔S) when the peer has it
    pointing back to us. This matters in 2-rank bidirectional rings
    where both E and W on one side point to the same peer — without
    the preference, the first-match-wins iteration would route data
    into the wrong rx slot. Falls back to any direction pointing back
    for topologies without an opposite convention (tree_binary's
    parent/child).
    """
    nt = neighbor_table[peer_rank]
    opp = _OPPOSITE_DIR.get(my_dir)
    if opp is not None and nt.get(opp) == my_rank:
        return opp
    for d, target in nt.items():
        if target == my_rank:
            return d
    return None
 ```
 호출부:
 ```python
 for d, peer_rank in nbrs.items():
    peer_dir = reverse_direction(r, peer_rank, d)  # my_dir 전달
    if peer_dir is None:
        continue
    ...
 ```
 ### D2. Runtime — `_handle_meta_arrival` dst_addr 매칭
 `src/kernbench/components/builtin/pe_ipcq.py`:
 ```python
 def _handle_meta_arrival(self, msg: IpcqMetaArrival) -> None:
    """Match incoming token to the receiver-side direction by dst_addr range.
    Each direction has a unique rx buffer address range
    (my_rx_base_pa + n_slots * slot_size). The token's dst_addr (set by
    the sender's IPCQ when computing peer's slot address) falls within
    exactly one such range. This address-based matching is unambiguous
    even when multiple directions have the same peer (2-rank ring).
    """
    token = msg.token
    dst_addr = token.dst_addr
    for d, qp in self._queue_pairs.items():
        base = qp["my_rx_base_pa"]
        size = qp["n_slots"] * qp["slot_size"]
        if base <= dst_addr < base + size:
            qp["peer_head_cache"] = max(qp["peer_head_cache"],
                                         token.sender_seq + 1)
            self._arrived_tokens.setdefault(d, []).append(token)
            waiters = self._recv_waiters.get(d, [])
            self._recv_waiters[d] = []
            for ev in waiters:
                if not ev.triggered:
                    ev.succeed()
            any_waiters = self._any_recv_waiters
            self._any_recv_waiters = []
            for ev in any_waiters:
                if not ev.triggered:
                    ev.succeed()
            return
    # Unknown dst_addr — diagnostic log (should not happen under correct install)
 ```
 Sender 좌표 검사는 **제거**. `dst_addr`가 이미 direction을 결정.
 ### D3. Credit — `dst_rx_base_pa` 필드 추가
 `src/kernbench/common/ipcq_types.py`:
 ```python
@dataclass(frozen=True)
 class IpcqCreditMetadata:
    consumer_seq: int
    dst_rx_base_pa: int       # NEW: 원 sender의 peer.rx_base_pa와 매칭용
    # 기존 필드 (diagnostic / log 용도로 유지)
    src_sip: int
    src_cube: int
    src_pe: int
    src_direction: str
 ```
 Credit 생성 시 (`_delayed_credit_send`): 자기 direction의 `my_rx_base_pa`를
 `dst_rx_base_pa`로 실어 보냄 (이게 상대방이 sender 당시 썼던 `peer.rx_base_pa`).
 수신 측 (`_credit_worker`):
 ```python
 def _credit_worker(self, env):
    while True:
        credit = yield self._credit_inbox.get()
        for d, qp in self._queue_pairs.items():
            # peer의 rx_base_pa와 credit의 dst_rx_base_pa가 일치하는 qp 찾기
            if qp["peer"].rx_base_pa == credit.dst_rx_base_pa:
                qp["peer_tail_cache"] = max(qp["peer_tail_cache"],
                                              credit.consumer_seq)
                waiters = self._send_waiters.get(d, [])
                self._send_waiters[d] = []
                for ev in waiters:
                    if not ev.triggered:
                        ev.succeed()
                break
 ```
 Sender 좌표 검사 제거. `dst_rx_base_pa` 매칭으로 unambiguous.
 ### D4. `IpcqInitEntry`에 `peer_direction` 필드를 **추가하지 않음**
 ADR-0025 rev 1에서 제안했던 `IpcqInitEntry.peer_direction`은 **불필요**.
 이유:
 - Meta arrival은 dst_addr로 매칭 (D2)
 - Credit은 dst_rx_base_pa로 매칭 (D3)
 - qp에 peer_direction 저장 필요 없음
 - Install은 rx_base_pa 계산 시 내부적으로만 peer_dir 사용 (`reverse_direction`)
 IpcqInitEntry schema 변경 없음. Rev 1 대비 **단순화**.
 ### D5. `IpcqDmaToken.src_direction` 유지 (diagnostic only)
 기존 `src_direction` 필드는 제거하지 않는다. 다음 용도로 유지:
 - Logging / trace: `KERNBENCH_CCL_TRACE=1` 출력의 `(rank, t, dir, nbytes)`
 - Diagnostics: pointer_dump 등에서 direction 표시
 - 미래 확장 여지
 Runtime matching은 `dst_addr`만 사용.
 ### D6. Invariants (ADR-0023 I3 강화)
 **I3 (엄격)**: 각 방향 pair `(my_direction, peer_direction)`에 대해 my
 rx_base와 peer rx_base는 **별개의 direction slot**을 가리켜야 함. Install은
 이를 보장해야 한다 (reverse_direction opposite-preference).
 **I3.1 (신규)**: 모든 qp에 대해 `qp["my_rx_base_pa"]`와 `qp["peer"].rx_base_pa`는
 서로 disjoint한 주소 range를 점유한다 (다른 direction의 buffer는 절대 겹치지
 않음). 이것이 D2/D3의 주소-기반 매칭의 전제.
 Install time에 검증 가능:
 ```python
 # ccl/install_plan.py: build_install_plans 끝에 assertion
 all_rx_ranges = set()
 for plan in plans:
    for pe_install in plan.pe_installs:
        for entry in pe_install.neighbors:
            r = (entry.my_rx_base_pa,
                 entry.my_rx_base_pa + plan.n_slots * plan.slot_size)
            overlap = any(_ranges_overlap(r, e) for e in all_rx_ranges)
            assert not overlap
            all_rx_ranges.add(r)
 ```
 ---
 ## Dependencies
 - **ADR-0023** (IPCQ protocol): 본 ADR은 ADR-0023의 runtime 매칭 로직 수정
  (D2, D3) + install heuristic 개선 (D1). IPCQ 프로토콜의 semantic layer
  변경은 없음.
 - **ADR-0024** (launcher): 2-rank bidirectional ring이 실제 쓰이는 경우가
  ADR-0024의 ws=SIP_count 모델. 본 ADR이 그 케이스를 작동시킴.
 - **ADR-0030** (PhysAddr transition, stub): **독립적** — ADR-0025의
  주소-기반 매칭은 현재 synthetic 주소든 PhysAddr이든 동일하게 작동.
 ---
 ## Non-goals
 - **IPCQ 주소 체계를 PhysAddr로 전환**: ADR-0030 scope. 본 ADR은 주소가 어떻게
  인코딩되는가와 무관.
 - **Multi-hop routing**: ADR-0023 D5의 single-hop DMA write 전제 유지.
 - **Unidir ring 특수화**: `ring_1d_unidir`는 direction 하나만 있으므로 본 버그
  무관.
 ---
 ## Open questions
 - **주소 매칭 성능**: `_handle_meta_arrival`과 `_credit_worker`가 qp를 선형
  순회 (max 4 direction). 성능 영향 무시 가능 수준. 문제 시 dict lookup으로
  전환 가능 (`_qp_by_rx_base`).
 - **`IpcqDmaToken.src_direction` 필요성 재평가**: diagnostic 용도로만 남긴
  필드를 계속 유지할지, 또는 logging 외부로 분리할지. 현재는 유지.
 - **Install-time invariant 검증 cost**: D6의 I3.1 검증은 O(N_PE × N_direction)^2.
  대형 topology에서 느려질 수 있음 → interval tree 등 자료구조로 개선 가능.
  단순 구현 먼저.
 ---
 ## Test strategy
 ### T1. Unit — `reverse_direction` opposite-preference
 `tests/test_ccl_install.py` (확장):
 - Ring ws=2: `reverse_direction(0, 1, "E")` → "W", `reverse_direction(0, 1, "W")` → "E"
 - Ring ws=4: `reverse_direction(0, 1, "E")` → "W" (자연스러운 opposite)
 - Mesh 2×2: `reverse_direction(r, peer, "N")` → "S", "E" ↔ "W"
 - Tree binary: opposite 없는 direction (parent) → fallback 경로
 - Non-symmetric topology: opposite가 peer에 없고 다른 direction만 있는 경우
 ### T2. Runtime — `_handle_meta_arrival` dst_addr 매칭
 `tests/test_pe_ipcq.py` (확장):
 - 2-rank pair install 후, E direction dst_addr로 meta arrival → E의 `peer_head_cache`
  증가 (W는 불변)
 - W direction dst_addr로 meta arrival → W의 `peer_head_cache` 증가
 - 잘못된 dst_addr (어느 rx range에도 속하지 않음) → 에러 또는 silent drop
  (결정 후 명시)
 ### T3. Credit — `dst_rx_base_pa` 매칭
 `tests/test_pe_ipcq.py` (확장):
 - E direction send 후 peer가 consume → credit에 자기 W의 `my_rx_base_pa`
  담아 송신 → sender의 E direction `peer_tail_cache` 증가
 - W direction도 동일
 ### T4. E2E — 2-rank bidirectional ring
 `tests/test_ipcq_e2e.py`:
 - 2-rank ring_1d로 tl.send(E) + tl.recv(W) pattern이 양방향으로 작동
 - ADR-0024의 `test_ccl_allreduce_matrix.py`에서 ring at ws=2가 통과
 ### T5. Install invariant — rx_base range disjointness
 `tests/test_ccl_install_plan.py` (확장):
 - I3.1 검증: `build_install_plans` 결과에서 모든 qp의 rx_base range가 disjoint
 ### T6. 회귀
 - 기존 ws≥3 ring / mesh / tree 테스트 그대로 통과
 - `test_pe_ipcq`, `test_ipcq_e2e` 기존 케이스 회귀
 ---
 ## Consequences
 ### Positive
 - **단순함**: `peer_direction` 이중 메타데이터 제거. 주소가 single source of truth.
 - **Unambiguous matching**: 모든 topology (direction 중복 포함)에서 동작.
 - **Schema 변경 최소**: `IpcqInitEntry` 불변, `IpcqCreditMetadata`에 1 필드 추가.
 - **PhysAddr 전환 (ADR-0030) 독립**: 주소-기반 매칭은 주소 인코딩 방식과 무관.
 - **Diagnostic 유지**: `IpcqDmaToken.src_direction`은 로깅 용도로 존치.
 ### Negative
 - Runtime 매칭이 주소 비교로 바뀌어서 디버깅 시 "왜 peer_head_cache[E]가 아닌
  W가 업데이트됐나" 같은 질문에 address range를 추적해야 함 (기존엔 direction
  이름으로 충분). 해결: pointer_dump에 "direction ↔ rx_base_pa" 매핑 포함.
 ### Neutral
 - IPCQ protocol의 semantic layer (sender가 dst_addr 계산, receiver가 수신)는
  불변.
 ---
 ## Affected files
 | File | Change |
 |------|--------|
 | `src/kernbench/ccl/install.py` | D1: `reverse_direction`에 `my_dir` 인자 추가, opposite-preference |
 | `src/kernbench/components/builtin/pe_ipcq.py` | D2: `_handle_meta_arrival` dst_addr 매칭 / D3: `_credit_worker` dst_rx_base_pa 매칭 / `_delayed_credit_send`가 `dst_rx_base_pa` 필드 채움 |
 | `src/kernbench/common/ipcq_types.py` | D3: `IpcqCreditMetadata`에 `dst_rx_base_pa` 필드 추가 |
 | `src/kernbench/ccl/install_plan.py` (ADR-0024 신규) | D6: I3.1 invariant 검증 (optional) |
 | `docs/adr/ADR-0023-ipcq-pe-collective.md` | Reference note: runtime 매칭 방식이 ADR-0025에서 바뀜 |
 | `tests/test_ccl_install.py` | T1 |
 | `tests/test_pe_ipcq.py` | T2, T3 |
 | `tests/test_ipcq_e2e.py` | T4 |
 | `tests/test_ccl_install_plan.py` | T5 |
@@ -0,0 +1,476 @@
 # ADR-0026: DPPolicy = Intra-Device Only — sip/num_sips 필드 제거
 ## Status
 Accepted (Revision 5 — Phase 2 landed 2026-04-14, 523 passed + 1 strict xfail)
 ## Context
 ### 목표
 `DPPolicy`를 **한 device(SIP) 내부의 cube × PE 분산**만 표현하는 순수한
 intra-device 추상화로 명확화한다. SIP 간 분산(TP)은 별도 레이어로 분리
 (ADR-0024의 `torch.ahbm.set_device(rank)` 또는 ADR-0027의 Megatron parallel
 layers가 담당).
 ### 현재 상태
 `src/kernbench/policy/placement/dp.py`:
 ```python
@dataclass(frozen=True)
 class DPPolicy:
    sip: Literal["replicate", "column_wise", "row_wise"] = "replicate"
    cube: Literal["replicate", "column_wise", "row_wise"] = "replicate"
    pe: Literal["replicate", "column_wise", "row_wise"] = "replicate"
    num_pes: int | None = None
    num_cubes: int | None = None
    num_sips: int | None = None    # ← 제거 대상
 ```
 `sip` / `num_sips` 필드는 텐서를 SIP 경계 **너머**로 분산하는 경로를 제공함.
 이는:
 - **ADR-0024의 launcher 모델과 충돌**: ADR-0024는 "rank = SIP = 1 worker per SIP"
  모델. 각 worker가 자기 SIP에 텐서를 생성. 텐서가 여러 SIP에 걸치는 경우는
  Megatron-style TP가 개별 primitive로 처리해야 함.
 - **사용자 의도와 불일치**: "DPPolicy는 한 디바이스 내에서 PE들로 분산하는 방법"
  (사용자 진술).
 - **개념 혼동**: `DPPolicy.sip="column_wise"`는 실제로 **TP**. 이름이 DP인데
  하는 일은 TP → 신규 사용자에게 혼란.
 ### 영향받는 call site (rollback 시점 grep 결과)
 **생성 사이트** (`DPPolicy(sip=...` 또는 `num_sips=...`):
 - `tests/test_runtime_api_tensor.py`
 - `benches/ccl_allreduce.py` (ADR-0024 scope 내에서 이미 개편됨)
 - `tests/test_va_offset.py`
 - `benches/va_offset_verify.py`
 - `tests/test_sip_parallel.py`
 **참조 사이트** (`dp.sip`, `policy.sip`, `num_sips` 등):
 - `src/kernbench/runtime_api/context.py` (`_create_tensor`, `launch`)
 - `src/kernbench/components/builtin/pe_cpu.py`
 - `src/kernbench/components/legacy/builtin/pe_cpu.py`
 - `src/kernbench/policy/placement/dp.py` (구현 자체)
 - `tests/test_tensor.py`, `test_ipcq_types.py`
 **핵심 테스트**: `test_sip_parallel.py`는 이름 그대로 "SIP 병렬성을 DPPolicy로
 표현하는" 테스트. 이 ADR 이후 **새 launcher 모델로 재작성** 필요.
 ---
 ## Decision
 ### D1. `DPPolicy`에서 `sip` + `num_sips` 필드 제거
 ```python
@dataclass(frozen=True)
 class DPPolicy:
    """Intra-device (cube × PE) data-parallel policy.
    SIP-level placement is controlled by ``torch.ahbm.set_device(rank)``
    (ADR-0024 D10) and, for model-level TP, by Megatron-style parallel
    layers (ADR-0027). DPPolicy does not cross SIP boundaries.
    """
    cube: Literal["replicate", "column_wise", "row_wise"] = "replicate"
    pe: Literal["replicate", "column_wise", "row_wise"] = "replicate"
    num_pes: int | None = None
    num_cubes: int | None = None
 ```
 제거되는 필드: `sip`, `num_sips`.
 ### D2. `ShardSpec` — structural (sip, cube, pe) 좌표, `pe_index` 완전 제거
 현재 `ShardSpec.pe_index`는 **global flat index** (`sip × cubes × pes + cube ×
 pes + pe`). 이는 ADR-0024 D11이 "abstraction leakage"로 지적한 형태.
 본 ADR에서 ShardSpec을 **structural 좌표로 재정의**하고, `pe_index`는
 property로도 **남기지 않는다**:
 ```python
 # src/kernbench/policy/placement/dp.py (after)
@dataclass(frozen=True)
 class ShardSpec:
    """Structural shard placement — intra-SIP (cube × PE) coord.
    Global-flat `pe_index` was removed in ADR-0026. Callers must use
    structural coords (sip, cube, pe) directly. If a flat integer key is
    needed (e.g. dict lookup), compute it explicitly at the call site.
    """
    sip: int              # structural — which SIP this shard lives on
    cube: int             # local within SIP
    pe: int               # local within cube
    offset_bytes: int
    nbytes: int
 ```
 **핵심 원칙**:
 - ShardSpec의 정체성은 `(sip, cube, pe)` 3튜플.
 - **`pe_index` property도 없음** — silent semantics drift 차단.
 - Global flat을 기대한 기존 호출자는 `.pe_index` 접근 시 **즉시
  `AttributeError`** → 반드시 구조적 좌표로 migration.
 - Flat integer key가 필요한 국소 문맥 (예: 내부 dict lookup)은 호출자가
  명시적으로 `spec.sip * N_CUBES * N_PE + spec.cube * N_PE + spec.pe`를 계산.
 **Property 제거 정당화**: KernBench는 사내 프로젝트로 call site가 한정되어
 있음. Silent drift 위험 (의미만 바뀌고 타입은 같은 int) 대비 explicit breakage
 (AttributeError)가 훨씬 안전.
 ### D3. `resolve_dp_policy`가 `target_sip`을 받아 structural 좌표 생성
 ADR-0024 D11의 계약 구현. Post-hoc shifting 없음.
 ```python
 # src/kernbench/policy/placement/dp.py (after)
@dataclass(frozen=True)
 class _LocalPeShard:
    """Internal — PE resolver의 반환. Cube 내 local PE 식별자 + payload."""
    local_pe: int                  # cube-local PE index (0..num_pe-1)
    offset_bytes: int
    nbytes: int
 def resolve_dp_policy(
    policy: DPPolicy,
    *,
    shape: tuple[int, int],
    itemsize: int,
    num_pe: int,
    num_cubes: int = 1,
    target_sip: int,       # NEW — 어느 SIP에 배치할지 명시
 ) -> list[ShardSpec]:
    """2-level resolution (cube × PE) on a specified SIP.
    Returns ShardSpecs with structural coords (sip=target_sip, cube, pe).
    No SIP-level split — DPPolicy is intra-device only.
    """
    resolver = _PE_RESOLVERS[policy.pe]
    all_shards: list[ShardSpec] = []
    # Level 1: cube within SIP
    cube_splits = _split_shape(policy.cube, shape, num_cubes, itemsize)
    for cube_id, (cube_shape, cube_offset) in enumerate(cube_splits):
        # Level 2: PE within cube — resolver returns _LocalPeShard (local_pe)
        local_shards = resolver(shape=cube_shape, itemsize=itemsize,
                                 num_pe=num_pe)
        for ls in local_shards:
            all_shards.append(ShardSpec(
                sip=target_sip,                   # from caller (current_device)
                cube=cube_id,                     # local within SIP
                pe=ls.local_pe,                   # local within cube (explicit name)
                offset_bytes=cube_offset + ls.offset_bytes,
                nbytes=ls.nbytes,
            ))
    return all_shards
 ```
 **내부 resolver** (`column_wise`, `row_wise`, `replicate`)는 `_LocalPeShard`
 리스트 반환 — `local_pe` 필드명으로 **"cube-local PE identifier"임이 명시적**.
 과거 `ShardSpec.pe_index`와 이름이 혼동되던 문제 해소.
 **이름 규약 정리** (전체 ADR):
 - `ShardSpec.pe`: 최종 외부 API — cube-local PE (structural coord)
 - `_LocalPeShard.local_pe`: 내부 resolver 단계의 동일 의미
 - `pe_index`: **제거**. 외부/내부 어디에도 남기지 않는다 (silent drift 차단의
  부가 효과: 이름 재등장 없음).
 ### D4. `_create_tensor` — 구조적 좌표로 직접 placement
 ADR-0024 D11 연속선. Post-hoc shifting 제거, 구조적 좌표를 `resolve_dp_policy`
 호출 시점에 직접 지정.
 ```python
 # context.py _create_tensor (after)
 current_sip = self.ahbm.current_device()
 if current_sip is None:
    # Single-driver fallback (ADR-0024 D9와 일관).
    # Launcher 기반 코드가 set_device()를 빼먹으면 조용히 SIP 0에 박히는
    # 문제가 있음 → debug mode에서 경고.
    if os.environ.get("KERNBENCH_DEBUG"):
        import warnings
        warnings.warn(
            "torch.ahbm.current_device() is None; defaulting to SIP 0. "
            "If this is a multi-rank launcher context, you likely forgot "
            "torch.ahbm.set_device(rank) inside the worker.",
            stacklevel=2,
        )
    current_sip = 0
 placement = resolve_dp_policy(
    dp,
    shape=shape_2d,
    itemsize=itemsize,
    num_pe=eff_num_pe,
    num_cubes=eff_num_cubes,
    target_sip=current_sip,          # ← 구조적 좌표 일차 지정
 )
 # placement의 각 ShardSpec은 이미 (sip=current_sip, cube=local, pe=local) 포함.
 # 과거의 post-hoc shifting 블록은 완전히 제거.
 ```
 **모든** 텐서가 current device SIP에 배치됨. Multi-SIP 텐서를 만들고 싶으면
 ADR-0027의 TP primitive 사용.
 **Single-driver fallback의 trade-off**: set_device 없는 호출에서 SIP 0으로
 default는 기존 single-driver 테스트 호환을 위해 유지. `KERNBENCH_DEBUG=1`
 환경에서는 launcher 컨텍스트의 실수로 set_device 누락 시 조용히 잘못된 SIP에
 배치되는 것을 감지할 수 있도록 warning.
 ### D5. Downstream — allocator lookup은 구조적 tuple key로
 기존 `deploy_tensor` (`src/kernbench/runtime_api/tensor.py`):
 ```python
 for spec in placement:
    alloc = allocators[spec.pe_index]       # ← AttributeError (property 제거됨)
 ```
 `pe_index`가 없어졌으므로 구조적 좌표로 **강제** migration:
 ```python
 for spec in placement:
    alloc = allocators[(spec.sip, spec.cube, spec.pe)]
 ```
 `_ensure_allocators`의 dict population도 tuple key로:
 ```python
 # context.py _ensure_allocators (after)
 for sip_id in sip_range:
    for cube_id in range(cubes_per_sip):
        for pe_id in range(pes_per_cube):
            self._allocators[(sip_id, cube_id, pe_id)] = PEMemAllocator(
                rack_id=0, sip_id=sip_id, cube_id=cube_id, pe_id=pe_id, cfg=cfg,
            )
 ```
 `_free_tensor`도 동일: 기존 `flat_idx = sip * ... + cube * ... + pe` 계산
 블록 제거, `(shard.sip, shard.cube, shard.pe)` 직접 사용.
 **Tuple vs dataclass `PEIdentity`**: Tuple이 단순하고 hashable로 바로 써서
 권고. `PEIdentity` 값객체는 명시적 타입 장점은 있지만 boilerplate가 크고 현재
 allocator dict의 유일한 key라 오버엔지니어링. Tuple 유지.
 ### D6. Migration — 기존 call site
 **(A) `DPPolicy(sip=..., num_sips=..., ...)` 사용하던 코드**:
 - `DPPolicy(sip="column_wise", cube=..., pe=...)` 패턴 → **해당 bench를 ADR-0024
  launcher로 재작성**. worker가 `set_device(rank)`로 SIP 선택, DPPolicy는
  cube/PE만.
 - `DPPolicy(sip="replicate", num_sips=1, ...)` 패턴 → `DPPolicy(cube=..., pe=...)`로
  축소 (필드가 사라지니 자연스럽게).
 **(B) `dp.sip`, `dp.num_sips` 읽던 코드**:
 - 제거. `launch()`의 `_compute_local_shape`에서 `dp.sip` 분기 삭제.
 - `pe_cpu.py`가 `dp.sip`을 참조하던 곳도 정리.
 **(C) `ShardSpec.pe_index`를 사용하던 코드 — 전부 수정 필요**:
 - `.pe_index` 접근은 이제 `AttributeError` 발생 → 모든 call site 수정 필수.
 - Allocator lookup: `allocators[spec.pe_index]` →
  `allocators[(spec.sip, spec.cube, spec.pe)]`
 - Flat integer가 꼭 필요한 국소 문맥: `spec.sip * N_CUBES * N_PE + spec.cube *
  N_PE + spec.pe` 명시적 계산. **국소 변수로만 사용하고 공개 API에 노출하지
  않는다**.
 **구현 착수 전 grep audit 체크리스트**:
 1. **Property 참조**:
   - `\.pe_index\b` — 필드/property 접근 모두 (regex)
   - `pe_index=` — 생성 시점의 키워드 인자
   - `pe_index:` — dataclass 필드 선언
 2. **Allocator / dict indexing**:
   - `allocators\[` — dict lookup 패턴. `allocators[spec.pe_index]` 같은
     것이 걸리는지
   - `_allocators\[` — 같은 패턴 (prefix _)
 3. **Flat index 수동 계산 블록**:
   - `flat_idx =`
   - `pe_index =` (좌변)
   - `* pes_per_cube +` (전형적 flat 계산 패턴)
   - `* self._num_cubes \* self._pes_per_cube` (global flat 계산)
 4. **Serialization / logging**:
   - `asdict(.*shard` — dataclass 직렬화 시 `pe_index` 자동 포함 여부
   - `repr(.*ShardSpec` — 로그 포맷에서 의존하는지
   - JSON/YAML 저장 포맷에서 `pe_index` 키 사용 여부
 5. **Tests asserting integer PE identity**:
   - `assert .*pe_index` — 정수 동일성 주장
   - `spec.pe_index ==` — 비교 (SIP-local 의미로 변하면 테스트가 깨질 수 있음)
 각 match마다 "이 호출자가 global flat / SIP-local / 내부 lookup 중 무엇을
 기대했나"를 판단한 뒤 구조적 좌표로 교체.
 **(D) `test_sip_parallel.py`**:
 - 이름 유지, 내용은 ADR-0024의 multi-greenlet launcher 기반 재작성.
 - "SIP 병렬성 = rank 별 worker × 각자 DPPolicy" 로 검증.
 **(E) `test_va_offset.py`, `benches/va_offset_verify.py`**:
 - `num_sips=1`만 쓰는 경우가 대부분. 단순히 필드 제거.
 - SIP offset 테스트가 핵심이면 `set_device(rank)` + 구조적 좌표 관찰로 이식.
 ### D7. 하위 호환 — 불가 (cleanup ADR)
 이 ADR은 **breaking change**.
 1. `DPPolicy(sip=...)` 또는 `DPPolicy(num_sips=...)` 호출 → `TypeError`
 2. `ShardSpec.pe_index` 접근 → `AttributeError`
 모두 **즉시 명시적 breakage**. Deprecation warning / fallback 경로 없음.
 KernBench는 사내 프로젝트로 call site가 한정되어 있어 한 번에 migration.
 **Silent drift 차단**이 property 완전 제거의 주된 이점: global flat을 기대한
 코드가 SIP-local 결과를 받아 조용히 잘못된 인덱싱을 할 가능성 제거.
 ### D8. 문서 업데이트
 - `ADR-0008` (tensor deploy) — DPPolicy 의미 갱신 note, ShardSpec 구조적 좌표
  전환 명시
 - DPPolicy docstring에 "intra-device only" 명시 (D1 코드 스니펫의 docstring)
 - ShardSpec docstring에 **structural coordinates `(sip, cube, pe)`를 직접
  사용하며, `pe_index`는 더 이상 제공되지 않음**을 명시 (D2)
 - `docs/ccl-author-guide` 등 튜토리얼에서 `sip=...` 예시 제거
 ---
 ## Dependencies
 - **ADR-0024** (launcher): `set_device(rank)` 및 current-device scoping이
  SIP 배치 메커니즘 제공. 본 ADR은 그 위에 서서 DPPolicy를 순수 intra-device로
  좁힘.
 - **ADR-0027** (Megatron TP): 다중 SIP에 걸친 텐서가 필요한 경우의 대안 경로.
  이 ADR 적용 후 multi-SIP use case는 ADR-0027로 이관.
 ---
 ## Non-goals
 - **`DPPolicy.cube` / `pe` 재설계**: 기존 replicate/column_wise/row_wise 의미
  유지.
 - **Tiling 정책 통합**: `tiled_column_major` / `tiled_row_major`는 그대로.
 - **Multi-device 텐서 추상화 신규**: DTensor-like는 ADR-0028.
 ---
 ## Open questions
 - **`_create_tensor`의 current_sip 기본값**: set_device 없는 호출에서 rank=0
  (SIP 0)로 fallback할지, 아니면 error 낼지. 권고는 fallback (기존 single-driver
  테스트와의 호환).
 - **`test_sip_parallel.py` 재작성 범위**: 기존 단위 테스트의 의도를 유지하며
  launcher 기반으로 옮기려면 추가 fixture 필요. 별도 작업으로 scope.
 - **`DPPolicy`의 `num_sips=None` 의미**: 필드가 없어지면 `num_sips` 개념 자체가
  사라짐. Multi-SIP을 표현하고 싶으면 ADR-0027의 TP primitive를 쓰라는 것이
  명시적 답.
 **Resolved (이전 rev에서 open이었던 것들)**:
 - ~~`ShardSpec.pe_index` property 존치 여부~~ → **완전 제거** (D2)
 - ~~`_ensure_allocators` dict key 형식~~ → **tuple `(sip, cube, pe)`** (D5)
 ---
 ## Test strategy
 ### T1. 단위 테스트 갱신
 - `tests/test_tensor.py`, `tests/test_ipcq_types.py`, `tests/test_runtime_api_tensor.py`
  — DPPolicy 생성자 인자 정리, ShardSpec 구조적 좌표 검증
 - `tests/test_va_offset.py` — `num_sips=1` 제거 후 동작 유지
 ### T2. `resolve_dp_policy` 구조적 좌표 반환
 `tests/test_dp_policy.py` (new 또는 확장):
 - `resolve_dp_policy(dp, ..., target_sip=1)` 결과의 모든 ShardSpec이 `sip=1`
 - 각 spec의 `(cube, pe)`가 local (0..num_cubes-1, 0..num_pe-1)
 - 같은 topology에서 `target_sip=0`과 `target_sip=1` 결과가 sip 필드만 다름
 ### T3. `test_sip_parallel.py` 재작성
 SIP 병렬성 검증을 launcher 기반으로:
 ```python
 def test_sip_parallel_via_launcher(topology):
    ...
    def worker(rank, ws, torch):
        torch.ahbm.set_device(rank)
        t = torch.zeros((1, 128), dtype="f16",
                         dp=DPPolicy(cube="column_wise", pe="column_wise"))
        # verify shard.sip == rank (structural coord)
    spawn(worker, nprocs=n_sips, ...)
 ```
 ### T4. Allocator key migration
 `tests/test_allocator_structural_key.py` (new 또는 기존 확장):
 - `PEMemAllocator` dict이 `(sip, cube, pe)` tuple key로 작동
 - `deploy_tensor`가 구조적 좌표로 allocator lookup
 - `_free_tensor`도 동일
 ### T5. E2E 회귀
 ADR-0024의 `test_ccl_allreduce_matrix.py` 그대로 통과.
 ### T6. 오류 검증
 - `DPPolicy(sip="column_wise")` 호출 → `TypeError`. 테스트로 명시.
 - `DPPolicy(num_sips=2)` 호출 → `TypeError`.
 - `spec.pe_index` 접근 → `AttributeError` (property 완전 제거 검증).
 ---
 ## Consequences
 ### Positive
 - **개념 분리 명확**: DPPolicy = intra-device, TP = inter-device.
 - **API 단순화**: DPPolicy 생성자 필드 ~33% 축소.
 - **Structural 좌표 일관성**: ShardSpec이 `(sip, cube, pe)` 튜플로 표현 →
  abstraction leakage 해소 (ADR-0024 D11 계약 충족).
 - **`pe_index` 의미 명확**: SIP-local이 단일 해석. Global flat이 필요하면 명시.
 - **Launcher 모델 일관성**: ADR-0024의 "1 worker per SIP" 모델이 유일한 SIP
  경계 제어 메커니즘.
 ### Negative
 - **Breaking change (explicit)**: `DPPolicy(sip=...)` → `TypeError`,
  `spec.pe_index` → `AttributeError`. 모든 호출자 한 번에 수정 필요.
 - **ShardSpec schema 변경**: `pe_index` 단일 필드 → `sip`/`cube`/`pe` 세 필드.
  Downstream (`deploy_tensor`, `_free_tensor`, `_ensure_allocators`,
  `allocators` dict key 등) 연쇄 수정.
 - **Silent drift 없음**: property 완전 제거로 runtime에서 즉시 실패 →
  migration leakage 원천 차단. (Negative가 아니라 explicit tradeoff)
 - `test_sip_parallel.py` 재작성 비용.
 ### Neutral
 - 기존 `cube` / `pe` 필드 의미 불변.
 ---
 ## Affected files
 | File | Change |
 |------|--------|
 | `src/kernbench/policy/placement/dp.py` | D1: `sip`/`num_sips` 제거 / D2: `ShardSpec`에 `sip`/`cube`/`pe` structural fields 추가, **`pe_index` property 제거** / D3: `resolve_dp_policy`에 `target_sip`, SIP-level 루프 제거 / 내부 resolver가 반환하는 shard 타입 이름도 `local_pe`로 명확화 (이름 충돌 방지) |
 | `src/kernbench/runtime_api/context.py` | D4: `_create_tensor` `target_sip` 전달 / D5: `_ensure_allocators` dict key → `(sip, cube, pe)` tuple / `launch`의 `dp.sip` 분기 제거 |
 | `src/kernbench/runtime_api/tensor.py` | D5: `deploy_tensor`가 구조적 좌표로 allocator lookup |
 | `src/kernbench/components/builtin/pe_cpu.py` | D6: `dp.sip` 참조 제거 |
 | `src/kernbench/components/legacy/builtin/pe_cpu.py` | D6: 동일 |
 | `benches/ccl_allreduce.py` | ADR-0024 scope에서 이미 처리 |
 | `benches/va_offset_verify.py` | D6: `num_sips=1` 제거 |
 | `tests/test_runtime_api_tensor.py` | D6 |
 | `tests/test_va_offset.py` | D6 |
 | `tests/test_tensor.py`, `test_ipcq_types.py` | D6 |
 | `tests/test_sip_parallel.py` | T3: launcher 기반 재작성 |
 | `tests/test_dp_policy.py` (new 또는 확장) | T2 |
 | `tests/test_allocator_structural_key.py` (new) | T4 |
@@ -0,0 +1,171 @@
 # ADR-0028: DTensor Support — 선언적 분산 텐서 (Stub / Future)
 ## Status
 Stub (Future Work)
 ## Context
 ### 목표
 **선언적 분산 텐서 추상화**(PyTorch 2.x `DTensor` 스타일)를 KernBench에
 도입하기 위한 **디자인 공간 preliminary exploration**. 본 ADR은 **구현 계획이
 아닌 future 작업의 파일 플레이스홀더 + 초기 질문 목록**이다.
 ### Megatron-style TP와의 차이 (Why DTensor)
 | 관점 | Megatron (ADR-0027) | DTensor (이 ADR) |
 |---|---|---|
 | 표현 | 명시적 parallel layer | 텐서 + placement spec |
 | 호출 형태 | `ColumnParallelLinear(...)` | `distribute_tensor(x, mesh, [Shard(1)])` |
 | Collective 삽입 | 레이어 내부 명시 | 연산 dispatch가 자동 |
 | Learning curve | 낮음 (명시적) | 중~높음 (선언적 의미 이해) |
 | 유연성 | 레이어 단위로 고정 | 레이어 경계 무관, 어디서나 |
 | KernBench에 선행 필요한 것 | launcher (ADR-0024) + TP (0027) | 그 + operator dispatch overhaul |
 DTensor는 operator-level에서 "텐서의 placement를 보고 자동으로 collective
 삽입". KernBench가 이를 지원하려면 **operator dispatch layer에 placement-aware
 rewriting**이 들어가야 한다. 이는 비-trivial.
 ### 현재 상태
 - KernBench는 operator dispatch 레이어가 없음 (`torch.matmul`은 없음; kernel
  launch로 대체).
 - DPPolicy는 정적 placement metadata를 보유 (ADR-0026 후: intra-device only).
 - ADR-0024 launcher가 rank / device 개념 제공.
 - Megatron-style TP (ADR-0027)가 명시적 대안으로 기능할 것.
 ---
 ## Preliminary decision space
 ### DQ1. PyTorch DTensor API 수용 범위
 - `DeviceMesh`: rank들의 논리적 grid.
 - `Placements`: `Shard(dim)`, `Replicate()`, `Partial(reduce_op)`.
 - `distribute_tensor(tensor, device_mesh, placements)`: local tensor → DTensor.
 - Redistribute: `dt.redistribute(new_placements)`로 collective 자동 삽입.
 - Operator forward: `dt @ dt`, `dt + dt` 등 → 적절한 collective 자동 dispatch.
 KernBench가 어느 수준까지 지원할지 결정 필요. 최소: `distribute_tensor` +
 `redistribute`. 최대: 모든 operator overloading.
 ### DQ2. Operator dispatch 레이어
 KernBench에서 `dt @ dt`를 정의하려면 Tensor의 `__matmul__`이 placement를
 보고 적절한 action 수행:
 - 둘 다 replicated → local matmul
 - A column-sharded, B row-sharded → local matmul + all-reduce (RowParallel)
 - A replicated, B column-sharded → local matmul (ColumnParallel)
 - etc.
 이는 Megatron-style의 **자동화된 버전**. Kernel은 기존 matmul kernel 사용.
 ### DQ3. DeviceMesh와 기존 topology
 KernBench topology는 이미 SIP/cube/PE 계층. DTensor의 DeviceMesh는 추상
 `(tp_size, dp_size, ...)` grid. 매핑:
 - 1D mesh of size = SIP count → rank = SIP
 - 2D mesh (tp × dp) → SIP을 그룹 분할 (pure TP 대신 mixed parallelism)
 초기엔 1D mesh만, DP × TP 2D는 future.
 ### DQ4. Placement의 intra-device (DP) 통합
 KernBench 특이점: 한 rank 내부에서 DPPolicy로 cube/PE에 분산. DTensor는
 device 내부를 보지 않음. 통합:
 - DTensor placement = rank (SIP) 간 분산
 - 각 rank의 local tensor는 여전히 DPPolicy로 cube/PE 배치
 - → DTensor wrapper가 local tensor의 DPPolicy도 보관
 ### DQ5. Collective 자동 삽입 지점
 `redistribute` 또는 operator forward 시. ADR-0024의 submit+yield+wait 패턴을
 자동으로 호출하는 형태. `_launch_submit` 내부화.
 ### DQ6. Autograd
 DTensor는 autograd와 상호작용 (backward에서 reverse collective). KernBench가
 backward 지원하기 전까지는 **forward-only DTensor**.
 ---
 ## Open questions (to resolve before real design)
 1. **우선순위**: Megatron-style(ADR-0027)이 먼저 안착한 후 DTensor를 위에
   얹는가, 아니면 공통 lower-layer를 먼저 설계하는가?
 2. **호환성 목표**: PyTorch DTensor API와 몇 %까지 일치시키는가? 독자 API vs
   거의 동일?
 3. **Operator dispatch**: KernBench `Tensor` 클래스에 `__matmul__` 등 연산자
   overloading을 도입하는가? (현재는 kernel launch만)
 4. **Redistribute 정책**: `Shard(0) → Replicate()` 변환 시 어떤 collective
   사용? `all_gather`가 없으면 구현 전까지 제약.
 5. **Mesh × DPPolicy interaction**: 하나의 DTensor가 2개 layer 분산을 갖는
   경우의 metadata 표현.
 6. **Partial placement의 reduce 시점**: 자동 vs 명시 `redistribute` 호출.
 7. **Bench authoring impact**: 기존 Megatron-style bench가 DTensor 기반으로
   얼마나 쉽게 포팅되는가?
 ---
 ## Non-goals (for future real ADR)
 - 이번 stub에서 API 확정. Future ADR에서 구체화.
 - Implementation timeline. 이번 round에서는 **설계 공간 매핑만**.
 ---
 ## Dependencies (potential)
 - **ADR-0024** (launcher): rank / device 기반
 - **ADR-0026** (DPPolicy cleanup): DTensor placement와의 분리 명확화
 - **ADR-0027** (Megatron TP): 실용 TP 패턴 경험을 DTensor 설계로 환류
 - **Future ADR** (operator dispatch layer): KernBench Tensor에 operator
  overloading 도입
 ---
 ## Expected consequences (hypothetical)
 ### Positive
 - PyTorch training code 이식이 **매우 쉬워짐** (DTensor 코드 그대로).
 - TP + DP + 더 복잡한 parallelism을 **하나의 추상화**로 표현.
 - Collective 삽입이 자동 → bench 작성자 부담 감소.
 ### Negative
 - Operator dispatch layer 신규 구축 → 상당한 엔지니어링.
 - Implicit behavior 증가 → 디버깅 / 성능 분석 복잡.
 - KernBench의 "명시적 kernel launch" 철학과 tension.
 ---
 ## Action
 - **Phase 1 (현재)**: 본 stub 유지. Megatron-style (ADR-0027) 먼저 구현 +
  사용 경험 축적.
 - **Phase 2 (future)**: 사용 경험을 바탕으로 본 ADR을 real design으로 승격.
  위 Open questions에 대한 답을 제시.
 - **Phase 3 (future)**: Implementation.
 현재 구현 작업은 **없음**. 디자인 공간 매핑만.
 ---
 ## Affected files
 본 ADR은 **stub**이므로 production 변경 없음. Future real ADR에서 갱신될
 파일 후보:
 | File | 예상 변경 (future) |
 |------|---|
 | `src/kernbench/dtensor/__init__.py` | 신규 패키지 |
 | `src/kernbench/dtensor/device_mesh.py` | DeviceMesh |
 | `src/kernbench/dtensor/placements.py` | Shard/Replicate/Partial |
 | `src/kernbench/dtensor/api.py` | distribute_tensor, redistribute |
 | `src/kernbench/dtensor/ops/*.py` | Operator dispatch (matmul 등) |
 | `src/kernbench/runtime_api/tensor.py` | Tensor에 `__matmul__` 등 추가 |
@@ -0,0 +1,419 @@
 # ADR-0029: Hierarchical All-Reduce — 3-level intra/inter-SIP 알고리즘
 ## Status
 Proposed
 ## Context
 ### 목표
 "Rank = SIP" 모델 (ADR-0024) 위에서 각 SIP 내부의 모든 PE를 참여시키는
 **3-level 계층 all-reduce** 알고리즘을 정의한다. 각 레벨이 서로 다른 물리
 연결(intra-cube ring, inter-cube NoC, inter-SIP UCIe)을 활용해 대역폭을
 극대화한다.
 ### 왜 hierarchical인가
 단순 ring/mesh/tree all-reduce는 SIP당 1 PE만 참여 (ADR-0024의 `leader_only`
 mapper). 이는 inter-SIP 단계는 잘 모델링하지만:
 - **Intra-SIP PE가 노는 시간이 발생**. Leader PE가 inter-SIP 통신 중이면
  나머지 7 PE / 16 cube는 유휴.
 - **Intra-cube/inter-cube 연결 대역폭 미활용**. Cube NoC는 매우 빠르지만
  단일 leader 사용 시 이 자원이 노출되지 않음.
 - **실제 NCCL 등은 hierarchical**: NVLink(intra-node) + InfiniBand(inter-node)
  의 bandwidth 차이를 활용. KernBench 토폴로지도 동일 구조
  (intra-cube / inter-cube / inter-SIP의 bandwidth·latency 차이).
 ### 현재 상태
 - `src/kernbench/ccl/algorithms/hierarchical_allreduce.py` 이미 존재
  (git log `10b33b4` — "Tensor indexing + hierarchical 3-level all-reduce
  kernel"). PE-level로 world_size = total PE를 가정하는 옛 모델 기반 구현.
 - ADR-0024에 의해 launcher는 rank = SIP로 바뀜.
 - Hierarchical 커널은 **재해석 필요**: 이제 각 worker(1 per SIP)가 자기 SIP의
  모든 PE를 참여시키고, kernel은 intra-cube → inter-cube → inter-SIP 순으로
  3-level reduce + broadcast.
 ### 풀어야 할 문제
 1. **ADR-0024 framework 위에 hierarchical 알고리즘 맞추기**
   - Mapper: `all_pes` (ADR-0024 D5 제공)
   - Validator: `multi_pe_sip_local` (ADR-0024 D8 제공)
   - Kernel: 기존 `hierarchical_allreduce.py` 수정 — rank 계산 방식을 SIP 내
     local (cube, pe)로 바꿈
 2. **PE-level neighbor graph 생성**
   - Intra-cube: `(sip, cube, pe) ↔ (sip, cube, pe±1 mod N_PE)` (ring 내부)
   - Inter-cube: `(sip, cube, 0) ↔ (sip, cube±1 mod N_CUBE, 0)` (cube leader만)
   - Inter-SIP: `(sip, 0, 0) ↔ (sip±1 mod N_SIP, 0, 0)` (SIP leader만)
 3. **Tensor layout**: 각 PE가 1 tile을 소유하고 시작 (`multi_pe_sip_local`
   validator가 이 layout 강제). DPPolicy(cube="column_wise",
   pe="column_wise")로 달성 가능.
 4. **PE-level topology 표현 부족** (ADR-0024 D6의 "책임 분산" 이슈 구체화)
   - Ring/mesh/tree 같은 단순 패턴은 rank-level topology_fn + mapper 조합으로
     충분.
   - Hierarchical은 레벨마다 다른 peer 매핑이라 `_build_pe_installs`에서
     multi-level 해석을 해야 함.
   - 장기적으로는 topology 모듈이 PE-level을 직접 표현하는 편이 명시적.
 ### Non-problem (이 ADR 밖)
 - Launcher / barrier / rank-to-SIP / mapper-validator registry → ADR-0024
 - IPCQ direction addressing → ADR-0025
 - DPPolicy 필드 정리 → ADR-0026
 - Megatron TP → ADR-0027
 ---
 ## Decision
 ### D1. 알고리즘 구조 — 3-level reduce + 역순 broadcast
 ```
 Level 1 (intra-cube, E/W ring):
  각 cube의 N_PE개 PE가 bidirectional ring reduce → cube 내 PE 0에 부분합 집중
 Level 2 (inter-cube within SIP, N/S ring, PE 0만 참여):
  N_CUBE개 cube-leader가 bidirectional ring reduce → SIP 내 (cube 0, PE 0)에
  SIP 전체 부분합 집중
 Level 3 (inter-SIP, N_SIP peers, (cube 0, PE 0)만 참여):
  Ring 또는 pair exchange로 전역 합산 완료
 Broadcast:
  역순 — Level 3 결과를 (cube 0, PE 0)에서 SIP 내 모든 cube-leader로, 다시
  각 cube 내 모든 PE로 전파
 ```
 세부는 기존 `hierarchical_allreduce.py`의 커널 구현과 일치. ADR-0024 이후
 변경점은 **rank 계산 방식**과 **n_elem 해석**뿐:
 - 기존 (rank=PE 모델): `rank = cube_id * pes_per_cube + local_pe`, `pe_addr =
  t_ptr + rank * nbytes`
 - 신규 (rank=SIP 모델): 커널은 SIP-local 좌표 `(cube_id, local_pe)`로만 동작.
  텐서의 per-PE slice는 backend가 per-PE `TensorArg`로 전달 (ADR-0024 D3).
  커널 내부 rank 계산 자체가 불필요해짐 — `tl.program_id(0/1)`로 충분.
 ### D2. Framework integration — ADR-0024 infrastructure 재활용
 `ccl.yaml`:
 ```yaml
 algorithms:
  hierarchical_allreduce:
    module: kernbench.ccl.algorithms.hierarchical_allreduce
    topology: hierarchical_3level        # NEW — D3 참고
    mapper: all_pes                      # ADR-0024 D5 built-in
    validator: multi_pe_sip_local        # ADR-0024 D8 built-in
    buffer_kind: tcm
    n_elem: 128
 ```
 Framework 관점에서 hierarchical은 **특별한 알고리즘이 아니라, 특정
 topology / mapper / validator 조합**. 본 ADR은 그 조합과 topology 패턴을
 정의.
 ### D3. `hierarchical_3level` topology (신규)
 `kernbench/ccl/topologies.py`에 신규 추가:
 ```python
 def hierarchical_3level(rank: int, world_size: int, spec: dict) -> dict:
    """3-level hierarchical neighbor pattern.
    Returns a nested structure describing intra-cube + inter-cube + inter-SIP
    neighbors. Unlike ring_1d / mesh_2d which are rank → {dir: peer_rank},
    hierarchical is PE-level and requires spec for cube_mesh / pe_layout.
    """
 ```
 반환 스키마 (초안):
 ```python
 {
    "intra_cube": {
        # 각 cube 내 ring neighbors: (cube, pe) → {"E": (cube, pe_e), "W": (cube, pe_w)}
        ...
    },
    "inter_cube": {
        # cube-leader 간 ring: (cube, 0) → {"N": (cube_n, 0), "S": (cube_s, 0)}
        ...
    },
    "inter_sip": {
        # SIP-leader 간: rank → {"parent": peer_rank} (또는 ring 방식)
        ...
    },
 }
 ```
 이 구조는 `_build_pe_installs`가 해석하여 각 PE의 neighbor table 엔트리
 (4-direction)에 대응시킨다.
 **Rank-level `topologies.py` 현 API와의 관계**: 기존 단순 패턴은
 `(rank → {dir: peer_rank})` 단일 레벨. Hierarchical은 multi-level이므로
 기존 API와 schema가 다름. `_resolve_topology`는 **알고리즘이 어떤 schema를
 쓰는지 선언**하고, builder가 그에 맞춰 해석하도록 확장 필요 (open question).
 ### D4. PE-level neighbor graph — `_build_pe_installs` 확장
 기존 (ring/mesh/tree): topology_fn이 반환한 `(rank → {dir: peer_rank})`를
 각 참여 PE에 그대로 매핑 (leader_only일 경우 peer PE도 leader).
 신규 (hierarchical): `hierarchical_3level`의 3단 구조를 per-PE neighbor
 table로 펼침:
 ```python
 def _build_pe_installs_hierarchical(rank, world_size, sip, pes, topo, spec):
    """Hierarchical 전용 PE neighbor table 빌더."""
    result = []
    for (cube, pe) in pes:
        entries = []
        # Level 1: intra-cube ring (E/W)
        for d, peer in topo["intra_cube"][(cube, pe)].items():
            entries.append(NeighborTableEntry(direction=d, ...))
        # Level 2: inter-cube ring (N/S) — cube leader (pe == 0)만
        if pe == 0:
            for d, peer in topo["inter_cube"][(cube, 0)].items():
                entries.append(NeighborTableEntry(direction=d, ...))
        # Level 3: inter-SIP — SIP leader (cube == 0 and pe == 0)만
        if cube == 0 and pe == 0:
            for d, peer_rank in topo["inter_sip"][rank].items():
                # peer_rank → peer SIP의 (0, 0)
                entries.append(NeighborTableEntry(
                    direction=d, peer_sip=peer_rank, peer_cube=0, peer_pe=0, ...))
        result.append(PeInstallSpec(cube=cube, pe=pe, neighbors=tuple(entries)))
    return tuple(result)
 ```
 `build_install_plans`에서 algorithm_config의 `topology`에 따라 적절한 builder
 선택 (기존 simple builder vs hierarchical builder).
 ### D5. Kernel 재해석 — SIP-local 좌표로
 `src/kernbench/ccl/algorithms/hierarchical_allreduce.py`를 ADR-0024 D3에
 맞춰 수정:
 ```python
 def kernel_args(*, n_elem: int, world_size: int, pes_per_cube: int,
                cubes_per_sip: int, num_sips: int, **kw) -> tuple:
    """world_size (= num_sips), pes_per_cube, cubes_per_sip를 스칼라로."""
    return (n_elem, pes_per_cube, cubes_per_sip, num_sips)
 def kernel(t_ptr, n_elem, pes_per_cube, cubes_per_sip, num_sips, tl):
    """SIP-local 좌표 기반.
    이전 (rank=PE 모델):
        rank = cube_id * pes_per_cube + local_pe
        pe_addr = t_ptr + rank * nbytes
    현재 (rank=SIP 모델):
        per-PE tensor slice는 backend가 TensorArg로 전달 → t_ptr은 이미 local.
        intra-cube ring은 tl.program_id(0) 사용.
        inter-cube ring은 pe_id == 0 조건으로 제한.
        inter-SIP reduce는 cube_id == 0 and pe_id == 0 조건으로 제한.
    """
    local_pe = tl.program_id(axis=0)
    cube_id = tl.program_id(axis=1)
    # Level 1: intra-cube ring
    for _ in range(intra_rounds(pes_per_cube)):
        tl.send(dir="E", src=acc)
        recv = tl.recv(dir="W", shape=(n_elem,), dtype="f16")
        acc = acc + recv
    # Level 2: inter-cube (cube leader only)
    if local_pe == 0:
        for _ in range(inter_cube_rounds(cubes_per_sip)):
            tl.send(dir="N", src=acc)
            recv = tl.recv(dir="S", shape=(n_elem,), dtype="f16")
            acc = acc + recv
    # Level 3: inter-SIP (SIP leader only)
    if local_pe == 0 and cube_id == 0:
        for _ in range(inter_sip_rounds(num_sips)):
            tl.send(dir="parent", src=acc)
            recv = tl.recv(dir="parent", shape=(n_elem,), dtype="f16")
            acc = acc + recv
    # Broadcast (reverse chain)
    # ...
    tl.store(t_ptr, acc)
 ```
 `kernel_args`는 ADR-0024 D4의 keyword-only signature 계약을 따른다.
 ### D6. Validator — `multi_pe_sip_local`
 ADR-0024 D8의 built-in 그대로 활용. `ccl.yaml`에서 `validator:
 multi_pe_sip_local` 지정 시 backend가 각 SIP에 `cubes × pes_per_cube`개
 shard가 있는지 검증.
 ### D7. Bench — 기본 all-reduce bench 확장
 `benches/ccl_allreduce.py`의 worker는 `ccl.yaml`이 `hierarchical_allreduce`를
 선택하면 자동으로:
 ```python
 # Worker 예
 dp = DPPolicy(cube="column_wise", pe="column_wise")
 tensor = torch.zeros((1, intra_sip_pes * n_elem), dp=dp, name="in")
 # tensor는 각 SIP의 모든 PE에 1 tile씩 분산 (multi_pe_sip_local validator 통과)
 dist.all_reduce(tensor, op="sum")
 ```
 Worker 코드 자체는 알고리즘 종류를 모름 (`ccl.yaml` 선택에 의존). 단,
 **DPPolicy가 hierarchical 요구와 일치해야** 함 — `cube/pe="column_wise"`
 같은 SIP-내 분산을 하는 DPPolicy여야 `multi_pe_sip_local` 검증 통과. 이
 DPPolicy 선택은 bench 설정 또는 sample bench에서 결정.
 ---
 ## Dependencies
 - **ADR-0024**: Launcher, `all_pes` mapper, `multi_pe_sip_local` validator,
  registry + import path. 본 ADR 구현의 전제.
 - **ADR-0025**: IPCQ direction addressing — cube/pe/SIP 간 다중 direction을
  동시 사용하므로 정확한 direction 매칭 필수.
 - **ADR-0023**: IPCQ protocol (neighbor table, send/recv, credit return).
 - **기존 `hierarchical_allreduce.py`**: 본 ADR은 그 커널의 재해석 + 주변
  framework integration.
 ---
 ## Non-goals
 - **ADR-0024 framework 변경**: 재활용만.
 - **Alternative reduce topology (tree-in-tree 등)**: 3-level ring이 첫 구현.
 - **Dynamic level count**: 현재 SIP/cube/PE 3단 고정. 2단 (SIP + PE, cube
  skip) 또는 4단 이상은 future.
 - **Bandwidth-optimal schedule tuning**: reduce round 수 / chunk size 조정
  같은 tuning은 별도.
 - **Pipelined hierarchical**: 여러 chunk를 파이프라인으로 겹쳐서 돌리는
  NCCL-style 최적화는 future.
 ---
 ## Open questions
 ### 🟠 중간 영향 — 구현 시 결정 필요
 - **`topologies.py` 스키마 확장**: 기존 `ring_1d` 등은 단일 레벨 `(rank →
  {dir: peer})`. `hierarchical_3level`은 multi-level. `_resolve_topology`가
  둘을 모두 반환할 수 있도록 schema를 일반화할지, 아니면 hierarchical 전용
  return type을 두고 builder가 분기할지.
  - Option A: 모든 topology를 neighbor-list 형태로 단일화
    (`[{direction, peer_sip, peer_cube, peer_pe}, ...]`)
  - Option B: topology 모듈이 `kind` 필드 제공, builder가 분기
  - 권장: Option A (single source of truth, ADR-0024 Open Q의
    "PE-level topology 일원화" 방향과 일치)
 - **`hierarchical_3level` vs algorithm별 topology 모듈**: 향후 mesh-based
  hierarchical 등 variant이 생기면? `hierarchical_3level` 같은 이름이 이미
  topology-specific. 변형은 새 key 추가 (`hierarchical_mesh_3level` 등) 또는
  알고리즘 모듈에서 topology 생성 override.
 ### 🟡 Nice-to-have
 - **Reduce round 수 최적화**: Bidirectional ring은 `ceil((N-1)/2)` round.
  Non-power-of-2 group size에서 idle PE 발생 가능.
 - **Non-uniform topology 대응**: cube_mesh가 w != h일 때 inter-cube ring
  balance.
 - **Single SIP 케이스**: world_size = 1 (SIP 1개)일 때 Level 3 skip. Degenerate
  case 검증.
 ### 🟢 Framework evolution 시사점 (ADR-0024로부터 이관)
 - **PE-level topology 일원화 (중장기)**: 현 설계는
  - topology (rank graph 또는 level-separated)
  - mapper (per-SIP PE set)
  - `_build_pe_installs` (actual edges)
  의 3단 분산. Hierarchical이 이 분산을 가장 스트레스 받는 케이스. 중장기로는
  `topologies.py`가 PE-level neighbor list를 직접 반환하고 mapper는 단순히
  "어느 PE가 참여하느냐"만 결정, `_build_pe_installs`는 flat
  mapping으로 단순화되는 방향이 자연스러움. **본 ADR에서 Option A를 채택**하면
  이 방향으로 이미 정합.
 ---
 ## Test strategy
 ### T1. Topology generator
 `tests/test_hierarchical_topology.py` (new):
 - `hierarchical_3level(rank, world_size, spec)` → 각 level의 neighbor set이
  예상 구조인지 (intra-cube는 ring, inter-cube는 cube-leader만 참여, inter-SIP은
  SIP-leader만 참여)
 - 2 SIP × 4 cubes × 4 PEs 같은 작은 토폴로지로 수작업 검증 가능
 - Symmetry: rank r의 E neighbor가 peer에서 W로 역포인팅
 ### T2. Install plan — hierarchical × all_pes
 `tests/test_ccl_install_plan.py` (확장):
 - `build_install_plans(algorithm="hierarchical_allreduce", mapper="all_pes",
  validator="multi_pe_sip_local")` 호출 시
  - 각 SIP의 모든 PE가 `participating_pes`에 포함
  - PE 0 (cube leader)만 inter-cube neighbor를 가짐
  - (cube 0, pe 0) (SIP leader)만 inter-SIP neighbor를 가짐
  - Non-leader PE는 intra-cube neighbor만
 ### T3. Kernel unit — mock runtime
 `tests/test_hierarchical_mock_runtime.py` (new):
 - `run_kernel_in_mock` (kernbench.ccl.testing)을 확장해 multi-level 지원
 - 2 SIP × 2 cubes × 4 PEs (총 16 PE) 토폴로지에서 초기 tile을 rank+1로 채우고
  hierarchical all-reduce 실행
 - 모든 PE의 최종 결과가 `sum(1..16)`인지
 ### T4. E2E — 실제 SimPy backend
 `tests/test_ccl_allreduce_matrix.py` (확장):
 - `hierarchical @ ws=SIP_count`: multi_pe_sip_local layout + 3-level 알고리즘
  전체 stack 통과 검증
 ### T5. Validator enforcement
 - `multi_pe_sip_local` validator가 wrong layout (예: leader_only 스타일 1
  shard per rank) 입력에 raise
 ### T6. 회귀
 기존 ring/mesh/tree 알고리즘 모두 그대로 통과. 본 ADR은 그들을 건드리지 않음.
 ---
 ## Consequences
 ### Positive
 - **Intra-SIP PE 활용도 증가**: Inter-SIP 통신 중에도 intra-cube / inter-cube
  reduce가 진행되어 전체 PE 가동률 향상.
 - **Multi-level bandwidth 활용**: cube NoC, UCIe 모두 작동 → 더 정확한 HW 모델.
 - **ADR-0024 framework 검증**: `all_pes` mapper + `multi_pe_sip_local`
  validator의 첫 non-trivial use case. Framework 설계 타당성 확인.
 - **기존 커널 재활용**: `hierarchical_allreduce.py` 큰 구조 유지, SIP-local
  좌표만 재해석.
 ### Negative
 - **`topologies.py` schema 확장 필요**: Single-level vs multi-level 표현.
  해결안(Option A)은 기존 ring/mesh/tree의 마이그레이션 비용 유발.
 - **Validator / mapper 조합 요구**: 사용자가 DPPolicy를
  `multi_pe_sip_local`에 맞춰 선택해야 함 (bench 설정 복잡도 증가).
 ### Neutral
 - 본 ADR 구현 전까지 `hierarchical_allreduce.py`는 deprecated 상태 유지 또는
  ADR-0024 matrix test에서 제외. 현재 파일을 곧바로 삭제하지는 않음.
 ---
 ## Affected files
 | File | Change |
 |------|--------|
 | `src/kernbench/ccl/topologies.py` | D3: `hierarchical_3level` topology 함수 추가. (Option A 채택 시) 기존 topology 출력 format 통일 |
 | `src/kernbench/ccl/install_plan.py` | D4: hierarchical builder 분기 (또는 단일 builder가 level 개수로 dispatch) |
 | `src/kernbench/ccl/algorithms/hierarchical_allreduce.py` | D5: SIP-local 좌표로 kernel 재작성, `kernel_args` keyword-only signature |
 | `ccl.yaml` | D2: `hierarchical_allreduce` 엔트리 추가 (`mapper: all_pes`, `validator: multi_pe_sip_local`, `topology: hierarchical_3level`) |
 | `tests/test_hierarchical_topology.py` (new) | T1 |
 | `tests/test_ccl_install_plan.py` | T2 확장 |
 | `tests/test_hierarchical_mock_runtime.py` (new) | T3 |
 | `tests/test_ccl_allreduce_matrix.py` | T4: hierarchical row 추가 |
@@ -0,0 +1,347 @@
 # ADR-0030: IPCQ Physical Addressing — PhysAddr integration
 ## Status
 Proposed (Blocked on ADR-0031 — PhysAddr PE-resource extension)
 ## Context
 ### 목표
 IPCQ ring buffer의 주소 체계를 ADR-0023의 **synthetic parallel namespace**
 (`_IPCQ_BASE = 1<<60`)에서 **ADR-0001의 PhysAddr**로 이관한다. Routing /
 allocator / MemoryStore의 정합성을 회복하고, buffer_kind (tcm/hbm/sram)별
 physical backing을 구조적 좌표로 표현한다.
 ### 현재 상태 (ADR-0023 D2.5)
 `src/kernbench/ccl/install.py:52-56`:
 ```python
 _IPCQ_BASE = 1 << 60
 def _ipcq_base_for_pe(sip, cube, pe):
    return _IPCQ_BASE | (sip << 40) | (cube << 32) | (pe << 24)
 def rx_base(s, c, p, d):
    return _ipcq_base_for_pe(s, c, p) + direction_idx[d] * bytes_per_direction
 ```
 - **bit 60** 사용 → ADR-0001의 51-bit PhysAddr 공간 밖 (`MAX_51 = (1 << 51) - 1`)
 - `PhysAddr.decode(addr)` → `PhysAddrError("addr must be a 51-bit value")`
 - `IpcqEndpoint.rx_base_pa: int` — 타입이 raw int, 구조 없음
 - `buffer_kind` (tcm/hbm/sram)와 synthetic 주소의 관계가 coupling 없음
 - Allocator (`PEMemAllocator`) 우회 — synthetic unique id per (sip, cube, pe,
  direction). 진짜 physical allocation이 아님
 ADR-0023 D2.5 원문:
 > This bypasses the topology's address resolver / PhysAddr encoding and
 > treats IPCQ buffers as a separate, parallel address namespace. Real PA
 > encoding can be plugged in later without changing the rest of the design.
 "later"가 이 ADR.
 ### 왜 지금 다루는가
 - ADR-0025 (direction addressing)은 주소-기반 매칭으로 전환. 주소가 correctness에
  직접 기여 → 주소 체계가 설계 관점에서 더 중요해짐
 - ADR-0001의 "Routing consumes decoded domains, not raw bit-fields" 계약 위반
  지속 → 기술 부채
 - Routing fabric (cube_noc / UCIe)은 PhysAddr.decode()로 destination을 정함.
  IPCQ의 synthetic 주소가 fabric routing에서 실제로 어떻게 처리되는지 **검증되지
  않음** (별도 경로로 배달되는 것으로 추정)
 - TCM / HBM / SRAM의 실제 memory layout과 IPCQ ring buffer 위치가 **disjoint**
  → allocator가 IPCQ 영역을 모르므로 실수로 겹칠 가능성 (현재는 bit 60로 완전
  분리되어 문제 없지만 설계 원칙상 건강하지 않음)
 ### 풀어야 할 문제
 1. **IPCQ ring buffer의 PhysAddr 표현**: buffer_kind별로 어떤 PhysAddr factory를
   쓸지.
 2. **PhysAddr 공간 부족 가능성**: 51-bit 공간에 IPCQ 버퍼를 담을 여유가 있는지.
 3. **Allocator 통합**: `PEMemAllocator`에 IPCQ buffer 영역 예약 기능 추가, 또는
   기존 pool에서 정상 allocation.
 4. **MemoryStore space naming 정리**: 현재는 `{"tcm", "hbm", "sram"}` 문자열로
   space 구분. IPCQ buffer도 이 space에 속하면 일반 data와 주소 겹침 방지 필요.
 5. **Routing fabric 통합**: PhysAddr 기반 routing이 IPCQ 토큰을 올바른 SIP의
   올바른 메모리로 배달.
 6. **ADR-0025와의 정합**: 주소-기반 매칭이 PhysAddr에서도 동일하게 작동.
 ---
 ## Decision
 ### D1. IPCQ ring buffer = PhysAddr factory 사용
 각 `buffer_kind`가 해당하는 PhysAddr factory를 호출:
 | buffer_kind | PhysAddr factory | 필요한 인자 |
 |---|---|---|
 | `tcm` | `PhysAddr.pe_tcm_addr(rack_id, sip_id, cube_id, pe_id, tcm_offset)` | PE-local TCM |
 | `hbm` | `PhysAddr.pe_hbm_addr(rack_id, sip_id, cube_id, pe_id, pe_local_hbm_offset, slice_size_bytes)` | PE-local HBM slice |
 | `sram` | `PhysAddr.cube_sram_addr(rack_id, sip_id, cube_id, sram_offset)` | Cube-shared SRAM |
 Install plan builder (`build_install_plans` in ADR-0024)가 각 PE의 rx_base를
 계산할 때:
 ```python
 # ADR-0030 후 install_plan.py (pseudocode)
 def _compute_rx_base(sip, cube, pe, direction_idx, buffer_kind, n_slots, slot_size,
                     allocator_pool, rack_id=0) -> PhysAddr:
    bytes_per_direction = n_slots * slot_size
    offset = direction_idx * bytes_per_direction
    if buffer_kind == "tcm":
        # TCM base (per-PE) + direction offset
        tcm_base = allocator_pool.reserve_pe_tcm_for_ipcq(sip, cube, pe,
                                                          total_bytes=N_DIR * bytes_per_direction)
        return PhysAddr.pe_tcm_addr(rack_id=rack_id, sip_id=sip, cube_id=cube,
                                      pe_id=pe, tcm_offset=tcm_base + offset)
    elif buffer_kind == "hbm":
        hbm_base = allocator_pool.reserve_pe_hbm_for_ipcq(sip, cube, pe,
                                                          total_bytes=...)
        return PhysAddr.pe_hbm_addr(rack_id=rack_id, sip_id=sip, cube_id=cube,
                                      pe_id=pe, pe_local_hbm_offset=hbm_base + offset,
                                      slice_size_bytes=slice_size)
    elif buffer_kind == "sram":
        sram_base = allocator_pool.reserve_cube_sram_for_ipcq(sip, cube,
                                                               total_bytes=...)
        return PhysAddr.cube_sram_addr(rack_id=rack_id, sip_id=sip, cube_id=cube,
                                         sram_offset=sram_base + offset)
 ```
 `IpcqEndpoint.rx_base_pa`의 타입을 `PhysAddr` (또는 encoded `int`)로 변경:
 ```python
@dataclass(frozen=True)
 class IpcqEndpoint:
    sip: int
    cube: int
    pe: int
    buffer_kind: str
    rx_base_pa: int            # PhysAddr.encode() 결과 (51-bit)
    rx_base_va: int
    n_slots: int
    slot_size: int
 ```
 타입은 int 유지 (encoded form), 단 **반드시 PhysAddr.decode()로 복원 가능**한
 값임을 invariant으로 둔다. 디코더 호출자는 `PhysAddr.decode(rx_base_pa)`로
 구조적 좌표 획득.
 ### D2. Allocator 확장 — IPCQ 예약 API
 `PEMemAllocator`에 IPCQ 전용 예약 기능 추가:
 ```python
 class PEMemAllocator:
    def reserve_ipcq_tcm(self, total_bytes: int) -> int:
        """Reserve TCM region for IPCQ ring buffers at this PE.
        Returns tcm_offset (to be used in PhysAddr.pe_tcm_addr)."""
        # TCM에서 `total_bytes` 연속 영역 예약.
        # Tensor allocation과 겹치지 않도록.
    def reserve_ipcq_hbm(self, total_bytes: int) -> int: ...
    # cube-level allocator도 유사
 ```
 Install plan 빌더가 각 PE allocator에서 예약. 예약 결과(offset)를 PhysAddr
 factory에 전달.
 **기존 `_ipcq_base_for_pe` / `_IPCQ_BASE` 제거**.
 ### D3. MemoryStore space 통합
 현재 `MemoryStore`는 `{space_name: {addr: ndarray}}` 구조. IPCQ buffer는 일반
 tensor 데이터와 같은 space (tcm/hbm/sram)를 공유하게 됨. 주소 유일성은 ADR-0001의
 PhysAddr 계층 보장.
 Backward compatibility: 기존 IPCQ address (synthetic)을 쓰는 code path는
 **제거**하고, 모두 PhysAddr.encode() 결과만 사용. 이 자체는 API 변경이 아니라
 값 변경.
 ### D4. Routing fabric 통합
 IPCQ DMA write (`IpcqDmaToken`의 `src_addr → dst_addr`)이 PhysAddr encoding을
 사용하므로 **routing fabric이 `PhysAddr.decode(dst_addr)`로 destination
 SIP/cube/PE를 정확히 찾을 수 있음**. Fabric routing 로직 변경 없음 (기존에도
 PhysAddr.decode를 쓰는 것으로 추정).
 **검증 필요**: 현재 fabric이 bit 60 synthetic 주소를 어떻게 라우팅하는지 확인.
 별도 경로가 있다면 제거, PhysAddr 경로로 통합.
 ### D5. ADR-0025와의 정합
 ADR-0025의 주소-기반 매칭 (dst_addr로 direction 식별)은 PhysAddr.encode()
 결과를 비교하는 것으로 자연스럽게 호환. 변경 없음.
 다만 debug / diagnostic 향상 가능:
 ```python
 # pointer_dump 등에서
 print(f"E: rx_base_pa={PhysAddr.decode(qp.peer.rx_base_pa)}")
 # 출력 예: PhysAddr(sip=1, cube=0, pe=0, kind="pe_resource", unit_type=PE, ...)
 ```
 이전 synthetic 주소는 decode 불가 → diagnostic 질 저하. PhysAddr 전환으로 개선.
 ### D6. ADR-0023 D2.5 amendment
 ADR-0023의 "bypasses PhysAddr encoding" 문구를 **Accepted fallback → now
 replaced by ADR-0030**으로 수정. 본 ADR이 적용되면 ADR-0023 D2.5의 "Real PA
 encoding can be plugged in later" 약속이 이행된 것.
 ---
 ## Migration strategy
 단계적 전환 (한 PR로 하지 않는다):
 ### Phase 1: PhysAddr 공간 재검토
 - 51-bit PhysAddr 공간에 IPCQ ring buffer가 실제로 들어갈 수 있는지 확인.
 - 각 buffer_kind (tcm/hbm/sram)별 factory가 제공하는 `local_offset` 범위가
  IPCQ 요구 (4 direction × n_slots × slot_size)를 수용 가능한지.
 - 부족하면 PhysAddr layout 자체 확장 (ADR-0001 amendment 별도 필요).
 ### Phase 2: Allocator API 확장
 - `PEMemAllocator.reserve_ipcq_*` 메소드 추가.
 - 기존 tensor allocation과 영역 충돌 방지.
 ### Phase 3: Install plan builder 전환
 - `_ipcq_base_for_pe` 제거, PhysAddr factory 호출로 대체.
 - `IpcqEndpoint.rx_base_pa`가 PhysAddr.encode() 결과 (51-bit).
 ### Phase 4: Routing fabric 검증
 - IPCQ DMA token이 fabric 정상 경로로 배달되는지 확인.
 - 별도 fast-path가 있다면 제거, 통합.
 ### Phase 5: MemoryStore space 검증
 - IPCQ buffer 주소가 기존 tensor 주소와 겹치지 않는지.
 - Allocator 레벨에서 이미 예약했으므로 정상적으로 분리되어야 함.
 ### Phase 6: ADR-0023 D2.5 업데이트 + 기존 sideband path 제거 (완료)
 ---
 ## Dependencies
 - **ADR-0031** (PhysAddr PE-resource extension) — **Blocker**: PhysAddr가 PE
  resource (특히 IPCQ ring buffer)를 충분히 표현할 수 있도록 schema 확장이
  선행되어야 함. 본 ADR은 ADR-0031 완료 후에만 실행 가능.
 - **ADR-0001** (PhysAddr layout): 본 ADR의 기반. 51-bit 공간 / factory API의
  ADR-0031 확장본을 사용.
 - **ADR-0023** (IPCQ protocol): 본 ADR은 ADR-0023 D2.5의 "later" 약속 이행.
  D9 piggyback / credit return 프로토콜 자체는 불변.
 - **ADR-0024** (launcher + install_plan.py): `build_install_plans`가 PhysAddr
  factory를 호출하게 됨.
 - **ADR-0025** (direction addressing): 주소-기반 매칭이 PhysAddr에서도 동일하게
  작동. 변경 없음.
 ---
 ## Non-goals
 - **ADR-0001 PhysAddr layout 자체 변경**: 51-bit 공간과 segment 구조는 유지.
  부족 시 별도 ADR.
 - **IPCQ protocol semantic 변경**: ADR-0023 D9 piggyback 등 프로토콜 로직 유지.
 - **Allocator 전반 재설계**: IPCQ 예약 API 추가만.
 ---
 ## Open questions
 ### 🔴 Critical — Migration 전 반드시 검증
 - **PhysAddr 51-bit 공간에 IPCQ 버퍼가 실제로 들어가는가**: 각 PE의 TCM
  영역에서 `4 direction × n_slots (default 4) × slot_size (default 4KB)` =
  64KB가 PE TCM 공간에 수용 가능. TCM size (e.g., 16MB) 대비 충분. HBM도 여유
  많음. SRAM은 cube 공유라 direction × PE 곱이 있음 — 별도 검증 필요.
 - **Routing fabric의 현재 IPCQ 주소 처리**: 현재 synthetic 주소가 fabric에서
  어떻게 routing되는지 trace 필요. `PhysAddr.decode()`로 판독 불가한 값이
  fabric에서 정상 배달된다면 어떤 경로를 쓰는지 조사.
 ### 🟡 Nice-to-have
 - **IPCQ 전용 kind / sub_offset 인코딩**: `UnitType.PE`의 sub_offset 공간을
  IPCQ와 공유. 충돌 방지를 위해 IPCQ 전용 sub-space 정의할지 여부.
 - **Debug tool**: `pointer_dump`를 PhysAddr 포매팅으로 개선.
 ---
 ## Test strategy
 ### T1. PhysAddr round-trip
 `tests/test_ipcq_physaddr.py` (new):
 - `PhysAddr.pe_tcm_addr(...)` → encode → decode → 동일 필드 복원
 - TCM / HBM / SRAM 각 factory에 대해
 ### T2. Allocator 예약
 `tests/test_ipcq_alloc.py` (new):
 - `PEMemAllocator.reserve_ipcq_tcm` → 반환된 offset이 valid TCM 영역
 - 중복 예약 → 에러 또는 non-overlapping offset
 - Tensor allocation과 충돌 없음
 ### T3. Install plan PhysAddr integration
 `tests/test_ccl_install_plan.py` (확장):
 - `build_install_plans` 결과의 `rx_base_pa`가 PhysAddr.decode() 가능
 - Decoded 좌표가 plan의 (sip, cube, pe)와 일치
 - I3.1 invariant (ADR-0025 D6) — rx_base range disjointness가 PhysAddr에서도 성립
 ### T4. Routing — IPCQ DMA fabric traversal
 `tests/test_ipcq_routing.py` (new):
 - Cross-SIP IPCQ send → fabric이 `PhysAddr.decode(dst_addr)`로 destination SIP
  정확히 판단 → 올바른 MemoryStore에 write
 - UCIe 경로 / cube_noc 경로 모두 검증
 ### T5. 회귀
 - 기존 IPCQ E2E 테스트 (ring, mesh, tree) 모두 통과
 - ADR-0024, ADR-0025 통합 테스트 통과
 ---
 ## Consequences
 ### Positive
 - **ADR-0001 정합성 회복**: routing과 addressing이 단일 체계.
 - **buffer_kind 명확**: TCM/HBM/SRAM이 구조적 좌표로 구분.
 - **Debug 향상**: PhysAddr.decode()로 사람이 읽을 수 있는 좌표.
 - **Allocator 통합**: IPCQ 영역이 정상 예약 → tensor와의 충돌 리스크 사전 차단.
 - **Fabric routing 일원화**: 별도 경로 없이 기존 PhysAddr-based routing 재활용.
 ### Negative
 - **Migration 복잡도**: 6 Phase 단계적 전환 필요. 각 Phase마다 regression 리스크.
 - **PhysAddr 공간 검증 부담**: Phase 1에서 TCM/HBM/SRAM 공간이 IPCQ 요구를
  수용하는지 실측 필요.
 - **Routing fabric 검증**: 현재 fabric이 synthetic 주소를 어떻게 처리하는지
  조사 필요.
 ### Neutral
 - IPCQ protocol semantic (ADR-0023 D9 등) 불변.
 - ADR-0025의 direction addressing 로직 불변.
 ---
 ## Affected files
 | File | Change |
 |------|--------|
 | `src/kernbench/ccl/install.py` | `_IPCQ_BASE`, `_ipcq_base_for_pe` 제거 |
 | `src/kernbench/ccl/install_plan.py` (ADR-0024) | D1: PhysAddr factory 호출로 rx_base 계산 |
 | `src/kernbench/policy/address/allocator.py` (or similar) | D2: IPCQ 예약 API (`reserve_ipcq_tcm` 등) |
 | `src/kernbench/common/ipcq_types.py` | D1: `IpcqEndpoint.rx_base_pa` 문서화 — PhysAddr.encode 결과 |
 | `src/kernbench/sim_engine/memory_store.py` | D3: IPCQ buffer가 기존 space와 공유되는지 검증 |
 | `src/kernbench/sim_engine/engine.py` | D4: IPCQ token routing이 PhysAddr-based fabric 경로 사용 |
 | `src/kernbench/ccl/diagnostics.py` | D5: pointer_dump를 PhysAddr 포매팅으로 개선 |
 | `docs/adr/ADR-0023-ipcq-pe-collective.md` | D6: D2.5 amendment note |
 | `tests/test_ipcq_physaddr.py` (new) | T1 |
 | `tests/test_ipcq_alloc.py` (new) | T2 |
 | `tests/test_ccl_install_plan.py` | T3 확장 |
 | `tests/test_ipcq_routing.py` (new) | T4 |
@@ -0,0 +1,257 @@
 # ADR-0031: PhysAddr PE-Resource Extension
 ## Status
 Stub (Blocker for ADR-0030 — specific range allocations TBD)
 ## Context
 ### 목표
 ADR-0001의 `PhysAddr` schema를 **PE 내부의 다양한 resource**를 체계적으로
 표현할 수 있도록 확장한다. ADR-0030 (IPCQ PhysAddr integration) 및 향후의
 PE-local resource 추가 (scratchpad, register file, status register, 등)의
 기반을 제공한다.
 ### 현재 상태 (ADR-0001)
 51-bit PhysAddr layout:
 ```
 [50:47] rack_id  (4)
 [46:43] sip_id   (4)
 [42:38] sip_seg  (5)   # cube_id
 [37:0]  local_offset (38)
 ```
 `local_offset` (38 bits) 내부:
 - `[37]` selector: 1 = HBM window (128GB), 0 = PE resource window
 - PE resource window는 `unit_type` (3 bits: PE | MCPU | SRAM) +
  `pe_id` (4 bits) + `ext` (1 bit) + `sub_offset` (29 bits)
 Factory API:
 - `PhysAddr.hbm_addr(...)` — HBM generic
 - `PhysAddr.pe_hbm_addr(...)` — PE-local HBM slice
 - `PhysAddr.pe_tcm_addr(...)` — PE TCM (via `UnitType.PE` + `sub_offset`)
 - `PhysAddr.cube_sram_addr(...)` — Cube-shared SRAM
 ### 풀어야 할 문제
 1. **PE 내부 resource 구분의 명시적 체계 부재**: 현재 `local_offset` (38 bits)
   이 평면 공간으로 취급되고, PE TCM / IPCQ ring / scratchpad / 향후 register
   file 등이 관습적 offset 범위로만 구분됨. Schema 레벨에서 명확하지 않음.
 2. **IPCQ 주소의 PhysAddr 표현 부재**: ADR-0030이 IPCQ ring buffer를 PhysAddr로
   표현하려면 "이 주소가 IPCQ 영역"을 decode 가능해야 함. 현재는 불가.
 3. **향후 PE resource 확장 경로**: register file, performance counter 등
   추가 시 일관된 위치 할당 규칙 필요.
 ### 설계 방향 — local_offset을 PE 컴포넌트별 range로 분할
 `local_offset` (38 bits = 256GB per PE segment)을 **PE 컴포넌트마다 고정
 range**로 나누어 할당한다. 각 range는 해당 컴포넌트 전용 주소 공간이며,
 `PhysAddr.decode()`가 주소가 어느 range에 속하는지 판별해 해당하는 `kind` /
 `unit_type` / `sub_type` 필드를 채운다.
 개념적 구조 (구체적 bit 할당은 **TBD**):
 ```
 local_offset [37:0]  (38 bits total)
 ├── HBM window           [37] = 1    (기존 128GB)
 ├── PE component ranges  [37] = 0
 │   ├── TCM              [range_1]
 │   ├── IPCQ rings       [range_2]
 │   ├── Scratchpad       [range_3]
 │   ├── Register file    [range_4]
 │   ├── (reserved)       ...
 │   └── Sideband / status [range_N]
 ```
 ### 왜 range-based partition인가
 - **Schema-level 명시성**: 주소 하나 보고 어느 컴포넌트의 자원인지 decode 가능.
  "Routing consumes decoded domains" (ADR-0001 D5) 계약 충족.
 - **Unit type enum 확장보다 유연**: 3-bit `UnitType` 공간을 고갈시키지 않고
  세분화 가능. 미래 추가 컴포넌트도 빈 range 할당.
 - **Allocator 통합 자연**: 각 PE-level allocator가 관리하는 하위 pool을
  address range와 1:1 매칭 (e.g., `reserve_ipcq_tcm()` → IPCQ range 안에서만
  할당).
 - **Decode routing 단순**: `PhysAddr.decode(addr)`가 range table을 참조해
  `kind` + sub-field를 채움. 기존 HBM selector bit 패턴의 일반화.
 ### 왜 지금 다루는가
 - ADR-0030 (IPCQ PhysAddr 통합)이 이 확장에 **의존**. ADR-0030 단독 진행 시
  `sub_offset` 공간을 불투명하게 재사용하게 되어 ADR-0001 계약 미충족.
 - PE 내부 자원이 더 추가될 가능성 — 지금 구조를 정리해두면 일관된 확장 경로 확보.
 ---
 ## Decision (pending specific range allocation)
 ### D1. Range-based local_offset partition — approach
 `local_offset`을 고정 byte range로 분할하고, 각 range를 PE 컴포넌트에 할당한다.
 주소의 어느 range에 속하는가로 `kind` / component type을 결정.
 ```python
 # src/kernbench/policy/address/phyaddr.py (conceptual, post-extension)
@dataclass(frozen=True)
 class PeResourceRange:
    name: str                # e.g. "tcm", "ipcq", "scratchpad", "regfile"
    start_offset: int        # local_offset 내 시작
    end_offset: int          # exclusive
    byte_size: int           # end - start
 PE_RESOURCE_MAP: tuple[PeResourceRange, ...] = (
    # TBD — 구체적 range 할당은 사용자가 별도 업데이트
 )
 ```
 `PhysAddr.decode(addr)`의 PE resource 경로는:
 ```python
 def decode_pe_resource(local_offset: int) -> dict:
    for r in PE_RESOURCE_MAP:
        if r.start_offset <= local_offset < r.end_offset:
            return {
                "kind": "pe_resource",
                "component": r.name,                 # NEW: "tcm"/"ipcq"/...
                "component_offset": local_offset - r.start_offset,  # within range
            }
    raise PhysAddrError(f"local_offset {local_offset} not in any PE range")
 ```
 ### D2. Specific range allocations — **TBD**
 > 사용자가 구체적 byte 할당을 별도로 정의한 뒤 본 ADR에 업데이트.
 >
 > 필요 정보:
 > - 각 컴포넌트 (TCM, IPCQ, scratchpad, regfile, ...)의 이름 / byte size
 > - `local_offset` 내 시작 offset (align 고려)
 > - 현재 하드웨어 사양 / 시뮬레이션 요구 반영
 이 섹션이 채워진 뒤 ADR status: **Stub → Proposed → Accepted** 승격.
 ### D3. Factory API — per-component 함수
 기존 `PhysAddr.pe_tcm_addr(...)` 패턴을 일반화:
 ```python
 # 기존 (이미 존재)
 PhysAddr.pe_tcm_addr(rack_id, sip_id, cube_id, pe_id, tcm_offset)
 # 신규 (ADR-0031 후 추가)
 PhysAddr.pe_ipcq_addr(rack_id, sip_id, cube_id, pe_id, ipcq_offset)
 PhysAddr.pe_scratchpad_addr(...)
 PhysAddr.pe_regfile_addr(...)
 # ...
 ```
 각 factory는 해당 컴포넌트의 range 내에서 `component_offset`만 받아 최종
 PhysAddr encoding. 호출자는 어느 range인지 몰라도 됨.
 ### D4. Backward compatibility
 - 기존 `pe_tcm_addr()` signature / semantic 유지.
 - 내부 인코딩만 신규 range table을 참조하도록 변경.
 - 기존 `UnitType.PE` decoding 경로는 `PE_RESOURCE_MAP`에서 "tcm" range를
  대응하도록 매핑 → 기존 코드 transparent.
 - 기존 코드가 `PhysAddr.decode(addr).unit_type == UnitType.PE`를 체크하는
  경우는 여전히 유효 (TCM 주소는 계속 PE unit_type).
 ---
 ## Open questions
 ### 🔴 Pending user input (ADR 승격 blocker)
 - **D2의 specific range allocation**: 사용자가 구체적 byte 할당 테이블을
  제공해야 Stub → Proposed 승격 가능. 필요 정보:
  - 컴포넌트 목록 (TCM, IPCQ, scratchpad, regfile 등)
  - 각 컴포넌트의 byte size / 시작 offset
  - Alignment 요구사항 (4KB / page-aligned 등)
 ### 🟡 설계 세부 — range allocation 결정 과정에서 함께 결정
 - **총 local_offset space 배분**: HBM window (bit 37 = 1, 128GB)을 유지할지,
  아니면 PE resource space를 확장하기 위해 HBM window 축소할지.
 - **Range padding / reserved space**: 미래 컴포넌트 추가를 위한 "reserved"
  range 몇 개를 미리 확보할지.
 - **Address alignment**: 각 range의 시작 offset이 특정 alignment (page /
  cache line) 만족해야 하는지.
 - **Diagnostic / debug 포맷**: `PhysAddr.decode()` 출력에서 component 이름 +
  component_offset을 사람이 읽기 좋게 표시 (e.g., "IPCQ ring sip=0 cube=0 pe=3
  offset=0x1234").
 - **기존 `UnitType` enum의 role**: Range-based 접근 후에도 `unit_type` 필드
  유지할지 (decode 결과에 `component` 추가), 또는 enum 대체할지.
 ### 🟢 ADR-0030 연동 질문
 - **IPCQ range 내 direction/slot 표현**: PhysAddr는 `component_offset` 단위
  까지만 표현. "direction=E, slot=2"는 IPCQ range 내 offset 계산으로 도출
  (`direction_idx * slot_region_size + slot_idx * slot_size`) — 이 공식은
  ADR-0030 scope에서 구체화.
 - **Allocator pool 구조**: `PEMemAllocator`가 여러 range (TCM, IPCQ,
  scratchpad)를 개별 pool로 관리할지, 단일 pool에서 kind별 reserved만 관리
  할지. Range-based schema면 개별 pool이 자연스러움.
 ---
 ## Non-goals (this ADR)
 - **51-bit 전체 layout 재작성**: 본 ADR은 `local_offset` (38 bits) 내부의
  subdivision만 다룬다. Rack / SIP / cube segment 같은 상위 bit 구조는
  불변.
 - **`UnitType` enum 재설계**: range-based 접근으로 대체 가능하지만, 기존 enum
  (PE / MCPU / SRAM)은 backward compat 위해 유지.
 - **Dynamic range allocation**: runtime에 range 크기 바꾸는 기능 불필요. 모든
  range는 컴파일 / 설정 시점에 고정.
 - **Multi-process / multi-rack partitioning**: PE 내부 resource만 다룸.
 ---
 ## Action
 ### Phase 1 — User 입력: specific range allocation (**Blocker**)
 - 사용자가 정의한 PE 컴포넌트별 byte range를 D2에 기입:
  - `PE_RESOURCE_MAP` 테이블 내용 (name, start_offset, byte_size per 컴포넌트)
  - 각 컴포넌트의 hardware spec 근거 note
 ### Phase 2 — ADR Stub → Proposed 승격
 - D2 채워지면 status 변경.
 - Open questions의 "🔴 Pending user input" 블록 제거.
 - ADR-0001에 amendment note 초안 작성.
 ### Phase 3 — 구현
 - `PhysAddr` range-based decode 구현.
 - 신규 factory 함수 (`pe_ipcq_addr`, `pe_scratchpad_addr` 등 컴포넌트별)
  추가.
 - 기존 `pe_tcm_addr` 내부 인코딩만 신규 range table 참조하도록 수정
  (signature 불변).
 - 기존 코드 경로 회귀 확인.
 ### Phase 4 — ADR-0030 unblock
 - ADR-0030 "Blocked" 상태 해제.
 - Install_plan builder가 `pe_ipcq_addr(...)` 등 확장된 factory 호출하도록
  수정.
 ---
 ## Dependencies
 - **ADR-0001** (PhysAddr layout): 본 ADR은 ADR-0001의 확장.
 - **ADR-0023** (IPCQ protocol): IPCQ ring buffer의 주소 체계를 PhysAddr로
  통합할 수 있게 하는 기반.
 - **ADR-0030** (IPCQ PhysAddr integration): 본 ADR에 blocked.
 ---
 ## Affected files (future, after promotion to Proposed)
 | File | Change |
 |------|--------|
 | `src/kernbench/policy/address/phyaddr.py` | Range table (`PE_RESOURCE_MAP`), range-based decode, 신규 component-specific factory들 (`pe_ipcq_addr` 등), 기존 `pe_tcm_addr` 내부 인코딩 갱신 |
 | `src/kernbench/policy/address/allocator.py` | Range-aware pool 분리 (TCM pool / IPCQ pool / scratchpad pool 등 per-PE) |
 | `docs/adr/ADR-0001-physaddr-layout.md` | Amendment note: range-based PE resource partition |
 | `tests/test_phyaddr.py` | Range table 검증, 각 factory의 encode/decode round-trip, 기존 `pe_tcm_addr` 회귀 |
@@ -0,0 +1,592 @@
 # CCL Algorithm Author Guide (English)
 This document is a step-by-step guide for engineers writing CCL
 (Collective Communication Library) algorithms in kernbench. The
 internal system design and component structure live in
 [ADR-0023](adr/ADR-0023-ipcq-pe-collective.md).
 The goal here is to clearly separate **what an algorithm author has to
 touch** from **what they can leave alone**, and to get a first
 algorithm running through the shortest possible path.
 ---
 ## 0. Five-minute tour
 | Things you touch | Location |
 |------------------|----------|
 | Algorithm module (kernel + optional `neighbors()`) | `src/kernbench/ccl/algorithms/<algo>.py` |
 | Algorithm registration | `ccl.yaml` |
 | Host bench (rank count, init, launch, verify) | `benches/<your_bench>.py` |
 | (Optional) unit test | `tests/test_<algo>.py` |
 | Things you do NOT touch | Location |
 |--------------------------|----------|
 | TLContext API | `src/kernbench/triton_emu/tl_context.py` (ADR-0022 spec) |
 | Framework (topology generators, helpers, mock testing) | `src/kernbench/ccl/` |
 | PE_IPCQ / PE_DMA components | `src/kernbench/components/builtin/` |
 | Backend implementation (`install_ipcq`) | `src/kernbench/runtime_api/distributed.py` and `kernbench/ccl/install.py` |
 Workflow:
 1. Write a `kernel` function in the algorithm module.
 2. Register an entry in `ccl.yaml`.
 3. Write a host bench using `torch.distributed.init_process_group` /
   `torch.distributed.all_reduce` (the unified `benches/ccl_allreduce.py`
   handles the common case).
 4. (Optional) Run the mock runtime for fast unit tests (a few ms).
 5. `kernbench run --bench <name> --verify-data` for full SimPy verification.
 ---
 ## 1. Hello World — the simplest send/recv
 Each PE sends its tile to its E neighbor once and receives a tile from
 its W neighbor once. The reference code lives in
 [`src/kernbench/ccl/algorithms/hello_send.py`](../src/kernbench/ccl/algorithms/hello_send.py).
 ### Step 1: write the kernel
 New file `src/kernbench/ccl/algorithms/hello_send.py`:
 ```python
 """Hello world: send your tile to the next rank, receive from the previous one."""
 def kernel(t_ptr, n_elem, tl):
    # Global rank is computed from program_id(0/1) (ADR-0022).
    local_pe = tl.program_id(axis=0)
    cube_id = tl.program_id(axis=1)
    pes_per_cube = tl.num_programs(axis=0)
    rank = cube_id * pes_per_cube + local_pe
    nbytes = n_elem * 2  # f16
    pe_addr = t_ptr + rank * nbytes
    # Load our slice and send it east.
    src = tl.load(pe_addr, shape=(n_elem,), dtype="f16")
    tl.send(dir="E", src=src)
    # Receive from west and store directly back into our slice.
    recv = tl.recv(dir="W", shape=(n_elem,), dtype="f16")
    tl.store(pe_addr, recv)
 def kernel_args(world_size: int, n_elem: int) -> tuple:
    """Positional kernel args used by the ahbm backend (after t_ptr)."""
    return (n_elem,)
 ```
 Key points:
 - **Global rank is computed from `program_id(axis=0)` + `program_id(axis=1)`.**
  TL has no contractually-supported `tl.rank` / `tl.world_size`. If the
  host needs to pass `world_size` or anything else as an algorithm
  parameter, it goes through ordinary `torch.launch` arguments.
 - **`tl.send` takes a `TensorHandle`.** PE_IPCQ reads
  `addr`/`space`/`shape`/`dtype`/`nbytes` from the handle to issue an
  `IpcqDmaToken` to PE_DMA.
 - **`tl.recv` requires `shape` and `dtype`.** The returned TensorHandle
  points at the IPCQ ring slot and can be used directly as a `dst`
  handle (e.g. `tl.store(pe_addr, recv)`). Phase 2's `dma_write` replay
  handles the (slot → hbm) copy, so user code never has to touch
  `recv.data`.
 ### Step 2: register in `ccl.yaml`
 ```yaml
 algorithms:
  hello_send:
    module: kernbench.ccl.algorithms.hello_send
    topology: ring_1d
    buffer_kind: tcm
    world_size: 8
 ```
 `world_size` here is optional. If absent, `AhbmCCLBackend` derives it
 from the topology spec (`sips × cubes_per_sip × pes_per_cube`).
 ### Step 3: write a host bench (optional — the unified bench may suffice)
 For most CCL benchmarks the existing `benches/ccl_allreduce.py` is
 sufficient: it reads `ccl.yaml`, picks the algorithm, sets up the
 process group, and runs the collective. If your algorithm needs custom
 host logic, write a new bench file along the same lines.
 The host code looks like a real PyTorch DDP worker:
 ```python
 """benches/ccl_hello.py"""
 from __future__ import annotations
 import numpy as np
 from kernbench.policy.placement.dp import DPPolicy
 N_ELEM = 8
 def worker(rank: int, world_size: int, torch) -> None:
    """Per-rank business logic — mirrors a real PyTorch DDP worker."""
    dp = DPPolicy(
        cube="replicate", pe="column_wise",
        num_cubes=1, num_pes=world_size,
    )
    tensor = torch.zeros(
        (1, world_size * N_ELEM), dtype="f16", dp=dp, name="hello_in",
    )
    # Per-rank initialization via the real PyTorch idiom.
    init = np.zeros((1, world_size * N_ELEM), dtype=np.float16)
    for r in range(world_size):
        init[0, r * N_ELEM : (r + 1) * N_ELEM] = float(r + 1)
    tensor.copy_(torch.from_numpy(init))
    # The collective itself.
    torch.distributed.all_reduce(tensor, op="sum")
    # Verify on rank 0 (real PyTorch DDP idiom).
    if rank == 0:
        result = tensor.numpy()
        for r in range(world_size):
            expected = float(((r - 1) % world_size) + 1)
            slice_r = result[0, r * N_ELEM : (r + 1) * N_ELEM]
            print(
                f"  rank {r}: got {float(slice_r.mean()):.1f}, "
                f"expected {expected:.1f}"
            )
 def run(torch) -> None:
    """CLI entry point. Initializes dist, dispatches to worker."""
    dist = torch.distributed
    dist.init_process_group(backend="ahbm")
    worker(
        rank=dist.get_rank(),
        world_size=dist.get_world_size(),
        torch=torch,
    )
 ```
 ### Step 4: unit test (optional but strongly recommended)
 `tests/test_hello_send.py`:
 ```python
 import numpy as np
 from kernbench.ccl.algorithms.hello_send import kernel
 from kernbench.ccl.testing import run_kernel_in_mock
 def test_hello_send_4_ranks():
    n_elem = 8
    inputs = [
        np.full((n_elem,), float(r + 1), dtype=np.float16)
        for r in range(4)
    ]
    outputs = run_kernel_in_mock(
        kernel_fn=kernel,
        world_size=4,
        topology="ring_1d",
        inputs=inputs,
        kernel_args=(n_elem,),
    )
    # rank r should now hold rank (r-1) % 4's data.
    for r in range(4):
        assert np.array_equal(outputs[r], inputs[(r - 1) % 4])
 ```
 `run_kernel_in_mock` runs every rank concurrently in pure Python (no
 SimPy), so a unit test like this finishes in **milliseconds**. It only
 verifies algorithmic correctness — no latency, no DMA, no fabric.
 ### Step 5: SimPy validation
 ```bash
 kernbench run --topology topology.yaml --bench ccl_hello --verify-data
 ```
 Phase 1 runs the SimPy simulation + MemoryStore data movement, Phase 2
 replays the op_log for correctness. The bench's `print` lines should
 show OK for every rank.
 ---
 ## 2. Ring all-reduce — the second algorithm
 Slightly more complex. Each PE runs `world_size - 1` rounds, sending
 its current tile east and accumulating the tile received from the west.
 After all rounds, every PE holds the global sum.
 The reference implementation lives in
 [`src/kernbench/ccl/algorithms/ring_allreduce.py`](../src/kernbench/ccl/algorithms/ring_allreduce.py).
 The core flow:
 ```python
 """Ring all-reduce."""
 def kernel(t_ptr, n_elem, world_size, tl):
    local_pe = tl.program_id(axis=0)
    cube_id = tl.program_id(axis=1)
    pes_per_cube = tl.num_programs(axis=0)
    rank = cube_id * pes_per_cube + local_pe
    nbytes = n_elem * 2
    pe_addr = t_ptr + rank * nbytes
    # The handle points at HBM[pe_addr]. In greenlet mode .data is
    # populated, but the kernel never has to touch .data directly.
    acc = tl.load(pe_addr, shape=(n_elem,), dtype="f16")
    current = acc  # source for the first send
    for _step in range(world_size - 1):
        tl.send(dir="E", src=current)
        recv = tl.recv(dir="W", shape=(n_elem,), dtype="f16")
        # TensorHandle operator overload → MathCmd → PE_MATH dispatch.
        # Phase 1 only models timing; Phase 2 DataExecutor replays the
        # actual numpy accumulation.
        acc = acc + recv
        current = recv  # forward the received slot to the next round
    # Store the final accumulator back to HBM. Source is acc (a PE-local
    # scratch addr); dst is HBM. The op_log dma_write entry records both
    # ends so Phase 2 copies the math result into HBM at verify time.
    tl.store(pe_addr, acc)
 def kernel_args(world_size: int, n_elem: int) -> tuple:
    return (n_elem, world_size)
 ```
 Four key points:
 1. **Accumulation goes through TensorHandle operators.** `acc + recv`
   emits a `MathCmd` and dispatches it through PE_MATH — i.e. the
   real hardware path, so the latency model stays accurate. Per
   ADR-0020 D3, Phase 1 only simulates timing; Phase 2's `DataExecutor`
   replays the op_log and runs the actual numpy accumulation.
 2. **Use `current = recv` to forward.** Each round must update the send
   source to the just-received slot handle so the same data circulates
   exactly once around the ring. Setting `current = acc` would resend
   the cumulative sum, inflating the result.
 3. **`tl.store(pe_addr, acc)` exactly once at the end.** Do not use a
   store→reload pattern in the middle. `acc` lives in PE-local scratch;
   the op_log records `(src=scratch, dst=hbm)` and Phase 2 first runs
   math (filling scratch) then copies via the dma_write snapshot.
 4. **`world_size` is passed by the host explicitly.** TL only knows the
   topology slot count (e.g. `num_programs(axis=0)` is "PEs per cube"),
   not the participating CCL group size. The host bench knows
   `world_size` and forwards it as an explicit kernel argument.
 For registration in `ccl.yaml` and wiring through the unified bench,
 look at the existing `ring_allreduce_tcm/_hbm/_sram` entries plus
 [`benches/ccl_allreduce.py`](../benches/ccl_allreduce.py). Mock unit
 tests live in
 [`tests/test_ccl_mock_runtime.py`](../tests/test_ccl_mock_runtime.py)
 and follow the `kernel_args=(n_elem, world_size)` convention.
 ---
 ## 3. `neighbors()` override — custom topology
 Most algorithms are happy with the builtin topologies (`ring_1d`,
 `mesh_2d`, `tree_binary`, `ring_1d_unidir`, `none`). If you want to
 modify a builtin or define a brand-new connectivity pattern, define a
 `neighbors()` function in your algorithm module.
 ### Signature
 ```python
 def neighbors(
    rank: int, world_size: int, neighbor_map: dict[str, int],
 ) -> dict[str, int] | None:
    """Override the neighbor map produced by the builtin topology.
    Args:
        neighbor_map: the mapping the ccl.yaml ``topology`` field built.
                      For ring_1d this is {"E": (rank+1)%ws, "W": (rank-1)%ws}.
                      The dict is mutable — modify in place if you want.
    Returns:
        dict: the new neighbor map (or the modified-in-place dict).
        None: do not override; use neighbor_map as-is.
    """
    return None
 ```
 ### Pattern A: tweak a builtin
 ```python
 def neighbors(rank, world_size, neighbor_map):
    # Only even ranks use W; remove W from odd ranks.
    if rank % 2 == 1:
        neighbor_map.pop("W", None)
    return neighbor_map
 ```
 ### Pattern B: replace entirely (skip-connection ring)
 ```python
 def neighbors(rank, world_size, neighbor_map):
    return {"E": (rank + 2) % world_size}
 ```
 ### Pattern C: keep builtin
 Either omit `neighbors` entirely or return None:
 ```python
 def neighbors(rank, world_size, neighbor_map):
    return None  # explicit "use the builtin"
 ```
 ---
 ## 4. PE kernel API reference (ADR-0023 D4)
 ### IPCQ API
 | API | Description | Blocking? |
 |-----|-------------|-----------|
 | `tl.send(dir, src=TensorHandle)` | Send to a peer in the given direction. | Yes (waits if peer slots are full) |
 | `tl.send(dir, src_addr=..., nbytes=..., shape=..., dtype=..., space=...)` | Same, keyword form. | Yes |
 | `tl.recv(dir, shape=..., dtype=...)` | Blocking recv from one direction. | Yes |
 | `tl.recv(shape=..., dtype=...)` | Round-robin recv across all four directions. | Yes |
 | `tl.recv_async(dir, shape=..., dtype=...) → RecvFuture` | Non-blocking recv. | No |
 | `tl.wait(future)` | Wait for a non-blocking recv future → returns the resolved TensorHandle. | Yes |
 ### Existing TL API (ADR-0020/0022, unchanged)
 | API | Description |
 |-----|-------------|
 | `tl.load(addr, shape, dtype) → TensorHandle` | DMA read; in greenlet mode `.data` carries the ndarray. |
 | `tl.store(addr, handle)` | DMA write — when `handle.data` is set the runner propagates it to MemoryStore. |
 | `tl.composite(op, ...)` | Submit a GEMM/Math composite (non-blocking). |
 | `tl.program_id(axis=0)` | Local PE id within the cube. |
 | `tl.program_id(axis=1)` | Cube id (ADR-0022). |
 | `tl.num_programs(axis=0/1)` | Topology slot counts (NOT the participating-rank count). |
 ### Two recv modes
 The default is `return_slot` (zero-copy): the IPCQ slot address is
 returned in `handle.addr`. To force a copy into a custom destination,
 pass `dst_addr` + `dst_space`:
 ```python
 recv = tl.recv(
    dir="W", shape=(8,), dtype="f16",
    dst_addr=my_scratch_addr,
    dst_space="hbm",
 )
 # After this call recv.addr == my_scratch_addr (copy_to_dst mode).
 ```
 ---
 ## 5. Helpers (`kernbench.ccl.helpers`)
 Convenience helpers to keep algorithm code short:
 ```python
 from kernbench.ccl.helpers import chunked, ring_step, tree_step
 ```
 ### `chunked(base_addr, n_chunks, n_elem, dtype="f16") → list[Chunk]`
 Split a tile of `n_elem` elements into `n_chunks` equal-size views.
 Each `Chunk` has `addr`, `n_elem`, `nbytes` fields.
 ```python
 chunks = chunked(t_ptr, n_chunks=4, n_elem=64, dtype="f16")
 # chunks[0..3] are 16-element views with consecutive addresses.
 ```
 ### `ring_step(rank, step, world_size) → (send_idx, recv_idx)`
 Per-step chunk indices for a ring algorithm (reduce-scatter / all-gather):
 ```python
 for step in range(world_size - 1):
    send_idx, recv_idx = ring_step(rank, step, world_size)
    tl.send(
        dir="E", src_addr=chunks[send_idx].addr,
        nbytes=chunks[send_idx].nbytes,
        shape=(chunks[send_idx].n_elem,), dtype="f16",
    )
    recv = tl.recv(
        dir="W", shape=(chunks[recv_idx].n_elem,), dtype="f16",
    )
    # accumulate ...
 ```
 ### `tree_step(rank, world_size) → {"parent": int|None, "children": list[int]}`
 Parent / children rank ids for a binary tree:
 ```python
 info = tree_step(rank, world_size)
 if info["parent"] is None:
    print(f"rank {rank} is the root")
 for child in info["children"]:
    ...
 ```
 ---
 ## 6. Unit testing — Mock runtime
 `kernbench.ccl.testing.run_kernel_in_mock` runs an algorithm without
 SimPy for fast feedback.
 ### Basic usage
 ```python
 import numpy as np
 from kernbench.ccl.testing import run_kernel_in_mock
 from kernbench.ccl.algorithms.my_algo import kernel
 def test_my_algo():
    n_elem = 16
    inputs = [np.arange(n_elem, dtype="f16") + r for r in range(4)]
    expected = sum(inputs)
    outputs = run_kernel_in_mock(
        kernel_fn=kernel,
        world_size=4,
        topology="ring_1d",
        inputs=inputs,
        kernel_args=(n_elem, 4),  # positional args after t_ptr
    )
    for r in range(4):
        assert np.allclose(outputs[r], expected, rtol=1e-3)
 ```
 ### Behavior
 - All ranks run their kernels concurrently as cooperative greenlets.
 - `tl.send` / `tl.recv` are serviced by in-memory FIFOs (no DMA, no
  latency).
 - Each rank's last `store` is what the helper returns as a numpy array.
 ### Limitations
 - No latency or performance numbers (it is not a simulation).
 - No PE_DMA, fabric, or BW model.
 - Correctness only.
 - One cube assumed: `program_id(axis=1)` is always 0.
 ---
 ## 7. Debugging
 ### CCL trace
 ```bash
 KERNBENCH_CCL_TRACE=1 kernbench run --topology topology.yaml \
    --bench ccl_allreduce --verify-data
 ```
 Per-rank send/recv events appear on stdout:
 ```
 [ccl t=346.4 send] sip0.cube0.pe1 dir=E nbytes=64 seq=0
 [ccl t=360.4 recv] sip0.cube0.pe2 dir=W nbytes=64
 ```
 ### Pointer dump
 `kernbench.ccl.diagnostics.pointer_dump(engine)` returns a multi-line
 dump of every PE_IPCQ ring buffer's `my_head`, `my_tail`,
 `peer_head_cache`, `peer_tail_cache`. When something hangs, this shows
 which rank is stuck and on what.
 ### Deadlock detection
 When the SimPy schedule empties because of unmatched send/recv pairs,
 the engine raises `IpcqDeadlock` and embeds the pointer dump in the
 message (ADR-0023 D14 F3). Wait-for-graph visualization is future
 work.
 ---
 ## 8. Common mistakes
 ### 1. Using a direction that wasn't installed
 `topology: ring_1d` only installs E and W. Trying:
 ```python
 tl.send(dir="N", ...)   # → IpcqInvalidDirection
 ```
 Fix: switch to `topology: mesh_2d`, or add N/S in a `neighbors()` override.
 ### 2. `send` without a matching `recv`
 ```python
 def kernel(..., tl):
    for _ in range(100):
        tl.send(dir="E", ...)
    # The peer never recvs → ring buffer fills → backpressure → deadlock.
 ```
 Fix: every `send` needs a matching `recv` on the receiver side.
 Otherwise `IpcqDeadlock` is raised.
 ### 3. dtype/shape mismatch
 By default mismatches are not validated. The author is responsible for
 consistency. Set `strict_validation: true` on a PE_IPCQ node's attrs to
 enable D14 F2 strict mode and catch them immediately.
 ### 4. Assuming round-robin recv fairness
 `tl.recv()` (no direction) returns the first slot to arrive in
 round-robin order, but **arrival order is not predictable**. If your
 algorithm depends on a particular direction, name it explicitly:
 `tl.recv(dir="N", ...)`.
 ### 5. Confusing `num_programs` with the CCL group size
 `tl.num_programs(axis=0/1)` reports topology slot counts, not the
 number of ranks participating in the collective. The host bench knows
 `world_size` and must pass it through as a kernel argument.
 ### 6. Overwriting the send source before it's actually sent
 PE_DMA snapshots the source data into the IpcqDmaToken at send time,
 preserving in-flight semantics. Even so, the safest pattern is to call
 `tl.send` first and only mutate the source addr afterwards. If you
 mutate the addr before `tl.send` makes it into the PE_DMA queue, the
 snapshot will pick up the wrong data.
 ---
 ## 9. Next steps
 - Try other topologies (`mesh_2d`, `tree_binary`).
 - Faster algorithms (recursive halving / doubling).
 - Compare `buffer_kind` (tcm/hbm/sram) and `backpressure` (poll/sleep)
  modes for latency.
 - Larger-scale validation through the unified `ccl_allreduce` bench
  with different `ccl.yaml` overlays.
 If you add a new algorithm or pattern, please send a PR.
 ---
 ## References
 - [ADR-0023](adr/ADR-0023-ipcq-pe-collective.md): IPCQ + PE-level collective design.
 - [ADR-0022](adr/ADR-0022-program-id-2d-grid.md): 2D grid program_id (axis=0/1).
 - [ADR-0020](adr/ADR-0020-data-execution-two-pass.md): 2-pass data execution.
 - [ADR-0021](adr/ADR-0021-pe-pipeline-refactor.md): PE pipeline refactor.
 Existing algorithm examples:
 - [`src/kernbench/ccl/algorithms/hello_send.py`](../src/kernbench/ccl/algorithms/hello_send.py) — simplest send/recv
 - [`src/kernbench/ccl/algorithms/ring_allreduce.py`](../src/kernbench/ccl/algorithms/ring_allreduce.py) — ring all-reduce
 - [`src/kernbench/ccl/algorithms/mesh_allreduce.py`](../src/kernbench/ccl/algorithms/mesh_allreduce.py) — 2D mesh all-reduce
 - [`src/kernbench/ccl/algorithms/tree_allreduce.py`](../src/kernbench/ccl/algorithms/tree_allreduce.py) — binary tree all-reduce
@@ -0,0 +1,537 @@
 # CCL Algorithm Author Guide
 이 문서는 kernbench에서 CCL (Collective Communication Library) 알고리즘을
 직접 작성하는 사람을 위한 step-by-step 가이드이다. 시스템 내부 설계와
 컴포넌트 구조는 [ADR-0023](adr/ADR-0023-ipcq-pe-collective.md)에 있다.
 본 가이드는 알고리즘 작성자가 **자신이 만져야 할 곳**과 **만지지 않아도 될 곳**을
 명확히 분리하고, 가장 짧은 경로로 첫 알고리즘을 동작시키는 것을 목표로 한다.
 ---
 ## 0. 5분 요약
 | 만지는 것 | 위치 |
 |----------|------|
 | 알고리즘 모듈 (kernel + 선택적 neighbors) | `src/kernbench/ccl/algorithms/<algo>.py` |
 | 알고리즘 등록 | `ccl.yaml` |
 | 호스트 bench (PE 수, 메모리 init, launch, 검증) | `benches/<your_bench>.py` |
 | (선택) 단위 테스트 | `tests/test_<algo>.py` |
 | 만지지 않는 것 | 위치 |
 |---------------|------|
 | TLContext API | `src/kernbench/triton_emu/tl_context.py` (ADR-0022 spec) |
 | 프레임워크 (topology generators, helpers, mock testing) | `src/kernbench/ccl/` |
 | PE_IPCQ / PE_DMA 컴포넌트 | `src/kernbench/components/builtin/` |
 | backend 구현 (install_ipcq) | `src/kernbench/runtime_api/distributed.py` 및 `kernbench/ccl/install.py` |
 흐름:
 1. 알고리즘 모듈에 `kernel` 작성
 2. `ccl.yaml`에 entry 등록
 3. 호스트 bench에서 `install_ipcq` + `launch`
 4. (선택) mock runtime으로 단위 테스트 (수 ms)
 5. `kernbench run --bench <name> --verify-data`로 SimPy 검증
 ---
 ## 1. Hello World — 가장 단순한 send/recv
 각 PE가 자기 데이터를 E 방향 이웃에 한 번 보내고, W 방향에서 한 번 받는
 가장 단순한 알고리즘이다. 실제 동작 코드는
 [`src/kernbench/ccl/algorithms/hello_send.py`](../src/kernbench/ccl/algorithms/hello_send.py)
 에 있다.
 ### Step 1: kernel 작성
 새 파일 `src/kernbench/ccl/algorithms/hello_send.py`:
 ```python
 """Hello world: 자기 데이터를 다음 rank에 보내고 이전 rank에서 받기."""
 def kernel(t_ptr, n_elem, tl):
    # 글로벌 rank는 program_id(0/1)에서 계산 (ADR-0022)
    local_pe = tl.program_id(axis=0)
    cube_id = tl.program_id(axis=1)
    pes_per_cube = tl.num_programs(axis=0)
    rank = cube_id * pes_per_cube + local_pe
    nbytes = n_elem * 2  # f16
    pe_addr = t_ptr + rank * nbytes
    # 자기 슬라이스를 로드해서 E로 보낸다.
    src = tl.load(pe_addr, shape=(n_elem,), dtype="f16")
    tl.send(dir="E", src=src)
    # W 방향에서 받아서 그대로 자기 슬라이스에 store한다.
    recv = tl.recv(dir="W", shape=(n_elem,), dtype="f16")
    tl.store(pe_addr, recv)
 ```
 핵심 포인트:
 - **글로벌 rank는 `program_id(axis=0)` + `program_id(axis=1)`에서 계산.** TL에는
  `tl.rank` / `tl.world_size` 같은 약속되지 않은 확장이 없다. 호스트가
  `world_size` 같은 알고리즘 파라미터가 필요하면 `torch.launch`의 일반 인자로
  전달한다.
 - **`tl.send`는 `TensorHandle`을 받는다.** 핸들의 `addr`/`space`/`shape`/`dtype`/`nbytes`를
  PE_IPCQ가 읽어 PE_DMA에 IpcqDmaToken을 발행한다.
 - **`tl.recv`는 `shape`와 `dtype`이 필수.** 반환된 TensorHandle은 IPCQ ring slot을
  가리키며, `tl.store(pe_addr, recv)`처럼 dst 핸들로 그대로 사용할 수 있다.
  Phase 2 dma_write replay가 (slot, hbm) 복사를 수행하므로 numpy `.data`를
  직접 만질 필요가 없다.
 ### Step 2: ccl.yaml 등록
 `ccl.yaml`의 `algorithms` 섹션에 entry를 추가한다. (defaults.algorithm은 호스트
 bench가 `install_ipcq(algorithm=...)`로 명시 전달해도 되므로 꼭 바꿀 필요는 없다.)
 ```yaml
 algorithms:
  hello_send:
    module: kernbench.ccl.algorithms.hello_send
    topology: ring_1d
    buffer_kind: tcm
 ```
 ### Step 3: 호스트 bench 작성
 새 파일 `benches/ccl_hello.py`:
 ```python
 """Hello-world ring rotation bench (각 PE가 W 이웃의 데이터를 1번 받음)."""
 import numpy as np
 from kernbench.ccl.algorithms import hello_send
 from kernbench.policy.placement.dp import DPPolicy
 ALGORITHM = "hello_send"
 N_ELEM = 8
 WORLD_SIZE = 8
 def run(torch):
    plan = torch.install_ipcq(algorithm=ALGORITHM)
    a = torch.zeros(
        (1, WORLD_SIZE * N_ELEM), dtype="f16",
        dp=DPPolicy(
            cube="replicate", pe="column_wise",
            num_cubes=1,
        ),
        name="hello_in",
    )
    store = torch.engine.memory_store
    base = a._handle.va_base or a._handle.shards[0].pa
    nbytes = N_ELEM * 2
    for r in range(WORLD_SIZE):
        store.write("hbm", base + r * nbytes,
                    np.full((N_ELEM,), float(r + 1), dtype=np.float16))
    torch.launch(ALGORITHM, hello_send.kernel, a, N_ELEM)
    # rank r은 rank (r-1)%ws의 데이터를 가져야 한다.
    for r, (sip, cube, pe) in enumerate(plan["rank_to_pe"]):
        result = store.read("hbm", base + r * nbytes, shape=(N_ELEM,), dtype="f16")
        prev = float(((r - 1) % WORLD_SIZE) + 1)
        ok = np.allclose(result, prev)
        print(f"  [{'OK ' if ok else 'FAIL'}] rank {r} got {float(result.mean()):.1f}, "
              f"expected {prev:.1f}")
 ```
 ### Step 4: 단위 테스트 (선택, 강력 추천)
 `tests/test_hello_send.py`:
 ```python
 import numpy as np
 from kernbench.ccl.algorithms.hello_send import kernel
 from kernbench.ccl.testing import run_kernel_in_mock
 def test_hello_send_4_ranks():
    n_elem = 8
    inputs = [np.full((n_elem,), float(r + 1), dtype=np.float16) for r in range(4)]
    outputs = run_kernel_in_mock(
        kernel_fn=kernel,
        world_size=4,
        topology="ring_1d",
        inputs=inputs,
        kernel_args=(n_elem,),
    )
    # rank r은 rank (r-1) % 4의 데이터를 받아야 함
    for r in range(4):
        assert np.array_equal(outputs[r], inputs[(r - 1) % 4])
 ```
 `run_kernel_in_mock`는 SimPy 없이 순수 Python으로 모든 rank를 동시 실행하므로
 **ms 단위로 끝난다**. 알고리즘 logic 정합성만 검증.
 ### Step 5: 시뮬 검증
 ```bash
 kernbench run --topology topology.yaml --bench ccl_hello --verify-data
 ```
 Phase 1에서 SimPy 시뮬레이션 + MemoryStore 데이터 이동, Phase 2에서 op_log
 정합성 replay. 호스트 bench의 `print` 검증이 모든 rank에 대해 OK여야 한다.
 ---
 ## 2. Ring All-Reduce — 두 번째 알고리즘
 조금 더 복잡한 예제. Ring all-reduce는 N-1 라운드 동안 각 PE가 자기 데이터를
 E로 보내고 W에서 받아 누적한다. 최종적으로 모든 PE가 글로벌 sum을 갖는다.
 실제 동작 코드는 [`src/kernbench/ccl/algorithms/ring_allreduce.py`](../src/kernbench/ccl/algorithms/ring_allreduce.py)
 참조. 핵심 흐름:
 ```python
 """Ring all-reduce."""
 def kernel(t_ptr, n_elem, world_size, tl):
    # rank
    local_pe = tl.program_id(axis=0)
    cube_id = tl.program_id(axis=1)
    pes_per_cube = tl.num_programs(axis=0)
    rank = cube_id * pes_per_cube + local_pe
    nbytes = n_elem * 2
    pe_addr = t_ptr + rank * nbytes
    # HBM의 자기 슬라이스를 가리키는 TensorHandle. greenlet 모드에선 .data가
    # 채워지지만 커널은 .data를 직접 만질 필요가 없다.
    acc = tl.load(pe_addr, shape=(n_elem,), dtype="f16")
    current = acc  # 첫 라운드 send 출처
    for _step in range(world_size - 1):
        tl.send(dir="E", src=current)
        recv = tl.recv(dir="W", shape=(n_elem,), dtype="f16")
        # TensorHandle 연산자 오버로드 → MathCmd → PE_MATH 디스패치.
        # Phase 1은 타이밍만, Phase 2 DataExecutor가 실제 numpy 누적을 수행한다.
        acc = acc + recv
        current = recv  # 다음 라운드는 직전에 받은 슬롯을 다시 forward
    # 최종 누적값을 자기 슬라이스에 store. 출처는 acc(=PE-local scratch addr)
    # 이고 dst는 HBM. op_log dma_write가 (scratch, hbm) 복사 정보를 기록하므로
    # Phase 2가 검증 시점에 HBM[pe_addr]에 정답을 채워준다.
    tl.store(pe_addr, acc)
 ```
 네 가지 포인트:
 1. **누적은 TensorHandle 연산자**: `acc + recv`는 `MathCmd`를 emit하고
   PE_MATH로 디스패치된다 — 실제 하드웨어 경로를 거치므로 latency 모델이
   정확하다. ADR-0020 D3대로 Phase 1은 타이밍만 시뮬레이션하고, Phase 2
   `DataExecutor`가 op_log를 재실행하면서 numpy 누적을 수행한다.
 2. **`current = recv`로 forward**: 매 라운드의 send 출처를 직전에 받은 슬롯
   핸들로 갱신해야 같은 데이터가 ring을 순회하면서 누적이 한 번씩 일어난다.
   `current = acc`로 두면 누적값이 다시 송출되어 결과가 부풀려진다.
 3. **`tl.store(pe_addr, acc)` 한 번이면 끝**: 중간에 store→reload 패턴은
   금지다. acc는 PE-local scratch에 살고, op_log가 (src=scratch, dst=hbm)
   메타데이터를 기록한다. Phase 2가 math를 먼저 실행해 scratch를 채운 뒤
   dma_write 스냅샷으로 HBM에 복사한다.
 4. **`world_size`는 호스트가 명시 전달**: TL은 topology slot 수만 안다 (예:
   `num_programs(axis=0)`은 cube당 PE 수). 실제 참여하는 CCL group 크기는 bench가
   알고 호스트→kernel 인자로 넘긴다.
 `ccl.yaml` 등록 + 호스트 bench는 [`benches/ccl_allreduce_tcm.py`](../benches/ccl_allreduce_tcm.py)
 참조. mock 단위 테스트는 [`tests/test_ccl_mock_runtime.py`](../tests/test_ccl_mock_runtime.py)
 를 그대로 따라하면 된다 (`kernel_args=(n_elem, world_size)` 인자 형태).
 ---
 ## 3. neighbors() override — Custom topology
 대부분의 알고리즘은 builtin topology(`ring_1d`, `mesh_2d`, `tree_binary`,
 `ring_1d_unidir`, `none`)로 충분하다. builtin을 변형하거나 새로 만들고 싶으면
 알고리즘 모듈에 `neighbors()`를 정의한다.
 ### 시그니처
 ```python
 def neighbors(rank: int, world_size: int, neighbor_map: dict[str, int]) -> dict[str, int] | None:
    """builtin topology가 만든 neighbor_map을 override.
    Args:
        neighbor_map: ccl.yaml의 topology 필드가 만든 builtin 매핑.
                      예: ring_1d → {"E": (rank+1)%ws, "W": (rank-1)%ws}
                      mutable dict — 직접 수정 가능.
    Returns:
        dict: neighbor_map을 override한 결과 (또는 수정한 그 dict)
        None: override 안 함, neighbor_map 그대로 사용
    """
    return None
 ```
 ### Pattern A: builtin을 base로 일부만 수정
 ```python
 def neighbors(rank, world_size, neighbor_map):
    # 짝수 rank만 W 방향 사용 (홀수 rank는 W 제거)
    if rank % 2 == 1:
        neighbor_map.pop("W", None)
    return neighbor_map
 ```
 ### Pattern B: 완전히 새로 작성 (skip-connection ring)
 ```python
 def neighbors(rank, world_size, neighbor_map):
    # neighbor_map은 무시하고 새로 작성
    return {"E": (rank + 2) % world_size}
 ```
 ### Pattern C: builtin 사용, override 없음
 `neighbors()` 함수를 정의하지 않거나 None을 반환:
 ```python
 def neighbors(rank, world_size, neighbor_map):
    return None  # 명시적으로 builtin 사용
 ```
 ---
 ## 4. PE 커널 API 레퍼런스 (ADR-0023 D4)
 ### IPCQ API
 | API | 설명 | Blocking? |
 |-----|------|-----------|
 | `tl.send(dir, src=TensorHandle)` | direction으로 데이터 send | Yes (peer slot full 시 wait) |
 | `tl.send(dir, src_addr=..., nbytes=..., shape=..., dtype=..., space=...)` | 동일, keyword 형태 | Yes |
 | `tl.recv(dir, shape=..., dtype=...)` | 특정 방향에서 blocking recv | Yes |
 | `tl.recv(shape=..., dtype=...)` | 4방향 round-robin recv (방향 미지정) | Yes |
 | `tl.recv_async(dir, shape=..., dtype=...) → RecvFuture` | non-blocking recv | No |
 | `tl.wait(future)` | non-blocking future 완료 대기 → TensorHandle | Yes |
 ### 기존 TL API (ADR-0020/0022, 그대로 사용 가능)
 | API | 설명 |
 |-----|------|
 | `tl.load(addr, shape, dtype) → TensorHandle` | DMA read; greenlet 모드에서 `.data`에 ndarray |
 | `tl.store(addr, handle)` | DMA write — handle.data가 있으면 MemoryStore에 propagate |
 | `tl.composite(op, ...)` | GEMM/Math compute 비동기 submit |
 | `tl.program_id(axis=0)` | cube 내 local PE id |
 | `tl.program_id(axis=1)` | cube id (ADR-0022) |
 | `tl.num_programs(axis=0/1)` | topology 슬롯 수 (참여 ranks 수가 아님) |
 ### `recv` 두 가지 모드
 기본은 `return_slot` (zero-copy): IPCQ slot 주소가 그대로 handle.addr에 들어온다.
 slot 데이터를 별도 위치로 복사하고 싶으면 `dst_addr` + `dst_space`를 명시:
 ```python
 recv = tl.recv(
    dir="W", shape=(8,), dtype="f16",
    dst_addr=my_scratch_addr,
    dst_space="hbm",
 )
 # 이제 recv.addr == my_scratch_addr (copy_to_dst 모드)
 ```
 ---
 ## 5. Helpers (`kernbench.ccl.helpers`)
 알고리즘 코드를 짧게 유지하기 위한 헬퍼들:
 ```python
 from kernbench.ccl.helpers import chunked, ring_step, tree_step
 ```
 ### `chunked(base_addr, n_chunks, n_elem, dtype="f16") → list[Chunk]`
 총 `n_elem` 개의 element를 `n_chunks` 등분한 view 리스트를 반환. 각 `Chunk`는
 `addr`, `n_elem`, `nbytes` 필드를 가진다.
 ```python
 chunks = chunked(t_ptr, n_chunks=4, n_elem=64, dtype="f16")
 # chunks[0..3] 각각 16 element view, addr이 연속
 ```
 ### `ring_step(rank, step, world_size) → (send_idx, recv_idx)`
 Ring algorithm의 step별 chunk 인덱스 (reduce-scatter / all-gather):
 ```python
 for step in range(world_size - 1):
    send_idx, recv_idx = ring_step(rank, step, world_size)
    tl.send(dir="E", src_addr=chunks[send_idx].addr,
            nbytes=chunks[send_idx].nbytes,
            shape=(chunks[send_idx].n_elem,), dtype="f16")
    recv = tl.recv(dir="W", shape=(chunks[recv_idx].n_elem,), dtype="f16")
    # accumulate ...
 ```
 ### `tree_step(rank, world_size) → {"parent": int|None, "children": list[int]}`
 Binary tree의 parent/children rank:
 ```python
 info = tree_step(rank, world_size)
 if info["parent"] is None:
    print(f"rank {rank} is the root")
 for child in info["children"]:
    ...
 ```
 ---
 ## 6. 단위 테스트 — Mock Runtime
 `kernbench.ccl.testing.run_kernel_in_mock`은 SimPy를 거치지 않고 알고리즘을
 빠르게 검증할 수 있다.
 ### 기본 사용법
 ```python
 from kernbench.ccl.testing import run_kernel_in_mock
 from kernbench.ccl.algorithms.my_algo import kernel
 import numpy as np
 def test_my_algo():
    n_elem = 16
    inputs = [np.arange(n_elem, dtype="f16") + r for r in range(4)]
    expected = sum(inputs)
    outputs = run_kernel_in_mock(
        kernel_fn=kernel,
        world_size=4,
        topology="ring_1d",
        inputs=inputs,
        kernel_args=(n_elem, 4),  # kernel의 (t_ptr 이후) 추가 positional 인자
    )
    for r in range(4):
        assert np.allclose(outputs[r], expected, rtol=1e-3)
 ```
 ### 동작
 - 4개 rank의 kernel을 greenlet으로 동시 실행
 - `tl.send/recv`를 in-memory FIFO로 즉시 처리 (DMA, latency 무시)
 - 각 rank가 마지막에 store한 데이터를 ndarray로 반환
 ### 한계
 - latency / 성능 측정 불가 (시뮬레이션이 아님)
 - PE_DMA, fabric, BW 모델 안 함
 - 정합성 검증만 가능
 - 한 cube 안에서 동작하는 가정 — `program_id(axis=1)`은 항상 0
 ---
 ## 7. 디버깅
 ### CCL trace
 ```bash
 KERNBENCH_CCL_TRACE=1 kernbench run --topology topology.yaml \
    --bench ccl_allreduce_tcm --verify-data
 ```
 각 rank의 send/recv 시점이 stdout에 출력된다:
 ```
 [ccl t=346.4 send] sip0.cube0.pe1 dir=E nbytes=64 seq=0
 [ccl t=360.4 recv] sip0.cube0.pe2 dir=W nbytes=64
 ...
 ```
 ### Pointer dump
 `kernbench.ccl.diagnostics.pointer_dump(engine)`는 모든 PE_IPCQ의 ring buffer
 상태(`my_head`, `my_tail`, `peer_head_cache`, `peer_tail_cache`)를 multi-line
 문자열로 반환한다. hang이 발생하면 어느 rank가 어떤 상태에서 막혔는지 한눈에
 보인다.
 ### Deadlock detection
 매칭되지 않는 send/recv 등으로 SimPy 스케줄이 비면 engine이 `IpcqDeadlock`을
 던지며 pointer dump를 메시지에 포함시킨다 (ADR-0023 D14 F3). 별도 wait-for graph
 시각화는 미래 작업.
 ---
 ## 8. 흔한 실수
 ### 1. install 안 된 direction 사용
 ccl.yaml의 `topology: ring_1d`는 E/W만 install한다. N/S 사용 시:
 ```python
 tl.send(dir="N", ...)   # → IpcqInvalidDirection 예외
 ```
 해결: `topology: mesh_2d`로 바꾸거나, `neighbors()` override로 N/S 추가.
 ### 2. send만 호출하고 recv 없음
 ```python
 def kernel(..., tl):
    for _ in range(100):
        tl.send(dir="E", ...)
    # peer 측 recv 없음 → ring buffer 가득 차면 backpressure → deadlock
 ```
 해결: 모든 send에 짝이 되는 recv가 있어야 한다. 안 그러면 `IpcqDeadlock`이
 발생한다.
 ### 3. dtype/shape 불일치
 기본 모드에서는 dtype/shape mismatch를 검증하지 않는다. 작성자가 직접 보장하거나,
 PE_IPCQ 노드 attrs에 `strict_validation: true`를 설정해 D14 F2 strict 모드로
 mismatch를 즉시 잡을 수 있다.
 ### 4. round-robin recv의 fairness 가정
 `tl.recv()` (방향 미지정)는 round-robin으로 가져오지만, 도착한 첫 슬롯을 반환한다.
 **도착 순서를 알 수 없으므로** 알고리즘이 도착 방향에 의존하면 안 된다.
 필요하면 `tl.recv(dir="N", ...)`처럼 명시.
 ### 5. CCL 그룹 크기 가정
 `tl.num_programs(axis=0/1)`은 토폴로지 슬롯 개수이지 CCL group 크기가 아니다.
 참여하는 rank 수(`world_size`)는 호스트 bench가 알고 있고, kernel 인자로 명시
 전달해야 한다.
 ### 6. 호스트가 send-source 메모리를 도착 전에 덮어씀
 PE_DMA가 송신 시점에 src 데이터를 토큰에 스냅샷해서 in-flight 데이터의 의미가
 보존된다. 그래도 하나의 PE 안에서 같은 주소를 여러 step에 걸쳐 갱신할 때는
 direct send 후 다른 step에서 같은 주소를 store해도 안전하다 (token snapshot 덕분).
 하지만 `tl.send`가 PE_DMA 큐에 enqueue되기 전에 주소를 덮어쓰면 잘못된 데이터가
 스냅샷된다 — `tl.send`를 먼저, 메모리 변경을 나중에 하는 게 권장.
 ---
 ## 9. 다음 단계
 - `mesh_2d` / `tree_binary` 같은 다른 topology 활용
 - recursive halving/doubling 등 더 빠른 알고리즘
 - `buffer_kind` (tcm/hbm/sram) / `backpressure` (poll/sleep) 모드별 latency 비교
 - `ccl_ring_allreduce_multicube.py`, `ccl_ring_allreduce_multisip.py`처럼 큰
  scale의 ring 검증
 새 알고리즘이나 패턴을 추가했다면 PR로 기여해주세요.
 ---
 ## 참고
 - [ADR-0023](adr/ADR-0023-ipcq-pe-collective.md): IPCQ + PE-level collective 설계
 - [ADR-0022](adr/ADR-0022-program-id-2d-grid.md): 2D grid program_id (axis=0/1)
 - [ADR-0020](adr/ADR-0020-data-execution-two-pass.md): 2-pass data execution
 - [ADR-0021](adr/ADR-0021-pe-pipeline-refactor.md): PE pipeline refactor
 기존 알고리즘 예제:
 - [`src/kernbench/ccl/algorithms/hello_send.py`](../src/kernbench/ccl/algorithms/hello_send.py) — 가장 단순한 send/recv
 - [`src/kernbench/ccl/algorithms/ring_allreduce.py`](../src/kernbench/ccl/algorithms/ring_allreduce.py) — ring all-reduce
 - [`src/kernbench/ccl/algorithms/mesh_allreduce.py`](../src/kernbench/ccl/algorithms/mesh_allreduce.py) — 2D mesh all-reduce
 - [`src/kernbench/ccl/algorithms/tree_allreduce.py`](../src/kernbench/ccl/algorithms/tree_allreduce.py) — binary tree all-reduce
@@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta"
 name = "kernbench"
 version = "0.1.0"
 requires-python = ">=3.10"
-dependencies = ["pytest", "simpy", "pyyaml", "fastapi>=0.110", "uvicorn[standard]>=0.29", "websockets>=12"]
+dependencies = ["pytest", "simpy", "pyyaml", "fastapi>=0.110", "uvicorn[standard]>=0.29", "websockets>=12", "numpy>=1.24", "greenlet>=3.0"]
 [project.scripts]
 kernbench = "kernbench.cli.main:main"
@@ -18,6 +18,7 @@ include = ["kernbench*", "benches*"]
 [project.optional-dependencies]
 dev = [
  "pytest>=7",
  "pytest-xdist>=3.0",
  "ruff>=0.4.0",
 ]
@@ -31,4 +32,5 @@ select = ["E", "F", "I", "B", "UP"]
 ignore = ["F401"]
 [tool.pytest.ini_options]
-addopts = ["--disable-warnings"]
+addopts = ["--disable-warnings", "-n", "auto", "-m", "not slow"]
 markers = ["slow: 256-rank full-system tests (~3min each, run with: pytest -m '')"]
@@ -0,0 +1,9 @@
 """CCL (Collective Communication Library) framework for kernbench (ADR-0023).
 This package provides:
    - topologies: builtin neighbor topology generators (ring/mesh/tree)
    - helpers:    utilities for algorithm authors (chunked, ring_step, ...)
    - testing:    mock CCL runtime for fast unit tests of algorithm kernels
 See docs/adr/ADR-0023-ipcq-pe-collective.md and docs/ccl-author-guide.md.
 """
@@ -0,0 +1,189 @@
 """Intercube all-reduce kernel (pe0-only, same-lane across cubes).
 Reduces across the 4×4 cube mesh within each SIP, then exchanges
 between SIPs using the configured SIP topology, and broadcasts back.
 Supported SIP topologies (selected via ``sip_topo_kind``):
  0 — ring_1d:       global_E/global_W ring, n_sips-1 rounds
  1 — torus_2d:      row ring (global_E/W) + col ring (global_S/N)
  2 — mesh_2d:       row chain reduce+broadcast + col chain reduce+broadcast
 IPCQ wiring is handled by ``configure_sfr_intercube_multisip``.
 """
 from __future__ import annotations
 SIP_TOPO_RING = 0
 SIP_TOPO_TORUS = 1
 SIP_TOPO_MESH = 2
 TOPO_NAME_TO_KIND = {
    "ring_1d": SIP_TOPO_RING,
    "torus_2d": SIP_TOPO_TORUS,
    "mesh_2d": SIP_TOPO_TORUS,
    "mesh_2d_no_wrap": SIP_TOPO_MESH,
 }
 def kernel_args(world_size: int, n_elem: int) -> tuple:
    cube_w = 4
    cube_h = 4
    return (n_elem, cube_w, cube_h, world_size)
 def _inter_sip_ring(acc, n_sips, n_elem, tl):
    current = acc
    for _ in range(n_sips - 1):
        tl.send(dir="global_E", src=current)
        recv = tl.recv(dir="global_W", shape=(n_elem,), dtype="f16")
        acc = acc + recv
        current = recv
    return acc
 def _inter_sip_torus_2d(acc, sip_rank, sip_topo_w, sip_topo_h, n_elem, tl):
    # Row ring (global_E / global_W)
    current = acc
    for _ in range(sip_topo_w - 1):
        tl.send(dir="global_E", src=current)
        recv = tl.recv(dir="global_W", shape=(n_elem,), dtype="f16")
        acc = acc + recv
        current = recv
    # Col ring (global_S / global_N)
    current = acc
    for _ in range(sip_topo_h - 1):
        tl.send(dir="global_S", src=current)
        recv = tl.recv(dir="global_N", shape=(n_elem,), dtype="f16")
        acc = acc + recv
        current = recv
    return acc
 def _inter_sip_mesh_2d(acc, sip_rank, sip_topo_w, sip_topo_h, n_elem, tl):
    sip_row = sip_rank // sip_topo_w
    sip_col = sip_rank % sip_topo_w
    # Row reduce W → E
    if sip_col == 0:
        tl.send(dir="global_E", src=acc)
    elif sip_col < sip_topo_w - 1:
        recv = tl.recv(dir="global_W", shape=(n_elem,), dtype="f16")
        acc = acc + recv
        tl.send(dir="global_E", src=acc)
    else:
        recv = tl.recv(dir="global_W", shape=(n_elem,), dtype="f16")
        acc = acc + recv
    # Row broadcast E → W
    if sip_col == sip_topo_w - 1:
        tl.send(dir="global_W", src=acc)
    elif sip_col > 0:
        acc = tl.recv(dir="global_E", shape=(n_elem,), dtype="f16")
        tl.send(dir="global_W", src=acc)
    else:
        acc = tl.recv(dir="global_E", shape=(n_elem,), dtype="f16")
    # Col reduce N → S
    if sip_row == 0:
        tl.send(dir="global_S", src=acc)
    elif sip_row < sip_topo_h - 1:
        recv = tl.recv(dir="global_N", shape=(n_elem,), dtype="f16")
        acc = acc + recv
        tl.send(dir="global_S", src=acc)
    else:
        recv = tl.recv(dir="global_N", shape=(n_elem,), dtype="f16")
        acc = acc + recv
    # Col broadcast S → N
    if sip_row == sip_topo_h - 1:
        tl.send(dir="global_N", src=acc)
    elif sip_row > 0:
        acc = tl.recv(dir="global_S", shape=(n_elem,), dtype="f16")
        tl.send(dir="global_N", src=acc)
    else:
        acc = tl.recv(dir="global_S", shape=(n_elem,), dtype="f16")
    return acc
 def allreduce_intercube_multidevice(
    t_ptr, n_elem, cube_w, cube_h, n_sips, sip_rank,
    sip_topo_kind, sip_topo_w, sip_topo_h, tl,
 ):
    """Intercube all-reduce (pe0-only) with configurable SIP topology.
    Args:
        t_ptr: VA base of the row-wise-sharded tensor on this SIP.
        n_elem: f16 elements per cube tile.
        cube_w: cube mesh width (columns).
        cube_h: cube mesh height (rows).
        n_sips: number of SIPs.
        sip_rank: this SIP's rank (0-based).
        sip_topo_kind: 0=ring, 1=torus_2d, 2=mesh_2d.
        sip_topo_w: SIP mesh width (for 2D topologies, 0 for ring).
        sip_topo_h: SIP mesh height (for 2D topologies, 0 for ring).
        tl: TLContext (auto-injected).
    """
    cube_id = tl.program_id(axis=1)
    row = cube_id // cube_w
    col = cube_id % cube_w
    nbytes = n_elem * 2
    pe_addr = t_ptr + cube_id * nbytes
    acc = tl.load(pe_addr, shape=(n_elem,), dtype="f16")
    # ── Phase 1: row reduce W → E ──
    if col == 0:
        tl.send(dir="E", src=acc)
    elif col < cube_w - 1:
        recv = tl.recv(dir="W", shape=(n_elem,), dtype="f16")
        acc = acc + recv
        tl.send(dir="E", src=acc)
    else:
        recv = tl.recv(dir="W", shape=(n_elem,), dtype="f16")
        acc = acc + recv
    # ── Phase 2: col reduce N → S on rightmost column ──
    if col == cube_w - 1:
        if row == 0:
            tl.send(dir="S", src=acc)
        elif row < cube_h - 1:
            recv = tl.recv(dir="N", shape=(n_elem,), dtype="f16")
            acc = acc + recv
            tl.send(dir="S", src=acc)
        else:
            recv = tl.recv(dir="N", shape=(n_elem,), dtype="f16")
            acc = acc + recv
    # ── Phase 3: inter-SIP exchange on root cube ──
    root_cube = (cube_h - 1) * cube_w + (cube_w - 1)
    if cube_id == root_cube and n_sips > 1:
        if sip_topo_kind == SIP_TOPO_RING:
            acc = _inter_sip_ring(acc, n_sips, n_elem, tl)
        elif sip_topo_kind == SIP_TOPO_TORUS:
            acc = _inter_sip_torus_2d(acc, sip_rank, sip_topo_w, sip_topo_h, n_elem, tl)
        elif sip_topo_kind == SIP_TOPO_MESH:
            acc = _inter_sip_mesh_2d(acc, sip_rank, sip_topo_w, sip_topo_h, n_elem, tl)
    # ── Phase 4: col broadcast S → N on rightmost column ──
    if col == cube_w - 1:
        if row == cube_h - 1:
            tl.send(dir="N", src=acc)
        elif row > 0:
            acc = tl.recv(dir="S", shape=(n_elem,), dtype="f16")
            tl.send(dir="N", src=acc)
        else:
            acc = tl.recv(dir="S", shape=(n_elem,), dtype="f16")
    # ── Phase 5: row broadcast E → W ──
    if col == cube_w - 1:
        tl.send(dir="W", src=acc)
    elif col > 0:
        acc = tl.recv(dir="E", shape=(n_elem,), dtype="f16")
        tl.send(dir="W", src=acc)
    else:
        acc = tl.recv(dir="E", shape=(n_elem,), dtype="f16")
    tl.store(pe_addr, acc)
 kernel = allreduce_intercube_multidevice
@@ -0,0 +1,127 @@
 """CCL diagnostics: trace + pointer dump + deadlock (ADR-0023 D14).
 Trace
 -----
 Set ``KERNBENCH_CCL_TRACE=1`` (or any truthy value) to enable per-event
 logging of CCL send/recv to stdout. Off by default.
 Pointer dump
 ------------
 ``pointer_dump(engine)`` returns a multi-line string showing every PE_IPCQ's
 ring buffer state (my_head, my_tail, peer_head_cache, peer_tail_cache).
 Useful for diagnosing hangs.
 Deadlock
 --------
 ``IpcqDeadlock`` is raised by the engine when SimPy's schedule empties
 while a request is still pending — typical of unmatched send/recv pairs.
 The exception message includes the pointer dump.
 """
 from __future__ import annotations
 import os
 from typing import Any
 class IpcqDeadlock(RuntimeError):
    """Raised when the simulation cannot make further progress while a
    CCL request is still pending (D14 F3)."""
 # ── Trace toggle ─────────────────────────────────────────────────────
 _TRACE_ENABLED: bool = False
 def reload_trace_setting() -> None:
    """Re-read the ``KERNBENCH_CCL_TRACE`` env var."""
    global _TRACE_ENABLED
    val = os.environ.get("KERNBENCH_CCL_TRACE", "")
    _TRACE_ENABLED = val.strip().lower() in {"1", "true", "yes", "on"}
 def trace_enabled() -> bool:
    return _TRACE_ENABLED
 # Initialise once at import time
 reload_trace_setting()
 # ── Trace event functions ────────────────────────────────────────────
 def log_send(
    t_ns: float,
    sender: str,
    direction: str,
    nbytes: int,
    sender_seq: int,
 ) -> None:
    if not _TRACE_ENABLED:
        return
    print(
        f"[ccl t={t_ns:.1f} send] {sender} dir={direction} nbytes={nbytes} seq={sender_seq}",
        flush=True,
    )
 def log_recv(
    t_ns: float,
    receiver: str,
    direction: str,
    nbytes: int,
 ) -> None:
    if not _TRACE_ENABLED:
        return
    print(
        f"[ccl t={t_ns:.1f} recv] {receiver} dir={direction} nbytes={nbytes}",
        flush=True,
    )
 def log_credit_return(
    t_ns: float,
    sender: str,
    direction: str,
    consumer_seq: int,
 ) -> None:
    if not _TRACE_ENABLED:
        return
    print(
        f"[ccl t={t_ns:.1f} credit] {sender} dir={direction} seq={consumer_seq}",
        flush=True,
    )
 # ── Pointer dump ─────────────────────────────────────────────────────
 def pointer_dump(engine: Any) -> str:
    """Return a multi-line string of every PE_IPCQ's pointer state."""
    lines: list[str] = []
    components = getattr(engine, "_components", {})
    for node_id in sorted(components):
        if not node_id.endswith(".pe_ipcq"):
            continue
        comp = components[node_id]
        qps = getattr(comp, "queue_pairs", {})
        if not qps:
            continue
        lines.append(node_id)
        for d in sorted(qps):
            qp = qps[d]
            peer = qp["peer"]
            lines.append(
                f"  {d}: peer=sip{peer.sip}.cube{peer.cube}.pe{peer.pe}  "
                f"my_head={qp['my_head']} my_tail={qp['my_tail']}  "
                f"peer_head_cache={qp['peer_head_cache']} "
                f"peer_tail_cache={qp['peer_tail_cache']}"
            )
    return "\n".join(lines)
 def print_pointer_dump(engine: Any) -> None:
    """Convenience: print pointer_dump(engine) to stdout."""
    print(pointer_dump(engine), flush=True)
@@ -0,0 +1,118 @@
 """Helpers for CCL algorithm authors (ADR-0023 D15).
 These are pure utility functions usable from any kernel module:
    from kernbench.ccl.helpers import chunked, ring_step, tree_step
 They keep algorithm code short and free of off-by-one bugs.
 """
 from __future__ import annotations
 from dataclasses import dataclass
 from typing import Any
 _DTYPE_BYTES = {
    "f16": 2, "fp16": 2, "float16": 2, "bf16": 2,
    "f32": 4, "fp32": 4, "float32": 4,
    "i8": 1, "int8": 1,
    "i16": 2, "int16": 2,
    "i32": 4, "int32": 4,
 }
 def _itemsize(dtype: str) -> int:
    if dtype not in _DTYPE_BYTES:
        raise ValueError(f"Unsupported dtype: {dtype}")
    return _DTYPE_BYTES[dtype]
 # ── chunked ──────────────────────────────────────────────────────────
@dataclass(frozen=True)
 class Chunk:
    """One chunk of a tensor used by collective algorithms."""
    addr: int
    n_elem: int
    nbytes: int
 def chunked(
    base_addr: int,
    n_chunks: int,
    n_elem: int,
    dtype: str = "f16",
 ) -> list[Chunk]:
    """Slice a 1D buffer into ``n_chunks`` equal Chunks.
    Args:
        base_addr: starting address of the buffer.
        n_chunks: number of equal chunks to produce.
        n_elem: total number of elements (must be divisible by n_chunks).
        dtype: element type for byte-size calculation.
    Returns:
        List of ``Chunk`` objects whose addresses are consecutive.
    Raises:
        ValueError: if n_elem is not divisible by n_chunks.
    """
    if n_elem % n_chunks != 0:
        raise ValueError(
            f"chunked: n_elem ({n_elem}) not divisible by n_chunks ({n_chunks})"
        )
    per_chunk_elem = n_elem // n_chunks
    isize = _itemsize(dtype)
    per_chunk_bytes = per_chunk_elem * isize
    return [
        Chunk(
            addr=base_addr + i * per_chunk_bytes,
            n_elem=per_chunk_elem,
            nbytes=per_chunk_bytes,
        )
        for i in range(n_chunks)
    ]
 # ── ring_step ────────────────────────────────────────────────────────
 def ring_step(rank: int, step: int, world_size: int) -> tuple[int, int]:
    """Return ``(send_chunk_idx, recv_chunk_idx)`` for a ring algorithm step.
    Standard reduce-scatter / all-gather ring schedule:
        at step s, rank r sends chunk (r - s) and receives chunk (r - s - 1)
        modulo world_size.
    Used by ring all-reduce kernels:
        for step in range(world_size - 1):
            send_idx, recv_idx = ring_step(rank, step, world_size)
            tl.send(dir="E", src=chunks[send_idx])
            chunks[recv_idx] += tl.recv(dir="W").data
    """
    send_idx = (rank - step) % world_size
    recv_idx = (rank - step - 1) % world_size
    return send_idx, recv_idx
 # ── tree_step ────────────────────────────────────────────────────────
 def tree_step(rank: int, world_size: int) -> dict[str, Any]:
    """Return parent/children for binary tree rooted at rank 0.
    Returns:
        ``{"parent": int|None, "children": list[int]}``
    """
    parent = (rank - 1) // 2 if rank > 0 else None
    children: list[int] = []
    left = 2 * rank + 1
    right = 2 * rank + 2
    if left < world_size:
        children.append(left)
    if right < world_size:
        children.append(right)
    return {"parent": parent, "children": children}
@@ -0,0 +1,285 @@
 """IPCQ install plan for AhbmCCLBackend (ADR-0023 D10/D11/D12).
 Given a ccl.yaml config, the topology, and the engine, this module:
 1. Loads ccl.yaml and resolves the chosen algorithm.
 2. Maps each rank to a (sip, cube, pe) PE address using a linear scheme.
 3. Allocates per-rank IPCQ ring buffer base addresses (synthetic but
   unique-per-PE; see notes below).
 4. Builds neighbor tables via the algorithm's ``topology`` field plus the
   optional ``neighbors()`` override hook from the algorithm module.
 5. Wires bidirectional credit-return SimPy Stores between every (PE, peer)
   pair.
 6. Installs each PE_IPCQ component's neighbor table directly via its
   ``_install_neighbors`` sideband call (equivalent to fan-out IpcqInitMsg
   without going through fabric).
 Address scheme
 --------------
 For the first implementation we use a synthetic address scheme that
 guarantees uniqueness per (sip, cube, pe, direction) without going
 through ``PEMemAllocator``. The address is encoded as:
    base = IPCQ_BASE | (sip << 40) | (cube << 32) | (pe << 24)
    rx_base[direction_idx] = base + direction_idx * (n_slots * slot_size)
 The ``buffer_kind`` (tcm/hbm/sram) selects the *MemoryStore space* into
 which data is written. Within a space, addresses are unique per PE so
 the existing MemoryStore (``{space: {addr: ndarray}}``) handles them
 naturally.
 This bypasses the topology's address resolver / PhysAddr encoding and
 treats IPCQ buffers as a separate, parallel address namespace. Real PA
 encoding can be plugged in later without changing the rest of the design.
 """
 from __future__ import annotations
 from pathlib import Path
 from typing import Any
 import simpy
 import yaml
 from kernbench.ccl.topologies import resolve_topology
 from kernbench.common.ipcq_types import (
    IpcqEndpoint,
    IpcqInitEntry,
 )
 from kernbench.runtime_api.kernel import IpcqInitMsg
 # IPCQ synthetic address space top bit
 _IPCQ_BASE = 1 << 60
 def _ipcq_base_for_pe(sip: int, cube: int, pe: int) -> int:
    return _IPCQ_BASE | (sip << 40) | (cube << 32) | (pe << 24)
 # ── ccl.yaml loading ─────────────────────────────────────────────────
 def load_ccl_config(path: str | Path | None = None) -> dict:
    """Load and validate ccl.yaml. Searches cwd and project root."""
    if path is None:
        candidates = [
            Path.cwd() / "ccl.yaml",
            Path(__file__).resolve().parents[3] / "ccl.yaml",
        ]
        for p in candidates:
            if p.exists():
                path = p
                break
    if path is None:
        raise FileNotFoundError(
            "ccl.yaml not found. Place it at project root or cwd."
        )
    with open(path) as f:
        cfg = yaml.safe_load(f)
    if "defaults" not in cfg:
        raise ValueError("ccl.yaml missing 'defaults' section")
    if "algorithms" not in cfg:
        raise ValueError("ccl.yaml missing 'algorithms' section")
    return cfg
 def resolve_algorithm_config(cfg: dict, name: str | None = None) -> dict:
    """Merge defaults with the chosen algorithm's overrides.
    Returns a flat dict with at minimum: module, topology, buffer_kind,
    backpressure, n_slots, slot_size, ipcq_credit_size_bytes, world_size.
    """
    defaults = dict(cfg.get("defaults", {}))
    algo_name = name or defaults.get("algorithm")
    if algo_name is None:
        raise ValueError("ccl.yaml: defaults.algorithm not set")
    algos = cfg.get("algorithms", {})
    if algo_name not in algos:
        raise ValueError(
            f"ccl.yaml: algorithm '{algo_name}' not in algorithms section"
        )
    merged = defaults.copy()
    merged.update(algos[algo_name])
    merged["algorithm"] = algo_name
    return merged
 # ── rank → PE mapping ────────────────────────────────────────────────
 def linear_rank_to_pe(rank: int, spec: dict) -> tuple[int, int, int]:
    """Map a rank to (sip, cube, pe) using linear topology order."""
    sips = spec["system"]["sips"]["count"]
    cubes_per_sip = spec["sip"]["cube_mesh"]["w"] * spec["sip"]["cube_mesh"]["h"]
    pe_layout = spec["cube"]["pe_layout"]
    pes_per_cube = pe_layout["pe_per_corner"] * len(pe_layout["corners"])
    pes_per_sip = cubes_per_sip * pes_per_cube
    if rank >= sips * pes_per_sip:
        raise ValueError(
            f"rank {rank} exceeds total PE count {sips * pes_per_sip}"
        )
    sip = rank // pes_per_sip
    rem = rank % pes_per_sip
    cube = rem // pes_per_cube
    pe = rem % pes_per_cube
    return sip, cube, pe
 # ── Install plan ─────────────────────────────────────────────────────
 def install_ipcq(
    engine: Any,
    spec: dict,
    cfg: dict,
    algo_module: Any | None = None,
    rank_to_pe: list[tuple[int, int, int]] | None = None,
 ) -> dict[str, Any]:
    """Build neighbor tables and install them in every participating PE_IPCQ.
    Args:
        engine: GraphEngine with ``_components`` dict
        spec: topology spec dict
        cfg: merged algorithm config (from ``resolve_algorithm_config``)
        algo_module: optional algorithm Python module (for neighbors override)
        rank_to_pe: optional explicit rank → (sip, cube, pe) mapping. If
                    None, the default linear mapping is used.
    Returns:
        A diagnostics dict with the install plan (rank → PE map, neighbor table).
    """
    if "world_size" in cfg:
        world_size = int(cfg["world_size"])
    else:
        # Topology-derived fallback (mirrors AhbmCCLBackend / RuntimeContext).
        sips = int(spec.get("system", {}).get("sips", {}).get("count", 1))
        cm = spec.get("sip", {}).get("cube_mesh", {})
        cubes_per_sip = int(cm.get("w", 1)) * int(cm.get("h", 1))
        pl = spec.get("cube", {}).get("pe_layout", {})
        corners = pl.get("corners", [])
        pe_per_corner = int(pl.get("pe_per_corner", 1))
        pes_per_cube = pe_per_corner * max(len(corners), 1)
        world_size = sips * cubes_per_sip * pes_per_cube
    buffer_kind = cfg["buffer_kind"]
    n_slots = int(cfg["n_slots"])
    slot_size = int(cfg["slot_size"])
    backpressure = cfg["backpressure"]
    credit_size_bytes = int(cfg.get("ipcq_credit_size_bytes", 16))
    # Step 1: rank → (sip, cube, pe)
    if rank_to_pe is not None:
        if len(rank_to_pe) != world_size:
            raise ValueError(
                f"rank_to_pe has {len(rank_to_pe)} entries but world_size={world_size}"
            )
        rank_pe = list(rank_to_pe)
    else:
        rank_pe: list[tuple[int, int, int]] = [
            linear_rank_to_pe(r, spec) for r in range(world_size)
        ]
    pe_to_rank = {(s, c, p): r for r, (s, c, p) in enumerate(rank_pe)}
    # Step 2: resolve topology fn (with optional override)
    topo_fn = resolve_topology(cfg["topology"], algo_module=algo_module)
    # Build per-rank neighbor map
    neighbor_table: dict[int, dict[str, int]] = {}
    for r in range(world_size):
        neighbor_table[r] = topo_fn(r, world_size)
    # Step 3: pull the live engine reference for each PE_IPCQ
    components = engine._components
    pe_ipcq_id = lambda s, c, p: f"sip{s}.cube{c}.pe{p}.pe_ipcq"
    # Step 4: per-PE rx_base address and per-PE credit_inbox
    direction_keys = sorted({d for nt in neighbor_table.values() for d in nt})
    direction_idx = {d: i for i, d in enumerate(direction_keys)}
    bytes_per_direction = n_slots * slot_size
    def rx_base(s: int, c: int, p: int, d: str) -> int:
        return _ipcq_base_for_pe(s, c, p) + direction_idx[d] * bytes_per_direction
    # Wire bidirectional credit stores: backend creates the SimPy Stores
    # by reading each rank's PE_IPCQ.credit_inbox property.
    rank_to_credit_inbox: dict[int, simpy.Store] = {}
    for r, (s, c, p) in enumerate(rank_pe):
        comp = components[pe_ipcq_id(s, c, p)]
        # Trigger lazy creation of credit_inbox if not yet started.
        # PE_IPCQ.start() creates it; we ensure it exists.
        if comp._credit_inbox is None:
            comp._credit_inbox = simpy.Store(engine._env)
        rank_to_credit_inbox[r] = comp.credit_inbox
    # Step 5: build IpcqInitMsg per rank and call _install_neighbors directly
    plan: dict[str, Any] = {
        "world_size": world_size,
        "rank_to_pe": rank_pe,
        "buffer_kind": buffer_kind,
        "neighbor_table": neighbor_table,
    }
    _OPPOSITE_DIR = {
        "E": "W", "W": "E", "N": "S", "S": "N",
        "global_E": "global_W", "global_W": "global_E",
        "global_N": "global_S", "global_S": "global_N",
    }
    def reverse_direction(my_rank: int, peer_rank: int, my_dir: str) -> str | None:
        """Find peer's direction that reciprocates my_dir→peer_rank.
        Prefer the OPPOSITE direction (E↔W, N↔S) when the peer has it
        pointing back to us (ADR-0025 D1). This matters in 2-rank
        bidirectional rings where both E and W on one side point to the
        same peer — without the preference, dict-order first-match would
        route data into the wrong rx slot. Falls back to any direction
        pointing back for topologies without an opposite convention
        (e.g. tree_binary's parent/child).
        """
        nt = neighbor_table[peer_rank]
        opp = _OPPOSITE_DIR.get(my_dir)
        if opp is not None and nt.get(opp) == my_rank:
            return opp
        for d, target in nt.items():
            if target == my_rank:
                return d
        return None
    for r, (s, c, p) in enumerate(rank_pe):
        my_pe_ipcq = components[pe_ipcq_id(s, c, p)]
        nbrs = neighbor_table[r]
        entries: list[IpcqInitEntry] = []
        for d, peer_rank in nbrs.items():
            if peer_rank is None:
                continue
            peer_s, peer_c, peer_p = rank_pe[peer_rank]
            peer_dir = reverse_direction(r, peer_rank, d)
            if peer_dir is None:
                # Peer doesn't have a reverse entry — skip (asymmetric topology)
                continue
            peer_endpoint = IpcqEndpoint(
                sip=peer_s, cube=peer_c, pe=peer_p,
                buffer_kind=buffer_kind,
                rx_base_pa=rx_base(peer_s, peer_c, peer_p, peer_dir),
                rx_base_va=0,
                n_slots=n_slots, slot_size=slot_size,
            )
            entries.append(IpcqInitEntry(
                direction=d,
                peer=peer_endpoint,
                my_rx_base_pa=rx_base(s, c, p, d),
                my_rx_base_va=0,
                n_slots=n_slots, slot_size=slot_size,
                peer_credit_store=rank_to_credit_inbox[peer_rank],
            ))
        msg = IpcqInitMsg(
            correlation_id="ccl_init", request_id=f"init_r{r}",
            target_sips=(s,), target_cubes=(c,), target_pe=p,
            entries=tuple(entries),
            backpressure_mode=backpressure,
            buffer_kind=buffer_kind,
            credit_size_bytes=credit_size_bytes,
        )
        my_pe_ipcq._install_neighbors(msg)
    return plan
@@ -0,0 +1,104 @@
 """SFR configuration for intercube + inter-SIP IPCQ wiring.
 Provides ``configure_sfr_intercube_multisip`` which programs PE_IPCQ
 neighbor tables for:
  1. Intercube within each SIP — pe0 of every cube connects to pe0 of
     its N/S/E/W mesh neighbors (no wrap-around).
  2. Inter-SIP on ALL cubes — pe0 of cube_c on sip_A connects to pe0 of
     cube_c on each peer SIP, using ``global_E``/``global_W`` (ring) or
     ``global_N``/``global_S``/``global_E``/``global_W`` (mesh/torus)
     direction labels.  Wiring all cubes allows the kernel to
     dynamically elect the root cube at runtime.
 SIP-level topology is read from ``topology.yaml`` →
 ``system.sips.topology`` (e.g. ``ring_1d``, ``mesh_2d``).
 Intercube mesh dimensions come from ``sip.cube_mesh.w/h``.
 Internally delegates to ``install_ipcq`` with a computed ``rank_to_pe``
 (pe0-only) and a closure-captured ``neighbors()`` function.
 """
 from __future__ import annotations
 import types
 from typing import Any
 from kernbench.ccl.install import install_ipcq
 from kernbench.ccl.topologies import _BUILTIN as _TOPO_BUILTINS
 def configure_sfr_intercube_multisip(
    engine: Any,
    spec: dict,
    cfg: dict,
 ) -> dict[str, Any]:
    """Wire IPCQ for intercube (pe0, mesh) + inter-SIP (pe0, all cubes).
    Args:
        engine: GraphEngine with ``_components``.
        spec: topology spec dict (from topology.yaml).
        cfg: merged algorithm config (from ``resolve_algorithm_config``).
    Returns:
        The install plan dict from ``install_ipcq``.
    """
    cm = spec["sip"]["cube_mesh"]
    mesh_w = int(cm["w"])
    mesh_h = int(cm["h"])
    n_cubes = mesh_w * mesh_h
    n_sips = int(spec.get("system", {}).get("sips", {}).get("count", 1))
    sip_topology = str(
        spec.get("system", {}).get("sips", {}).get("topology", "ring_1d")
    )
    if sip_topology not in _TOPO_BUILTINS:
        raise ValueError(
            f"Unknown sip topology '{sip_topology}'. "
            f"Available: {list(_TOPO_BUILTINS)}"
        )
    sip_topo_fn = _TOPO_BUILTINS[sip_topology]
    world_size = n_sips * n_cubes
    pe_idx_to_pe: list[tuple[int, int, int]] = [
        (sip, cube, 0)
        for sip in range(n_sips)
        for cube in range(n_cubes)
    ]
    def _neighbors(pe_idx: int, ws: int, _base: dict) -> dict[str, int]:
        sip = pe_idx // n_cubes
        cube = pe_idx % n_cubes
        row = cube // mesh_w
        col = cube % mesh_w
        nbrs: dict[str, int] = {}
        # Intercube within SIP (mesh, no wrap-around)
        if col < mesh_w - 1:
            nbrs["E"] = sip * n_cubes + (row * mesh_w + col + 1)
        if col > 0:
            nbrs["W"] = sip * n_cubes + (row * mesh_w + col - 1)
        if row < mesh_h - 1:
            nbrs["S"] = sip * n_cubes + ((row + 1) * mesh_w + col)
        if row > 0:
            nbrs["N"] = sip * n_cubes + ((row - 1) * mesh_w + col)
        # Inter-SIP on ALL cubes
        if n_sips > 1:
            sip_nbrs = sip_topo_fn(sip, n_sips)
            for d, peer_sip in sip_nbrs.items():
                nbrs[f"global_{d}"] = peer_sip * n_cubes + cube
        return nbrs
    mock_module = types.SimpleNamespace(neighbors=_neighbors)
    cfg_copy = dict(cfg)
    cfg_copy["world_size"] = world_size
    cfg_copy["topology"] = "none"
    return install_ipcq(
        engine, spec, cfg_copy,
        algo_module=mock_module,
        rank_to_pe=pe_idx_to_pe,
    )
@@ -0,0 +1,163 @@
 """Builtin neighbor topology generators for CCL backend (ADR-0023 D11).
 Each generator takes ``(rank, world_size)`` and returns a
 ``dict[direction, peer_rank]`` for that rank. ``direction`` is one of
 ``"N" | "S" | "E" | "W"`` for ring/mesh, or
 ``"parent" | "child_left" | "child_right"`` for tree topologies.
 Algorithm modules may override the generated map by defining a
 ``neighbors(rank, world_size, neighbor_map) -> dict | None`` function in
 the same module (see D11 / D15). ``resolve_topology`` wires these together.
 """
 from __future__ import annotations
 from typing import Any, Callable
 NeighborMap = dict[str, int]
 TopologyFn = Callable[[int, int], NeighborMap]
 # ── Builtin generators ───────────────────────────────────────────────
 def ring_1d(rank: int, world_size: int) -> NeighborMap:
    """1D bidirectional ring (E/W)."""
    return {
        "E": (rank + 1) % world_size,
        "W": (rank - 1) % world_size,
    }
 def ring_1d_unidir(rank: int, world_size: int) -> NeighborMap:
    """1D unidirectional ring (E only)."""
    return {"E": (rank + 1) % world_size}
 def mesh_2d(rank: int, world_size: int) -> NeighborMap:
    """Square 2D mesh (N/S/E/W).
    Layout: rank = row * side + col, with side = sqrt(world_size).
    Wrap-around (torus) on all four edges.
    """
    side = int(round(world_size ** 0.5))
    if side * side != world_size:
        raise ValueError(
            f"mesh_2d requires square world_size, got {world_size}"
        )
    r, c = divmod(rank, side)
    return {
        "N": ((r - 1) % side) * side + c,
        "S": ((r + 1) % side) * side + c,
        "W": r * side + (c - 1) % side,
        "E": r * side + (c + 1) % side,
    }
 def tree_binary(rank: int, world_size: int) -> NeighborMap:
    """Binary tree rooted at rank 0.
    Children of rank r are 2r+1 and 2r+2 (if within world_size).
    Parent of rank r > 0 is (r-1)//2.
    Returned keys (only those that exist):
        "parent", "child_left", "child_right"
    """
    n: NeighborMap = {}
    if rank > 0:
        n["parent"] = (rank - 1) // 2
    left = 2 * rank + 1
    right = 2 * rank + 2
    if left < world_size:
        n["child_left"] = left
    if right < world_size:
        n["child_right"] = right
    return n
 def torus_2d(rank: int, world_size: int) -> NeighborMap:
    """Square 2D torus (N/S/E/W) with wrap-around on all edges.
    Alias for mesh_2d (which already wraps). Explicit name for clarity
    when used as a SIP-level topology.
    """
    return mesh_2d(rank, world_size)
 def mesh_2d_no_wrap(rank: int, world_size: int) -> NeighborMap:
    """Square 2D mesh (N/S/E/W) WITHOUT wrap-around.
    Edge nodes have fewer neighbors (no wrapping). Used for SIP-level
    topologies where physical links don't wrap.
    """
    side = int(round(world_size ** 0.5))
    if side * side != world_size:
        raise ValueError(
            f"mesh_2d_no_wrap requires square world_size, got {world_size}"
        )
    r, c = divmod(rank, side)
    n: NeighborMap = {}
    if r > 0:
        n["N"] = (r - 1) * side + c
    if r < side - 1:
        n["S"] = (r + 1) * side + c
    if c > 0:
        n["W"] = r * side + (c - 1)
    if c < side - 1:
        n["E"] = r * side + (c + 1)
    return n
 def none(rank: int, world_size: int) -> NeighborMap:
    """Empty map — algorithm's neighbors() must build from scratch."""
    return {}
 _BUILTIN: dict[str, TopologyFn] = {
    "ring_1d": ring_1d,
    "ring_1d_unidir": ring_1d_unidir,
    "mesh_2d": mesh_2d,
    "torus_2d": torus_2d,
    "mesh_2d_no_wrap": mesh_2d_no_wrap,
    "tree_binary": tree_binary,
    "none": none,
 }
 # ── Resolution ───────────────────────────────────────────────────────
 def resolve_topology(
    name: str, algo_module: Any | None = None,
 ) -> TopologyFn:
    """Return a callable ``(rank, world_size) -> NeighborMap``.
    Args:
        name: builtin topology name from ccl.yaml. Must be one of
              ``ring_1d``, ``ring_1d_unidir``, ``mesh_2d``, ``tree_binary``,
              or ``none``.
        algo_module: optional algorithm module. If it defines
              ``neighbors(rank, world_size, neighbor_map)``, that hook is
              invoked after the builtin to override the result.
              Returning None from neighbors() leaves the builtin map
              unchanged; returning a dict replaces it.
    Raises:
        ValueError: if ``name`` is not a known builtin.
    """
    if name not in _BUILTIN:
        raise ValueError(
            f"Unknown topology '{name}'. "
            f"Available builtins: {list(_BUILTIN)}"
        )
    builtin_fn = _BUILTIN[name]
    override_fn = getattr(algo_module, "neighbors", None) if algo_module else None
    if override_fn is None or not callable(override_fn):
        return builtin_fn
    def _wrapped(rank: int, world_size: int) -> NeighborMap:
        base = builtin_fn(rank, world_size)
        result = override_fn(rank, world_size, base)
        if result is None:
            return base
        return result
    return _wrapped
@@ -21,6 +21,10 @@ def build_parser() -> argparse.ArgumentParser:
    runp.add_argument(
        "--device", default=None, help="Target device: 'all' or 'sip:<N>' (default: all)"
    )
    runp.add_argument(
        "--verify-data", action="store_true", default=False,
        help="Enable Phase 2 data verification (ADR-0020)",
    )
    runp.set_defaults(_handler=cmd_run)
    probep = sub.add_parser("probe", help="Probe latency and BW for predefined traffic patterns")
@@ -36,9 +40,11 @@ def build_parser() -> argparse.ArgumentParser:
    return p
-def engine_factory(topology: object, device: DeviceSelector) -> SimEngine:
+def engine_factory(
    topology: object, device: DeviceSelector, *, enable_data: bool = False,
 ) -> SimEngine:
    topo_obj = getattr(topology, "topology_obj", topology)
-    return GraphEngine(topo_obj)
+    return GraphEngine(topo_obj, enable_data=enable_data)
 def cmd_web(args) -> int:
@@ -53,8 +59,12 @@ def cmd_run(args) -> int:
    topo = resolve_topology(args.topology)
    bench = resolve_bench(args.bench)
    device = resolve_device(args.device)
    verify_data = getattr(args, "verify_data", False)
-    result = run_bench(topology=topo, bench_fn=bench, device=device, engine_factory=engine_factory)
+    def _factory(topology, device):
        return engine_factory(topology, device, enable_data=verify_data)
    result = run_bench(topology=topo, bench_fn=bench, device=device, engine_factory=_factory)
    topo_obj = getattr(topo, "topology_obj", topo)
    spec = getattr(topo_obj, "spec", None)
@@ -62,6 +72,18 @@ def cmd_run(args) -> int:
        print(format_report(result.traces, title=args.bench, spec=spec))
    print(result.summary_text())
    # Phase 2 diagnostic summary (ADR-0020). The actual Phase 2 replay
    # already runs inside engine.wait() → _flush_data_phase(). We only
    # print the summary here; no redundant re-execution.
    if verify_data and result.engine is not None:
        op_log = result.engine.op_log
        if op_log:
            n_gemm = sum(1 for r in op_log if r.op_kind == "gemm")
            n_math = sum(1 for r in op_log if r.op_kind == "math")
            print(f"[data] Phase 2 complete: {len(op_log)} ops ({n_gemm} gemm, {n_math} math)")
        else:
            print("[data] No op_log recorded — skipping Phase 2")
    return 0 if result.completion.ok else 1
@@ -0,0 +1,249 @@
 """IPCQ schemas and exceptions (ADR-0023 D2.5, D12, D14 F1).
 This module contains the data structures and exceptions used by the
 PE-level IPCQ collective communication infrastructure. The host-facing
 sideband fan-out message ``IpcqInitMsg`` lives in
 ``kernbench.runtime_api.kernel`` (alongside other fabric messages),
 while all internal token / metadata / command schemas are kept here.
 Layering:
    PE_CPU       --IpcqRequest(IpcqSendCmd|IpcqRecvCmd)--> PE_IPCQ
    PE_IPCQ      --IpcqDmaToken-->                         PE_DMA (vc_comm)
    PE_DMA       --IpcqMetaArrival-->                      PE_IPCQ (atomic, D9)
    PE_IPCQ      --IpcqCreditMetadata-->                   peer PE_IPCQ (fast path, D9)
 See ADR-0023 for the full design.
 """
 from __future__ import annotations
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Any, Union
 if TYPE_CHECKING:
    import simpy
 # ── D14 F1: invalid direction exception ──────────────────────────────
 class IpcqInvalidDirection(ValueError):
    """Raised when a kernel calls tl.send/recv with a direction that
    has no neighbor installed for this PE."""
 # ── D2.5: IpcqEndpoint ───────────────────────────────────────────────
@dataclass(frozen=True)
 class IpcqEndpoint:
    """송신 측이 peer's rx_buffer 주소를 계산하기 위해 필요한 모든 정보 (D2.5).
    Sender PE_IPCQ uses this to compute the destination PA for its DMA
    write into the peer's rx ring buffer slot:
        slot_idx = sender.my_head % peer.n_slots
        dst_pa   = peer.rx_base_pa + slot_idx * peer.slot_size
    """
    sip: int                     # destination SIP
    cube: int                    # destination cube
    pe: int                      # destination PE (cube-local index)
    buffer_kind: str             # "tcm" | "hbm" | "sram"
    rx_base_pa: int              # peer rx_buffer base PA (PhysAddr.encode())
    rx_base_va: int              # peer rx_buffer base VA (optional, MMU)
    n_slots: int                 # peer ring depth (wrap-around modulo)
    slot_size: int               # peer slot size (offset multiplier)
 # ── D12: IpcqInitEntry (used by IpcqInitMsg in kernel.py) ────────────
@dataclass(frozen=True)
 class IpcqInitEntry:
    """One direction's neighbor entry that backend installs into a PE_IPCQ
    via IpcqInitMsg (kernbench.runtime_api.kernel.IpcqInitMsg, D12).
    """
    direction: str               # "N" | "S" | "E" | "W"
    peer: IpcqEndpoint           # see D2.5
    my_rx_base_pa: int           # this PE's own rx_buffer base
    my_rx_base_va: int           # this PE's own rx_buffer base VA (optional)
    n_slots: int                 # this PE's ring depth
    slot_size: int               # this PE's slot size
    # Credit fast path channel (D9).
    # Contract: must be a simpy.Store instance dedicated to receiving
    # IpcqCreditMetadata objects only. Backend wires it once at init time
    # and the receiving PE_IPCQ owns its consumer side; the sender (peer's
    # PE_IPCQ) puts IpcqCreditMetadata directly into this store via
    # _delayed_credit_send. Do not put any other object type.
    peer_credit_store: "simpy.Store"
 # ── D12: IpcqSendCmd (PE_CPU → PE_IPCQ) ──────────────────────────────
@dataclass(frozen=True)
 class IpcqSendCmd:
    """tl.send command issued by the kernel to PE_IPCQ."""
    direction: str               # "N" | "S" | "E" | "W"
    src_addr: int                # source data address (TCM/HBM/SRAM)
    src_space: str               # "tcm" | "hbm" | "sram"
    nbytes: int
    shape: tuple[int, ...]       # data shape (op_log + MemoryStore use)
    dtype: str
    handle_id: str               # completion tracking
    # In-flight data snapshot captured at tl.send() time from the
    # TensorHandle.data field. Carries the actual numpy array that was
    # visible at recv-time (when handle.data was populated), avoiding a
    # Phase 1 race where a later IPCQ inbound overwrites the sender's
    # slot between recv and send. If None, PE_DMA outbound falls back to
    # reading MemoryStore[src_addr] (correct for sources that are never
    # overwritten, such as HBM tiles).
    data: Any = None
    data_op: bool = True         # ADR-0020 op_log recording flag
 # ── D12: IpcqRecvCmd (PE_CPU → PE_IPCQ) ──────────────────────────────
@dataclass(frozen=True)
 class IpcqRecvCmd:
    """tl.recv command issued by the kernel to PE_IPCQ.
    Two modes (recv_mode):
        "return_slot" — return slot address as-is (default, zero-copy).
                        Kernel uses the slot memory directly.
        "copy_to_dst" — copy slot data to dst_addr, then return.
    """
    direction: str | None        # None → round-robin (weak fairness, D4)
    shape: tuple[int, ...]
    dtype: str
    handle_id: str
    recv_mode: str = "return_slot"
    dst_addr: int = 0            # used only when recv_mode == "copy_to_dst"
    dst_space: str = ""          # used only when recv_mode == "copy_to_dst"
    blocking: bool = True
    data_op: bool = True
 # ── D12: IpcqDmaToken (PE_IPCQ → PE_DMA, vc_comm) ───────────────────
@dataclass
 class IpcqDmaToken:
    """Token sent from PE_IPCQ to PE_DMA (vc_comm channel) carrying both
    the data move request and the piggyback metadata (ADR-0023 D9).
    Receiving PE_DMA processes this atomically (I6 MUST):
        1. MemoryStore.write(dst_endpoint.buffer_kind, dst_addr, data)
        2. Forward IpcqMetaArrival(token=self) to peer PE_IPCQ
    No yield is allowed between the two steps.
    The ``data`` field is a snapshot taken by the sender's PE_DMA at the
    moment the send is issued. This preserves "in-flight data" semantics:
    if the sender mutates its source memory after issuing the send but
    before arrival, the receiver still gets the snapshot. The snapshot is
    None for control-only tokens (e.g. credit-only updates).
    """
    # ── Data movement (single-hop DMA write) ──
    src_addr: int
    src_space: str
    dst_addr: int                # already-computed peer rx slot PA
    dst_endpoint: IpcqEndpoint   # routing target (sip/cube/pe) + buffer_kind
    nbytes: int
    handle_id: str               # completion notify back to sender PE_IPCQ
    # Optional shape/dtype carried for op_log + MemoryStore convenience.
    shape: tuple[int, ...] = ()
    dtype: str = "f16"
    # In-flight data snapshot (sender PE_DMA captures this at send time).
    data: Any = None
    # ── Piggyback metadata (D9) ──
    sender_seq: int = 0          # monotonic; receiver updates peer_head_cache
    src_sip: int = 0
    src_cube: int = 0
    src_pe: int = 0
    src_direction: str = "E"     # sender-side direction; receiver maps to its own
    data_op: bool = True
 # ── D12: IpcqMetaArrival (PE_DMA → PE_IPCQ, intra-PE wire) ──────────
@dataclass
 class IpcqMetaArrival:
    """Posted by receiving PE_DMA into the destination PE's PE_IPCQ inbox
    in the same SimPy step as the MemoryStore.write (D9, I6 MUST).
    The receiver PE_IPCQ uses ``token.sender_seq`` to update its
    peer_head_cache for the corresponding direction.
    """
    token: IpcqDmaToken
 # ── D12: IpcqCreditMetadata (PE_IPCQ → peer PE_IPCQ, fast path) ─────
@dataclass(frozen=True)
 class IpcqCreditMetadata:
    """Credit return — recv-side → send-side fast path (D9).
    Sent by ``PeIpcqComponent._delayed_credit_send`` after a
    bottleneck-BW based latency, putting the metadata directly into
    the peer's pre-wired credit store (no fabric routing).
    ``dst_rx_base_pa`` is the receiver's ``my_rx_base_pa`` for the direction
    whose slot was consumed. The original sender matches this against
    ``qp.peer.rx_base_pa`` to find the correct direction (ADR-0025 D3) —
    unambiguous even when multiple directions share the same peer (e.g.
    2-rank bidirectional ring).
    """
    consumer_seq: int            # my_tail at recv side (new tail value)
    dst_rx_base_pa: int          # receiver-side my_rx_base_pa (ADR-0025 D3)
    src_sip: int                 # which peer is sending the credit (diag)
    src_cube: int
    src_pe: int
    src_direction: str           # sender-side direction (peer maps to its own)
 # ── Request wrapper (PE_CPU → PE_IPCQ) ───────────────────────────────
@dataclass
 class IpcqRequest:
    """Wrapper carrying an IpcqSendCmd or IpcqRecvCmd plus a SimPy completion
    event. Posted by PE_CPU into PE_IPCQ's inbox; PE_IPCQ calls
    ``done.succeed()`` when the request is fully processed.
    For recv requests, the result (slot address, direction, dtype, shape)
    is written into ``result_data`` so the caller can read it after wait.
    """
    command: "IpcqSendCmd | IpcqRecvCmd"
    done: "simpy.Event"
    result_data: dict[str, Any] = field(default_factory=dict)
 # ── RecvFuture (kernel ↔ runner handshake for tl.recv_async / tl.wait) ─
@dataclass
 class RecvFuture:
    """Opaque future returned by ``tl.recv_async``.
    The KernelRunner attaches a SimPy event and the IpcqRequest in the
    background; ``tl.wait(future)`` switches back to the runner which
    yields on the event and resolves the result into a TensorHandle.
    """
    cmd: "IpcqRecvCmd"
    request: Any = None         # IpcqRequest (set by runner)
    event: Any = None           # simpy.Event (set by runner)
    resolved: bool = False
    result: Any = None          # cached TensorHandle after wait()
@@ -33,6 +33,7 @@ class TensorHandle:
    dtype: str
    nbytes: int                      # total byte size
    data: object = None              # reserved for validate mode
    space: str = "tcm"               # MemoryStore space ("tcm" | "hbm" | "sram")
@dataclass(frozen=True)
@@ -55,6 +56,7 @@ class DmaReadCmd:
    handle: TensorHandle
    src_addr: int
    nbytes: int
    data_op: bool = True
@dataclass(frozen=True)
@@ -64,6 +66,7 @@ class DmaWriteCmd:
    handle: TensorHandle
    dst_addr: int
    nbytes: int
    data_op: bool = True
@dataclass(frozen=True)
@@ -79,6 +82,7 @@ class GemmCmd:
    m: int
    k: int
    n: int
    data_op: bool = True
@dataclass(frozen=True)
@@ -94,6 +98,7 @@ class MathCmd:
    inputs: tuple[TensorHandle, ...]
    out: TensorHandle
    axis: int | None = None          # for reductions
    data_op: bool = True
@dataclass(frozen=True)
@@ -111,6 +116,7 @@ class CompositeCmd:
    out_addr: int
    out_nbytes: int
    math_op: str | None = None       # for op="math": which math operation
    data_op: bool = True
@dataclass(frozen=True)
@@ -33,6 +33,7 @@ class ComponentBase(ABC):
        self.ctx = ctx
        self.in_ports: dict[str, simpy.Store] = {}
        self.out_ports: dict[str, simpy.Store] = {}
        self._op_logger: Any | None = None  # OpLogger, set by GraphEngine if enabled
    def start(self, env: simpy.Environment) -> None:
        """Called once after all ports are wired.
@@ -64,9 +65,21 @@ class ComponentBase(ABC):
            txn: Any = yield self._inbox.get()
            env.process(self._forward_txn(env, txn))
    def _on_process_start(self, env: simpy.Environment, msg: Any) -> None:
        """Op log hook: record service start for data_op messages (ADR-0020 D2)."""
        if self._op_logger and getattr(msg, "data_op", False):
            self._op_logger.record_start(env.now, self.node.id, msg)
    def _on_process_end(self, env: simpy.Environment, msg: Any) -> None:
        """Op log hook: record service end for data_op messages (ADR-0020 D2)."""
        if self._op_logger and getattr(msg, "data_op", False):
            self._op_logger.record_end(env.now, self.node.id, msg)
    def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
        """Apply run() latency, then forward to next hop or drain at terminal."""
        self._on_process_start(env, txn)
        yield from self.run(env, txn.nbytes)
        self._on_process_end(env, txn)
        next_hop = txn.next_hop          # duck-typed: Transaction.next_hop
        if next_hop:
            yield self.out_ports[next_hop].put(txn.advance())
@@ -120,10 +133,16 @@ class PeEngineBase(ComponentBase):
        while True:
            msg: Any = yield self._inbox.get()
            if isinstance(msg, PeInternalTxn):
-                env.process(self.handle_command(env, msg))
+                env.process(self._handle_with_hooks(env, msg))
            else:
                env.process(self._forward_txn(env, msg))
    def _handle_with_hooks(self, env: simpy.Environment, pe_txn: Any) -> Generator:
        """Wrap handle_command with op log hooks on the inner command."""
        self._on_process_start(env, pe_txn.command)
        yield from self.handle_command(env, pe_txn)
        self._on_process_end(env, pe_txn.command)
    @abstractmethod
    def handle_command(self, env: simpy.Environment, pe_txn: Any) -> Generator:
        """Process a PE-internal command (PeInternalTxn).
@@ -42,6 +42,30 @@ class PeCpuComponent(ComponentBase):
            self._cube_idx = int(parts[1].replace("cube", ""))
        except (IndexError, ValueError):
            self._cube_idx = 0
        # num_cubes from spec (for tl.program_id(axis=1) — ADR-0022)
        spec = ctx.spec if ctx else {}
        cube_mesh = spec.get("sip", {}).get("cube_mesh", {})
        if cube_mesh:
            self._num_cubes = int(cube_mesh.get("w", 1)) * int(cube_mesh.get("h", 1))
        else:
            self._num_cubes = (
                spec.get("system", {}).get("sips", {}).get("cubes_per_sip", 1)
            )
        # PE-local scratch for kernel math output handles (ADR-0020 D3
        # extension; reserved portion of TCM addressed via a synthetic
        # MemoryStore key, not the real PA encoder).
        pe_template = spec.get("cube", {}).get("pe_template", {})
        tcm_attrs = pe_template.get("components", {}).get("pe_tcm", {}).get("attrs", {})
        scratch_mb = float(tcm_attrs.get("kernel_scratch_mb", 1))
        self._tl_scratch_size = int(scratch_mb * (1 << 20))
        # PE-unique base address — high bit pattern to avoid collision with
        # IPCQ ring buffers (which use bit 60).
        self._tl_scratch_base = (
            (1 << 61)
            | (self._sip_idx << 40)
            | (self._cube_idx << 32)
            | (self._pe_idx << 24)
        )
    def _find_shard(self, shards: tuple) -> Any:
        """Find shard matching this PE's (sip, cube, pe). Fallback to positional index."""
@@ -65,24 +89,45 @@ class PeCpuComponent(ComponentBase):
                yield from self._forward_txn(env, txn)
    def _execute_kernel(self, env: simpy.Environment, txn: Any) -> Generator:
-        """Compile kernel function and replay command trace."""
+        """Execute kernel: greenlet mode (ADR-0020) or legacy Phase 0 + replay."""
        from kernbench.common.pe_commands import (
            CompositeCmd,
            PeCpuOverheadCmd,
            PeInternalTxn,
            WaitCmd,
        )
        from kernbench.triton_emu.registry import get_kernel
        from kernbench.triton_emu.tl_context import TLContext, run_kernel
        request = txn.request
        # Phase 1: Compile — apply PE_CPU setup overhead, then run kernel
        yield from self.run(env, 0)
        kernel_fn = get_kernel(request.kernel_ref.name)
        num_programs = self._derive_num_programs(request)
        kernel_args = self._unpack_kernel_args(request)
-        # Derive num_programs from the number of PE shards in this cube
+        pe_exec_start = env.now
        scheduler_id = f"{self._pe_prefix}.pe_scheduler"
        # Choose execution mode: greenlet (ADR-0020) or legacy command-list
        store = getattr(self.ctx, "memory_store", None) if self.ctx else None
        if store is not None:
            composite_results = yield from self._execute_greenlet(
                env, kernel_fn, kernel_args, num_programs, scheduler_id, store,
            )
        else:
            composite_results = yield from self._execute_legacy(
                env, kernel_fn, kernel_args, num_programs, scheduler_id,
            )
        # Record PE-internal execution time
        txn.result_data["pe_exec_ns"] = env.now - pe_exec_start
        total_dma_ns = 0.0
        total_compute_ns = 0.0
        for rd in composite_results:
            total_dma_ns += rd.get("dma_ns", 0.0)
            total_compute_ns += rd.get("compute_ns", 0.0)
        txn.result_data["dma_ns"] = total_dma_ns
        txn.result_data["compute_ns"] = total_compute_ns
        # Send ResponseMsg on reverse path
        yield from self._send_response(env, txn, request)
    def _derive_num_programs(self, request: Any) -> int:
        num_programs = 1
        for arg in request.args:
            if arg.arg_kind == "tensor":
@@ -92,11 +137,9 @@ class PeCpuComponent(ComponentBase):
                )
                if cube_pe_count > num_programs:
                    num_programs = cube_pe_count
        return num_programs
-        tl = TLContext(pe_id=self._pe_idx, num_programs=num_programs, dispatch_cycles=0)
+    def _unpack_kernel_args(self, request: Any) -> list:
        # Unpack KernelLaunchMsg.args into positional args for kernel function
        # TensorArg → va_base (already local, set by runtime) or PA fallback
        kernel_args: list = []
        for arg in request.args:
            if arg.arg_kind == "tensor":
@@ -107,15 +150,48 @@ class PeCpuComponent(ComponentBase):
                    kernel_args.append(shard.pa)
            elif arg.arg_kind == "scalar":
                kernel_args.append(arg.value)
        return kernel_args
    def _execute_greenlet(
        self, env, kernel_fn, kernel_args, num_programs, scheduler_id, store,
    ) -> Generator:
        """Greenlet-based execution (ADR-0020 D3): kernel ↔ SimPy interleaved."""
        from kernbench.triton_emu.kernel_runner import KernelRunner
        runner = KernelRunner(
            pe_prefix=self._pe_prefix,
            pe_idx=self._pe_idx,
            sip_idx=self._sip_idx,
            cube_idx=self._cube_idx,
            num_cubes=self._num_cubes,
            scheduler_id=scheduler_id,
            out_ports=self.out_ports,
            store=store,
            scratch_base=self._tl_scratch_base,
            scratch_size=self._tl_scratch_size,
        )
        yield from runner.run(env, kernel_fn, kernel_args, num_programs)
        return getattr(runner, "_composite_results", [])
    def _execute_legacy(
        self, env, kernel_fn, kernel_args, num_programs, scheduler_id,
    ) -> Generator:
        """Legacy Phase 0 + replay: generate command list, then dispatch."""
        from kernbench.common.pe_commands import (
            CompositeCmd, PeCpuOverheadCmd, PeInternalTxn, WaitCmd,
        )
        from kernbench.triton_emu.tl_context import TLContext, run_kernel
        tl = TLContext(
            pe_id=self._pe_idx, num_programs=num_programs,
            cube_id=self._cube_idx, num_cubes=self._num_cubes,
            dispatch_cycles=0,
        )
        run_kernel(kernel_fn, tl, *kernel_args)
        commands = tl.commands
-        # Phase 2: Replay — dispatch commands to PE_SCHEDULER
+        pending: dict[str, simpy.Event] = {}
-        pe_exec_start = env.now
+        composite_results: list[dict] = []
        scheduler_id = f"{self._pe_prefix}.pe_scheduler"
        pending: dict[str, simpy.Event] = {}  # completion_id → done event
        composite_results: list[dict] = []  # collect result_data from CompositeCmd txns
        for cmd in commands:
            if isinstance(cmd, PeCpuOverheadCmd):
@@ -126,47 +202,30 @@ class PeCpuComponent(ComponentBase):
                    if evt:
                        yield evt
                else:
                    # Wait all pending completions
                    for evt in pending.values():
                        yield evt
                    pending.clear()
            elif isinstance(cmd, CompositeCmd):
                # Non-blocking: dispatch to scheduler, track completion
                done_evt = env.event()
                pe_txn = PeInternalTxn(
-                    command=cmd, done=done_evt,
+                    command=cmd, done=done_evt, pe_prefix=self._pe_prefix,
                    pe_prefix=self._pe_prefix,
                )
                composite_results.append(pe_txn.result_data)
                yield self.out_ports[scheduler_id].put(pe_txn)
                pending[cmd.completion.id] = done_evt
            else:
                # Blocking: dispatch and wait for completion
                done_evt = env.event()
                pe_txn = PeInternalTxn(
-                    command=cmd, done=done_evt,
+                    command=cmd, done=done_evt, pe_prefix=self._pe_prefix,
                    pe_prefix=self._pe_prefix,
                )
                yield self.out_ports[scheduler_id].put(pe_txn)
                yield done_evt
        # Wait for any remaining pending completions
        for evt in pending.values():
            yield evt
        return composite_results
-        # Record PE-internal execution time
+    def _send_response(self, env, txn, request) -> Generator:
        txn.result_data["pe_exec_ns"] = env.now - pe_exec_start
        # Aggregate dma_ns / compute_ns from CompositeCmd results
        total_dma_ns = 0.0
        total_compute_ns = 0.0
        for rd in composite_results:
            total_dma_ns += rd.get("dma_ns", 0.0)
            total_compute_ns += rd.get("compute_ns", 0.0)
        txn.result_data["dma_ns"] = total_dma_ns
        txn.result_data["compute_ns"] = total_compute_ns
        # Send ResponseMsg on reverse path (PE_CPU → NOC → M_CPU)
        reverse_path = list(reversed(txn.path))
        if len(reverse_path) >= 2:
            from kernbench.runtime_api.kernel import ResponseMsg
@@ -105,6 +105,203 @@ class PeDmaComponent(PeEngineBase):
        yield sub_done
        pe_txn.done.succeed()
    def _worker(self, env: simpy.Environment) -> Generator:
        """Handle TileToken (pipeline), PeInternalTxn (legacy), IpcqDmaToken,
        and Transaction (fabric)."""
        from kernbench.common.ipcq_types import IpcqDmaToken
        from kernbench.common.pe_commands import PeInternalTxn
        from kernbench.components.builtin.pe_types import TileToken
        while True:
            msg: Any = yield self._inbox.get()
            if isinstance(msg, IpcqDmaToken):
                # Outbound: IPCQ token from local PE_IPCQ → forward via fabric
                env.process(self._handle_ipcq_outbound(env, msg))
            elif isinstance(msg, TileToken):
                env.process(self._pipeline_process(env, msg))
            elif isinstance(msg, PeInternalTxn):
                env.process(self._handle_with_hooks(env, msg))
            else:
                # Transaction (or unknown). May carry IpcqDmaToken inbound.
                req = getattr(msg, "request", None)
                if isinstance(req, IpcqDmaToken):
                    env.process(self._handle_ipcq_inbound(env, msg))
                else:
                    env.process(self._forward_txn(env, msg))
    # ── IPCQ outbound (PE_IPCQ → PE_DMA → fabric) ───────────────────
    def _handle_ipcq_outbound(self, env: simpy.Environment, token: Any) -> Generator:
        """Forward IpcqDmaToken from local PE_IPCQ through the fabric to peer
        PE_DMA. ADR-0023 D8 (vc_comm channel)."""
        if self.ctx is None:
            return  # nothing to do
        peer = token.dst_endpoint
        peer_pe_dma = f"sip{peer.sip}.cube{peer.cube}.pe{peer.pe}.pe_dma"
        # Snapshot the source data at send time (D9 in-flight semantics).
        # Without this, the receiver could read stale or future data if the
        # sender mutates src_addr between send issue and DMA arrival.
        store = getattr(self.ctx, "memory_store", None)
        if store is not None and token.data is None:
            try:
                snap = store.read(
                    token.src_space, token.src_addr,
                    shape=token.shape, dtype=token.dtype,
                )
                # Copy so later mutations to src_addr don't affect the snapshot.
                token.data = snap.copy() if hasattr(snap, "copy") else snap
            except Exception:
                token.data = None
        # Note: ipcq_copy is recorded at INBOUND time (in _handle_ipcq_inbound),
        # not here. Outbound time is too early — it precedes fabric propagation,
        # so in Phase 2 a later round's copy can sort before the receiver's
        # math for an earlier round, causing slot data corruption.
        # The secondary sort in DataExecutor (memory ops before math at the
        # same t_start) ensures the inbound copy runs before the local math
        # that reads the slot.
        try:
            path = self.ctx.router.find_path(self._pe_prefix, peer_pe_dma)
        except Exception:
            return
        drain_ns = self.ctx.compute_drain_ns(path, token.nbytes)
        sub_done = env.event()
        sub_txn = Transaction(
            request=token, path=path, step=0,
            nbytes=token.nbytes, done=sub_done, drain_ns=drain_ns,
        )
        if len(path) > 1:
            next_hop = path[1]
            if next_hop in self.out_ports:
                yield self.out_ports[next_hop].put(sub_txn.advance())
            else:
                return
        # Note: don't wait on sub_done here — fire-and-forget for vc_comm.
        # IPCQ slot bookkeeping (peer_head) was already updated by PE_IPCQ;
        # backpressure is via credit return, not via this DMA's completion.
    # ── IPCQ inbound (fabric → PE_DMA → MemoryStore + PE_IPCQ) ──────
    def _handle_ipcq_inbound(self, env: simpy.Environment, txn: Any) -> Generator:
        """At destination PE_DMA: atomically write data and forward metadata.
        I6 (MUST): no SimPy yield between MemoryStore.write and the
        IpcqMetaArrival put into PE_IPCQ.
        """
        from kernbench.common.ipcq_types import IpcqMetaArrival
        token = txn.request
        # ── ATOMIC: do not introduce yield between these two operations ──
        # 1. Move data via MemoryStore (single-hop DMA write).
        # Prefer the in-flight snapshot stashed by the sender PE_DMA;
        # fall back to a fresh read of src_addr if no snapshot is present
        # (e.g. control-only token).
        store = getattr(self.ctx, "memory_store", None) if self.ctx else None
        if store is not None:
            try:
                data = token.data
                if data is None:
                    data = store.read(
                        token.src_space, token.src_addr,
                        shape=token.shape, dtype=token.dtype,
                    )
                store.write(token.dst_endpoint.buffer_kind, token.dst_addr, data)
            except Exception:
                pass
        # Record the IPCQ copy at INBOUND time with embedded data snapshot.
        # The snapshot (token.data) was captured by the sender's outbound
        # PE_DMA at send time. Phase 2 writes the snapshot directly to
        # dst — it does NOT re-read from MemoryStore[src_addr], which may
        # have been mutated by a different PE's Phase 2 ops by that point.
        # DataExecutor's secondary sort (memory before math at same
        # t_start) ensures the write completes before the local math
        # that reads the slot.
        if self._op_logger is not None:
            try:
                self._op_logger.record_copy(
                    t_start=float(env.now), t_end=float(env.now),
                    component_id=self.node.id,
                    src_space=token.src_space, src_addr=token.src_addr,
                    dst_space=token.dst_endpoint.buffer_kind,
                    dst_addr=token.dst_addr,
                    shape=token.shape, dtype=token.dtype, nbytes=token.nbytes,
                    snapshot=token.data,
                )
            except Exception:
                pass
        # 2. Forward IpcqMetaArrival to local PE_IPCQ
        ipcq_id = f"{self._pe_prefix}.pe_ipcq"
        if ipcq_id in self.out_ports:
            yield self.out_ports[ipcq_id].put(IpcqMetaArrival(token=token))
        # ─────────────────────────────────────────────────────────────────
        if not txn.done.triggered:
            txn.done.succeed()
    def _pipeline_process(self, env: simpy.Environment, token: Any) -> Generator:
        """Pipeline mode: DMA read/write via fabric, then self-route."""
        self._on_process_start(env, token)
        yield from self._do_pipeline_dma(env, token)
        self._on_process_end(env, token)
        # Self-routing (handle same-component consecutive stages)
        next_stage = token.advance()
        while next_stage is not None and next_stage.component == self.node.id:
            self._on_process_start(env, token)
            yield from self._do_pipeline_dma(env, token)
            self._on_process_end(env, token)
            next_stage = token.advance()
        if next_stage is not None:
            yield self.out_ports[next_stage.component].put(token)
        else:
            token.pipeline_ctx.complete_tile()
    def _do_pipeline_dma(self, env, token):
        """Core DMA logic for pipeline mode."""
        from kernbench.policy.address.phyaddr import PhysAddr
        from kernbench.runtime_api.kernel import PeDmaMsg
        params = token.params
        from kernbench.components.builtin.pe_types import StageType
        is_write = token.current_stage.stage_type == StageType.DMA_WRITE
        addr = params.get("dst_addr" if is_write else "src_addr", 0)
        nbytes = params.get("nbytes", 0)
        if nbytes > 0 and self.ctx:
            dma_res = self._dma_write if is_write else self._dma_read
            assert dma_res is not None
            pa = PhysAddr.decode(addr)
            dst_node = self.ctx.resolver.resolve(pa)
            path = self.ctx.router.find_path(self._pe_prefix, dst_node)
            drain_ns = self.ctx.compute_drain_ns(path, nbytes)
            with dma_res.request() as req:
                yield req
                sub_done = env.event()
                sub_request = PeDmaMsg(
                    correlation_id="pipeline",
                    request_id=f"tile_{token.tile_id}",
                    src_sip=0, src_cube=0, src_pe=0,
                    dst_pa=addr, nbytes=nbytes,
                    is_write=is_write,
                )
                sub_txn = Transaction(
                    request=sub_request, path=path, step=0,
                    nbytes=nbytes, done=sub_done, drain_ns=drain_ns,
                )
                if len(path) > 1:
                    yield self.out_ports[path[1]].put(sub_txn.advance())
            yield sub_done
    def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
        """Handle external Transaction (PeDmaMsg probe, M_CPU DMA) with channel acquisition."""
        # Response transactions bypass DMA channel (no outbound resource needed)
@@ -0,0 +1,77 @@
 """PE_FETCH_STORE: TCM ↔ Register File transfer unit (ADR-0021 D5).
 Handles both fetch (TCM → register) and store (register → TCM).
 BW serialization is delegated to PE_TCM via port communication.
 """
 from __future__ import annotations
 from collections.abc import Generator
 from typing import TYPE_CHECKING, Any
 import simpy
 from kernbench.components.base import PeEngineBase
 if TYPE_CHECKING:
    from kernbench.components.context import ComponentContext
    from kernbench.topology.types import Node
 class PeFetchStoreComponent(PeEngineBase):
    """PE_FETCH_STORE: TCM ↔ Register File (ADR-0021 D5).
    Receives TileTokens via pipeline self-routing.
    Sends TcmRequest to PE_TCM for BW-based latency.
    """
    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
        super().__init__(node, ctx)
        self._tcm_id = f"{self._pe_prefix}.pe_tcm"
    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
        yield env.timeout(overhead_ns)
    def _worker(self, env: simpy.Environment) -> Generator:
        """Handle both PeInternalTxn (legacy) and TileToken (pipeline)."""
        from kernbench.common.pe_commands import PeInternalTxn
        from kernbench.components.builtin.pe_types import TileToken
        while True:
            msg: Any = yield self._inbox.get()
            if isinstance(msg, TileToken):
                env.process(self._pipeline_process(env, msg))
            elif isinstance(msg, PeInternalTxn):
                env.process(self.handle_command(env, msg))
            else:
                env.process(self._forward_txn(env, msg))
    def _pipeline_process(self, env: simpy.Environment, token: Any) -> Generator:
        """Process a pipeline TileToken: fetch or store via TCM."""
        from kernbench.components.builtin.pe_tcm import TcmRequest
        self._on_process_start(env, token)
        direction = token.params.get("direction", "read")
        nbytes = token.params.get("nbytes", 0)
        if nbytes > 0 and self._tcm_id in self.out_ports:
            done = env.event()
            yield self.out_ports[self._tcm_id].put(
                TcmRequest(direction=direction, nbytes=nbytes, done=done)
            )
            yield done
        self._on_process_end(env, token)
        # Self-routing: advance to next stage
        next_stage = token.advance()
        if next_stage is not None:
            yield self.out_ports[next_stage.component].put(token)
        else:
            token.pipeline_ctx.complete_tile()
    def handle_command(self, env: simpy.Environment, pe_txn: Any) -> Generator:
        """Legacy PeInternalTxn handling."""
        yield from self.run(env, 0)
        pe_txn.done.succeed()
@@ -1,6 +1,18 @@
 """PE_GEMM: matrix multiplication engine (ADR-0021 D6).
 Handles both legacy PeInternalTxn (GemmCmd) and pipeline TileToken.
 In pipeline mode, receives token after fetch stage, computes MAC, chains to next.
 MAC latency model (from pe_accel):
  cycles = ceil(Tm/mac_m) * ceil(Tk/mac_k) * ceil(Tn/mac_n)
  latency_ns = cycles / clock_freq_ghz
 Falls back to TFLOPS model when mac dimensions not configured.
 """
 from __future__ import annotations
 from collections.abc import Generator
 from math import ceil
 from typing import TYPE_CHECKING, Any
 import simpy
@@ -12,33 +24,29 @@ if TYPE_CHECKING:
    from kernbench.components.context import ComponentContext
    from kernbench.topology.types import Node
 # dtype → bit width (for TFLOPS scaling)
 _DTYPE_BITS: dict[str, int] = {
    "f16": 16, "fp16": 16, "float16": 16, "bf16": 16,
    "f32": 32, "fp32": 32, "float32": 32,
-    "i8": 8, "int8": 8,
+    "i8": 8, "int8": 8, "i16": 16, "int16": 16, "i32": 32, "int32": 32,
    "i16": 16, "int16": 16,
    "i32": 32, "int32": 32,
 }
 class PeGemmComponent(PeEngineBase):
-    """PE_GEMM: matrix multiplication engine sharing accel_slot (ADR-0014 D4).
+    """PE_GEMM: MAC array (ADR-0021 D6).
-    Uses a shared compute resource (PE_ACCEL capacity=1) that is mutually
+    In pipeline mode: pure compute — register data already fetched.
-    exclusive with PE_MATH within the same PE.
+    In legacy mode: handles PeInternalTxn(GemmCmd) with shared accel_slot.
    Compute latency model:
      FLOPs = 2 * M * K * N
      effective_tflops = peak_tflops_f16 * (16 / dtype_bits)
      compute_ns = FLOPs / (effective_tflops * 1e3)
    """
    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
        super().__init__(node, ctx)
        self._accel: simpy.Resource | None = None
        self._peak_tflops_f16: float = float(node.attrs.get("peak_tflops_f16", 0.0))
        # Cycle-accurate MAC dimensions (from pe_accel)
        self._mac_m: int = int(node.attrs.get("mac_m", 0))
        self._mac_k: int = int(node.attrs.get("mac_k", 0))
        self._mac_n: int = int(node.attrs.get("mac_n", 0))
        self._clock_freq: float = float(node.attrs.get("clock_freq_ghz", 1.0))
    def init_resources(self, env: simpy.Environment) -> None:
        resource_name = self.node.attrs.get("shared_resource")
@@ -47,8 +55,15 @@ class PeGemmComponent(PeEngineBase):
                env, f"{self._pe_prefix}.{resource_name}"
            )
-    def _compute_ns(self, m: int, k: int, n: int, dtype: str) -> float:
+    def _compute_ns_mac(self, m: int, k: int, n: int) -> float:
-        """Compute GEMM latency in nanoseconds."""
+        """Cycle-accurate MAC latency (pe_accel model)."""
        if self._mac_m > 0 and self._mac_k > 0 and self._mac_n > 0:
            cycles = ceil(m / self._mac_m) * ceil(k / self._mac_k) * ceil(n / self._mac_n)
            return cycles / self._clock_freq
        return 0.0
    def _compute_ns_tflops(self, m: int, k: int, n: int, dtype: str = "f16") -> float:
        """TFLOPS-based latency (legacy model)."""
        if self._peak_tflops_f16 <= 0:
            return float(self.node.attrs.get("overhead_ns", 0.0))
        dtype_bits = _DTYPE_BITS.get(dtype, 16)
@@ -56,11 +71,58 @@ class PeGemmComponent(PeEngineBase):
        flops = 2.0 * m * k * n
        return flops / (effective_tflops * 1e3)
    def _compute_ns(self, m: int, k: int, n: int, dtype: str = "f16") -> float:
        """Choose best available latency model."""
        mac_ns = self._compute_ns_mac(m, k, n)
        if mac_ns > 0:
            return mac_ns
        return self._compute_ns_tflops(m, k, n, dtype)
    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
        yield env.timeout(overhead_ns)
    def _worker(self, env: simpy.Environment) -> Generator:
        from kernbench.common.pe_commands import PeInternalTxn
        from kernbench.components.builtin.pe_types import TileToken
        while True:
            msg: Any = yield self._inbox.get()
            if isinstance(msg, TileToken):
                env.process(self._pipeline_process(env, msg))
            elif isinstance(msg, PeInternalTxn):
                env.process(self._handle_with_hooks(env, msg))
            else:
                env.process(self._forward_txn(env, msg))
    def _pipeline_process(self, env: simpy.Environment, token: Any) -> Generator:
        """Pipeline mode: pure MAC compute, then self-route."""
        self._on_process_start(env, token)
        m = token.params.get("m", 0)
        k = token.params.get("k", 0)
        n = token.params.get("n", 0)
        if self._accel:
            with self._accel.request() as req:
                yield req
                ns = self._compute_ns(m, k, n)
                yield env.timeout(ns)
        else:
            ns = self._compute_ns(m, k, n)
            yield env.timeout(ns)
        self._on_process_end(env, token)
        # Self-routing
        next_stage = token.advance()
        if next_stage is not None:
            yield self.out_ports[next_stage.component].put(token)
        else:
            token.pipeline_ctx.complete_tile()
    def handle_command(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
        """Legacy PeInternalTxn handling."""
        from kernbench.common.pe_commands import GemmCmd
        cmd = pe_txn.command
@@ -81,7 +143,6 @@ class PeGemmComponent(PeEngineBase):
        pe_txn.done.succeed()
    def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
        """Transaction forwarding with accel_slot acquisition."""
        if self._accel:
            with self._accel.request() as req:
                yield req
@@ -0,0 +1,479 @@
 """PE_IPCQ component (ADR-0023): per-PE IPCQ control plane.
 Responsibilities:
    - Hold per-direction queue pair state (my_head, my_tail,
      peer_head_cache, peer_tail_cache, ring buffer addresses)
    - Process IpcqInitMsg from backend to install neighbor table
    - Handle IpcqRequest(IpcqSendCmd) from PE_CPU:
        compute peer slot address, check backpressure, forward
        IpcqDmaToken to PE_DMA (vc_comm)
    - Handle IpcqRequest(IpcqRecvCmd) from PE_CPU:
        wait for data arrival, return slot address (or copy to dst),
        send fast-path credit return
    - Handle IpcqMetaArrival from PE_DMA: update peer_head_cache, wake recv
    - Handle IpcqCreditMetadata via own credit_inbox: update peer_tail_cache,
      wake send
 PE_IPCQ does NOT move data — it forwards IpcqDmaToken to PE_DMA which
 performs the actual fabric DMA.
 Credit return uses a fast path: PE_IPCQ creates a SimPy process with a
 bottleneck-BW based latency, then puts IpcqCreditMetadata directly into
 the peer's pre-wired credit_store.
 """
 from __future__ import annotations
 from collections.abc import Generator
 from typing import TYPE_CHECKING, Any
 import simpy
 from kernbench.common.ipcq_types import (
    IpcqCreditMetadata,
    IpcqDmaToken,
    IpcqInvalidDirection,
    IpcqMetaArrival,
    IpcqRecvCmd,
    IpcqRequest,
    IpcqSendCmd,
 )
 from kernbench.components.base import ComponentBase
 if TYPE_CHECKING:
    from kernbench.components.context import ComponentContext
    from kernbench.runtime_api.kernel import IpcqInitMsg
    from kernbench.topology.types import Node
 _DIR_ORDER: tuple[str, ...] = ("N", "S", "E", "W", "parent", "child_left", "child_right")
 class PeIpcqComponent(ComponentBase):
    """PE_IPCQ: ring buffer pointer + neighbor management for CCL.
    Owned by one PE; talks to PE_DMA via out_ports[<pe_dma_id>] and
    receives credit return metadata via the public ``credit_inbox``
    SimPy Store (wired by backend at IpcqInitMsg installation time).
    """
    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
        super().__init__(node, ctx)
        # Strict shape/dtype validation (D14 F2). Off by default.
        self._strict: bool = bool(node.attrs.get("strict_validation", False))
        # direction → list of received tokens (for strict-mode peek of next slot)
        self._arrived_tokens: dict[str, list] = {}
        # Parse self (sip, cube, pe) from node id, e.g. "sip0.cube0.pe0.pe_ipcq"
        self._pe_prefix: str = node.id.rsplit(".", 1)[0]
        parts = self._pe_prefix.split(".")
        try:
            self._self_sip = int(parts[0].replace("sip", ""))
        except (IndexError, ValueError):
            self._self_sip = 0
        try:
            self._self_cube = int(parts[1].replace("cube", ""))
        except (IndexError, ValueError):
            self._self_cube = 0
        try:
            self._self_pe = int(parts[2].replace("pe", ""))
        except (IndexError, ValueError):
            self._self_pe = 0
        self._dma_node_id = f"{self._pe_prefix}.pe_dma"
        # direction → state dict (see _install_neighbors for shape)
        self._queue_pairs: dict[str, dict[str, Any]] = {}
        self._installed = False
        self._buffer_kind: str = "tcm"
        self._backpressure_mode: str = "sleep"
        self._credit_size_bytes: int = 16
        # waiters for recv (per direction) and any-direction (for round-robin)
        self._recv_waiters: dict[str, list[simpy.Event]] = {}
        self._any_recv_waiters: list[simpy.Event] = []
        # waiters for send backpressure (per direction)
        self._send_waiters: dict[str, list[simpy.Event]] = {}
        # round-robin cursor over installed directions
        self._rr_dirs: list[str] = []
        self._rr_cursor: int = 0
        # credit_inbox is created in start() once env is available
        self._credit_inbox: simpy.Store | None = None
    # ── Public ──
    @property
    def credit_inbox(self) -> simpy.Store:
        """SimPy Store that backend wires as ``peer_credit_store`` on
        every remote sender targeting this PE. Used by D9 fast path."""
        assert self._credit_inbox is not None, "PE_IPCQ not started yet"
        return self._credit_inbox
    @property
    def queue_pairs(self) -> dict[str, dict[str, Any]]:
        """Test/debug accessor."""
        return self._queue_pairs
    # ── Lifecycle ──
    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
        yield env.timeout(0)
    def start(self, env: simpy.Environment) -> None:
        # Create credit_inbox even if there are no in_ports yet
        if self._credit_inbox is None:
            self._credit_inbox = simpy.Store(env)
        # If no in_ports were wired (e.g. unit test), still spin up workers
        if not self.in_ports:
            self._inbox = simpy.Store(env)
        super().start(env)
        env.process(self._credit_worker(env))
    # ── Worker (override of ComponentBase._worker) ──
    def _worker(self, env: simpy.Environment) -> Generator:
        from kernbench.runtime_api.kernel import IpcqInitMsg
        while True:
            msg: Any = yield self._inbox.get()
            # IpcqInitMsg may arrive wrapped in a transaction (with .request)
            # or directly.
            request_obj = getattr(msg, "request", None)
            if isinstance(request_obj, IpcqInitMsg):
                self._install_neighbors(request_obj)
                done = getattr(msg, "done", None)
                if done is not None and not done.triggered:
                    done.succeed()
                continue
            if isinstance(msg, IpcqInitMsg):
                self._install_neighbors(msg)
                continue
            if isinstance(msg, IpcqMetaArrival):
                self._handle_meta_arrival(msg)
                continue
            if isinstance(msg, IpcqRequest):
                env.process(self._handle_request(env, msg))
                continue
            # Unknown message — drop or forward via base class fallback
            env.process(self._forward_txn(env, msg))
    # ── Init ──
    def _install_neighbors(self, msg: IpcqInitMsg) -> None:
        self._installed = True
        self._buffer_kind = msg.buffer_kind
        self._backpressure_mode = msg.backpressure_mode
        self._credit_size_bytes = msg.credit_size_bytes
        for entry in msg.entries:
            self._queue_pairs[entry.direction] = {
                "peer": entry.peer,
                "my_rx_base_pa": entry.my_rx_base_pa,
                "my_rx_base_va": entry.my_rx_base_va,
                "n_slots": entry.n_slots,
                "slot_size": entry.slot_size,
                "peer_credit_store": entry.peer_credit_store,
                "my_head": 0,
                "my_tail": 0,
                "peer_head_cache": 0,
                "peer_tail_cache": 0,
            }
            self._recv_waiters.setdefault(entry.direction, [])
            self._send_waiters.setdefault(entry.direction, [])
        # Reset round-robin order to a stable canonical sequence
        self._rr_dirs = [d for d in _DIR_ORDER if d in self._queue_pairs]
        self._rr_cursor = 0
    # ── Send ──
    def _handle_request(self, env: simpy.Environment, req: IpcqRequest) -> Generator:
        cmd = req.command
        if isinstance(cmd, IpcqSendCmd):
            yield from self._handle_send(env, req, cmd)
        elif isinstance(cmd, IpcqRecvCmd):
            yield from self._handle_recv(env, req, cmd)
    def _handle_send(
        self, env: simpy.Environment, req: IpcqRequest, cmd: IpcqSendCmd,
    ) -> Generator:
        if cmd.direction not in self._queue_pairs:
            raise IpcqInvalidDirection(
                f"PE {self._pe_prefix}: direction {cmd.direction!r} not installed"
            )
        qp = self._queue_pairs[cmd.direction]
        peer = qp["peer"]
        # Backpressure: wait while ring full
        while (qp["my_head"] - qp["peer_tail_cache"]) >= peer.n_slots:
            wait_event = env.event()
            self._send_waiters[cmd.direction].append(wait_event)
            yield wait_event
        # Compute peer slot address
        slot_idx = qp["my_head"] % peer.n_slots
        dst_pa = peer.rx_base_pa + slot_idx * peer.slot_size
        token = IpcqDmaToken(
            src_addr=cmd.src_addr,
            src_space=cmd.src_space,
            dst_addr=dst_pa,
            dst_endpoint=peer,
            nbytes=cmd.nbytes,
            handle_id=cmd.handle_id,
            shape=cmd.shape,
            dtype=cmd.dtype,
            # Carry the handle's recv-time data snapshot so the outbound
            # PE_DMA doesn't need to re-read from MemoryStore (which may
            # have been overwritten by a later inbound in the meantime).
            data=getattr(cmd, "data", None),
            sender_seq=qp["my_head"],
            src_sip=self._self_sip,
            src_cube=self._self_cube,
            src_pe=self._self_pe,
            src_direction=cmd.direction,
        )
        # Forward to PE_DMA (vc_comm)
        yield self.out_ports[self._dma_node_id].put(token)
        qp["my_head"] += 1
        # Diagnostics trace (D14)
        from kernbench.ccl import diagnostics
        if diagnostics.trace_enabled():
            diagnostics.log_send(
                t_ns=float(env.now), sender=self._pe_prefix,
                direction=cmd.direction, nbytes=cmd.nbytes,
                sender_seq=qp["my_head"] - 1,
            )
        if not req.done.triggered:
            req.done.succeed()
    # ── Recv ──
    def _handle_recv(
        self, env: simpy.Environment, req: IpcqRequest, cmd: IpcqRecvCmd,
    ) -> Generator:
        if cmd.direction is None:
            direction = yield from self._wait_any_direction(env)
        else:
            if cmd.direction not in self._queue_pairs:
                raise IpcqInvalidDirection(
                    f"PE {self._pe_prefix}: direction {cmd.direction!r} not installed"
                )
            direction = cmd.direction
            qp = self._queue_pairs[direction]
            while qp["peer_head_cache"] <= qp["my_tail"]:
                wait_event = env.event()
                self._recv_waiters[direction].append(wait_event)
                yield wait_event
        qp = self._queue_pairs[direction]
        slot_idx = qp["my_tail"] % qp["n_slots"]
        slot_addr = qp["my_rx_base_pa"] + slot_idx * qp["slot_size"]
        # Strict validation (D14 F2): peek the next-arrived token's metadata
        # against the recv command's expected shape/dtype/nbytes.
        arrived = self._arrived_tokens.get(direction, [])
        if arrived:
            front = arrived.pop(0)
            if self._strict:
                expected_nbytes = self._nbytes_for(cmd.shape, cmd.dtype)
                if front.dtype != cmd.dtype:
                    raise ValueError(
                        f"PE_IPCQ {self._pe_prefix} recv strict: dtype mismatch — "
                        f"sender={front.dtype} recv={cmd.dtype}"
                    )
                if front.shape != cmd.shape:
                    raise ValueError(
                        f"PE_IPCQ {self._pe_prefix} recv strict: shape mismatch — "
                        f"sender={front.shape} recv={cmd.shape}"
                    )
                if front.nbytes != expected_nbytes:
                    raise ValueError(
                        f"PE_IPCQ {self._pe_prefix} recv strict: nbytes mismatch — "
                        f"sender={front.nbytes} recv={expected_nbytes}"
                    )
        req.result_data["src_space"] = self._buffer_kind
        req.result_data["src_addr"] = slot_addr
        req.result_data["direction"] = direction
        req.result_data["dtype"] = cmd.dtype
        req.result_data["shape"] = cmd.shape
        req.result_data["nbytes"] = self._nbytes_for(cmd.shape, cmd.dtype)
        # copy_to_dst mode: rebind the result handle to (dst_space, dst_addr).
        # When op_log is disabled, we also do the actual data move now;
        # when op_log is enabled, Phase 2 replays the slot→dst copy from
        # the op_log entry below so we don't pollute the slot in Phase 1.
        if cmd.recv_mode == "copy_to_dst" and self.ctx is not None:
            req.result_data["src_space"] = cmd.dst_space
            req.result_data["src_addr"] = cmd.dst_addr
            store = getattr(self.ctx, "memory_store", None)
            if store is not None and self._op_logger is None:
                try:
                    data = store.read(self._buffer_kind, slot_addr, shape=cmd.shape, dtype=cmd.dtype)
                    store.write(cmd.dst_space, cmd.dst_addr, data)
                except Exception:
                    pass
            if self._op_logger is not None:
                # Record slot → dst copy for Phase 2 replay (ADR-0023 D9.5).
                try:
                    self._op_logger.record_copy(
                        t_start=float(env.now), t_end=float(env.now),
                        component_id=self.node.id,
                        src_space=self._buffer_kind, src_addr=slot_addr,
                        dst_space=cmd.dst_space, dst_addr=cmd.dst_addr,
                        shape=cmd.shape, dtype=cmd.dtype,
                        nbytes=self._nbytes_for(cmd.shape, cmd.dtype),
                    )
                except Exception:
                    pass
        qp["my_tail"] += 1
        # Diagnostics trace (D14)
        from kernbench.ccl import diagnostics
        if diagnostics.trace_enabled():
            diagnostics.log_recv(
                t_ns=float(env.now), receiver=self._pe_prefix,
                direction=direction,
                nbytes=req.result_data.get("nbytes", 0),
            )
        # Fast path credit return — bottleneck BW based latency
        env.process(
            self._delayed_credit_send(env, direction, qp["peer_credit_store"], qp["my_tail"])
        )
        if not req.done.triggered:
            req.done.succeed()
    def _wait_any_direction(self, env: simpy.Environment) -> Generator:
        """Round-robin scan over installed directions; wait until at least one
        has data. Returns the chosen direction (str)."""
        if not self._rr_dirs:
            raise IpcqInvalidDirection(
                f"PE {self._pe_prefix}: no neighbors installed"
            )
        while True:
            n = len(self._rr_dirs)
            for i in range(n):
                idx = (self._rr_cursor + i) % n
                d = self._rr_dirs[idx]
                qp = self._queue_pairs[d]
                if qp["peer_head_cache"] > qp["my_tail"]:
                    self._rr_cursor = (idx + 1) % n
                    return d
            # Nothing available — wait until any arrival
            wait_event = env.event()
            self._any_recv_waiters.append(wait_event)
            yield wait_event
    # ── Metadata arrival from PE_DMA (D9) ──
    def _handle_meta_arrival(self, msg: IpcqMetaArrival) -> None:
        """Match arrival to the correct direction by dst_addr range (ADR-0025 D2).
        Each direction has a unique rx buffer address range
        ([my_rx_base_pa, my_rx_base_pa + n_slots * slot_size)). The token's
        dst_addr (set by the sender's IPCQ when computing the peer slot
        address) falls within exactly one such range. Address-based matching
        is unambiguous even when multiple directions share the same peer
        (2-rank bidirectional ring).
        """
        token = msg.token
        dst_addr = token.dst_addr
        for d, qp in self._queue_pairs.items():
            base = qp["my_rx_base_pa"]
            size = qp["n_slots"] * qp["slot_size"]
            if base <= dst_addr < base + size:
                qp["peer_head_cache"] = max(qp["peer_head_cache"], token.sender_seq + 1)
                # Track arrived token for strict-mode peek
                self._arrived_tokens.setdefault(d, []).append(token)
                # Wake any blocked recv on this direction
                waiters = self._recv_waiters.get(d, [])
                self._recv_waiters[d] = []
                for ev in waiters:
                    if not ev.triggered:
                        ev.succeed()
                # Wake any-direction waiters
                any_waiters = self._any_recv_waiters
                self._any_recv_waiters = []
                for ev in any_waiters:
                    if not ev.triggered:
                        ev.succeed()
                return
        # Unknown dst_addr — silently drop (could log)
    # ── Credit return (fast path) ──
    def _credit_worker(self, env: simpy.Environment) -> Generator:
        """Process IpcqCreditMetadata from credit_inbox.
        Matches credit to the correct direction by `credit.dst_rx_base_pa ==
        qp.peer.rx_base_pa` (ADR-0025 D3). This is unambiguous even when
        multiple directions share the same peer (2-rank bidirectional ring).
        """
        assert self._credit_inbox is not None
        while True:
            credit: IpcqCreditMetadata = yield self._credit_inbox.get()
            for d, qp in self._queue_pairs.items():
                if qp["peer"].rx_base_pa == credit.dst_rx_base_pa:
                    qp["peer_tail_cache"] = max(qp["peer_tail_cache"], credit.consumer_seq)
                    # Wake any blocked send on this direction
                    waiters = self._send_waiters.get(d, [])
                    self._send_waiters[d] = []
                    for ev in waiters:
                        if not ev.triggered:
                            ev.succeed()
                    break
    def _delayed_credit_send(
        self,
        env: simpy.Environment,
        direction: str,
        peer_credit_store: simpy.Store,
        new_tail: int,
    ) -> Generator:
        """Wait bottleneck-BW latency, then put IpcqCreditMetadata into peer
        credit store (D9 fast path).
        Carries ``dst_rx_base_pa`` = this PE's my_rx_base_pa for the
        consumed direction. The peer (original sender) matches this against
        qp.peer.rx_base_pa to identify the correct qp (ADR-0025 D3).
        """
        latency_ns = self._credit_latency_ns(direction)
        if latency_ns > 0:
            yield env.timeout(latency_ns)
        qp = self._queue_pairs[direction]
        meta = IpcqCreditMetadata(
            consumer_seq=new_tail,
            dst_rx_base_pa=qp["my_rx_base_pa"],
            src_sip=self._self_sip,
            src_cube=self._self_cube,
            src_pe=self._self_pe,
            src_direction=direction,
        )
        yield peer_credit_store.put(meta)
    def _credit_latency_ns(self, direction: str) -> float:
        """Compute credit fast path latency = credit_size / bottleneck_bw.
        Falls back to 0 when ctx/router is unavailable (unit-test mode).
        """
        if self.ctx is None:
            return 0.0
        qp = self._queue_pairs[direction]
        peer = qp["peer"]
        peer_pe_prefix = f"sip{peer.sip}.cube{peer.cube}.pe{peer.pe}"
        try:
            path = self.ctx.router.find_path(self._pe_prefix, peer_pe_prefix)
            return self.ctx.compute_drain_ns(path, self._credit_size_bytes)
        except Exception:
            return 0.0
    # ── Helpers ──
    @staticmethod
    def _nbytes_for(shape: tuple[int, ...], dtype: str) -> int:
        from math import prod
        bits = {"f16": 16, "bf16": 16, "f32": 32, "i8": 8, "i16": 16, "i32": 32}.get(dtype, 16)
        return prod(shape) * (bits // 8) if shape else 0
@@ -1,6 +1,16 @@
 """PE_MATH: element-wise / reduction computation engine (ADR-0021 D6).
 Handles both legacy PeInternalTxn (MathCmd) and pipeline TileToken.
 In pipeline mode, receives token after fetch stage, computes SIMD, chains to next.
 SIMD latency model (from pe_accel):
  cycles = ceil(num_elements / vector_width)
  latency_ns = cycles / clock_freq_ghz
 """
 from __future__ import annotations
 from collections.abc import Generator
 from math import ceil
 from typing import TYPE_CHECKING, Any
 import simpy
@@ -14,15 +24,17 @@ if TYPE_CHECKING:
 class PeMathComponent(PeEngineBase):
-    """PE_MATH: element-wise computation engine sharing accel_slot (ADR-0014 D4).
+    """PE_MATH: SIMD/Vector unit (ADR-0021 D6).
-    Uses a shared compute resource (PE_ACCEL capacity=1) that is mutually
+    In pipeline mode: pure compute — register data already fetched.
-    exclusive with PE_GEMM within the same PE.
+    In legacy mode: handles PeInternalTxn(MathCmd) with shared accel_slot.
    """
    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
        super().__init__(node, ctx)
        self._accel: simpy.Resource | None = None
        self._vector_width: int = int(node.attrs.get("vector_width", 256))
        self._clock_freq: float = float(node.attrs.get("clock_freq_ghz", 1.0))
    def init_resources(self, env: simpy.Environment) -> None:
        resource_name = self.node.attrs.get("shared_resource")
@@ -31,21 +43,83 @@ class PeMathComponent(PeEngineBase):
                env, f"{self._pe_prefix}.{resource_name}"
            )
    def _compute_ns(self, num_elements: int) -> float:
        """SIMD latency (pe_accel model)."""
        if self._vector_width > 0 and self._clock_freq > 0 and num_elements > 0:
            cycles = ceil(num_elements / self._vector_width)
            return cycles / self._clock_freq
        return float(self.node.attrs.get("overhead_ns", 0.0))
    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
        yield env.timeout(overhead_ns)
-    def handle_command(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
+    def _worker(self, env: simpy.Environment) -> Generator:
        from kernbench.common.pe_commands import PeInternalTxn
        from kernbench.components.builtin.pe_types import TileToken
        while True:
            msg: Any = yield self._inbox.get()
            if isinstance(msg, TileToken):
                env.process(self._pipeline_process(env, msg))
            elif isinstance(msg, PeInternalTxn):
                env.process(self._handle_with_hooks(env, msg))
            else:
                env.process(self._forward_txn(env, msg))
    def _pipeline_process(self, env: simpy.Environment, token: Any) -> Generator:
        """Pipeline mode: pure SIMD compute, then self-route."""
        self._on_process_start(env, token)
        num_elements = token.params.get("num_elements", 0)
        if self._accel:
            with self._accel.request() as req:
                yield req
-                yield from self.run(env, 0)
+                ns = self._compute_ns(num_elements)
                yield env.timeout(ns)
        else:
-            yield from self.run(env, 0)
+            ns = self._compute_ns(num_elements)
            yield env.timeout(ns)
        self._on_process_end(env, token)
        # Self-routing
        next_stage = token.advance()
        if next_stage is not None:
            yield self.out_ports[next_stage.component].put(token)
        else:
            token.pipeline_ctx.complete_tile()
    def handle_command(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
        """PeInternalTxn handling for standalone MathCmd (CCL kernels).
        Latency = max(overhead_ns, _compute_ns(num_elements)):
        - overhead_ns: fixed per-invocation setup cost (from node attrs).
        - _compute_ns: SIMD cycle-based model (from vector_width + clock_freq).
        The larger of the two dominates (setup-bound vs compute-bound).
        """
        from kernbench.common.pe_commands import MathCmd
        import math as _math
        cmd = pe_txn.command
        num_elements = 0
        if isinstance(cmd, MathCmd) and cmd.out.shape:
            num_elements = _math.prod(cmd.out.shape)
        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
        compute_ns = self._compute_ns(num_elements)
        ns = max(overhead_ns, compute_ns)
        if self._accel:
            with self._accel.request() as req:
                yield req
                yield env.timeout(ns)
        else:
            yield env.timeout(ns)
        pe_txn.done.succeed()
    def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
        """Transaction forwarding with accel_slot acquisition."""
        if self._accel:
            with self._accel.request() as req:
                yield req
@@ -1,3 +1,13 @@
 """PE_SCHEDULER: plan generation + tile dispatch (ADR-0021 D2).
 Receives PeInternalTxn from PE_CPU, routes to engines:
  - Simple commands (DmaReadCmd, GemmCmd, etc.) → direct dispatch to engine
  - CompositeCmd → generate TilePlan, feed tiles via _feed_loop
 Composite pipeline uses token self-routing (ADR-0021 D4):
  Scheduler only does initial dispatch + completion tracking.
  Tiles chain through components based on their plan's stage sequence.
 """
 from __future__ import annotations
 from collections.abc import Generator
@@ -14,29 +24,18 @@ if TYPE_CHECKING:
 class PeSchedulerComponent(ComponentBase):
-    """PE_SCHEDULER: sole dispatcher inside a PE (ADR-0014 D1).
+    """PE_SCHEDULER: sole dispatcher inside a PE (ADR-0014 D1, ADR-0021 D2).
-    Receives PeInternalTxn from PE_CPU, routes to the appropriate engine:
+    Simple commands are forwarded to the appropriate engine.
-      - DmaReadCmd / DmaWriteCmd → PE_DMA
+    CompositeCmd creates a TilePlan and feeds tiles into the pipeline.
      - GemmCmd                  → PE_GEMM
      - MathCmd                  → PE_MATH
      - CompositeCmd             → tiled pipeline (Stage 3: ADR-0014 D3.2)
-    Composite GEMM pipeline (32x64x32 tiles):
+    Single _feed_loop process per scheduler ensures FIFO command ordering.
      DMA_READ(b_tile_t) → COMPUTE(t) → DMA_WRITE(out_tile_t)
      with overlap: READ(t+1) || COMPUTE(t) || WRITE(t-1)
    Applies scheduler overhead_ns before dispatching each command.
    Non-PeInternalTxn messages are forwarded via inherited _forward_txn().
    """
    # Scheduler tile dimensions (ADR-0014 D3.2)
    TILE_M = 32
    TILE_K = 64
    TILE_N = 32
    # Command → engine suffix dispatch table.
    # New engines: add a single entry here (e.g. ConvCmd: "pe_conv").
    _CMD_DISPATCH: dict[type, str] = {}
    @classmethod
@@ -44,7 +43,6 @@ class PeSchedulerComponent(ComponentBase):
        if cls._CMD_DISPATCH:
            return
        from kernbench.common.pe_commands import DmaReadCmd, DmaWriteCmd, GemmCmd, MathCmd
        cls._CMD_DISPATCH = {
            DmaReadCmd: "pe_dma",
            DmaWriteCmd: "pe_dma",
@@ -56,6 +54,13 @@ class PeSchedulerComponent(ComponentBase):
        super().__init__(node, ctx)
        self._pe_prefix = node.id.rsplit(".", 1)[0]
        self._ensure_dispatch_table()
        self._pending_feeds: simpy.Store | None = None
        self._pipeline_counter = 0
    def start(self, env: simpy.Environment) -> None:
        self._pending_feeds = simpy.Store(env)
        super().start(env)
        env.process(self._feed_loop(env))
    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
@@ -72,174 +77,103 @@ class PeSchedulerComponent(ComponentBase):
                yield from self._forward_txn(env, msg)
    def _dispatch(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
-        """Route a PeInternalTxn to the correct engine via dispatch table."""
+        from kernbench.common.pe_commands import CompositeCmd, PeCpuOverheadCmd
        from kernbench.common.pe_commands import CompositeCmd
-        # Scheduler overhead
+        yield from self.run(env, 0)  # scheduler overhead
        yield from self.run(env, 0)
        cmd = pe_txn.command
-        # Check dispatch table first
+        # Simple command dispatch
        engine_suffix = self._CMD_DISPATCH.get(type(cmd))
        if engine_suffix is not None:
            yield self.out_ports[f"{self._pe_prefix}.{engine_suffix}"].put(pe_txn)
            return
-        # CompositeCmd: tiled pipeline (not a simple forward)
+        # CompositeCmd: generate plan and feed
        if isinstance(cmd, CompositeCmd):
-            yield from self._dispatch_composite(env, pe_txn)
+            yield from self._dispatch_composite(env, pe_txn, cmd)
            return
        if isinstance(cmd, PeCpuOverheadCmd):
            yield env.timeout(cmd.cycles)
            pe_txn.done.succeed()
            return
        # Unknown command — signal done immediately
        pe_txn.done.succeed()
-    def _dispatch_composite(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
+    def _dispatch_composite(
-        """Composite tiled pipeline (ADR-0014 D3.2).
+        self, env: simpy.Environment, pe_txn: Any, cmd: Any,
    ) -> Generator:
        """Generate plan and enqueue to feeder. Non-blocking (ADR-0021 D4)."""
        from kernbench.components.builtin.pe_types import PipelineContext
-        GEMM: 3-stage pipeline with b-tile streaming from HBM.
+        plan = self._generate_plan(cmd)
-        MATH: sequential compute + DMA_WRITE (no tiling).
+
        self._pipeline_counter += 1
        ctx = PipelineContext(
            id=f"p{self._pipeline_counter}",
            total_tiles=len(plan.tiles),
            done_event=pe_txn.done,
        )
        # Enqueue to feeder — scheduler worker returns immediately
        assert self._pending_feeds is not None
        yield self._pending_feeds.put((plan, ctx))
    def _feed_loop(self, env: simpy.Environment) -> Generator:
        """Single feeder process: FIFO command ordering (ADR-0021 D2).
        No tile feed interleaving between commands.
        Queue full → only this process blocks.
        """
-        from kernbench.common.pe_commands import CompositeCmd
+        from kernbench.components.builtin.pe_types import TileToken
        assert self._pending_feeds is not None
        while True:
            plan, ctx = yield self._pending_feeds.get()
            for tile in plan.tiles:
                first_stage = tile.stages[0]
                token = TileToken(
                    tile_id=tile.tile_id,
                    pipeline_ctx=ctx,
                    plan=tile,
                    stage_idx=0,
                    params=first_stage.params,
                )
                yield self.out_ports[first_stage.component].put(token)
    def _generate_plan(self, cmd: Any) -> Any:
        """Generate a PipelinePlan from CompositeCmd."""
        from kernbench.components.builtin.tiling import (
            generate_gemm_plan,
            generate_math_plan,
        )
        pp = self._pe_prefix
        bpe = 2  # default bytes per element (f16)
        cmd = pe_txn.command
        assert isinstance(cmd, CompositeCmd)
        if cmd.op == "gemm" and cmd.b is not None:
-            yield from self._pipeline_gemm(env, pe_txn, cmd)
+            a = cmd.a
            b = cmd.b
            M, K = a.shape[-2], a.shape[-1]
            N = b.shape[-1]
            return generate_gemm_plan(
                M=M, K=K, N=N,
                tile_m=self.TILE_M, tile_k=self.TILE_K, tile_n=self.TILE_N,
                bytes_per_element=bpe,
                A_addr=a.addr, B_addr=b.addr, C_addr=cmd.out_addr,
                pe_prefix=pp,
            )
        else:
-            yield from self._pipeline_math(env, pe_txn, cmd)
+            # Math composite
-
+            a = cmd.a
-    def _pipeline_gemm(self, env: simpy.Environment, pe_txn: PeInternalTxn, cmd: Any) -> Generator:
+            M = a.shape[-2] if len(a.shape) >= 2 else a.shape[0]
-        """Tiled GEMM pipeline: stream b tiles from HBM, compute, write results.
+            N = a.shape[-1] if len(a.shape) >= 2 else 1
-
+            return generate_math_plan(
-        Tensor a is in TCM (loaded via tl.load). Tensor b is in HBM (via tl.ref).
+                M=M, N=N,
-        Pipeline: DMA_READ(b_tile_t) -> COMPUTE(t) -> DMA_WRITE(out_tile_t)
+                tile_m=self.TILE_M, tile_n=self.TILE_N,
-        Overlap: READ(t+1) || COMPUTE(t) || WRITE(t-1)
+                bytes_per_element=bpe,
-        """
+                math_op=cmd.math_op or "identity",
-        from kernbench.common.pe_commands import (
+                src_addr=a.addr, dst_addr=cmd.out_addr,
-            DmaReadCmd,
+                pe_prefix=pp,
            DmaWriteCmd,
            GemmCmd,
            PeInternalTxn as PeTxn,
            TensorHandle,
        )
        pp = self._pe_prefix
        a = cmd.a   # already in TCM
        b = cmd.b   # HBM reference (via tl.ref)
        M, K_a = a.shape[-2], a.shape[-1]
        K_b, N = b.shape[-2], b.shape[-1]
        dtype = a.dtype
        dtype_bytes = b.nbytes // (K_b * N) if (K_b * N) > 0 else 2
        # Tile counts
        n_tiles_k = max(1, (K_a + self.TILE_K - 1) // self.TILE_K)
        n_tiles_n = max(1, (N + self.TILE_N - 1) // self.TILE_N)
        n_tiles = n_tiles_k * n_tiles_n
        prev_compute_done = None
        prev_write_done = None
        total_dma_ns = 0.0
        total_compute_ns = 0.0
        for tile_idx in range(n_tiles):
            tk = tile_idx // n_tiles_n
            tn = tile_idx % n_tiles_n
            k_start = tk * self.TILE_K
            n_start = tn * self.TILE_N
            tile_k = min(self.TILE_K, K_a - k_start)
            tile_n = min(self.TILE_N, N - n_start)
            tile_nbytes = tile_k * tile_n * dtype_bytes
            # --- Stage 1: DMA_READ b_tile from HBM ---
            read_done = env.event()
            b_tile_addr = b.addr + (k_start * N + n_start) * dtype_bytes
            b_tile_handle = TensorHandle(
                id=f"b_tile_{tile_idx}", addr=b_tile_addr,
                shape=(tile_k, tile_n), dtype=dtype, nbytes=tile_nbytes,
            )
            read_cmd = DmaReadCmd(handle=b_tile_handle, src_addr=b_tile_addr, nbytes=tile_nbytes)
            read_txn = PeTxn(command=read_cmd, done=read_done, pe_prefix=pp)
            t0 = env.now
            yield self.out_ports[f"{pp}.pe_dma"].put(read_txn)
            # Wait for previous compute before starting this tile's compute
            if prev_compute_done is not None:
                yield prev_compute_done
            # Wait for this tile's DMA_READ
            yield read_done
            total_dma_ns += env.now - t0
            # --- Stage 2: COMPUTE (GEMM) ---
            compute_done = env.event()
            out_handle = TensorHandle(
                id=f"out_tile_{tile_idx}", addr=0,
                shape=(M, tile_n), dtype=dtype,
                nbytes=M * tile_n * dtype_bytes,
            )
            compute_cmd = GemmCmd(a=a, b=b_tile_handle, out=out_handle,
                                  m=M, k=tile_k, n=tile_n)
            compute_txn = PeTxn(command=compute_cmd, done=compute_done, pe_prefix=pp)
            t0 = env.now
            yield self.out_ports[f"{pp}.pe_gemm"].put(compute_txn)
            # Wait for previous write (DMA_WRITE serialization)
            if prev_write_done is not None:
                yield prev_write_done
            # Wait for compute of THIS tile
            yield compute_done
            total_compute_ns += env.now - t0
            prev_compute_done = compute_done
            # --- Stage 3: DMA_WRITE out_tile to HBM ---
            write_done = env.event()
            out_tile_pa = cmd.out_addr + n_start * dtype_bytes
            write_nbytes = M * tile_n * dtype_bytes
            write_cmd = DmaWriteCmd(handle=out_handle, dst_addr=out_tile_pa, nbytes=write_nbytes)
            write_txn = PeTxn(command=write_cmd, done=write_done, pe_prefix=pp)
            t0 = env.now
            yield self.out_ports[f"{pp}.pe_dma"].put(write_txn)
            prev_write_done = write_done
        # Wait for final write
        if prev_write_done is not None:
            t0 = env.now
            yield prev_write_done
            total_dma_ns += env.now - t0
        pe_txn.result_data["dma_ns"] = total_dma_ns
        pe_txn.result_data["compute_ns"] = total_compute_ns
        pe_txn.done.succeed()
    def _pipeline_math(self, env: simpy.Environment, pe_txn: PeInternalTxn, cmd: Any) -> Generator:
        """Non-GEMM composite: sequential compute + DMA_WRITE (no tiling)."""
        from kernbench.common.pe_commands import (
            DmaWriteCmd,
            MathCmd,
            PeInternalTxn as PeTxn,
        )
        pp = self._pe_prefix
        # Step 1: Compute (MATH)
        compute_done = env.event()
        compute_cmd = MathCmd(
            op=cmd.math_op or "identity",
            inputs=(cmd.a,), out=cmd.a,
        )
        compute_txn = PeTxn(command=compute_cmd, done=compute_done, pe_prefix=pp)
        yield self.out_ports[f"{pp}.pe_math"].put(compute_txn)
        yield compute_done
        # Step 2: DMA_WRITE result to HBM
        write_done = env.event()
        write_cmd = DmaWriteCmd(handle=cmd.a, dst_addr=cmd.out_addr, nbytes=cmd.out_nbytes)
        write_txn = PeTxn(command=write_cmd, done=write_done, pe_prefix=pp)
        yield self.out_ports[f"{pp}.pe_dma"].put(write_txn)
        yield write_done
        pe_txn.done.succeed()
@@ -1,7 +1,18 @@
 """PE_TCM: tightly-coupled memory with BW-based access serialization (ADR-0021).
 Models scratchpad memory inside the PE. Handles both legacy Transaction forwarding
 and TcmRequest from PE_FETCH_STORE for BW-serialized read/write access.
 Two channels (read/write) with independent serialization.
 Ported from pe_accel TcmBlock timing model.
 """
 from __future__ import annotations
 from collections.abc import Generator
-from typing import TYPE_CHECKING
+from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any
 import simpy
 from kernbench.components.base import ComponentBase
@@ -10,16 +21,62 @@ if TYPE_CHECKING:
    from kernbench.topology.types import Node
-class PeTcmComponent(ComponentBase):
+@dataclass
-    """PE_TCM: tightly-coupled memory / local SRAM staging buffer.
+class TcmRequest:
    """Request to read from or write to TCM (used by PE_FETCH_STORE)."""
-    Terminal storage component for PE-internal dataflow (ADR-0014 D5).
+    direction: str      # "read" or "write"
-    Phase 0: applies overhead_ns and drain_ns at terminal.
+    nbytes: int
    done: simpy.Event
    tag: str = ""
 class PeTcmComponent(ComponentBase):
    """PE_TCM: BW-serialized scratchpad memory (ADR-0021 D1).
    Dual-channel: read and write can proceed in parallel,
    but concurrent reads serialize, concurrent writes serialize.
    BW from topology attrs or pe_template links.
    """
    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
        super().__init__(node, ctx)
        self._read_bw: float = float(node.attrs.get("read_bw_gbs", 512.0))
        self._write_bw: float = float(node.attrs.get("write_bw_gbs", 512.0))
        self._read_res: simpy.Resource | None = None
        self._write_res: simpy.Resource | None = None
-    def run(self, env, nbytes: int) -> Generator:
+    def start(self, env: simpy.Environment) -> None:
        self._read_res = simpy.Resource(env, capacity=1)
        self._write_res = simpy.Resource(env, capacity=1)
        super().start(env)
    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
        yield env.timeout(overhead_ns)
    def _worker(self, env: simpy.Environment) -> Generator:
        """Dispatch TcmRequest (from fetch_store) and Transaction (fabric)."""
        while True:
            msg: Any = yield self._inbox.get()
            if isinstance(msg, TcmRequest):
                env.process(self._handle_tcm_request(env, msg))
            else:
                env.process(self._forward_txn(env, msg))
    def _handle_tcm_request(self, env: simpy.Environment, req: TcmRequest) -> Generator:
        """BW-serialized access: acquire channel, apply delay, signal done."""
        if req.direction == "write":
            res = self._write_res
            bw = self._write_bw
        else:
            res = self._read_res
            bw = self._read_bw
        assert res is not None
        with res.request() as lock:
            yield lock
            if bw > 0 and req.nbytes > 0:
                delay_ns = req.nbytes / bw
                yield env.timeout(delay_ns)
            req.done.succeed()
@@ -0,0 +1,115 @@
 """PE pipeline types for ADR-0021: TileToken, TilePlan, Stage, PipelineContext.
 These types are used by the PE_SCHEDULER and all PE engine components
 for tile-based pipeline execution with self-routing.
 """
 from __future__ import annotations
 from dataclasses import dataclass, field
 from enum import Enum, auto
 from typing import TYPE_CHECKING, Any
 if TYPE_CHECKING:
    import simpy
 # ── Stage types ──────────────────────────────────────────────────────
 class StageType(Enum):
    DMA_READ = auto()
    FETCH = auto()
    GEMM = auto()
    MATH = auto()
    STORE = auto()
    DMA_WRITE = auto()
@dataclass
 class Stage:
    """One stage in a tile's execution plan."""
    stage_type: StageType
    component: str      # topology node ID (e.g. "sip0.cube0.pe0.pe_dma")
    params: dict = field(default_factory=dict)
 # ── Plan ─────────────────────────────────────────────────────────────
@dataclass
 class TilePlan:
    """Execution plan for a single tile (immutable stage sequence)."""
    tile_id: int
    stages: tuple[Stage, ...]
@dataclass
 class PipelinePlan:
    """Full pipeline plan for one CompositeCmd."""
    tiles: list[TilePlan]
    # Metadata for metrics
    m_tiles: int = 0
    k_tiles: int = 0
    n_tiles: int = 0
 # ── Pipeline Context ─────────────────────────────────────────────────
@dataclass
 class PipelineContext:
    """Tracks completion of a pipeline (exactly-once contract).
    Each tile's last stage calls complete_tile() exactly once.
    When all tiles complete, done_event.succeed() is called.
    """
    id: str
    total_tiles: int
    completed_tiles: int = 0
    done_event: Any = None  # simpy.Event
    def complete_tile(self) -> None:
        self.completed_tiles += 1
        if self.completed_tiles == self.total_tiles:
            if self.done_event is not None:
                self.done_event.succeed()
 # ── TileToken ────────────────────────────────────────────────────────
@dataclass
 class TileToken:
    """Self-routing tile token passed between PE components (ADR-0021 D9).
    Single-owner: only one component holds this token at any time.
    params is a cache of plan.stages[stage_idx].params (canonical source).
    """
    tile_id: int
    pipeline_ctx: PipelineContext
    plan: TilePlan
    stage_idx: int
    params: dict = field(default_factory=dict)
    data_op: bool = True     # op_log recording target (ADR-0020)
    @property
    def current_stage(self) -> Stage:
        return self.plan.stages[self.stage_idx]
    @property
    def has_next_stage(self) -> bool:
        return self.stage_idx + 1 < len(self.plan.stages)
    def advance(self) -> Stage | None:
        """Advance to next stage. Returns next Stage or None if last."""
        self.stage_idx += 1
        if self.stage_idx < len(self.plan.stages):
            next_stage = self.plan.stages[self.stage_idx]
            self.params = next_stage.params
            return next_stage
        return None
@@ -0,0 +1,176 @@
 """Tile plan generators for PE pipeline (ADR-0021).
 Generates TilePlan with stage sequences for GEMM and Math operations.
 Ported from pe_accel tiling.py with stage-based plan structure.
 """
 from __future__ import annotations
 from math import ceil
 from kernbench.components.builtin.pe_types import (
    PipelinePlan,
    Stage,
    StageType,
    TilePlan,
 )
 def generate_gemm_plan(
    M: int, K: int, N: int,
    tile_m: int, tile_k: int, tile_n: int,
    bytes_per_element: int,
    A_addr: int, B_addr: int, C_addr: int,
    pe_prefix: str,
 ) -> PipelinePlan:
    """Generate GEMM tile plan: M→N→K order.
    Each tile follows stage sequence:
      DMA_READ(A) → DMA_READ(B) → FETCH → GEMM → STORE
      On last K-tile per (m,n): → DMA_WRITE
    Args:
        pe_prefix: e.g. "sip0.cube0.pe0" — used to build component IDs.
    """
    M_tiles = max(1, ceil(M / tile_m))
    K_tiles = max(1, ceil(K / tile_k))
    N_tiles = max(1, ceil(N / tile_n))
    bpe = bytes_per_element
    dma_id = f"{pe_prefix}.pe_dma"
    fetch_id = f"{pe_prefix}.pe_fetch_store"
    gemm_id = f"{pe_prefix}.pe_gemm"
    # math_id = f"{pe_prefix}.pe_math"  # for K-accumulation if needed
    tiles: list[TilePlan] = []
    tile_id = 0
    for m in range(M_tiles):
        for n in range(N_tiles):
            c_addr = C_addr + (m * tile_m * N + n * tile_n) * bpe
            for k in range(K_tiles):
                last_k = k == K_tiles - 1
                a_addr = A_addr + (m * tile_m * K + k * tile_k) * bpe
                b_addr = B_addr + (k * tile_k * N + n * tile_n) * bpe
                a_bytes = tile_m * tile_k * bpe
                b_bytes = tile_k * tile_n * bpe
                out_bytes = tile_m * tile_n * bpe
                stages: list[Stage] = []
                # DMA READ: load A and B tiles from HBM → TCM
                stages.append(Stage(
                    stage_type=StageType.DMA_READ,
                    component=dma_id,
                    params={
                        "src_addr": a_addr, "nbytes": a_bytes,
                        "operand": "A", "tile_m": tile_m, "tile_k": tile_k,
                    },
                ))
                stages.append(Stage(
                    stage_type=StageType.DMA_READ,
                    component=dma_id,
                    params={
                        "src_addr": b_addr, "nbytes": b_bytes,
                        "operand": "B", "tile_k": tile_k, "tile_n": tile_n,
                    },
                ))
                # FETCH: TCM → Register File
                stages.append(Stage(
                    stage_type=StageType.FETCH,
                    component=fetch_id,
                    params={
                        "direction": "read",
                        "nbytes": a_bytes + b_bytes,
                    },
                ))
                # GEMM: MAC compute
                stages.append(Stage(
                    stage_type=StageType.GEMM,
                    component=gemm_id,
                    params={
                        "m": tile_m, "k": tile_k, "n": tile_n,
                        "is_last_k": last_k,
                    },
                ))
                # STORE: Register File → TCM
                stages.append(Stage(
                    stage_type=StageType.STORE,
                    component=fetch_id,
                    params={
                        "direction": "write",
                        "nbytes": out_bytes,
                    },
                ))
                # DMA WRITE: TCM → HBM (only on last K-tile)
                if last_k:
                    stages.append(Stage(
                        stage_type=StageType.DMA_WRITE,
                        component=dma_id,
                        params={
                            "dst_addr": c_addr, "nbytes": out_bytes,
                        },
                    ))
                tiles.append(TilePlan(tile_id=tile_id, stages=tuple(stages)))
                tile_id += 1
    return PipelinePlan(
        tiles=tiles, m_tiles=M_tiles, k_tiles=K_tiles, n_tiles=N_tiles,
    )
 def generate_math_plan(
    M: int, N: int,
    tile_m: int, tile_n: int,
    bytes_per_element: int,
    math_op: str,
    src_addr: int, dst_addr: int,
    pe_prefix: str,
 ) -> PipelinePlan:
    """Generate element-wise math tile plan.
    Each tile: DMA_READ → FETCH → MATH → STORE → DMA_WRITE
    """
    M_tiles = max(1, ceil(M / tile_m))
    N_tiles = max(1, ceil(N / tile_n))
    bpe = bytes_per_element
    dma_id = f"{pe_prefix}.pe_dma"
    fetch_id = f"{pe_prefix}.pe_fetch_store"
    math_id = f"{pe_prefix}.pe_math"
    tiles: list[TilePlan] = []
    tile_id = 0
    for m in range(M_tiles):
        for n in range(N_tiles):
            offset = (m * tile_m * N + n * tile_n) * bpe
            tile_bytes = tile_m * tile_n * bpe
            stages = [
                Stage(StageType.DMA_READ, dma_id, {
                    "src_addr": src_addr + offset, "nbytes": tile_bytes,
                }),
                Stage(StageType.FETCH, fetch_id, {
                    "direction": "read", "nbytes": tile_bytes,
                }),
                Stage(StageType.MATH, math_id, {
                    "op": math_op, "num_elements": tile_m * tile_n,
                }),
                Stage(StageType.STORE, fetch_id, {
                    "direction": "write", "nbytes": tile_bytes,
                }),
                Stage(StageType.DMA_WRITE, dma_id, {
                    "dst_addr": dst_addr + offset, "nbytes": tile_bytes,
                }),
            ]
            tiles.append(TilePlan(tile_id=tile_id, stages=tuple(stages)))
            tile_id += 1
    return PipelinePlan(tiles=tiles, m_tiles=M_tiles, n_tiles=N_tiles)
@@ -24,6 +24,8 @@ class ComponentContext:
    ns_per_mm: float  # wire propagation constant (from topology spec)
    edge_map: dict[tuple[str, str], Any] = field(default_factory=dict)
    spec: dict = field(default_factory=dict)  # topology spec (cube layout, PE count, etc.)
    memory_store: Any = None  # MemoryStore for Phase 1 data-aware execution (ADR-0020)
    op_logger: Any = None     # OpLogger for Phase 1 op recording (ADR-0020)
    def get_shared_resource(
        self, env: simpy.Environment, key: str, capacity: int = 1,
@@ -0,0 +1,2 @@
 # Legacy component backups — not actively used.
 # Kept for reference during ADR-0021 migration.
@@ -0,0 +1,34 @@
 """Concrete component implementations.
 Loaded from components.yaml via ComponentRegistry.load_components_yaml().
 Manual imports are no longer needed — add new impls to components.yaml.
 Classes are still importable from this package via lazy __getattr__.
 """
 from kernbench.components.base import ComponentRegistry
 ComponentRegistry.load_components_yaml()
 # Lazy re-export: allow `from kernbench.components.builtin import FooComponent`
 # without eagerly importing every module.
 _CLASS_MAP: dict[str, str] = {}  # ClassName → "module.path:ClassName"
 def _build_class_map() -> None:
    if _CLASS_MAP:
        return
    for class_path in ComponentRegistry._lazy.values():
        module_path, class_name = class_path.rsplit(":", 1)
        _CLASS_MAP[class_name] = class_path
 def __getattr__(name: str):
    _build_class_map()
    class_path = _CLASS_MAP.get(name)
    if class_path is None:
        raise ImportError(f"cannot import name '{name}' from 'kernbench.components.builtin'")
    import importlib
    module_path, class_name = class_path.rsplit(":", 1)
    mod = importlib.import_module(module_path)
    return getattr(mod, class_name)
@@ -0,0 +1,27 @@
 from __future__ import annotations
 from collections.abc import Generator
 from typing import TYPE_CHECKING
 import simpy
 from kernbench.components.base import ComponentBase
 if TYPE_CHECKING:
    from kernbench.components.context import ComponentContext
    from kernbench.topology.types import Node
 class TransitComponent(ComponentBase):
    """Transit component for NOC, UCIe, XBAR nodes.
    Applies overhead_ns processing delay (from node.attrs) then forwards the
    Transaction to the next hop via inherited _forward_txn().
    """
    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
        super().__init__(node, ctx)
    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
        yield env.timeout(overhead_ns)
@@ -0,0 +1,129 @@
 from __future__ import annotations
 from collections.abc import Generator
 from typing import TYPE_CHECKING, Any
 import simpy
 from kernbench.components.base import ComponentBase
 from kernbench.sim_engine.transaction import Transaction
 if TYPE_CHECKING:
    from kernbench.components.context import ComponentContext
    from kernbench.topology.types import Node
 class HbmCtrlComponent(ComponentBase):
    """HBM controller: terminal component that models HBM access latency.
    Dual-channel model: separate read and write resources (each capacity=1)
    allowing concurrent read/write like PE_DMA. Multiple reads or multiple
    writes still serialize within their respective channel.
    On completion, creates a ResponseMsg and sends it back on the reverse path
    so that response latency is modeled through the fabric.
    """
    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
        super().__init__(node, ctx)
        self._read: simpy.Resource | None = None
        self._write: simpy.Resource | None = None
    def start(self, env: simpy.Environment) -> None:
        capacity = int(self.node.attrs.get("capacity", 1))
        self._read = simpy.Resource(env, capacity=capacity)
        self._write = simpy.Resource(env, capacity=capacity)
        super().start(env)
    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
        yield env.timeout(overhead_ns)
    def _select_channel(self, txn: Any) -> simpy.Resource:
        """Select channel based on request type: write requests → write, else → read."""
        from kernbench.runtime_api.kernel import MemoryWriteMsg, PeDmaMsg
        assert self._read is not None and self._write is not None
        req = txn.request
        if isinstance(req, MemoryWriteMsg):
            return self._write
        if isinstance(req, PeDmaMsg) and req.is_write:
            return self._write
        return self._read
    def _worker(self, env: simpy.Environment) -> Generator:
        """Dispatch each incoming txn to a concurrent process for channel-level parallelism."""
        while True:
            txn: Any = yield self._inbox.get()
            env.process(self._handle_txn(env, txn))
    def _handle_txn(self, env: simpy.Environment, txn: Any) -> Generator:
        """Acquire channel, run, apply drain, send response."""
        channel = self._select_channel(txn)
        with channel.request() as req:
            yield req
            yield from self.run(env, txn.nbytes)
            drain = getattr(txn, "drain_ns", 0.0)
            if drain > 0:
                yield env.timeout(drain)
        yield from self._send_response(env, txn)
    def _send_response(self, env: simpy.Environment, txn: Any) -> Generator:
        """Route completion based on path type.
        - PeDmaMsg: succeed done directly (probe).
        - Bypass path (no m_cpu): MemoryWrite succeeds done; MemoryRead sends
          data back on reverse path with original done event.
        - M_CPU DMA path: send ResponseMsg for m_cpu/io_cpu aggregation.
        """
        from kernbench.runtime_api.kernel import MemoryReadMsg, PeDmaMsg
        if isinstance(txn.request, PeDmaMsg):
            reverse_path = list(reversed(txn.path))
            if len(reverse_path) >= 2:
                resp_txn = Transaction(
                    request=txn.request, path=reverse_path, step=0,
                    nbytes=0, done=txn.done, is_response=True,
                )
                yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
                return
            txn.done.succeed()
            return
        # Bypass path: no m_cpu in the transaction path
        is_bypass = not any("m_cpu" in n for n in txn.path)
        if is_bypass:
            if isinstance(txn.request, MemoryReadMsg):
                # D2H: send data back on reverse path to pcie_ep
                reverse_path = list(reversed(txn.path))
                if len(reverse_path) >= 2:
                    resp_txn = Transaction(
                        request=txn.request, path=reverse_path, step=0,
                        nbytes=txn.request.nbytes, done=txn.done,
                    )
                    yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
                    return
            # MemoryWrite bypass or short path: done
            txn.done.succeed()
            return
        # M_CPU DMA path: send ResponseMsg for aggregation
        reverse_path = list(reversed(txn.path))
        if len(reverse_path) >= 2 and self.ctx:
            from kernbench.runtime_api.kernel import ResponseMsg
            parts = self.node.id.split(".")
            cube_id = int(parts[1].replace("cube", ""))
            pe_id = 0  # single hbm_ctrl, PE info from request
            resp_msg = ResponseMsg(
                correlation_id=txn.request.correlation_id,
                request_id=txn.request.request_id,
                src_cube=cube_id, src_pe=pe_id, success=True,
            )
            resp_txn = Transaction(
                request=resp_msg, path=reverse_path, step=0,
                nbytes=0, done=env.event(), is_response=True,
            )
            yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
        else:
            txn.done.succeed()
@@ -0,0 +1,157 @@
 from __future__ import annotations
 from collections.abc import Generator
 from typing import TYPE_CHECKING, Any
 import simpy
 from kernbench.components.base import ComponentBase
 from kernbench.sim_engine.transaction import Transaction
 if TYPE_CHECKING:
    from kernbench.components.context import ComponentContext
    from kernbench.topology.types import Node
 class IoCpuComponent(ComponentBase):
    """IO_CPU component: multi-cube fan-out with response aggregation.
    Forward path:
      1. Applies overhead_ns processing overhead.
      2. Resolves target cube(s) from request.target_cubes.
      3. Fans out sub-Transactions to each target cube's M_CPU.
    Response path:
      Collects ResponseMsg from each M_CPU. When all cube responses are
      received, succeeds the parent txn.done.
    """
    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
        super().__init__(node, ctx)
        # Pending fan-out tracking: request_id → (expected, received, parent_txn_done)
        self._pending: dict[str, tuple[int, int, simpy.Event]] = {}
    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
        yield env.timeout(overhead_ns)
    def _worker(self, env: simpy.Environment) -> Generator:
        while True:
            txn: Any = yield self._inbox.get()
            if getattr(txn, "is_response", False):
                self._collect_response(txn)
            else:
                yield from self.run(env, txn.nbytes)
                env.process(self._dispatch_to_m_cpus(env, txn))
    def _collect_response(self, resp_txn: Any) -> None:
        """Receive a cube response and increment the aggregation counter."""
        key = resp_txn.request.request_id
        if key not in self._pending:
            return
        expected, received, parent_done = self._pending[key]
        received += 1
        if received >= expected:
            parent_done.succeed()
            del self._pending[key]
        else:
            self._pending[key] = (expected, received, parent_done)
    def _dispatch_to_m_cpus(self, env: simpy.Environment, txn: Any) -> Generator:
        """Fan out sub-Transactions to target cube M_CPUs, wait for responses."""
        from kernbench.runtime_api.kernel import KernelLaunchMsg, MemoryReadMsg, MemoryWriteMsg
        request = txn.request
        try:
            cube_targets = self._resolve_cube_targets(request)
        except Exception:
            txn.done.succeed()
            return
        if not cube_targets:
            txn.done.succeed()
            return
        # Setup aggregation
        self._pending[request.request_id] = (len(cube_targets), 0, txn.done)
        # Fan out to each target cube's M_CPU
        for sip, cube in cube_targets:
            try:
                m_cpu_id = self.ctx.resolver.find_m_cpu(sip, cube)
                path = self.ctx.router.find_node_path(self.node.id, m_cpu_id)
            except Exception:
                continue
            if len(path) < 2:
                continue
            sub_txn = Transaction(
                request=request, path=path, step=0,
                nbytes=txn.nbytes, done=env.event(),
                result_data=txn.result_data,
            )
            yield self.out_ports[path[1]].put(sub_txn.advance())
    def _resolve_cube_targets(self, request: Any) -> list[tuple[int, int]]:
        """Return list of (sip, cube) pairs to fan out to."""
        from kernbench.runtime_api.kernel import (
            KernelLaunchMsg, MemoryReadMsg, MemoryWriteMsg, MmuMapMsg, MmuUnmapMsg,
        )
        target_cubes = getattr(request, "target_cubes", "all")
        if isinstance(request, MemoryWriteMsg):
            sip = request.dst_sip
            if target_cubes == "all":
                cube = self._cube_from_pa(request.dst_pa, fallback=request.dst_cube)
                return [(sip, cube)]
            return [(sip, c) for c in target_cubes]
        if isinstance(request, MemoryReadMsg):
            sip = request.src_sip
            if target_cubes == "all":
                cube = self._cube_from_pa(request.src_pa, fallback=request.src_cube)
                return [(sip, cube)]
            return [(sip, c) for c in target_cubes]
        if isinstance(request, KernelLaunchMsg):
            my_sip = self._my_sip()
            if target_cubes != "all":
                return [(my_sip, c) for c in target_cubes]
            # "all": derive from tensor shards, filtered to this SIP
            seen: set[tuple[int, int]] = set()
            targets: list[tuple[int, int]] = []
            for arg in request.args:
                if arg.arg_kind != "tensor":
                    continue
                for shard in arg.shards:
                    if shard.sip != my_sip:
                        continue
                    key = (shard.sip, shard.cube)
                    if key not in seen:
                        seen.add(key)
                        targets.append(key)
            return targets
        if isinstance(request, (MmuMapMsg, MmuUnmapMsg)):
            my_sip = self._my_sip()
            if target_cubes == "all":
                n_cubes = 16
                if self.ctx and self.ctx.spec:
                    sips = self.ctx.spec.get("system", {}).get("sips", {})
                    n_cubes = sips.get("cubes_per_sip", 16)
                return [(my_sip, c) for c in range(n_cubes)]
            return [(my_sip, c) for c in target_cubes]
        return []
    def _cube_from_pa(self, pa_val: int, fallback: int) -> int:
        """Extract cube_id from a physical address, with fallback."""
        from kernbench.policy.address.phyaddr import PhysAddr
        try:
            return PhysAddr.decode(pa_val).cube_id
        except Exception:
            return fallback
    def _my_sip(self) -> int:
        """Extract this IO_CPU's SIP ID from its node ID (e.g. 'sip0.io0.io_cpu' → 0)."""
        return int(self.node.id.split(".")[0].replace("sip", ""))
@@ -0,0 +1,327 @@
 from __future__ import annotations
 from collections.abc import Generator
 from typing import TYPE_CHECKING, Any
 import simpy
 from kernbench.components.base import ComponentBase
 from kernbench.sim_engine.transaction import Transaction
 if TYPE_CHECKING:
    from kernbench.components.context import ComponentContext
    from kernbench.topology.types import Node
 class MCpuComponent(ComponentBase):
    """M_CPU component: multi-PE DMA fan-out with response aggregation.
    Forward path (ADR-0015 D5):
      When a forward Transaction arrives at m_cpu (terminal hop), M_CPU fans out
      DMA sub-Transactions to target PEs' HBM slices. target_pe on the request
      controls fan-out: int → single PE, "all" → all PEs in the cube.
    Response path:
      ResponseMsg from each hbm_ctrl arrives back at m_cpu. Once all PE responses
      are collected, m_cpu sends an aggregate ResponseMsg on the reverse command
      path back to io_cpu.
    Transit:
      When m_cpu is NOT the terminal hop (transit or response relay), the
      Transaction is forwarded normally to the next hop.
    """
    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
        super().__init__(node, ctx)
        # Pending fan-out tracking: request_id → (expected, received, all_done_event)
        self._pending: dict[str, tuple[int, int, simpy.Event]] = {}
        # Store parent txn for response sending: request_id → parent_txn
        self._parent_txns: dict[str, Any] = {}
        # DMA engine resources (ADR-0015 D5, ADR-0014 D4): capacity=1 each
        self._dma_write: simpy.Resource | None = None
        self._dma_read: simpy.Resource | None = None
    def start(self, env: simpy.Environment) -> None:
        self._dma_write = simpy.Resource(env, capacity=1)
        self._dma_read = simpy.Resource(env, capacity=1)
        super().start(env)
    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
        yield env.timeout(overhead_ns)
    def _worker(self, env: simpy.Environment) -> Generator:
        """Dispatch forward txns, collect response txns."""
        from kernbench.runtime_api.kernel import KernelLaunchMsg, MmuMapMsg, MmuUnmapMsg
        while True:
            txn: Any = yield self._inbox.get()
            if getattr(txn, "is_response", False):
                self._collect_response(txn)
            else:
                yield from self.run(env, txn.nbytes)
                next_hop = txn.next_hop
                if next_hop:
                    yield self.out_ports[next_hop].put(txn.advance())
                elif self.ctx is not None and txn.request is not None:
                    if isinstance(txn.request, KernelLaunchMsg):
                        env.process(self._kernel_launch_fanout(env, txn))
                    elif isinstance(txn.request, (MmuMapMsg, MmuUnmapMsg)):
                        env.process(self._mmu_msg_fanout(env, txn))
                    else:
                        env.process(self._dma_fanout(env, txn))
                else:
                    txn.done.succeed()
    def _collect_response(self, resp_txn: Any) -> None:
        """Receive a PE response and increment the aggregation counter."""
        key = resp_txn.request.request_id
        if key not in self._pending:
            return
        expected, received, all_done = self._pending[key]
        received += 1
        if received >= expected:
            all_done.succeed()
            del self._pending[key]
        else:
            self._pending[key] = (expected, received, all_done)
    def _dma_fanout(self, env: simpy.Environment, txn: Any) -> Generator:
        """Fan out DMA sub-Transactions to target PE(s), wait for responses,
        then send aggregate response on reverse command path.
        Each DMA transfer acquires the DMA resource (capacity=1 per ADR-0014 D4),
        so multi-PE fan-out is serialized through the DMA engine.
        """
        from kernbench.runtime_api.kernel import MemoryWriteMsg
        request = txn.request
        target_pe = getattr(request, "target_pe", "all")
        dst_nodes = self._resolve_dma_destinations(request, target_pe)
        if not dst_nodes:
            txn.done.succeed()
            return
        # Setup aggregation
        all_done = env.event()
        self._pending[request.request_id] = (len(dst_nodes), 0, all_done)
        self._parent_txns[request.request_id] = txn
        # Select DMA resource based on operation type
        dma_res = self._dma_write if isinstance(request, MemoryWriteMsg) else self._dma_read
        # Fan out DMA sub-txns (serialized through DMA resource)
        max_drain_ns = 0.0
        for dst_node in dst_nodes:
            try:
                dma_path = self.ctx.router.find_mcpu_dma_path(self.node.id, dst_node)
            except Exception:
                continue
            if len(dma_path) < 2:
                continue
            drain_ns = self.ctx.compute_drain_ns(dma_path, txn.nbytes)
            max_drain_ns = max(max_drain_ns, drain_ns)
            sub_txn = Transaction(
                request=request, path=dma_path, step=0,
                nbytes=txn.nbytes, done=env.event(),
                drain_ns=drain_ns,
            )
            with dma_res.request() as req:
                yield req
                yield self.out_ports[dma_path[1]].put(sub_txn.advance())
        # Wait for all PE responses
        yield all_done
        txn.result_data["xfer_ns"] = max_drain_ns
        del self._parent_txns[request.request_id]
        # Send aggregate response on reverse command path
        reverse_path = list(reversed(txn.path))
        if len(reverse_path) >= 2:
            from kernbench.runtime_api.kernel import ResponseMsg
            parts = self.node.id.split(".")
            cube_id = int(parts[1].replace("cube", ""))
            resp_msg = ResponseMsg(
                correlation_id=request.correlation_id,
                request_id=request.request_id,
                src_cube=cube_id, src_pe=-1, success=True,
            )
            resp_txn = Transaction(
                request=resp_msg, path=reverse_path, step=0,
                nbytes=0, done=env.event(), is_response=True,
            )
            yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
        else:
            txn.done.succeed()
    def _kernel_launch_fanout(self, env: simpy.Environment, txn: Any) -> Generator:
        """Fan out KernelLaunchMsg to target PE_CPU(s) via NOC (ADR-0009 D3).
        Routes through find_node_path (M_CPU → NOC → PE_CPU command edges).
        PE_CPU sends ResponseMsg back via NOC → M_CPU on completion.
        Then sends aggregate ResponseMsg back to IO_CPU on the reverse path.
        """
        request = txn.request
        target_pe = getattr(request, "target_pe", "all")
        cube_prefix = self.node.id.rsplit(".", 1)[0]  # e.g. "sip0.cube0"
        pe_ids = self._resolve_pe_ids(target_pe)
        if not pe_ids:
            txn.done.succeed()
            return
        # Fan out to each PE_CPU, using response-based aggregation
        sub_txns: list[Transaction] = []
        n_dispatched = 0
        for pe_id in pe_ids:
            pe_cpu_id = f"{cube_prefix}.pe{pe_id}.pe_cpu"
            try:
                path = self.ctx.router.find_node_path(self.node.id, pe_cpu_id)
            except Exception:
                continue
            if len(path) < 2:
                continue
            sub_txn = Transaction(
                request=request, path=path, step=0,
                nbytes=0, done=env.event(),
            )
            yield self.out_ports[path[1]].put(sub_txn.advance())
            sub_txns.append(sub_txn)
            n_dispatched += 1
        if n_dispatched == 0:
            txn.done.succeed()
            return
        # Setup response aggregation (PE_CPU ResponseMsg arrives via _collect_response)
        all_done = env.event()
        self._pending[request.request_id] = (n_dispatched, 0, all_done)
        self._parent_txns[request.request_id] = txn
        # Wait for all PE_CPU responses via NOC
        yield all_done
        del self._parent_txns[request.request_id]
        # Aggregate PE-internal metrics (max across PEs)
        pe_exec_values = [st.result_data.get("pe_exec_ns", 0.0) for st in sub_txns]
        if pe_exec_values:
            txn.result_data["pe_exec_ns"] = max(pe_exec_values)
        dma_values = [st.result_data.get("dma_ns", 0.0) for st in sub_txns]
        if dma_values:
            txn.result_data["dma_ns"] = max(dma_values)
        compute_values = [st.result_data.get("compute_ns", 0.0) for st in sub_txns]
        if compute_values:
            txn.result_data["compute_ns"] = max(compute_values)
        # Send aggregate response on reverse command path back to IO_CPU
        reverse_path = list(reversed(txn.path))
        if len(reverse_path) >= 2:
            from kernbench.runtime_api.kernel import ResponseMsg
            parts = self.node.id.split(".")
            cube_id = int(parts[1].replace("cube", ""))
            resp_msg = ResponseMsg(
                correlation_id=request.correlation_id,
                request_id=request.request_id,
                src_cube=cube_id, src_pe=-1, success=True,
            )
            resp_txn = Transaction(
                request=resp_msg, path=reverse_path, step=0,
                nbytes=0, done=env.event(), is_response=True,
            )
            yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
        else:
            txn.done.succeed()
    def _resolve_dma_destinations(self, request: Any, target_pe: int | str) -> list[str]:
        """Return list of HBM destination node_ids for DMA fan-out.
        With single hbm_ctrl per cube (ADR-0019), always returns one node.
        PA-based resolution still used for cross-cube routing.
        """
        cube_prefix = self.node.id.rsplit(".", 1)[0]  # e.g. "sip0.cube0"
        # PA-based resolution: extract actual target from physical address
        pa_val = getattr(request, "dst_pa", None) or getattr(request, "src_pa", None)
        if pa_val is not None:
            from kernbench.policy.address.phyaddr import PhysAddr
            try:
                pa = PhysAddr.decode(pa_val)
                return [self.ctx.resolver.resolve(pa)]
            except Exception:
                pass
        # Default: single hbm_ctrl in local cube
        return [f"{cube_prefix}.hbm_ctrl"]
    def _mmu_msg_fanout(self, env: simpy.Environment, txn: Any) -> Generator:
        """Fan out MmuMapMsg/MmuUnmapMsg to target PE_MMU(s) via NOC.
        Routes through find_node_path (M_CPU → NOC → PE_MMU command edges).
        PE_MMU is a terminal node — completes the transaction directly.
        """
        request = txn.request
        target_pe = getattr(request, "target_pe", "all")
        cube_prefix = self.node.id.rsplit(".", 1)[0]  # e.g. "sip0.cube0"
        pe_ids = self._resolve_pe_ids(target_pe)
        if not pe_ids:
            txn.done.succeed()
            return
        # Fan out to each PE_MMU
        sub_dones: list[simpy.Event] = []
        for pe_id in pe_ids:
            pe_mmu_id = f"{cube_prefix}.pe{pe_id}.pe_mmu"
            try:
                path = self.ctx.router.find_node_path(self.node.id, pe_mmu_id)
            except Exception:
                continue
            if len(path) < 2:
                continue
            sub_done = env.event()
            sub_txn = Transaction(
                request=request, path=path, step=0,
                nbytes=0, done=sub_done,
            )
            yield self.out_ports[path[1]].put(sub_txn.advance())
            sub_dones.append(sub_done)
        # Wait for all PE_MMUs to complete
        for sd in sub_dones:
            yield sd
        # Send aggregate response on reverse path
        reverse_path = list(reversed(txn.path))
        if len(reverse_path) >= 2:
            from kernbench.runtime_api.kernel import ResponseMsg
            parts = self.node.id.split(".")
            cube_id = int(parts[1].replace("cube", ""))
            resp_msg = ResponseMsg(
                correlation_id=request.correlation_id,
                request_id=request.request_id,
                src_cube=cube_id, src_pe=-1, success=True,
            )
            resp_txn = Transaction(
                request=resp_msg, path=reverse_path, step=0,
                nbytes=0, done=env.event(), is_response=True,
            )
            yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
        else:
            txn.done.succeed()
    def _resolve_pe_ids(self, target_pe: int | tuple | str) -> list[int]:
        """Return list of PE IDs to fan out to (used by kernel launch fan-out)."""
        if isinstance(target_pe, int):
            return [target_pe]
        if isinstance(target_pe, tuple):
            return list(target_pe)
        # "all": all PEs in local cube
        n_slices = 8
        if self.ctx and self.ctx.spec:
            mm = self.ctx.spec.get("cube", {}).get("memory_map", {})
            n_slices = mm.get("hbm_slices_per_cube", 8)
        return list(range(n_slices))
@@ -0,0 +1,27 @@
 from __future__ import annotations
 from collections.abc import Generator
 from typing import TYPE_CHECKING
 import simpy
 from kernbench.components.base import ComponentBase
 if TYPE_CHECKING:
    from kernbench.components.context import ComponentContext
    from kernbench.topology.types import Node
 class PcieEpComponent(ComponentBase):
    """PCIe endpoint: protocol processing overhead before forwarding.
    Applies overhead_ns (from node.attrs) for PCIe protocol handling,
    then forwards via inherited _forward_txn().
    """
    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
        super().__init__(node, ctx)
    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
        yield env.timeout(overhead_ns)
@@ -0,0 +1,214 @@
 from __future__ import annotations
 from collections.abc import Generator
 from typing import TYPE_CHECKING, Any
 import simpy
 from kernbench.components.base import ComponentBase
 from kernbench.sim_engine.transaction import Transaction
 if TYPE_CHECKING:
    from kernbench.components.context import ComponentContext
    from kernbench.topology.types import Node
 class PeCpuComponent(ComponentBase):
    """PE_CPU: kernel execution controller (Stage 2).
    Two-phase kernel execution (ADR-0014 D1):
      Phase 1 (compile): look up kernel from registry, run it with TLContext
                         to generate a PeCommand list.
      Phase 2 (replay):  iterate commands, dispatch to PE_SCHEDULER via
                         PeInternalTxn, wait for blocking commands.
    Non-kernel Transactions are forwarded normally.
    """
    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
        super().__init__(node, ctx)
        self._pe_prefix = node.id.rsplit(".", 1)[0]  # e.g. "sip0.cube0.pe0"
        try:
            self._pe_idx = int(self._pe_prefix.rsplit("pe", 1)[1])
        except (IndexError, ValueError):
            self._pe_idx = 0
        # Extract sip/cube index for multi-SIP/cube shard matching
        parts = node.id.split(".")
        try:
            self._sip_idx = int(parts[0].replace("sip", ""))
        except (IndexError, ValueError):
            self._sip_idx = 0
        try:
            self._cube_idx = int(parts[1].replace("cube", ""))
        except (IndexError, ValueError):
            self._cube_idx = 0
    def _find_shard(self, shards: tuple) -> Any:
        """Find shard matching this PE's (sip, cube, pe). Fallback to positional index."""
        for s in shards:
            if s.sip == self._sip_idx and s.cube == self._cube_idx and s.pe == self._pe_idx:
                return s
        return shards[min(self._pe_idx, len(shards) - 1)]
    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
        yield env.timeout(overhead_ns)
    def _worker(self, env: simpy.Environment) -> Generator:
        while True:
            txn: Any = yield self._inbox.get()
            from kernbench.runtime_api.kernel import KernelLaunchMsg
            if hasattr(txn, "request") and isinstance(txn.request, KernelLaunchMsg):
                yield from self._execute_kernel(env, txn)
            else:
                yield from self._forward_txn(env, txn)
    def _execute_kernel(self, env: simpy.Environment, txn: Any) -> Generator:
        """Execute kernel: greenlet mode (ADR-0020) or legacy Phase 0 + replay."""
        from kernbench.triton_emu.registry import get_kernel
        request = txn.request
        yield from self.run(env, 0)
        kernel_fn = get_kernel(request.kernel_ref.name)
        num_programs = self._derive_num_programs(request)
        kernel_args = self._unpack_kernel_args(request)
        pe_exec_start = env.now
        scheduler_id = f"{self._pe_prefix}.pe_scheduler"
        # Choose execution mode: greenlet (ADR-0020) or legacy command-list
        store = getattr(self.ctx, "memory_store", None) if self.ctx else None
        if store is not None:
            composite_results = yield from self._execute_greenlet(
                env, kernel_fn, kernel_args, num_programs, scheduler_id, store,
            )
        else:
            composite_results = yield from self._execute_legacy(
                env, kernel_fn, kernel_args, num_programs, scheduler_id,
            )
        # Record PE-internal execution time
        txn.result_data["pe_exec_ns"] = env.now - pe_exec_start
        total_dma_ns = 0.0
        total_compute_ns = 0.0
        for rd in composite_results:
            total_dma_ns += rd.get("dma_ns", 0.0)
            total_compute_ns += rd.get("compute_ns", 0.0)
        txn.result_data["dma_ns"] = total_dma_ns
        txn.result_data["compute_ns"] = total_compute_ns
        # Send ResponseMsg on reverse path
        yield from self._send_response(env, txn, request)
    def _derive_num_programs(self, request: Any) -> int:
        num_programs = 1
        for arg in request.args:
            if arg.arg_kind == "tensor":
                cube_pe_count = sum(
                    1 for s in arg.shards
                    if s.sip == self._sip_idx and s.cube == self._cube_idx
                )
                if cube_pe_count > num_programs:
                    num_programs = cube_pe_count
        return num_programs
    def _unpack_kernel_args(self, request: Any) -> list:
        kernel_args: list = []
        for arg in request.args:
            if arg.arg_kind == "tensor":
                if arg.va_base:
                    kernel_args.append(arg.va_base)
                else:
                    shard = self._find_shard(arg.shards)
                    kernel_args.append(shard.pa)
            elif arg.arg_kind == "scalar":
                kernel_args.append(arg.value)
        return kernel_args
    def _execute_greenlet(
        self, env, kernel_fn, kernel_args, num_programs, scheduler_id, store,
    ) -> Generator:
        """Greenlet-based execution (ADR-0020 D3): kernel ↔ SimPy interleaved."""
        from kernbench.triton_emu.kernel_runner import KernelRunner
        runner = KernelRunner(
            pe_prefix=self._pe_prefix,
            pe_idx=self._pe_idx,
            sip_idx=self._sip_idx,
            cube_idx=self._cube_idx,
            scheduler_id=scheduler_id,
            out_ports=self.out_ports,
            store=store,
        )
        yield from runner.run(env, kernel_fn, kernel_args, num_programs)
        return getattr(runner, "_composite_results", [])
    def _execute_legacy(
        self, env, kernel_fn, kernel_args, num_programs, scheduler_id,
    ) -> Generator:
        """Legacy Phase 0 + replay: generate command list, then dispatch."""
        from kernbench.common.pe_commands import (
            CompositeCmd, PeCpuOverheadCmd, PeInternalTxn, WaitCmd,
        )
        from kernbench.triton_emu.tl_context import TLContext, run_kernel
        tl = TLContext(pe_id=self._pe_idx, num_programs=num_programs, dispatch_cycles=0)
        run_kernel(kernel_fn, tl, *kernel_args)
        commands = tl.commands
        pending: dict[str, simpy.Event] = {}
        composite_results: list[dict] = []
        for cmd in commands:
            if isinstance(cmd, PeCpuOverheadCmd):
                yield env.timeout(cmd.cycles)
            elif isinstance(cmd, WaitCmd):
                if cmd.handle is not None:
                    evt = pending.pop(cmd.handle.id, None)
                    if evt:
                        yield evt
                else:
                    for evt in pending.values():
                        yield evt
                    pending.clear()
            elif isinstance(cmd, CompositeCmd):
                done_evt = env.event()
                pe_txn = PeInternalTxn(
                    command=cmd, done=done_evt, pe_prefix=self._pe_prefix,
                )
                composite_results.append(pe_txn.result_data)
                yield self.out_ports[scheduler_id].put(pe_txn)
                pending[cmd.completion.id] = done_evt
            else:
                done_evt = env.event()
                pe_txn = PeInternalTxn(
                    command=cmd, done=done_evt, pe_prefix=self._pe_prefix,
                )
                yield self.out_ports[scheduler_id].put(pe_txn)
                yield done_evt
        for evt in pending.values():
            yield evt
        return composite_results
    def _send_response(self, env, txn, request) -> Generator:
        reverse_path = list(reversed(txn.path))
        if len(reverse_path) >= 2:
            from kernbench.runtime_api.kernel import ResponseMsg
            resp_msg = ResponseMsg(
                correlation_id=request.correlation_id,
                request_id=request.request_id,
                src_cube=self._cube_idx, src_pe=self._pe_idx,
                success=True,
            )
            resp_txn = Transaction(
                request=resp_msg, path=reverse_path, step=0,
                nbytes=0, done=env.event(), is_response=True,
            )
            yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
        else:
            txn.done.succeed()
@@ -0,0 +1,138 @@
 from __future__ import annotations
 from collections.abc import Generator
 from typing import TYPE_CHECKING, Any
 import simpy
 from kernbench.components.base import PeEngineBase
 from kernbench.sim_engine.transaction import Transaction
 if TYPE_CHECKING:
    from kernbench.common.pe_commands import PeInternalTxn
    from kernbench.components.context import ComponentContext
    from kernbench.topology.types import Node
 class PeDmaComponent(PeEngineBase):
    """PE_DMA: dual-channel DMA engine with READ and WRITE resources.
    Each channel has capacity=1 (ADR-0014 D4):
      - DMA_READ and DMA_WRITE may execute concurrently.
      - Multiple READs cannot overlap; multiple WRITEs cannot overlap.
    Handles two message types:
      - Transaction: external fabric messages (PeDmaMsg probes, M_CPU DMA)
      - PeInternalTxn: PE-internal commands from PE_SCHEDULER
        (DmaReadCmd → HBM read, DmaWriteCmd → HBM write)
    """
    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
        super().__init__(node, ctx)
        self._dma_read: simpy.Resource | None = None
        self._dma_write: simpy.Resource | None = None
        self._mmu = None  # PeMMU instance, set by engine wiring
    def init_resources(self, env: simpy.Environment) -> None:
        self._dma_read = simpy.Resource(env, capacity=1)
        self._dma_write = simpy.Resource(env, capacity=1)
    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
        yield env.timeout(0)
    def handle_command(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
        """Handle PE-internal DMA command: resolve PA → HBM path → transfer."""
        from kernbench.common.pe_commands import DmaReadCmd, DmaWriteCmd
        from kernbench.policy.address.phyaddr import PhysAddr
        from kernbench.runtime_api.kernel import PeDmaMsg
        cmd = pe_txn.command
        assert self._dma_read is not None and self._dma_write is not None
        # Determine direction and target address (VA → PA via MMU)
        if isinstance(cmd, DmaReadCmd):
            dma_res = self._dma_read
            raw_addr = cmd.src_addr
            is_write = False
        elif isinstance(cmd, DmaWriteCmd):
            dma_res = self._dma_write
            raw_addr = cmd.dst_addr
            is_write = True
        else:
            pe_txn.done.succeed()
            return
        # Translate VA → PA via MMU (if available), then resolve HBM node
        # If MMU has no mapping for this address (PageFault), treat as PA directly
        # (backward-compatible with PA-only mode)
        if self._mmu is not None:
            from kernbench.policy.address.pe_mmu import PageFault
            try:
                target_pa = self._mmu.translate(raw_addr)
                if self._mmu.overhead_ns > 0:
                    yield env.timeout(self._mmu.overhead_ns)
            except PageFault:
                target_pa = raw_addr
        else:
            target_pa = raw_addr  # fallback: treat as PA directly
        pa = PhysAddr.decode(target_pa)
        dst_node = self.ctx.resolver.resolve(pa)
        path = self.ctx.router.find_path(self._pe_prefix, dst_node)
        drain_ns = self.ctx.compute_drain_ns(path, cmd.nbytes)
        # Acquire DMA channel (command issue serialization)
        with dma_res.request() as req:
            yield req
            # Create sub-Transaction with PeDmaMsg (HbmCtrl handles it directly)
            sub_done = env.event()
            sub_request = PeDmaMsg(
                correlation_id="pe_internal",
                request_id=f"dma_{id(pe_txn)}",
                src_sip=0, src_cube=0, src_pe=0,
                dst_pa=target_pa, nbytes=cmd.nbytes,
                is_write=is_write,
            )
            sub_txn = Transaction(
                request=sub_request, path=path, step=0,
                nbytes=cmd.nbytes, done=sub_done, drain_ns=drain_ns,
            )
            # Send to next hop (path[0] is pe_dma itself, path[1] is router)
            if len(path) > 1:
                yield self.out_ports[path[1]].put(sub_txn.advance())
        # DMA channel released after issue
        # Wait for HBM transfer completion
        yield sub_done
        pe_txn.done.succeed()
    def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
        """Handle external Transaction (PeDmaMsg probe, M_CPU DMA) with channel acquisition."""
        # Response transactions bypass DMA channel (no outbound resource needed)
        if getattr(txn, "is_response", False):
            next_hop = txn.next_hop
            if next_hop:
                yield self.out_ports[next_hop].put(txn.advance())
            else:
                txn.done.succeed()
            return
        dma_res = self._select_channel(txn)
        with dma_res.request() as req:
            yield req
            next_hop = txn.next_hop
            if next_hop:
                yield self.out_ports[next_hop].put(txn.advance())
            else:
                drain = getattr(txn, "drain_ns", 0.0)
                if drain > 0:
                    yield env.timeout(drain)
                txn.done.succeed()
    def _select_channel(self, txn: Any) -> simpy.Resource:
        """Select DMA channel based on request type."""
        from kernbench.runtime_api.kernel import MemoryWriteMsg
        assert self._dma_read is not None and self._dma_write is not None
        if isinstance(txn.request, MemoryWriteMsg):
            return self._dma_write
        return self._dma_read
@@ -0,0 +1,90 @@
 from __future__ import annotations
 from collections.abc import Generator
 from typing import TYPE_CHECKING, Any
 import simpy
 from kernbench.components.base import PeEngineBase
 if TYPE_CHECKING:
    from kernbench.common.pe_commands import PeInternalTxn
    from kernbench.components.context import ComponentContext
    from kernbench.topology.types import Node
 # dtype → bit width (for TFLOPS scaling)
 _DTYPE_BITS: dict[str, int] = {
    "f16": 16, "fp16": 16, "float16": 16, "bf16": 16,
    "f32": 32, "fp32": 32, "float32": 32,
    "i8": 8, "int8": 8,
    "i16": 16, "int16": 16,
    "i32": 32, "int32": 32,
 }
 class PeGemmComponent(PeEngineBase):
    """PE_GEMM: matrix multiplication engine sharing accel_slot (ADR-0014 D4).
    Uses a shared compute resource (PE_ACCEL capacity=1) that is mutually
    exclusive with PE_MATH within the same PE.
    Compute latency model:
      FLOPs = 2 * M * K * N
      effective_tflops = peak_tflops_f16 * (16 / dtype_bits)
      compute_ns = FLOPs / (effective_tflops * 1e3)
    """
    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
        super().__init__(node, ctx)
        self._accel: simpy.Resource | None = None
        self._peak_tflops_f16: float = float(node.attrs.get("peak_tflops_f16", 0.0))
    def init_resources(self, env: simpy.Environment) -> None:
        resource_name = self.node.attrs.get("shared_resource")
        if resource_name and self.ctx:
            self._accel = self.ctx.get_shared_resource(
                env, f"{self._pe_prefix}.{resource_name}"
            )
    def _compute_ns(self, m: int, k: int, n: int, dtype: str) -> float:
        """Compute GEMM latency in nanoseconds."""
        if self._peak_tflops_f16 <= 0:
            return float(self.node.attrs.get("overhead_ns", 0.0))
        dtype_bits = _DTYPE_BITS.get(dtype, 16)
        effective_tflops = self._peak_tflops_f16 * (16.0 / dtype_bits)
        flops = 2.0 * m * k * n
        return flops / (effective_tflops * 1e3)
    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
        yield env.timeout(overhead_ns)
    def handle_command(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
        from kernbench.common.pe_commands import GemmCmd
        cmd = pe_txn.command
        if self._accel:
            with self._accel.request() as req:
                yield req
                if isinstance(cmd, GemmCmd):
                    ns = self._compute_ns(cmd.m, cmd.k, cmd.n, cmd.a.dtype)
                    yield env.timeout(ns)
                else:
                    yield from self.run(env, 0)
        else:
            if isinstance(cmd, GemmCmd):
                ns = self._compute_ns(cmd.m, cmd.k, cmd.n, cmd.a.dtype)
                yield env.timeout(ns)
            else:
                yield from self.run(env, 0)
        pe_txn.done.succeed()
    def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
        """Transaction forwarding with accel_slot acquisition."""
        if self._accel:
            with self._accel.request() as req:
                yield req
                yield from super()._forward_txn(env, txn)
        else:
            yield from super()._forward_txn(env, txn)
@@ -0,0 +1,54 @@
 from __future__ import annotations
 from collections.abc import Generator
 from typing import TYPE_CHECKING, Any
 import simpy
 from kernbench.components.base import PeEngineBase
 if TYPE_CHECKING:
    from kernbench.common.pe_commands import PeInternalTxn
    from kernbench.components.context import ComponentContext
    from kernbench.topology.types import Node
 class PeMathComponent(PeEngineBase):
    """PE_MATH: element-wise computation engine sharing accel_slot (ADR-0014 D4).
    Uses a shared compute resource (PE_ACCEL capacity=1) that is mutually
    exclusive with PE_GEMM within the same PE.
    """
    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
        super().__init__(node, ctx)
        self._accel: simpy.Resource | None = None
    def init_resources(self, env: simpy.Environment) -> None:
        resource_name = self.node.attrs.get("shared_resource")
        if resource_name and self.ctx:
            self._accel = self.ctx.get_shared_resource(
                env, f"{self._pe_prefix}.{resource_name}"
            )
    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
        yield env.timeout(overhead_ns)
    def handle_command(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
        if self._accel:
            with self._accel.request() as req:
                yield req
                yield from self.run(env, 0)
        else:
            yield from self.run(env, 0)
        pe_txn.done.succeed()
    def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
        """Transaction forwarding with accel_slot acquisition."""
        if self._accel:
            with self._accel.request() as req:
                yield req
                yield from super()._forward_txn(env, txn)
        else:
            yield from super()._forward_txn(env, txn)
@@ -0,0 +1,66 @@
 """PE_MMU component: address translation unit.
 Component role: receives MmuMapMsg/MmuUnmapMsg via inbox (independent of PE_CPU).
 Utility role: PE_DMA/PE_GEMM call mmu.translate() directly (no SimPy overhead).
 """
 from __future__ import annotations
 from collections.abc import Generator
 from typing import TYPE_CHECKING, Any
 import simpy
 from kernbench.components.base import ComponentBase, ComponentRegistry
 from kernbench.policy.address.pe_mmu import PeMMU
 if TYPE_CHECKING:
    from kernbench.components.context import ComponentContext
    from kernbench.topology.types import Node
 class PeMmuComponent(ComponentBase):
    """PE_MMU: per-PE virtual-to-physical address translation.
    Receives MmuMapMsg/MmuUnmapMsg via inbox and updates the internal
    page table. PE_DMA and PE_GEMM access the underlying PeMMU object
    via the ``mmu`` property for synchronous VA→PA translation.
    """
    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
        super().__init__(node, ctx)
        page_size = int(node.attrs.get("page_size", 2 * 1024 * 1024))
        overhead_ns = float(node.attrs.get("tlb_overhead_ns", 0.0))
        self._mmu = PeMMU(page_size=page_size, overhead_ns=overhead_ns)
    @property
    def mmu(self) -> PeMMU:
        """The underlying PeMMU utility object for direct translate() calls."""
        return self._mmu
    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
        yield env.timeout(0)
    def _worker(self, env: simpy.Environment) -> Generator:
        """Process MmuMapMsg/MmuUnmapMsg from inbox."""
        from kernbench.runtime_api.kernel import MmuMapMsg, MmuUnmapMsg
        while True:
            txn: Any = yield self._inbox.get()
            if hasattr(txn, "request"):
                request = txn.request
                if isinstance(request, MmuMapMsg):
                    for entry in request.entries:
                        self._mmu.map(
                            va=entry["va"], pa=entry["pa"], size=entry["size"],
                        )
                    txn.done.succeed()
                elif isinstance(request, MmuUnmapMsg):
                    for entry in request.entries:
                        self._mmu.unmap(va=entry["va"], size=entry["size"])
                    txn.done.succeed()
                else:
                    # Forward non-MMU transactions normally
                    yield from self._forward_txn(env, txn)
            else:
                yield from self._forward_txn(env, txn)
@@ -0,0 +1,245 @@
 from __future__ import annotations
 from collections.abc import Generator
 from typing import TYPE_CHECKING, Any
 import simpy
 from kernbench.components.base import ComponentBase
 if TYPE_CHECKING:
    from kernbench.common.pe_commands import PeInternalTxn
    from kernbench.components.context import ComponentContext
    from kernbench.topology.types import Node
 class PeSchedulerComponent(ComponentBase):
    """PE_SCHEDULER: sole dispatcher inside a PE (ADR-0014 D1).
    Receives PeInternalTxn from PE_CPU, routes to the appropriate engine:
      - DmaReadCmd / DmaWriteCmd → PE_DMA
      - GemmCmd                  → PE_GEMM
      - MathCmd                  → PE_MATH
      - CompositeCmd             → tiled pipeline (Stage 3: ADR-0014 D3.2)
    Composite GEMM pipeline (32x64x32 tiles):
      DMA_READ(b_tile_t) → COMPUTE(t) → DMA_WRITE(out_tile_t)
      with overlap: READ(t+1) || COMPUTE(t) || WRITE(t-1)
    Applies scheduler overhead_ns before dispatching each command.
    Non-PeInternalTxn messages are forwarded via inherited _forward_txn().
    """
    # Scheduler tile dimensions (ADR-0014 D3.2)
    TILE_M = 32
    TILE_K = 64
    TILE_N = 32
    # Command → engine suffix dispatch table.
    # New engines: add a single entry here (e.g. ConvCmd: "pe_conv").
    _CMD_DISPATCH: dict[type, str] = {}
    @classmethod
    def _ensure_dispatch_table(cls) -> None:
        if cls._CMD_DISPATCH:
            return
        from kernbench.common.pe_commands import DmaReadCmd, DmaWriteCmd, GemmCmd, MathCmd
        cls._CMD_DISPATCH = {
            DmaReadCmd: "pe_dma",
            DmaWriteCmd: "pe_dma",
            GemmCmd: "pe_gemm",
            MathCmd: "pe_math",
        }
    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
        super().__init__(node, ctx)
        self._pe_prefix = node.id.rsplit(".", 1)[0]
        self._ensure_dispatch_table()
    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
        yield env.timeout(overhead_ns)
    def _worker(self, env: simpy.Environment) -> Generator:
        from kernbench.common.pe_commands import PeInternalTxn
        while True:
            msg: Any = yield self._inbox.get()
            if isinstance(msg, PeInternalTxn):
                env.process(self._dispatch(env, msg))
            else:
                yield from self._forward_txn(env, msg)
    def _dispatch(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
        """Route a PeInternalTxn to the correct engine via dispatch table."""
        from kernbench.common.pe_commands import CompositeCmd
        # Scheduler overhead
        yield from self.run(env, 0)
        cmd = pe_txn.command
        # Check dispatch table first
        engine_suffix = self._CMD_DISPATCH.get(type(cmd))
        if engine_suffix is not None:
            yield self.out_ports[f"{self._pe_prefix}.{engine_suffix}"].put(pe_txn)
            return
        # CompositeCmd: tiled pipeline (not a simple forward)
        if isinstance(cmd, CompositeCmd):
            yield from self._dispatch_composite(env, pe_txn)
            return
        # Unknown command — signal done immediately
        pe_txn.done.succeed()
    def _dispatch_composite(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
        """Composite tiled pipeline (ADR-0014 D3.2).
        GEMM: 3-stage pipeline with b-tile streaming from HBM.
        MATH: sequential compute + DMA_WRITE (no tiling).
        """
        from kernbench.common.pe_commands import CompositeCmd
        cmd = pe_txn.command
        assert isinstance(cmd, CompositeCmd)
        if cmd.op == "gemm" and cmd.b is not None:
            yield from self._pipeline_gemm(env, pe_txn, cmd)
        else:
            yield from self._pipeline_math(env, pe_txn, cmd)
    def _pipeline_gemm(self, env: simpy.Environment, pe_txn: PeInternalTxn, cmd: Any) -> Generator:
        """Tiled GEMM pipeline: stream b tiles from HBM, compute, write results.
        Tensor a is in TCM (loaded via tl.load). Tensor b is in HBM (via tl.ref).
        Pipeline: DMA_READ(b_tile_t) -> COMPUTE(t) -> DMA_WRITE(out_tile_t)
        Overlap: READ(t+1) || COMPUTE(t) || WRITE(t-1)
        """
        from kernbench.common.pe_commands import (
            DmaReadCmd,
            DmaWriteCmd,
            GemmCmd,
            PeInternalTxn as PeTxn,
            TensorHandle,
        )
        pp = self._pe_prefix
        a = cmd.a   # already in TCM
        b = cmd.b   # HBM reference (via tl.ref)
        M, K_a = a.shape[-2], a.shape[-1]
        K_b, N = b.shape[-2], b.shape[-1]
        dtype = a.dtype
        dtype_bytes = b.nbytes // (K_b * N) if (K_b * N) > 0 else 2
        # Tile counts
        n_tiles_k = max(1, (K_a + self.TILE_K - 1) // self.TILE_K)
        n_tiles_n = max(1, (N + self.TILE_N - 1) // self.TILE_N)
        n_tiles = n_tiles_k * n_tiles_n
        prev_compute_done = None
        prev_write_done = None
        total_dma_ns = 0.0
        total_compute_ns = 0.0
        for tile_idx in range(n_tiles):
            tk = tile_idx // n_tiles_n
            tn = tile_idx % n_tiles_n
            k_start = tk * self.TILE_K
            n_start = tn * self.TILE_N
            tile_k = min(self.TILE_K, K_a - k_start)
            tile_n = min(self.TILE_N, N - n_start)
            tile_nbytes = tile_k * tile_n * dtype_bytes
            # --- Stage 1: DMA_READ b_tile from HBM ---
            read_done = env.event()
            b_tile_addr = b.addr + (k_start * N + n_start) * dtype_bytes
            b_tile_handle = TensorHandle(
                id=f"b_tile_{tile_idx}", addr=b_tile_addr,
                shape=(tile_k, tile_n), dtype=dtype, nbytes=tile_nbytes,
            )
            read_cmd = DmaReadCmd(handle=b_tile_handle, src_addr=b_tile_addr, nbytes=tile_nbytes)
            read_txn = PeTxn(command=read_cmd, done=read_done, pe_prefix=pp)
            t0 = env.now
            yield self.out_ports[f"{pp}.pe_dma"].put(read_txn)
            # Wait for previous compute before starting this tile's compute
            if prev_compute_done is not None:
                yield prev_compute_done
            # Wait for this tile's DMA_READ
            yield read_done
            total_dma_ns += env.now - t0
            # --- Stage 2: COMPUTE (GEMM) ---
            compute_done = env.event()
            out_handle = TensorHandle(
                id=f"out_tile_{tile_idx}", addr=0,
                shape=(M, tile_n), dtype=dtype,
                nbytes=M * tile_n * dtype_bytes,
            )
            compute_cmd = GemmCmd(a=a, b=b_tile_handle, out=out_handle,
                                  m=M, k=tile_k, n=tile_n)
            compute_txn = PeTxn(command=compute_cmd, done=compute_done, pe_prefix=pp)
            t0 = env.now
            yield self.out_ports[f"{pp}.pe_gemm"].put(compute_txn)
            # Wait for previous write (DMA_WRITE serialization)
            if prev_write_done is not None:
                yield prev_write_done
            # Wait for compute of THIS tile
            yield compute_done
            total_compute_ns += env.now - t0
            prev_compute_done = compute_done
            # --- Stage 3: DMA_WRITE out_tile to HBM ---
            write_done = env.event()
            out_tile_pa = cmd.out_addr + n_start * dtype_bytes
            write_nbytes = M * tile_n * dtype_bytes
            write_cmd = DmaWriteCmd(handle=out_handle, dst_addr=out_tile_pa, nbytes=write_nbytes)
            write_txn = PeTxn(command=write_cmd, done=write_done, pe_prefix=pp)
            t0 = env.now
            yield self.out_ports[f"{pp}.pe_dma"].put(write_txn)
            prev_write_done = write_done
        # Wait for final write
        if prev_write_done is not None:
            t0 = env.now
            yield prev_write_done
            total_dma_ns += env.now - t0
        pe_txn.result_data["dma_ns"] = total_dma_ns
        pe_txn.result_data["compute_ns"] = total_compute_ns
        pe_txn.done.succeed()
    def _pipeline_math(self, env: simpy.Environment, pe_txn: PeInternalTxn, cmd: Any) -> Generator:
        """Non-GEMM composite: sequential compute + DMA_WRITE (no tiling)."""
        from kernbench.common.pe_commands import (
            DmaWriteCmd,
            MathCmd,
            PeInternalTxn as PeTxn,
        )
        pp = self._pe_prefix
        # Step 1: Compute (MATH)
        compute_done = env.event()
        compute_cmd = MathCmd(
            op=cmd.math_op or "identity",
            inputs=(cmd.a,), out=cmd.a,
        )
        compute_txn = PeTxn(command=compute_cmd, done=compute_done, pe_prefix=pp)
        yield self.out_ports[f"{pp}.pe_math"].put(compute_txn)
        yield compute_done
        # Step 2: DMA_WRITE result to HBM
        write_done = env.event()
        write_cmd = DmaWriteCmd(handle=cmd.a, dst_addr=cmd.out_addr, nbytes=cmd.out_nbytes)
        write_txn = PeTxn(command=write_cmd, done=write_done, pe_prefix=pp)
        yield self.out_ports[f"{pp}.pe_dma"].put(write_txn)
        yield write_done
        pe_txn.done.succeed()
@@ -0,0 +1,25 @@
 from __future__ import annotations
 from collections.abc import Generator
 from typing import TYPE_CHECKING
 from kernbench.components.base import ComponentBase
 if TYPE_CHECKING:
    from kernbench.components.context import ComponentContext
    from kernbench.topology.types import Node
 class PeTcmComponent(ComponentBase):
    """PE_TCM: tightly-coupled memory / local SRAM staging buffer.
    Terminal storage component for PE-internal dataflow (ADR-0014 D5).
    Phase 0: applies overhead_ns and drain_ns at terminal.
    """
    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
        super().__init__(node, ctx)
    def run(self, env, nbytes: int) -> Generator:
        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
        yield env.timeout(overhead_ns)
@@ -0,0 +1,59 @@
 from __future__ import annotations
 from collections.abc import Generator
 from typing import TYPE_CHECKING, Any
 import simpy
 from kernbench.components.base import ComponentBase
 from kernbench.sim_engine.transaction import Transaction
 if TYPE_CHECKING:
    from kernbench.components.context import ComponentContext
    from kernbench.topology.types import Node
 class SramComponent(ComponentBase):
    """Cube SRAM: terminal component that models SRAM access latency.
    Applies overhead_ns processing overhead (from node.attrs).
    On completion, sends a ResponseMsg back on the reverse path.
    """
    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
        super().__init__(node, ctx)
    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
        yield env.timeout(overhead_ns)
    def _worker(self, env: simpy.Environment) -> Generator:
        """Terminal worker: process, apply drain, send response."""
        while True:
            txn: Any = yield self._inbox.get()
            yield from self.run(env, txn.nbytes)
            drain = getattr(txn, "drain_ns", 0.0)
            if drain > 0:
                yield env.timeout(drain)
            yield from self._send_response(env, txn)
    def _send_response(self, env: simpy.Environment, txn: Any) -> Generator:
        """Create ResponseMsg and send on reverse path."""
        reverse_path = list(reversed(txn.path))
        if len(reverse_path) >= 2 and self.ctx:
            from kernbench.runtime_api.kernel import ResponseMsg
            parts = self.node.id.split(".")
            cube_id = int(parts[1].replace("cube", ""))
            resp_msg = ResponseMsg(
                correlation_id=txn.request.correlation_id,
                request_id=txn.request.request_id,
                src_cube=cube_id, src_pe=-1, success=True,
            )
            resp_txn = Transaction(
                request=resp_msg, path=reverse_path, step=0,
                nbytes=0, done=env.event(), is_response=True,
            )
            yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
        else:
            txn.done.succeed()
@@ -1,3 +1,14 @@
 """Data-parallel placement policy (ADR-0026: intra-device only).
 ``DPPolicy`` describes how a tensor is sharded *within a single SIP* across
 that SIP's cubes and PEs. Crossing the SIP boundary is not a DPPolicy
 concern: ADR-0024's ``torch.ahbm.set_device(rank)`` picks the SIP, and
 Megatron-style TP (ADR-0027) expresses multi-SIP tensors when needed.
 ``ShardSpec`` is expressed in structural ``(sip, cube, pe)`` coordinates.
 The former flat ``pe_index`` field/property is fully removed — callers
 needing a flat integer key compute it explicitly at the call site.
 """
 from __future__ import annotations
 from dataclasses import dataclass
@@ -7,25 +18,58 @@ from typing import Literal
@dataclass(frozen=True)
 class DPPolicy:
-    """Three-level data-parallel policy: sip-level + cube-level + pe-level.
+    """Intra-device (cube × PE) data-parallel policy.
-    Policies:
+    SIP-level placement is controlled by ``torch.ahbm.set_device(rank)``
    (ADR-0024). For tensors that must cross SIP boundaries, use
    Megatron-style parallel layers (ADR-0027). DPPolicy itself never
    crosses a SIP boundary.
    Policies (per axis):
      - "replicate": full copy at each unit
      - "column_wise": split K (column) axis across units
      - "row_wise": split M (row) axis across units
-    Optional overrides (default None = use topology dimensions):
+    Optional overrides (``None`` = use topology dimensions):
-      - num_pes: override PEs per cube (e.g., 1 for single-PE test)
+      - num_pes: override PEs per cube
-      - num_cubes: override cubes per SIP (e.g., 1 for single-cube test)
+      - num_cubes: override cubes per SIP
      - num_sips: override SIP count
    """
    sip: Literal["replicate", "column_wise", "row_wise"] = "replicate"
    cube: Literal["replicate", "column_wise", "row_wise"] = "replicate"
    pe: Literal["replicate", "column_wise", "row_wise"] = "replicate"
    num_pes: int | None = None
    num_cubes: int | None = None
-    num_sips: int | None = None
+
@dataclass(frozen=True)
 class ShardSpec:
    """Structural shard placement — ``(sip, cube, pe)`` coord (ADR-0026).
    Global-flat ``pe_index`` was removed: callers must use structural
    coords directly. If a flat integer key is needed in a local context
    (e.g. internal dict lookup), compute it explicitly at the call site
    and do not expose it in any public API.
    """
    sip: int
    cube: int
    pe: int
    offset_bytes: int
    nbytes: int
@dataclass(frozen=True)
 class _LocalPeShard:
    """Internal — PE resolver's return type (ADR-0026 D3).
    Holds a cube-local PE identifier (``local_pe``) plus the shard's
    byte payload. Lifted into ``ShardSpec`` with full ``(sip, cube, pe)``
    coordinates inside ``resolve_dp_policy``.
    """
    local_pe: int
    offset_bytes: int
    nbytes: int
 def _split_shape(
@@ -52,14 +96,13 @@ def resolve_dp_policy(
    itemsize: int,
    num_pe: int,
    num_cubes: int = 1,
-    num_sips: int = 1,
+    target_sip: int,
 ) -> list[ShardSpec]:
-    """Resolve a DPPolicy into a list[ShardSpec] with three-level resolution.
+    """Resolve a DPPolicy into a list[ShardSpec] on a single SIP.
-    SIP-level → cube-level → pe-level.
+    Two-level resolution (cube × PE) within ``target_sip``. Each returned
-    num_cubes is cubes per SIP (not total).
+    ``ShardSpec`` carries ``sip=target_sip`` and cube/pe local to the SIP.
-    ShardSpec.pe_index uses flat indexing:
+    No SIP-level split — DPPolicy is intra-device only (ADR-0026).
      sip_id * num_cubes * num_pe + cube_id * num_pe + pe_id
    """
    _PE_RESOLVERS = {
        "replicate": replicate,
@@ -70,84 +113,61 @@ def resolve_dp_policy(
    if resolver is None:
        raise ValueError(f"Unknown pe-level policy: {policy.pe}")
    cubes_per_sip = num_cubes
    all_shards: list[ShardSpec] = []
-    # Level 1: SIP
+    # Level 1: cube within SIP
-    sip_splits = _split_shape(policy.sip, shape, num_sips, itemsize)
+    cube_splits = _split_shape(policy.cube, shape, num_cubes, itemsize)
-    for sip_id, (sip_shape, sip_offset) in enumerate(sip_splits):
+    for cube_id, (cube_shape, cube_offset) in enumerate(cube_splits):
-        # Level 2: Cube within SIP
+        # Level 2: PE within cube — resolver returns _LocalPeShard
-        cube_splits = _split_shape(policy.cube, sip_shape, cubes_per_sip, itemsize)
+        local_shards = resolver(shape=cube_shape, itemsize=itemsize, num_pe=num_pe)
-        for cube_id, (cube_shape, cube_offset) in enumerate(cube_splits):
+        for ls in local_shards:
-            # Level 3: PE within cube
+            all_shards.append(ShardSpec(
-            pe_shards = resolver(shape=cube_shape, itemsize=itemsize, num_pe=num_pe)
+                sip=target_sip,
-
+                cube=cube_id,
-            for ps in pe_shards:
+                pe=ls.local_pe,
-                flat_idx = (
+                offset_bytes=cube_offset + ls.offset_bytes,
-                    sip_id * cubes_per_sip * num_pe
+                nbytes=ls.nbytes,
-                    + cube_id * num_pe
+            ))
                    + ps.pe_index
                )
                all_shards.append(ShardSpec(
                    pe_index=flat_idx,
                    offset_bytes=sip_offset + cube_offset + ps.offset_bytes,
                    nbytes=ps.nbytes,
                ))
    return all_shards
@dataclass(frozen=True)
 class ShardSpec:
    pe_index: int
    offset_bytes: int
    nbytes: int
 def column_wise(
    *, shape: tuple[int, int], itemsize: int, num_pe: int,
-) -> list[ShardSpec]:
+) -> list[_LocalPeShard]:
    """Split K axis into num_pe equal parts. Each PE gets (M, K/P)."""
    M, K = shape
    chunk_k = K // num_pe
    chunk_bytes = M * chunk_k * itemsize
-    shards = []
+    return [
-    for i in range(num_pe):
+        _LocalPeShard(local_pe=i, offset_bytes=i * chunk_bytes, nbytes=chunk_bytes)
-        shards.append(ShardSpec(
+        for i in range(num_pe)
-            pe_index=i,
+    ]
            offset_bytes=i * chunk_bytes,
            nbytes=chunk_bytes,
        ))
    return shards
 def row_wise(
    *, shape: tuple[int, int], itemsize: int, num_pe: int,
-) -> list[ShardSpec]:
+) -> list[_LocalPeShard]:
    """Split M axis into num_pe equal parts. Each PE gets (M/P, K)."""
    M, K = shape
    chunk_m = M // num_pe
    chunk_bytes = chunk_m * K * itemsize
-    shards = []
+    return [
-    for i in range(num_pe):
+        _LocalPeShard(local_pe=i, offset_bytes=i * chunk_bytes, nbytes=chunk_bytes)
-        shards.append(ShardSpec(
+        for i in range(num_pe)
-            pe_index=i,
+    ]
            offset_bytes=i * chunk_bytes,
            nbytes=chunk_bytes,
        ))
    return shards
 def replicate(
    *, shape: tuple[int, int], itemsize: int, num_pe: int,
-) -> list[ShardSpec]:
+) -> list[_LocalPeShard]:
    """Full copy per PE. Each PE gets (M, K)."""
    M, K = shape
    full_bytes = M * K * itemsize
    return [
-        ShardSpec(pe_index=i, offset_bytes=0, nbytes=full_bytes)
+        _LocalPeShard(local_pe=i, offset_bytes=0, nbytes=full_bytes)
        for i in range(num_pe)
    ]
@@ -155,20 +175,20 @@ def replicate(
 def tiled_column_major(
    *, shape: tuple[int, int], itemsize: int, num_pe: int,
    tile_m: int, tile_k: int,
-) -> list[ShardSpec]:
+) -> list[_LocalPeShard]:
    """2D tiling, column-major order (K axis first), round-robin across PEs."""
    M, K = shape
    tiles_m = ceil(M / tile_m)
    tiles_k = ceil(K / tile_k)
    tile_bytes = tile_m * tile_k * itemsize
    row_bytes = K * itemsize
-    shards = []
+    shards: list[_LocalPeShard] = []
    idx = 0
    for mi in range(tiles_m):
        for ki in range(tiles_k):
            offset = (mi * tile_m * row_bytes) + (ki * tile_k * itemsize)
-            shards.append(ShardSpec(
+            shards.append(_LocalPeShard(
-                pe_index=idx % num_pe,
+                local_pe=idx % num_pe,
                offset_bytes=offset,
                nbytes=tile_bytes,
            ))
@@ -179,20 +199,20 @@ def tiled_column_major(
 def tiled_row_major(
    *, shape: tuple[int, int], itemsize: int, num_pe: int,
    tile_m: int, tile_k: int,
-) -> list[ShardSpec]:
+) -> list[_LocalPeShard]:
    """2D tiling, row-major order (M axis first), round-robin across PEs."""
    M, K = shape
    tiles_m = ceil(M / tile_m)
    tiles_k = ceil(K / tile_k)
    tile_bytes = tile_m * tile_k * itemsize
    row_bytes = K * itemsize
-    shards = []
+    shards: list[_LocalPeShard] = []
    idx = 0
    for ki in range(tiles_k):
        for mi in range(tiles_m):
            offset = (mi * tile_m * row_bytes) + (ki * tile_k * itemsize)
-            shards.append(ShardSpec(
+            shards.append(_LocalPeShard(
-                pe_index=idx % num_pe,
+                local_pe=idx % num_pe,
                offset_bytes=offset,
                nbytes=tile_bytes,
            ))
@@ -29,11 +29,10 @@ def run_bench(
    correlation_id: str = "bench0",
    completion_policy: CompletionPolicy = CompletionPolicy.LAST_SUBMITTED,
 ) -> BenchResult:
-    """
+    """Minimal bench runner.
    Minimal bench runner.
    - topology: compiled topology object (opaque to runtime here)
-    - bench_fn: callable that receives RuntimeContext and submits requests
+    - bench_fn: callable ``run(torch)`` receiving a RuntimeContext
    - device: DeviceSelector ("all" or "sip:<N>")
    - engine_factory: builds sim_engine for given topology & device
    - completion_policy: how to determine overall completion/result
@@ -48,7 +47,6 @@ def run_bench(
    )
    bench_fn(ctx)
    ctx.wait_all()
    collected_traces = ctx._traces or None
@@ -62,6 +60,7 @@ def run_bench(
            correlation_id=correlation_id,
            trace=None,
            traces=collected_traces,
            engine=engine,
        )
    if completion_policy == CompletionPolicy.LAST_SUBMITTED:
@@ -69,7 +68,7 @@ def run_bench(
        completion, trace = engine.get_completion(last)
        return BenchResult(
            completion=completion, correlation_id=correlation_id,
-            trace=trace, traces=collected_traces,
+            trace=trace, traces=collected_traces, engine=engine,
        )
    if completion_policy == CompletionPolicy.ALL_OK_FAIL_FAST:
@@ -80,11 +79,11 @@ def run_bench(
            if not c.ok:
                return BenchResult(
                    completion=c, correlation_id=correlation_id,
-                    trace=last_trace, traces=collected_traces,
+                    trace=last_trace, traces=collected_traces, engine=engine,
                )
        return BenchResult(
            completion=Completion(ok=True), correlation_id=correlation_id,
-            trace=last_trace, traces=collected_traces,
+            trace=last_trace, traces=collected_traces, engine=engine,
        )
    # LAST_COMPLETED placeholder (needs engine support for timing). Fall back.
@@ -92,5 +91,5 @@ def run_bench(
    completion, trace = engine.get_completion(last)
    return BenchResult(
        completion=completion, correlation_id=correlation_id,
-        trace=trace, traces=collected_traces,
+        trace=trace, traces=collected_traces, engine=engine,
    )
@@ -9,6 +9,92 @@ from kernbench.common.types import Completion, RequestHandle, SimEngine
 from .types import DeviceSelector
 def _world_size_from_spec(spec: dict | None) -> int:
    """Derive world_size from topology spec: sips × cubes × pes_per_cube."""
    spec = spec or {}
    sips = int(spec.get("system", {}).get("sips", {}).get("count", 1))
    cm = spec.get("sip", {}).get("cube_mesh", {})
    cubes_per_sip = int(cm.get("w", 1)) * int(cm.get("h", 1))
    pl = spec.get("cube", {}).get("pe_layout", {})
    corners = pl.get("corners", [])
    pe_per_corner = int(pl.get("pe_per_corner", 1))
    pes_per_cube = pe_per_corner * max(len(corners), 1)
    return sips * cubes_per_sip * pes_per_cube
 def _numpy_to_dtype_str(np_dtype) -> str:
    """Map numpy dtype → kernbench dtype string used by Tensor."""
    import numpy as np
    kind_map = {
        np.float16: "f16",
        np.float32: "f32",
        np.int8: "i8",
        np.int16: "i16",
        np.int32: "i32",
        np.uint8: "u8",
        np.uint16: "u16",
        np.uint32: "u32",
    }
    for np_type, s in kind_map.items():
        if np.dtype(np_dtype) == np.dtype(np_type):
            return s
    raise ValueError(f"unsupported numpy dtype: {np_dtype!r}")
 # ADR-0027 D3: weak registry of the currently-active RuntimeContext so
 # module-level helpers (e.g. ``kernbench.tp.parallel_state``) can resolve
 # the ctx without threading it through every call.
 import weakref as _weakref
 _ACTIVE_CTX_REF: _weakref.ref | None = None
 def _get_active_context():
    """Return the most-recently-entered RuntimeContext, or None."""
    if _ACTIVE_CTX_REF is None:
        return None
    return _ACTIVE_CTX_REF()
 class _AhbmNamespace:
    """torch.ahbm — per-greenlet SIP device binding (ADR-0024 D10).
    Real-PyTorch parity idiom: ``torch.cuda.set_device(rank)``. KernBench's
    backend is 'ahbm' (not CUDA), so this namespace avoids pretending to be
    a CUDA runtime.
    """
    def __init__(self) -> None:
        self._device_by_greenlet: dict = {}
    def set_device(self, device: int) -> None:
        from greenlet import getcurrent
        self._device_by_greenlet[getcurrent()] = int(device)
    def current_device(self) -> int | None:
        from greenlet import getcurrent
        return self._device_by_greenlet.get(getcurrent())
 class _AcceleratorNamespace:
    """torch.accelerator — device-agnostic alias (PyTorch 2.x style).
    Wraps _AhbmNamespace. Bench code can pick either:
        torch.ahbm.set_device(rank)                   # explicit backend
        torch.accelerator.set_device_index(rank)      # portable
    """
    def __init__(self, ahbm: "_AhbmNamespace") -> None:
        self._ahbm = ahbm
    def set_device_index(self, device: int) -> None:
        self._ahbm.set_device(device)
    def current_device_index(self) -> int | None:
        return self._ahbm.current_device()
@dataclass
 class RuntimeContext:
    engine: SimEngine
@@ -18,17 +104,94 @@ class RuntimeContext:
    _handles: list[RequestHandle] = field(default_factory=list, init=False)
    _completed: set[RequestHandle] = field(default_factory=set, init=False)
-    _allocators: dict[int, Any] = field(default_factory=dict, init=False)
+    # ADR-0027 D0.1: worker-deferred wait queue. When a worker greenlet
    # calls ctx.wait(h), the handle is appended here and control yields to
    # main. Main's scheduler drain consumes this list.
    _pending_worker_waits: list[RequestHandle] = field(default_factory=list, init=False)
    _allocators: dict[tuple[int, int, int], Any] = field(default_factory=dict, init=False)
    _va_allocator: Any = field(default=None, init=False)
    _tensor_counter: int = field(default=0, init=False)
    _traces: list[dict] = field(default_factory=list, init=False)
    _tensors: list[Any] = field(default_factory=list, init=False)
    distributed: Any = field(default=None, init=False)  # DistributedContext for CCL benches
    _ipcq_plan: dict = field(default_factory=dict, init=False)  # ADR-0023 install plan
    def __post_init__(self) -> None:
        # Eagerly attach a DistributedContext so bench code can do
        # ``dist = torch.distributed`` + ``dist.init_process_group(...)``
        # without needing a separate launcher to install it.
        from kernbench.runtime_api.distributed import DistributedContext
        dc = DistributedContext()
        dc._ctx_ref = self  # back-reference for AhbmCCLBackend to reach ctx.launch etc.
        self.distributed = dc
        # ADR-0024 D10: torch.ahbm (KernBench-native) + torch.accelerator
        # (PyTorch 2.x portable) namespaces for per-greenlet device binding.
        self.ahbm = _AhbmNamespace()
        self.accelerator = _AcceleratorNamespace(self.ahbm)
        # ADR-0027 D1.3: torch.multiprocessing.spawn namespace.
        from kernbench.runtime_api.multiprocessing import _MultiprocessingNamespace
        self.multiprocessing = _MultiprocessingNamespace(self)
    def install_ipcq(
        self,
        algorithm: str | None = None,
        ccl_yaml: str | None = None,
        world_size_override: int | None = None,
        rank_to_pe: list[tuple[int, int, int]] | None = None,
    ) -> dict:
        """Install IPCQ neighbor tables on all participating PEs (ADR-0023 D10).
        Loads ``ccl.yaml`` (or the path provided), resolves the chosen
        algorithm (or ``defaults.algorithm`` if None), and pushes per-PE
        IpcqInitMsg into every PE_IPCQ component via the engine.
        Args:
            algorithm: name of the algorithm in ccl.yaml (or use defaults).
            ccl_yaml: optional path to ccl.yaml.
            world_size_override: if set, replace the algorithm's world_size.
        Returns the install plan dict (rank → (sip,cube,pe), neighbor table).
        """
        import importlib
        from kernbench.ccl.install import (
            install_ipcq as _install,
            load_ccl_config,
            resolve_algorithm_config,
        )
        cfg = load_ccl_config(ccl_yaml)
        merged = resolve_algorithm_config(cfg, algorithm)
        if world_size_override is not None:
            merged["world_size"] = world_size_override
        elif "world_size" not in merged:
            # Derive from topology.yaml when neither the algorithm entry
            # nor ``defaults`` carries ``world_size`` (matches pytorch DDP
            # where env vars determine ranks, not the ccl config file).
            merged["world_size"] = _world_size_from_spec(self.spec)
        algo_module = None
        try:
            algo_module = importlib.import_module(merged["module"])
        except ModuleNotFoundError:
            pass
        plan = _install(
            self.engine, self.spec, merged,
            algo_module=algo_module, rank_to_pe=rank_to_pe,
        )
        self._ipcq_plan = plan
        self._ipcq_config = merged
        return plan
    def __enter__(self):
        global _ACTIVE_CTX_REF
        _ACTIVE_CTX_REF = _weakref.ref(self)
        return self
    def __exit__(self, *exc):
        global _ACTIVE_CTX_REF
        self.cleanup()
        # Clear active-context registry if we are it.
        if _ACTIVE_CTX_REF is not None and _ACTIVE_CTX_REF() is self:
            _ACTIVE_CTX_REF = None
        return False
    def submit(self, request: Any) -> RequestHandle:
@@ -43,10 +206,24 @@ class RuntimeContext:
        return handle in self._completed
    def wait(self, handle: RequestHandle, *, _meta: dict | None = None) -> Completion:
        # ADR-0027 D0.2: fast-path for already-completed handles (avoid
        # redundant worker→main→worker round-trip).
        if handle in self._completed:
            completion, trace = self.engine.get_completion(handle)
            return completion
        # ADR-0027 D0.2: if called from a worker greenlet (parent is main,
        # not dead), defer the wait to the main scheduler — enqueue and
        # yield. Main drains env.run, then switches back. On resume the
        # handle must be in _completed (D0.3 resume invariant).
        from greenlet import getcurrent
        g = getcurrent()
        if g.parent is not None and not g.parent.dead:
            self._pending_worker_waits.append(handle)
            g.parent.switch()
            # Resume: main drained. Fall through to completion/trace assembly.
        # Main context (or single-driver): drive engine directly.
        wait_fn = getattr(self.engine, "wait", None)
        if wait_fn is not None:
            wait_fn(handle)  # type: ignore[misc]
@@ -135,12 +312,7 @@ class RuntimeContext:
        # Return PA space
        if self._allocators:
            for shard in handle.shards:
-                flat_idx = (
+                alloc = self._allocators.get((shard.sip, shard.cube, shard.pe))
                    shard.sip * self._num_cubes * self._pes_per_cube
                    + shard.cube * self._pes_per_cube
                    + shard.pe
                )
                alloc = self._allocators.get(flat_idx)
                if alloc is not None:
                    from kernbench.policy.address.phyaddr import PhysAddr
                    alloc.free_hbm(PhysAddr.decode(shard.pa), shard.nbytes)
@@ -204,17 +376,15 @@ class RuntimeContext:
            tcm_scheduler_reserved_bytes=4 * (1 << 20),
            sram_bytes_per_cube=32 * (1 << 20),
        )
-        # Create allocators scoped to target SIP(s) only
+        # Create allocators scoped to target SIP(s) only.
-        # Flat index: sip_id * cubes_per_sip * pes_per_cube + cube_id * pes_per_cube + pe_id
+        # ADR-0026 D5: dict key is the structural (sip, cube, pe) tuple.
        self._pes_per_cube = pes_per_cube
        self._num_cubes = cubes_per_sip
        self._num_sips = sip_count
        cubes_x_pes = cubes_per_sip * pes_per_cube
        for sip_id in sip_range:
            for cube_id in range(cubes_per_sip):
                for pe_id in range(pes_per_cube):
-                    flat_idx = sip_id * cubes_x_pes + cube_id * pes_per_cube + pe_id
+                    self._allocators[(sip_id, cube_id, pe_id)] = PEMemAllocator(
                    self._allocators[flat_idx] = PEMemAllocator(
                        rack_id=0, sip_id=sip_id, cube_id=cube_id, pe_id=pe_id, cfg=cfg,
                    )
@@ -258,6 +428,24 @@ class RuntimeContext:
        """Allocate a tensor in HBM without initialization (like torch.empty)."""
        return self._create_tensor(shape, dtype, name, pattern=None, dp=dp)
    def from_numpy(self, arr: Any):
        """Create a host-side tensor wrapping a numpy array.
        Mirrors ``torch.from_numpy``. The returned tensor is NOT deployed
        to any PE — it lives in an in-memory host staging buffer. Use
        ``target.copy_(host_tensor)`` to scatter its contents into a
        sharded, deployed tensor.
        """
        import numpy as np
        from kernbench.runtime_api.tensor import Tensor
        arr_c = np.ascontiguousarray(arr)
        dtype_str = _numpy_to_dtype_str(arr_c.dtype)
        t = Tensor(shape=tuple(arr_c.shape), dtype=dtype_str, name="host")
        t._host_buffer = arr_c
        t._memory_store = getattr(self.engine, "_memory_store", None)
        return t
    def _create_tensor(
        self,
        shape: tuple[int, ...],
@@ -283,16 +471,23 @@ class RuntimeContext:
        # DPPolicy overrides take precedence over topology dimensions
        eff_num_pe = dp.num_pes if dp.num_pes is not None else self._pes_per_cube
        eff_num_cubes = dp.num_cubes if dp.num_cubes is not None else self._num_cubes
-        eff_num_sips = dp.num_sips if dp.num_sips is not None else self._num_sips
+        # ADR-0026 D4: resolve structural coords directly at resolve time.
        # ``torch.ahbm.set_device(rank)`` (ADR-0024 D10) selects the target
        # SIP; if unset, fall back to SIP 0 for single-driver compatibility.
        current_sip = (
            self.ahbm.current_device() if hasattr(self, "ahbm") else None
        )
        if current_sip is None:
            current_sip = 0
        placement = resolve_dp_policy(
            dp, shape=shape_2d, itemsize=itemsize,
            num_pe=eff_num_pe, num_cubes=eff_num_cubes,
-            num_sips=eff_num_sips,
+            target_sip=int(current_sip),
        )
        # Infer target_pe from placement using local (within-cube) PE IDs.
        # This ensures M_CPU only fans out to PEs that own shards, not all PEs.
-        local_pe_ids = sorted({s.pe_index % eff_num_pe for s in placement})
+        local_pe_ids = sorted({s.pe for s in placement})
        if len(local_pe_ids) == 1:
            target_pe: int | tuple[int, ...] | str = local_pe_ids[0]
        elif len(local_pe_ids) == eff_num_pe and eff_num_pe == self._pes_per_cube:
@@ -314,6 +509,7 @@ class RuntimeContext:
        t._handle = handle
        import weakref
        t._ctx_ref = weakref.ref(self)
        t._memory_store = getattr(self.engine, "_memory_store", None)
        self._tensors.append(weakref.ref(t))
        # Install VA→PA mappings via fabric MmuMapMsg
@@ -389,6 +585,21 @@ class RuntimeContext:
                    "sip": shard.sip, "cube": shard.cube, "pe": shard.pe,
                    "nbytes": shard.nbytes,
                })
            # ADR-0027: also populate MemoryStore at VA keys so kernels
            # reading via VA (the common ``tl.load`` path) see the init
            # data. Phase 1 MemoryWriteMsg writes via PA; kernels read via
            # VA; Phase 2 DataExecutor reads via the addresses captured in
            # op_log (VA for tl.load). Without this, zero-init tensors are
            # invisible to kernels in Phase 2.
            store = getattr(self.engine, "_memory_store", None)
            if store is not None and pattern == "zero" and handle.va_base:
                import numpy as np
                from kernbench.runtime_api.tensor import _numpy_dtype
                np_dtype = _numpy_dtype(dtype)
                for shard in handle.shards:
                    count = shard.nbytes // itemsize
                    addr = handle.va_base + shard.offset_bytes
                    store.write("hbm", addr, np.zeros(count, dtype=np_dtype))
        return t
@@ -397,6 +608,7 @@ class RuntimeContext:
        kernel_name: str,
        kernel_fn: Any,
        *args: Any,
        _defer_wait: bool = False,
        **kwargs: Any,
    ) -> RequestHandle:
        """Register and launch a kernel (like a fused torch op).
@@ -406,6 +618,11 @@ class RuntimeContext:
        Creates per-SIP KernelLaunchMsg with local va_base per tensor
        (like host driver sending per-rank launch commands).
        When ``_defer_wait=True`` (ADR-0024 D7), returns the list of
        ``(handle, sip_id, meta)`` tuples instead of waiting. Caller is
        responsible for waiting — used by collective ops to yield between
        submit and wait so all sibling ranks can submit first.
        """
        from collections import defaultdict
@@ -417,13 +634,12 @@ class RuntimeContext:
            TensorArgShard,
        )
        from kernbench.runtime_api.tensor import Tensor
-        from kernbench.triton_emu.registry import register_kernel
+        from kernbench.triton_emu.registry import _kernels, register_kernel
-        # Register kernel (idempotent)
+        # Register kernel (idempotent overwrite — last call wins).
-        try:
+        # Tests can re-register the same kernel_name with a different
-            register_kernel(kernel_name, kernel_fn)
+        # function; the user's most recent launch must use the latest fn.
-        except ValueError:
+        _kernels[kernel_name] = kernel_fn
            pass
        # Collect tensors and scalars
        tensor_args: list[Tensor] = []
@@ -482,11 +698,8 @@ class RuntimeContext:
            dp = t._dp_metadata.dp_policy if t._dp_metadata else None
            if dp is None:
                return t.shape
-            if dp.sip != "replicate":
+            # ADR-0026: DPPolicy no longer crosses SIP boundaries; cube + PE
-                if dp.sip == "column_wise":
+            # are the only axes that shrink the local shape.
                    K = K // self._num_sips
                elif dp.sip == "row_wise":
                    M = M // self._num_sips
            if dp.cube != "replicate":
                if dp.cube == "column_wise":
                    K = K // self._num_cubes
@@ -505,6 +718,7 @@ class RuntimeContext:
        # Per-SIP kernel launch: each SIP gets TensorArgs with local va_base
        last_handle = None
        _pending_handles: list[tuple[Any, int]] = []
        for sip_id in sorted(sip_set):
            sip_kernel_args: list = []
            sip_cube_set: set[int] = set()
@@ -565,10 +779,29 @@ class RuntimeContext:
                target_cubes=target_cubes,
                target_pe=target_pe,
            ))
            # Defer wait until all SIPs are submitted (multi-SIP CCL needs
            # all participating PEs to be live concurrently — waiting
            # per-SIP would deadlock when ranks span SIP boundaries).
            _pending_handles.append((h, sip_id))
            last_handle = h
        if _defer_wait:
            # ADR-0024 D7: return the pending-list so the caller can yield
            # between submit and drain. Used by collective ops that need
            # all sibling ranks to submit before any rank waits.
            return [
                (h, sip_id, {
                    "phase": "kernel", "name": kernel_name,
                    "sip": sip_id, "target_pe": target_pe,
                })
                for h, sip_id in _pending_handles
            ]
        # Drain pending handles now that every SIP has a launch posted.
        for h, sip_id in _pending_handles:
            self.wait(h, _meta={
                "phase": "kernel", "name": kernel_name,
                "sip": sip_id, "target_pe": target_pe,
            })
            last_handle = h
        return last_handle
@@ -0,0 +1,242 @@
 """PyTorch-compatible distributed communication shim (ADR-0023 D11).
 Provides a ``torch.distributed``-like API whose public surface matches
 real PyTorch so that bench code looks identical to a DDP training script.
 Only the ``ahbm`` backend is implemented. It:
 1. Reads ``ccl.yaml`` to decide which collective algorithm to run.
 2. Derives world_size from the algorithm entry, the defaults section, or
   from the topology spec (``system.sips.count × sip.cube_mesh × pe_layout``).
 3. At ``init_process_group`` time, eagerly installs the IPCQ neighbor
   table once (one-time comm setup — mirrors NCCL communicator creation).
 4. On each ``all_reduce(tensor)`` call, reads per-shard metadata from the
   tensor handle and dispatches ``torch.launch`` with the registered
   kernel. The kernel performs intra-PE ring/tree/mesh CCL via IPCQ,
   and Phase 2 DataExecutor replays math + copies from op_log so
   MemoryStore is correct when ``all_reduce`` returns.
 Host bench code uses only real-PyTorch names:
    dist.init_process_group, dist.is_initialized, dist.get_world_size,
    dist.get_rank, dist.get_backend, dist.all_reduce, dist.barrier
 """
 from __future__ import annotations
 import importlib
 import math
 from typing import Any
 class AhbmCCLBackend:
    """Ahbm CCL backend — drives kernel-level collectives via IPCQ."""
    def __init__(self, torch_ctx: Any) -> None:
        from kernbench.ccl.install import (
            load_ccl_config,
            resolve_algorithm_config,
        )
        self.ctx = torch_ctx
        self._cfg_all = load_ccl_config()
        self._merged = resolve_algorithm_config(self._cfg_all)
        self._algo_module = importlib.import_module(self._merged["module"])
        self._world_size = self._resolve_world_size()
        self._pending_collective_handles: list = []
        self._dist_ctx: Any = None
        spec = self.ctx.spec or {}
        self._n_sips = int(spec.get("system", {}).get("sips", {}).get("count", 1))
        self._sip_topo = str(
            spec.get("system", {}).get("sips", {}).get("topology", "ring_1d")
        )
        cm = spec.get("sip", {}).get("cube_mesh", {})
        self._cube_w = int(cm.get("w", 4))
        self._cube_h = int(cm.get("h", 4))
        # Resolve SIP topology dims for the kernel
        topo_map = getattr(self._algo_module, "TOPO_NAME_TO_KIND", None)
        if topo_map is not None:
            self._sip_topo_kind = topo_map.get(self._sip_topo, 0)
        else:
            self._sip_topo_kind = 0
        if self._sip_topo == "ring_1d":
            self._sip_topo_w, self._sip_topo_h = 0, 0
        else:
            side = int(round(math.sqrt(self._n_sips)))
            self._sip_topo_w, self._sip_topo_h = side, side
        # IPCQ install: wire all pe0s across all cubes and SIPs
        engine = getattr(self.ctx, "engine", None)
        if engine is not None:
            from kernbench.ccl.sfr_config import configure_sfr_intercube_multisip
            configure_sfr_intercube_multisip(engine, spec, self._merged)
    def _resolve_world_size(self) -> int:
        """Derive world_size (priority: algorithm override > defaults > topology).
        ADR-0024 D1: topology fallback is SIP count. Each rank represents one
        SIP (TP dimension). Intra-SIP parallelism is expressed via DPPolicy
        inside each worker and is independent of world_size.
        Explicit ``ccl.yaml`` override still respected — legacy "rank = flat
        PE index" tests use this path.
        """
        if "world_size" in self._merged:
            return int(self._merged["world_size"])
        defaults = self._cfg_all.get("defaults", {})
        if "world_size" in defaults:
            return int(defaults["world_size"])
        spec = self.ctx.spec or {}
        return int(spec.get("system", {}).get("sips", {}).get("count", 1))
    @property
    def world_size(self) -> int:
        return self._world_size
    def all_reduce(self, tensor: Any, op: str = "sum") -> None:
        """Dispatch the configured CCL algorithm as a single kernel launch.
        Raises if ``op != "sum"`` (current kernels only implement add
        reduction) or if the tensor's shard count disagrees with the
        world_size that was installed into PE_IPCQ.
        """
        if op != "sum":
            raise NotImplementedError(f"all_reduce op={op!r} not supported")
        if tensor._handle is None:
            raise RuntimeError(
                f"Tensor '{tensor.name}' is not deployed (call torch.zeros "
                "with a DPPolicy first)"
            )
        shards = tensor._handle.shards
        if not shards:
            raise RuntimeError(
                f"all_reduce tensor '{tensor.name}' has no shards"
            )
        n_elem = shards[0].nbytes // tensor.itemsize
        kernel_fn = self._algo_module.kernel
        kernel_args = self._algo_module.kernel_args(self._world_size, n_elem)
        # Resolve sip_rank from the current greenlet's bound rank
        from greenlet import getcurrent as _gc
        g = _gc()
        dist_ctx = getattr(self, "_dist_ctx", None)
        if dist_ctx is not None:
            sip_rank = int(dist_ctx._rank_by_greenlet.get(g, 0))
        else:
            sip_rank = 0
        extra_args = (
            sip_rank,
            self._sip_topo_kind,
            self._sip_topo_w,
            self._sip_topo_h,
        )
        pending = self.ctx.launch(
            self._merged["algorithm"], kernel_fn, tensor,
            *kernel_args, *extra_args,
            _defer_wait=True,
        )
        from greenlet import getcurrent
        g = getcurrent()
        if g.parent is not None and not g.parent.dead:
            # Multi-greenlet mode: hand pending to the backend-level queue so
            # the main scheduler drains. Worker just yields.
            self._pending_collective_handles.extend(pending)
            g.parent.switch()
            # On resume, all pending handles have been drained by main.
        else:
            # Single-driver (no bench scheduler): drain inline.
            for h, _sip_id, meta in pending:
                self.ctx.wait(h, _meta=meta)
    def barrier(self) -> None:
        # Single-driver model → no cross-process sync needed. Keeping the
        # method so ``dist.barrier()`` is callable (pytorch-compat surface).
        return None
 class DistributedContext:
    """torch.distributed-compat facade.
    Public surface matches real PyTorch so bench code reads identically
    to a DDP training script. Single-driver semantics: ``get_rank()``
    always returns 0 because kernbench runs as one Python process;
    ``get_world_size()`` returns the CCL group size (number of PEs
    participating in the collective).
    """
    def __init__(self) -> None:
        self._backend: AhbmCCLBackend | None = None
        # ADR-0024 D9: greenlet-local rank registry. Bench launcher calls
        # _bind_rank(g, rank) when spawning workers; get_rank() resolves the
        # current greenlet to its rank. Unbound greenlets fall back to 0 for
        # single-driver test compat.
        self._rank_by_greenlet: dict = {}
    def init_process_group(
        self,
        backend: str = "ahbm",
        world_size: int | None = None,
        rank: int | None = None,
        **kwargs: Any,
    ) -> None:
        """Create the default process group.
        ``world_size`` and ``rank`` are accepted for API parity with
        ``torch.distributed.init_process_group`` but ignored — the ahbm
        backend derives both from ``ccl.yaml`` + topology automatically
        (like reading ``RANK``/``WORLD_SIZE`` env vars in real DDP).
        """
        if backend != "ahbm":
            raise ValueError(
                f"Unsupported backend '{backend}'. Only 'ahbm' is supported."
            )
        ctx = getattr(self, "_ctx_ref", None)
        if ctx is None:
            raise RuntimeError(
                "DistributedContext not bound to a RuntimeContext"
            )
        self._backend = AhbmCCLBackend(torch_ctx=ctx)
        self._backend._dist_ctx = self
    def is_initialized(self) -> bool:
        return self._backend is not None
    def get_world_size(self) -> int:
        self._ensure_initialized()
        return self._backend.world_size
    def get_rank(self) -> int:
        """Return the rank bound to the current greenlet (default 0).
        ADR-0024 D9: workers spawned by the bench launcher each get a rank
        registered via ``_bind_rank``. Callers outside any bound greenlet
        fall back to rank 0 for single-driver test compat.
        """
        self._ensure_initialized()
        from greenlet import getcurrent
        g = getcurrent()
        return int(self._rank_by_greenlet.get(g, 0))
    def _bind_rank(self, g: Any, rank: int) -> None:
        """Bind a greenlet to a rank so ``get_rank()`` returns it (ADR-0024 D9)."""
        self._rank_by_greenlet[g] = int(rank)
    def get_backend(self) -> str:
        self._ensure_initialized()
        return "ahbm"
    def all_reduce(self, tensor: Any, op: str = "sum") -> None:
        self._ensure_initialized()
        self._backend.all_reduce(tensor, op=op)
    def barrier(self) -> None:
        self._ensure_initialized()
        self._backend.barrier()
    def _ensure_initialized(self) -> None:
        if self._backend is None:
            raise RuntimeError(
                "Default process group has not been initialized. "
                "Call init_process_group(backend='ahbm') first."
            )
@@ -152,3 +152,30 @@ class MmuUnmapMsg:
    target_cubes: tuple[int, ...] | Literal["all"] = "all"
    target_pe: int | Literal["all"] = "all"
    msg_type: Literal["mmu_unmap"] = "mmu_unmap"
@dataclass(frozen=True)
 class IpcqInitMsg:
    """IPCQ neighbor table install (sideband fan-out, ADR-0023 D10/D12).
    Backend issues this at ``init_process_group`` time to install per-PE
    IPCQ neighbor tables. Each entry covers one direction (N/S/E/W) and
    carries the peer's IpcqEndpoint plus this PE's own rx_buffer base
    and a pre-wired SimPy Store for credit return fast path (D9).
    Routing is similar to MmuMapMsg.
    """
    correlation_id: str
    request_id: str
    target_sips: tuple[int, ...] | Literal["all"] = "all"
    target_cubes: tuple[int, ...] | Literal["all"] = "all"
    target_pe: int | tuple[int, ...] | Literal["all"] = "all"
    # entries: tuple[IpcqInitEntry, ...] — kept as tuple of plain objects to
    # avoid a runtime import cycle (IpcqInitEntry lives in
    # kernbench.common.ipcq_types).
    entries: tuple = ()
    backpressure_mode: str = "sleep"  # "poll" | "sleep"
    buffer_kind: str = "tcm"          # "tcm" | "hbm" | "sram"
    credit_size_bytes: int = 16
    msg_type: Literal["ipcq_init"] = "ipcq_init"
@@ -0,0 +1,152 @@
 """``torch.multiprocessing.spawn``-compatible namespace (ADR-0027 D1).
 Real-PyTorch API *signature* parity only — execution model is a cooperative
 greenlet scheduler in a single Python process (D1.0). Non-goals: process
 isolation, independent address space, failure isolation, OS-level scheduler
 fairness, mp.Queue/Lock.
 Attached to ``RuntimeContext`` as ``ctx.multiprocessing`` in
 ``__post_init__`` (D1.3).
 """
 from __future__ import annotations
 from typing import Any, Callable
 class SpawnException(RuntimeError):
    """Raised from ``_MultiprocessingNamespace.spawn`` on worker failure.
    ``errors`` contains only root-cause ranks — the rank(s) whose body
    raised. Sibling greenlets terminated via ``throw(SystemExit)`` during
    cleanup are NOT recorded (SystemExit does not satisfy ``except
    Exception`` in the entry wrapper).
    """
    def __init__(self, errors: dict[int, Exception]):
        self.errors = errors
        first = next(iter(errors.items()), None)
        msg = (
            f"spawn failed on ranks {sorted(errors.keys())}"
            + (
                f": rank {first[0]} raised {first[1]!r}"
                if first is not None
                else ""
            )
        )
        super().__init__(msg)
 def _drain_pending(ctx: Any) -> None:
    """Drain worker-wait + collective-pending queues in main context (D0.4/D0.5).
    Loop-until-empty: runs until both queues are simultaneously empty. Safe
    under the current model where main-context ``ctx.wait`` never re-enqueues
    (D0.5 main-context non-reentrance invariant); also safe under future
    extensions where drain can add sub-handles (SimPy causality gives finite
    depth).
    """
    distributed = getattr(ctx, "distributed", None)
    backend = getattr(distributed, "_backend", None) if distributed else None
    def _collective_nonempty() -> bool:
        if backend is None:
            return False
        pending = getattr(backend, "_pending_collective_handles", None)
        return bool(pending)
    while ctx._pending_worker_waits or _collective_nonempty():
        # (a) Worker-driven waits (D0.1). FIFO.
        while ctx._pending_worker_waits:
            h = ctx._pending_worker_waits.pop(0)
            if h not in ctx._completed:
                wait_fn = getattr(ctx.engine, "wait", None)
                if wait_fn is not None:
                    wait_fn(h)
                # Populate _completed so fast-path in ctx.wait short-circuits
                # on the return leg.
                ctx._completed.add(h)
        # (b) Collective backend queue (ADR-0024 D7 + D0.4-(2)).
        if backend is not None:
            pending_list = getattr(backend, "_pending_collective_handles", None)
            if pending_list is not None:
                while pending_list:
                    h, _sip_id, meta = pending_list.pop(0)
                    # Main context: ctx.wait drives engine directly and does
                    # NOT re-enqueue (D0.5 invariant).
                    ctx.wait(h, _meta=meta)
 class _MultiprocessingNamespace:
    """torch.multiprocessing-compat facade bound to a RuntimeContext."""
    def __init__(self, ctx: Any) -> None:
        self._ctx = ctx
    def spawn(
        self,
        fn: Callable,
        args: tuple = (),
        nprocs: int = 1,
        join: bool = True,
    ) -> None:
        """Spawn ``nprocs`` worker greenlets, each calling ``fn(rank, *args)``.
        Mirrors ``torch.multiprocessing.spawn`` signature (minus ``daemon``).
        Runs the D0.4 round-robin scheduler loop until all workers finish,
        draining pending queues between rounds.
        """
        from greenlet import greenlet
        ctx = self._ctx
        dist = ctx.distributed
        gs: list = []
        errors: dict[int, Exception] = {}
        for rank in range(nprocs):
            def _entry(r: int = rank) -> None:
                try:
                    fn(r, *args)
                except Exception as e:
                    errors[r] = e
                    raise
            g = greenlet(_entry)
            if dist is not None and hasattr(dist, "_bind_rank"):
                dist._bind_rank(g, rank)
            gs.append(g)
        try:
            while True:
                alive = [g for g in gs if not g.dead]
                if not alive:
                    break
                for g in alive:
                    if not g.dead:
                        g.switch()
                _drain_pending(ctx)
        except Exception as outer:
            # D0.4-(4) sibling cleanup. Abort live greenlets, clear state.
            for other in gs:
                if not other.dead:
                    try:
                        other.throw(SystemExit)
                    except BaseException:
                        # SystemExit inherits BaseException; greenlet.throw
                        # re-raises in caller if target doesn't catch it.
                        # Silent — we're already in cleanup.
                        pass
            backend = getattr(dist, "_backend", None)
            if backend is not None:
                if hasattr(backend, "_barrier") and hasattr(backend._barrier, "reset"):
                    try:
                        backend._barrier.reset()
                    except Exception:
                        pass
                pending_collective = getattr(
                    backend, "_pending_collective_handles", None,
                )
                if pending_collective is not None:
                    pending_collective.clear()
            ctx._pending_worker_waits.clear()
            raise SpawnException(errors) from outer
        # join=True: we already waited for all workers above.
@@ -5,6 +5,8 @@ import weakref
 from dataclasses import dataclass
 from typing import Literal
 import numpy as np
 from kernbench.policy.address.allocator import PEMemAllocator
 from kernbench.policy.placement.dp import DPPolicy, ShardSpec
 from kernbench.runtime_api.kernel import TensorArg, TensorArgShard
@@ -50,13 +52,78 @@ def dtype_itemsize(dtype: str) -> int:
    return _DTYPE_ITEMSIZE[dtype]
 _NUMPY_DTYPE = {
    "f16": np.float16, "fp16": np.float16, "float16": np.float16,
    "f32": np.float32, "fp32": np.float32, "float32": np.float32,
    "bf16": np.float16,
    "i8": np.int8, "int8": np.int8,
    "i16": np.int16, "int16": np.int16,
    "i32": np.int32, "int32": np.int32,
 }
 def _numpy_dtype(dtype: str) -> np.dtype:
    return np.dtype(_NUMPY_DTYPE.get(dtype, np.float16))
 # ADR-0027 T5.g: closed-set registry of host-read barrier entry-points.
 # Any new Tensor API with host-observable read semantics must be added here
 # AND implement the barrier call. Code review + this registry keep the set
 # consistent (Python introspection-based auto-detection is a non-goal).
 # Note on ``copy_``: the source read is barriered via ``source.numpy()``.
 # A target-side write barrier was specified in an earlier revision of
 # ADR-0027 D0.5 but is intentionally not applied (global-pending target
 # barrier can prematurely drain cross-rank collectives → deadlock).
 _HOST_READ_BARRIERS: frozenset[str] = frozenset({
    "numpy",
    "data",
    "__getitem__",
    "__repr__",
    "copy_",  # source-side via source.numpy(); target-side not barriered
 })
 def _host_read_barrier(tensor: "Tensor") -> None:
    """ADR-0027 D0.5: drain pending worker-wait queue before a host-observable
    read/write.
    Scope: the barrier yields to main when ``ctx._pending_worker_waits`` is
    non-empty AND the caller is a worker greenlet. Collective pending
    (``backend._pending_collective_handles``) is **deliberately excluded**
    from this check — collective handles represent cross-rank protocol that
    must be drained only at scheduler synchronisation points (all workers
    yielded). A collective's own yield (inside ``all_reduce``) already
    ensures that once the collective call returns to the worker, post-drain
    values are visible, so subsequent host reads see materialised data
    without needing to trigger drain themselves. Including collective
    pending here would cause an unrelated rank's barrier to prematurely
    request drain of a cross-rank operation → deadlock.
    No-op when called from main context or when the worker-wait queue is
    empty (fast-path avoids needless context switches).
    """
    ctx = None
    if tensor._ctx_ref is not None:
        ctx = tensor._ctx_ref()
    if ctx is None:
        return
    worker_pending = getattr(ctx, "_pending_worker_waits", None)
    if not worker_pending:
        return  # fast-path
    from greenlet import getcurrent
    g = getcurrent()
    if g.parent is None or g.parent.dead:
        return  # main context: caller drains directly when needed
    g.parent.switch()
 def deploy_tensor(
    *,
    name: str,
    shape: tuple[int, ...],
    dtype: str,
    placement: list[ShardSpec],
-    allocators: dict[int, PEMemAllocator],
+    allocators: dict[tuple[int, int, int], PEMemAllocator],
    mem_kind: Literal["hbm", "tcm"] = "hbm",
    va_allocator=None,
 ) -> TensorHandle:
@@ -70,15 +137,15 @@ def deploy_tensor(
    shards: list[TensorShard] = []
    for spec in placement:
-        alloc = allocators[spec.pe_index]
+        alloc = allocators[(spec.sip, spec.cube, spec.pe)]
        if mem_kind == "hbm":
            pa = alloc.alloc_hbm(spec.nbytes)
        else:
            pa = alloc.alloc_tcm(spec.nbytes)
        shards.append(TensorShard(
-            sip=alloc._sip_id,
+            sip=spec.sip,
-            cube=alloc._cube_id,
+            cube=spec.cube,
-            pe=alloc._pe_id,
+            pe=spec.pe,
            pa=pa.encode(),
            nbytes=spec.nbytes,
            offset_bytes=spec.offset_bytes,
@@ -129,6 +196,12 @@ class Tensor:
        self._dp_metadata: DPMetadata | None = None
        self._handle: TensorHandle | None = None
        self._ctx_ref: weakref.ref | None = None  # set by RuntimeContext
        self._memory_store = None  # set by RuntimeContext when enable_data=True
        # Host-side staging buffer for torch.from_numpy() results. A tensor
        # with a non-None _host_buffer is NOT deployed to any PE — it lives
        # only on the host. Use `target.copy_(host_tensor)` to scatter the
        # data into a deployed, sharded target tensor.
        self._host_buffer: np.ndarray | None = None
    def __del__(self) -> None:
        if self._ctx_ref is None or self._handle is None:
@@ -137,6 +210,222 @@ class Tensor:
        if ctx is not None:
            ctx._free_tensor(self)
    # ── Indexing (shard-aligned slices) ────────────────────────────
    def _resolve_shard_index(self, key) -> tuple[int, int | None]:
        """Map a numpy-style index key to (flat_start_elem, flat_stop_elem).
        Only shard-aligned slices on the last dimension are supported.
        Returns (start, stop) in element units from the flat layout, or
        raises IndexError / NotImplementedError for unsupported keys.
        """
        if self._handle is None:
            raise RuntimeError(f"Tensor '{self.name}' is not deployed")
        ndim = len(self.shape)
        if not isinstance(key, tuple):
            key = (key,)
        if len(key) != ndim:
            raise IndexError(
                f"expected {ndim} indices, got {len(key)}"
            )
        # All leading dims must be int (selecting a single row/plane).
        for i, k in enumerate(key[:-1]):
            if not isinstance(k, int):
                raise NotImplementedError(
                    "only integer indices are supported for leading dims"
                )
        last = key[-1]
        total_elems = math.prod(self.shape)
        if isinstance(last, int):
            # Single element
            return (last, last + 1)
        if isinstance(last, slice):
            start, stop, step = last.indices(self.shape[-1])
            if step != 1:
                raise NotImplementedError("step != 1 not supported")
            return (start, stop)
        raise NotImplementedError(f"unsupported index type: {type(last)}")
    def _shard_for_range(self, start_elem: int, stop_elem: int) -> TensorShard:
        """Return the single shard that fully covers [start_elem, stop_elem).
        Raises NotImplementedError if the range spans multiple shards.
        """
        isize = self.itemsize
        start_byte = start_elem * isize
        stop_byte = stop_elem * isize
        for shard in self._handle.shards:
            s_start = shard.offset_bytes
            s_end = shard.offset_bytes + shard.nbytes
            if start_byte >= s_start and stop_byte <= s_end:
                return shard
        raise NotImplementedError(
            f"slice [{start_elem}:{stop_elem}] spans multiple shards "
            f"(only shard-aligned slices are supported)"
        )
    def __getitem__(self, key):
        """Read a shard-aligned slice. Returns a numpy array.
        Mirrors ``torch.Tensor.__getitem__`` for the shard-aligned case.
        ADR-0027 D0.5: host-read barrier.
        """
        _host_read_barrier(self)
        start, stop = self._resolve_shard_index(key)
        shard = self._shard_for_range(start, stop)
        if self._memory_store is None:
            return np.zeros(stop - start, dtype=_numpy_dtype(self.dtype))
        isize = self.itemsize
        local_start = (start * isize - shard.offset_bytes) // isize
        local_count = stop - start
        try:
            arr = self._memory_store.read(
                "hbm", self._shard_store_addr(shard),
            )
            flat = np.asarray(arr, dtype=_numpy_dtype(self.dtype)).reshape(-1)
            return flat[local_start : local_start + local_count]
        except KeyError:
            return np.zeros(local_count, dtype=_numpy_dtype(self.dtype))
    def __setitem__(self, key, value):
        """Write a shard-aligned slice.
        Mirrors ``torch.Tensor.__setitem__``. Scalar broadcast and
        numpy array assignment are both supported.
        """
        if self._handle is None or self._memory_store is None:
            raise RuntimeError(
                f"Tensor '{self.name}' must be deployed before assignment"
            )
        start, stop = self._resolve_shard_index(key)
        shard = self._shard_for_range(start, stop)
        np_dtype = _numpy_dtype(self.dtype)
        isize = self.itemsize
        local_start = (start * isize - shard.offset_bytes) // isize
        local_count = stop - start
        shard_elems = shard.nbytes // isize
        addr = self._shard_store_addr(shard)
        # Read current shard data (or zeros if uninitialized)
        try:
            arr = self._memory_store.read("hbm", addr)
            arr = np.array(arr, dtype=np_dtype).reshape(-1).copy()
        except KeyError:
            arr = np.zeros(shard_elems, dtype=np_dtype)
        # Write the slice
        if isinstance(value, (int, float)):
            arr[local_start : local_start + local_count] = np_dtype.type(value)
        else:
            v = np.asarray(value, dtype=np_dtype).reshape(-1)
            arr[local_start : local_start + local_count] = v[:local_count]
        self._memory_store.write("hbm", addr, arr)
    def __repr__(self) -> str:
        parts = [f"tensor(name={self.name}, shape={self.shape}, dtype={self.dtype}"]
        if self._memory_store is not None and self._handle is not None:
            # ADR-0027 D0.5: barrier on data-containing repr path.
            _host_read_barrier(self)
            arr = self.data
            parts.append(f", mean={float(arr.mean()):.4g}, norm={float(np.linalg.norm(arr)):.4g}")
        else:
            parts.append(", data=N/A (placeholder)")
        parts.append(")")
        return "".join(parts)
    @property
    def data(self) -> np.ndarray:
        """Tensor data as numpy array.
        Gathers all shards into a single full-shape array. Returns actual
        values when enable_data=True, zeros placeholder otherwise (like an
        uninitialized tensor). Alias of ``numpy()``.
        """
        return self.numpy()
    def _shard_store_addr(self, shard: TensorShard) -> int:
        """MemoryStore key for a shard.
        Kernels read tensors via VA (translated to PA by PE_DMA's MMU when
        a mapping exists, otherwise the addr is treated as a PA-equivalent
        key). Tensor I/O therefore writes/reads at ``va_base + offset_bytes``
        when ``va_base`` is set, falling back to ``shard.pa`` for the
        VA-less mode used by some legacy paths.
        """
        if self._handle and self._handle.va_base:
            return self._handle.va_base + shard.offset_bytes
        return shard.pa
    def numpy(self) -> np.ndarray:
        """Return a single numpy array gathered from all shards.
        Mirrors ``torch.Tensor.numpy()``. In kernbench, sharded tensors are
        gathered into a single full-shape ndarray according to each shard's
        ``offset_bytes`` / ``nbytes`` range.
        ADR-0027 D0.5: acts as a host-read barrier — drains pending waits +
        collective handles before reading, ensuring post-drain values.
        """
        _host_read_barrier(self)
        np_dtype = _numpy_dtype(self.dtype)
        # Host-side tensor (created via torch.from_numpy) has no shards.
        if self._host_buffer is not None:
            return self._host_buffer.copy()
        if self._handle is None or self._memory_store is None:
            return np.zeros(self.shape, dtype=np_dtype)
        flat = np.zeros(math.prod(self.shape), dtype=np_dtype)
        for shard in self._handle.shards:
            start = shard.offset_bytes // self.itemsize
            count = shard.nbytes // self.itemsize
            try:
                piece = self._memory_store.read(
                    "hbm", self._shard_store_addr(shard),
                )
            except KeyError:
                continue
            flat[start : start + count] = (
                np.asarray(piece, dtype=np_dtype).reshape(-1)[:count]
            )
        return flat.reshape(self.shape)
    def copy_(self, source: "Tensor") -> "Tensor":
        """In-place copy from another tensor into self.
        Mirrors ``torch.Tensor.copy_()``. If ``source`` is a host tensor
        (from ``torch.from_numpy``), its ndarray is split across self's
        shards using each shard's byte range. If ``source`` is a deployed
        (sharded) tensor, its contents are gathered first and then
        re-scattered into self's shard layout.
        Shapes must match. Returns self.
        ADR-0027 D0.5: source-side read barrier is triggered inside
        ``source.numpy()``. Target-side write barrier is not applied here
        because it would require cross-rank coordination when other ranks
        have pending collectives (see _host_read_barrier docstring on
        collective pending being cross-rank).
        """
        if self._handle is None or self._memory_store is None:
            raise RuntimeError(
                f"Tensor '{self.name}' must be deployed before copy_()"
            )
        if source.shape != self.shape:
            raise ValueError(
                f"copy_ shape mismatch: self={self.shape} source={source.shape}"
            )
        np_dtype = _numpy_dtype(self.dtype)
        arr = source.numpy().astype(np_dtype, copy=False)
        flat = np.ascontiguousarray(arr).reshape(-1)
        for shard in self._handle.shards:
            start = shard.offset_bytes // self.itemsize
            count = shard.nbytes // self.itemsize
            piece = flat[start : start + count].copy()
            self._memory_store.write(
                "hbm", self._shard_store_addr(shard), piece,
            )
        return self
    @property
    def itemsize(self) -> int:
        return dtype_itemsize(self.dtype)
@@ -170,7 +459,8 @@ class Tensor:
    ) -> Tensor:
        """Set DP placement metadata (like torch.Tensor.to())."""
        if placement is None:
-            placement = [ShardSpec(pe_index=0, offset_bytes=0, nbytes=self.nbytes)]
+            placement = [ShardSpec(sip=0, cube=0, pe=0,
                                   offset_bytes=0, nbytes=self.nbytes)]
        self._dp_metadata = DPMetadata(
            placement=placement, dp_policy=dp_policy,
            sip=sip, cube=cube, target_pe=target_pe,
@@ -12,6 +12,7 @@ class BenchResult:
    correlation_id: str
    trace: Trace | None = None
    traces: list[dict] | None = None
    engine: object | None = None  # GraphEngine ref for Phase 2 data access
    def summary_text(self) -> str:
        if self.completion.ok:
@@ -0,0 +1,249 @@
 """DataExecutor: Phase 2 op_log-based data execution (ADR-0020 D6).
 Executes GEMM/Math operations from the op_log using numpy.
 Memory ops are skipped (already handled in Phase 1 via MemoryStore).
 Same-timestamp independent ops can be batched for efficiency.
 """
 from __future__ import annotations
 from typing import Any
 import numpy as np
 from kernbench.sim_engine.memory_store import MemoryStore, _resolve_dtype
 from kernbench.sim_engine.op_log import OpRecord
 class DataExecutor:
    """Phase 2 executor: replay op_log with actual numpy computation.
    Args:
        op_log: list of OpRecords from Phase 1.
        store: MemoryStore snapshot from Phase 1 (contains tensor data).
    """
    def __init__(self, op_log: list[OpRecord], store: MemoryStore) -> None:
        self._op_log = op_log
        self.store = store
    # Ordering priority within the same t_start: memory copies must run
    # before math/gemm so that slot data is populated before a consumer
    # PE's math op reads it. With 0-ns PE_MATH overhead and tight SimPy
    # scheduling, ipcq_copy and math ops from different PEs can collide
    # at the exact same t_start.
    _KIND_ORDER = {"memory": 0, "gemm": 1, "math": 2, "unknown": 3}
    def run(self) -> None:
        """Execute all ops in op_log order.
        Primary sort: t_start (ascending).
        Secondary sort: op_kind priority — memory (ipcq_copy/dma_write)
        before gemm before math. This ensures IPCQ slot data arrives
        before a consumer PE's math op tries to read it, even when both
        share the same SimPy timestamp.
        """
        ops = sorted(
            self._op_log,
            key=lambda r: (r.t_start, self._KIND_ORDER.get(r.op_kind, 3)),
        )
        for op in ops:
            if op.op_kind != "memory" or op.op_name != "dma_read":
                self._execute_op(op)
    def _execute_op(self, op: OpRecord) -> None:
        if op.op_kind == "memory":
            self._execute_memory(op)
        elif op.op_kind == "gemm":
            self._execute_gemm(op)
        elif op.op_kind == "math":
            self._execute_math(op)
    def _execute_memory(self, op: OpRecord) -> None:
        """Replay memory copy ops in Phase 2 (ADR-0020 + ADR-0023).
        - dma_read: no-op (handle already references HBM source).
        - dma_write: copy (src_space, src_addr) → (dst_space, dst_addr).
          Required because Phase 2 may have just produced new data at the
          source addr (e.g. PE_MATH scratch output).
        - ipcq_copy: copy across PEs — sender's source → receiver's slot.
          Required because the source may be a Phase 2 math output, and
          a downstream math op on the receiver reads from the slot.
        Legacy entries without src/dst metadata are silently skipped.
        """
        p = op.params
        if op.op_name == "dma_write" or op.op_name == "ipcq_copy":
            src_space = p.get("src_space")
            src_addr = p.get("src_addr")
            dst_space = p.get("dst_space")
            dst_addr = p.get("dst_addr")
            if (src_space is None or src_addr is None
                    or dst_space is None or dst_addr is None):
                return
            # Prefer the Phase-1-time snapshot (captured at record_end /
            # outbound) so we don't read from a source that has since been
            # mutated by another op. Fall back to MemoryStore for sources
            # that had no Phase 1 data (e.g. math scratch outputs that
            # only get populated by Phase 2's math replay).
            data = p.get("snapshot")
            if data is None:
                try:
                    data = self.store.read(
                        src_space, src_addr,
                        shape=p.get("shape"), dtype=p.get("dtype"),
                    )
                except KeyError:
                    return
            self.store.write(dst_space, dst_addr, data)
    def _execute_gemm(self, op: OpRecord) -> None:
        """Execute GEMM: out = a @ b."""
        p = op.params
        if "src_a_addr" not in p:
            return  # composite record without full params
        default_space = p.get("addr_space", "tcm")
        # ADR-0027: per-operand + output spaces (fall back to single space
        # for legacy records without explicit space keys).
        src_a_space = p.get("src_a_space", default_space)
        src_b_space = p.get("src_b_space", default_space)
        dst_space = p.get("dst_space", default_space)
        dtype_in = p.get("dtype_in", "f16")
        dtype_out = p.get("dtype_out", dtype_in)
        a = self.store.read(src_a_space, p["src_a_addr"],
                            shape=p.get("shape_a"), dtype=dtype_in)
        b = self.store.read(src_b_space, p["src_b_addr"],
                            shape=p.get("shape_b"), dtype=dtype_in)
        # Compute in higher precision if specified
        dtype_acc = p.get("dtype_acc", "f32")
        a_f = a.astype(_resolve_dtype(dtype_acc))
        b_f = b.astype(_resolve_dtype(dtype_acc))
        result = np.matmul(a_f, b_f).astype(_resolve_dtype(dtype_out))
        self.store.write(dst_space, p["dst_addr"], result)
    def _execute_math(self, op: OpRecord) -> None:
        """Execute math op: unary, binary, or reduction."""
        p = op.params
        math_op = p.get("op", op.op_name)
        dtype = p.get("dtype", "f32")
        input_addrs = p.get("input_addrs", [])
        input_shapes = p.get("input_shapes", [])
        # Per-input space/dtype (ADR-0023 CCL accumulation): math ops can
        # mix inputs from different MemoryStore spaces (e.g. acc in "hbm",
        # recv slot in "tcm"). Fall back to legacy single-space mode when
        # the per-input lists are absent.
        input_spaces = p.get("input_spaces") or [p.get("addr_space", "tcm")] * len(input_addrs)
        input_dtypes = p.get("input_dtypes") or [dtype] * len(input_addrs)
        # Per-input data snapshots (ADR-0020 D6): captured at op_log
        # record time. Phase 1 has correct values for slot/HBM addrs at
        # that moment, which lets Phase 2 sidestep the slot-wraparound
        # races where a later round overwrites a slot before this op
        # runs in t_start order.
        snapshots = p.get("input_snapshots") or [None] * len(input_addrs)
        dst_space = p.get("dst_space", p.get("addr_space", "tcm"))
        inputs = []
        for addr, shape, space, idtype, snap in zip(
            input_addrs, input_shapes, input_spaces, input_dtypes, snapshots
        ):
            if snap is not None:
                inputs.append(snap)
            else:
                inputs.append(self.store.read(space, addr, shape=shape, dtype=idtype))
        result = _compute_math(math_op, inputs, p.get("axis"))
        if result is not None:
            self.store.write(dst_space, p["dst_addr"], result)
    def verify(self, expected: dict[tuple[str, int], np.ndarray],
               rtol: float = 1e-3, atol: float = 1e-3) -> dict[str, bool]:
        """Compare MemoryStore contents against expected tensors.
        Args:
            expected: {(space, addr): expected_ndarray}
            rtol, atol: tolerance for floating-point comparison.
        Returns:
            {key_str: passed} dict.
        """
        results = {}
        for (space, addr), exp in expected.items():
            key = f"{space}:0x{addr:x}"
            try:
                actual = self.store.read(space, addr)
                if np.issubdtype(actual.dtype, np.integer):
                    results[key] = bool(np.array_equal(actual, exp))
                else:
                    results[key] = bool(np.allclose(actual, exp, rtol=rtol, atol=atol))
            except KeyError:
                results[key] = False
        return results
 def _compute_math(op: str, inputs: list[np.ndarray], axis: int | None) -> np.ndarray | None:
    """Execute a math operation on numpy arrays."""
    if not inputs:
        return None
    x = inputs[0]
    # Unary
    if op == "exp":
        return np.exp(x)
    if op == "log":
        return np.log(x)
    if op == "sqrt":
        return np.sqrt(x)
    if op == "abs":
        return np.abs(x)
    if op == "sigmoid":
        return 1.0 / (1.0 + np.exp(-x))
    if op == "cos":
        return np.cos(x)
    if op == "sin":
        return np.sin(x)
    # Reduction
    if op == "sum":
        return np.sum(x, axis=axis, keepdims=True)
    if op == "max":
        return np.max(x, axis=axis, keepdims=True)
    if op == "min":
        return np.min(x, axis=axis, keepdims=True)
    # Softmax (numerically stable)
    if op == "softmax":
        ax = axis if axis is not None else -1
        x_max = np.max(x, axis=ax, keepdims=True)
        e = np.exp(x - x_max)
        s = np.sum(e, axis=ax, keepdims=True)
        return e / s
    # Binary
    if len(inputs) >= 2:
        y = inputs[1]
        if op == "add":
            return x + y
        if op == "sub":
            return x - y
        if op == "mul":
            return x * y
        if op == "div":
            return x / y
        if op == "maximum":
            return np.maximum(x, y)
        if op == "minimum":
            return np.minimum(x, y)
    # Ternary
    if len(inputs) >= 3:
        if op == "where":
            return np.where(inputs[0], inputs[1], inputs[2])
        if op == "fma":
            return inputs[0] * inputs[1] + inputs[2]
        if op == "clamp":
            return np.minimum(np.maximum(inputs[0], inputs[1]), inputs[2])
    return None
@@ -31,6 +31,7 @@ class GraphEngine:
        graph: TopologyGraph,
        *,
        component_overrides: dict[str, type[ComponentBase]] | None = None,
        enable_data: bool = False,
    ) -> None:
        self._env = simpy.Environment()
        self._resolver = AddressResolver(graph)
@@ -44,6 +45,19 @@ class GraphEngine:
        self._events: dict[str, simpy.Event] = {}
        self._counter = 0
        overrides = component_overrides or {}
        # ADR-0020: optional data execution support
        self._op_logger = None
        self._memory_store = None
        if enable_data:
            from kernbench.sim_engine.memory_store import MemoryStore
            from kernbench.sim_engine.op_log import OpLogger
            self._memory_store = MemoryStore()
            self._op_logger = OpLogger(memory_store=self._memory_store)
        # Cursor for incremental Phase 2 replay (ADR-0020 D6).
        # SimPy env.now is monotonic so newly logged records always sort
        # to the tail; the cursor remains valid across waits.
        self._data_cursor = 0
        ctx = ComponentContext(
            router=self._router,
            resolver=self._resolver,
@@ -51,6 +65,8 @@ class GraphEngine:
            ns_per_mm=self._ns_per_mm,
            edge_map=self._edge_map,
            spec=graph.spec,
            memory_store=self._memory_store,
            op_logger=self._op_logger,
        )
        self._components: dict[str, ComponentBase] = {
            node_id: ComponentRegistry.create(node, overrides, ctx)
@@ -108,10 +124,25 @@ class GraphEngine:
                if mmu_comp is not None and hasattr(mmu_comp, "mmu"):
                    self._components[node_id]._mmu = mmu_comp.mmu
        # Inject op_logger into all components (ADR-0020 D2)
        if self._op_logger:
            for comp in self._components.values():
                comp._op_logger = self._op_logger
        # Start components after all ports are wired (ADR-0015 D3)
        for comp in self._components.values():
            comp.start(self._env)
    @property
    def op_log(self):
        """Op log records from Phase 1 (ADR-0020)."""
        return self._op_logger.records if self._op_logger else []
    @property
    def memory_store(self):
        """MemoryStore from Phase 1 (ADR-0020)."""
        return self._memory_store
    def submit(self, request: Any) -> RequestHandle:
        self._counter += 1
        handle = RequestHandle(f"h{self._counter}")
@@ -120,11 +151,60 @@ class GraphEngine:
        self._env.process(self._process(str(handle), request, event))
        return handle
    def _flush_data_phase(self) -> None:
        """Replay newly recorded op_log entries through DataExecutor.
        ADR-0020 D6 Phase 2: when data tracking is enabled, run DataExecutor
        on records added since the last flush so that callers reading
        MemoryStore between launches observe correct (compute-replayed)
        tensor data.
        Cursor-based incremental replay is necessary because Phase 2 is
        NOT idempotent across full re-runs: a math op writes a TCM scratch
        addr, a later dma_write copies that scratch into HBM[X], and an
        even-later math op may then read HBM[X]. Re-running everything
        from scratch would let the second pass's first math op read the
        already-overwritten HBM[X] instead of the original input.
        """
        if self._op_logger is None or self._memory_store is None:
            return
        records = self._op_logger.records  # sorted by t_start (stable)
        if self._data_cursor >= len(records):
            return
        new_records = records[self._data_cursor:]
        from kernbench.sim_engine.data_executor import DataExecutor
        DataExecutor(new_records, self._memory_store).run()
        self._data_cursor = len(records)
    def wait(self, handle: RequestHandle) -> None:
        key = str(handle)
        event = self._events[key]
        if not event.triggered:
-            self._env.run(until=event)
+            try:
                self._env.run(until=event)
            except (simpy.core.EmptySchedule, RuntimeError) as exc:
                # SimPy raises EmptySchedule directly OR (in newer simpy)
                # wraps it as a RuntimeError("No scheduled events left ...").
                # Either case while our event is still pending → IPCQ deadlock.
                msg = str(exc)
                is_deadlock = (
                    isinstance(exc, simpy.core.EmptySchedule)
                    or "No scheduled events left" in msg
                )
                if not is_deadlock:
                    raise
                from kernbench.ccl.diagnostics import IpcqDeadlock, pointer_dump
                dump = pointer_dump(self)
                if dump.strip():
                    raise IpcqDeadlock(
                        "IPCQ deadlock: simulation schedule empty while "
                        f"request {handle!r} is still pending.\n"
                        f"Pointer state:\n{dump}"
                    ) from None
                raise
        # ADR-0020: replay newly logged ops so the caller observes
        # post-Phase-2 tensor state from MemoryStore.
        self._flush_data_phase()
    def get_completion(self, handle: RequestHandle) -> tuple[Completion, Trace | None]:
        return self._results[str(handle)]
@@ -0,0 +1,84 @@
 """MemoryStore: tensor-granular storage for Phase 1 and Phase 2 (ADR-0020 D7).
 Logically byte-addressable, implemented as addr → numpy ndarray mapping.
 Read/write are reference-based (no copy) for Phase 1 performance.
 """
 from __future__ import annotations
 import numpy as np
 # numpy dtype string → numpy dtype mapping
 _DTYPE_MAP = {
    "f16": np.float16,
    "f32": np.float32,
    "f64": np.float64,
    "bf16": np.float16,  # numpy has no bfloat16; use float16 as proxy
    "i8": np.int8,
    "i16": np.int16,
    "i32": np.int32,
    "i64": np.int64,
    "u8": np.uint8,
    "u16": np.uint16,
    "u32": np.uint32,
 }
 def _resolve_dtype(dtype: str) -> np.dtype:
    if dtype in _DTYPE_MAP:
        return np.dtype(_DTYPE_MAP[dtype])
    return np.dtype(dtype)
 class MemoryStore:
    """Tensor-granular memory storage (ADR-0020 D7).
    Stores numpy ndarrays by (space, addr) key.
    Write = reference store (no copy), read = reference return (no copy).
    Overwrite at same addr replaces the entire tensor.
    """
    def __init__(self) -> None:
        # {space: {addr: ndarray}}
        self._storage: dict[str, dict[int, np.ndarray]] = {}
    def write(self, space: str, addr: int, data: np.ndarray) -> None:
        """Store tensor at (space, addr). Reference-only, no copy."""
        if space not in self._storage:
            self._storage[space] = {}
        self._storage[space][addr] = data
    def read(self, space: str, addr: int, shape: tuple[int, ...] | None = None,
             dtype: str | None = None) -> np.ndarray:
        """Read tensor from (space, addr). Returns reference, no copy.
        If shape/dtype match stored tensor, returns as-is.
        If dtype differs, performs reinterpret cast (view).
        If shape differs but nbytes match, reshapes.
        """
        store = self._storage.get(space)
        if store is None or addr not in store:
            raise KeyError(f"No data at ({space}, 0x{addr:x})")
        arr = store[addr]
        if dtype is not None:
            np_dtype = _resolve_dtype(dtype)
            if arr.dtype != np_dtype:
                arr = arr.view(np_dtype)
        if shape is not None and arr.shape != shape:
            if arr.nbytes != np.prod(shape) * arr.dtype.itemsize:
                raise ValueError(
                    f"Shape mismatch: stored {arr.shape} ({arr.nbytes}B) "
                    f"vs requested {shape} ({np.prod(shape) * arr.dtype.itemsize}B)"
                )
            arr = arr.reshape(shape)
        return arr
    def has(self, space: str, addr: int) -> bool:
        return addr in self._storage.get(space, {})
    def snapshot(self) -> MemoryStore:
        """Create a shallow copy for Phase 2 initialization."""
        new = MemoryStore()
        for space, addrs in self._storage.items():
            new._storage[space] = dict(addrs)  # shallow copy of addr→ndarray map
        return new
@@ -0,0 +1,222 @@
 """Op log infrastructure for 2-pass data execution (ADR-0020 D2, D5).
 OpRecord: single data operation with timing, params, and dependencies.
 OpLogger: collects OpRecords from ComponentBase hooks during Phase 1.
 """
 from __future__ import annotations
 from dataclasses import dataclass, field
 from typing import Any
@dataclass
 class OpRecord:
    """Single data operation record (ADR-0020 D5)."""
    t_start: float
    t_end: float
    component_id: str
    op_kind: str               # "memory" | "gemm" | "math"
    op_name: str               # e.g. "dma_read", "gemm_f16", "exp"
    params: dict[str, Any]
    dependency_ids: list[int] = field(default_factory=list)
 class OpLogger:
    """Collects OpRecords during Phase 1 simulation (ADR-0020 D2).
    Thread-safe is not required — SimPy is single-threaded.
    Records are maintained in t_start stable ordering (insertion order).
    """
    def __init__(self, memory_store: Any | None = None) -> None:
        self._records: list[OpRecord] = []
        self._pending: dict[int, dict[str, Any]] = {}  # msg id → partial record
        # Optional MemoryStore reference. When set, math op records capture
        # input data snapshots at record_end time so Phase 2 replay does
        # not depend on slot/scratch addrs surviving until math runs.
        self._memory_store = memory_store
    @property
    def records(self) -> list[OpRecord]:
        """Records sorted by t_start (stable ordering per ADR-0020 D5)."""
        self._records.sort(key=lambda r: r.t_start)
        return self._records
    def record_start(self, t: float, component_id: str, msg: Any) -> None:
        """Called by ComponentBase._on_process_start."""
        self._pending[id(msg)] = {
            "t_start": t,
            "component_id": component_id,
            "msg": msg,
        }
    def record_end(self, t: float, component_id: str, msg: Any) -> None:
        """Called by ComponentBase._on_process_end."""
        pending = self._pending.pop(id(msg), None)
        if pending is None:
            return
        op_kind, op_name, params = _extract_op_info(msg)
        # Snapshot data at record time so Phase 2 replay sidesteps
        # downstream mutations of source addrs (e.g. a tl.store that
        # overwrites HBM after a load handle was sent, or a slot that
        # gets reused on the next ring round).
        if self._memory_store is not None:
            if op_kind == "math":
                snaps: list[Any] = []
                for addr, shape, space, idtype in zip(
                    params.get("input_addrs", []),
                    params.get("input_shapes", []),
                    params.get("input_spaces", []),
                    params.get("input_dtypes", []),
                ):
                    try:
                        arr = self._memory_store.read(
                            space, addr, shape=shape, dtype=idtype,
                        )
                        snaps.append(arr.copy() if hasattr(arr, "copy") else arr)
                    except Exception:
                        snaps.append(None)
                params["input_snapshots"] = snaps
            elif op_name == "dma_write":
                # ADR-0027 fix: only snapshot HBM sources. TCM (PE scratch)
                # sources are repopulated by Phase 2 math/gemm replay —
                # capturing a Phase-1-time snapshot here would pick up stale
                # data from a PRIOR kernel's Phase 2 output that aliased the
                # same scratch address, causing the later kernel's replay
                # to write that stale value instead of the fresh math
                # result. See ADR-0027 postmortem (TP gemm → all_reduce).
                if params.get("src_space") == "hbm":
                    try:
                        arr = self._memory_store.read(
                            params["src_space"], params["src_addr"],
                            shape=params.get("shape"), dtype=params.get("dtype"),
                        )
                        params["snapshot"] = (
                            arr.copy() if hasattr(arr, "copy") else arr
                        )
                    except Exception:
                        params["snapshot"] = None
        self._records.append(OpRecord(
            t_start=pending["t_start"],
            t_end=t,
            component_id=pending["component_id"],
            op_kind=op_kind,
            op_name=op_name,
            params=params,
        ))
    def record_copy(
        self, t_start: float, t_end: float, component_id: str,
        src_space: str, src_addr: int,
        dst_space: str, dst_addr: int,
        shape: tuple[int, ...], dtype: str, nbytes: int,
        snapshot: Any = None,
    ) -> None:
        """Record a memory copy op for Phase 2 replay (ADR-0023 + ADR-0020).
        ``snapshot``: if provided (e.g. token.data from in-flight DMA),
        used directly. Otherwise falls back to a fresh read from
        MemoryStore[src_addr]. The snapshot is what Phase 2 writes into
        dst_addr, avoiding stale-source races from cross-PE mutations.
        """
        snap = snapshot
        if snap is None and self._memory_store is not None:
            try:
                arr = self._memory_store.read(
                    src_space, src_addr, shape=shape, dtype=dtype,
                )
                snap = arr.copy() if hasattr(arr, "copy") else arr
            except Exception:
                snap = None
        self._records.append(OpRecord(
            t_start=t_start, t_end=t_end,
            component_id=component_id,
            op_kind="memory", op_name="ipcq_copy",
            params={
                "src_space": src_space, "src_addr": src_addr,
                "dst_space": dst_space, "dst_addr": dst_addr,
                "shape": shape, "dtype": dtype, "nbytes": nbytes,
                "snapshot": snap,
            },
        ))
 def _extract_op_info(msg: Any) -> tuple[str, str, dict[str, Any]]:
    """Extract op_kind, op_name, params from a data_op message."""
    from kernbench.common.pe_commands import (
        DmaReadCmd, DmaWriteCmd, GemmCmd, MathCmd, CompositeCmd,
    )
    if isinstance(msg, DmaReadCmd):
        return "memory", "dma_read", {
            "src_addr": msg.src_addr,
            "nbytes": msg.nbytes,
            "handle_id": msg.handle.id,
        }
    if isinstance(msg, DmaWriteCmd):
        return "memory", "dma_write", {
            "src_space": getattr(msg.handle, "space", "tcm"),
            "src_addr": msg.handle.addr,
            "shape": msg.handle.shape,
            "dtype": msg.handle.dtype,
            "dst_space": "hbm",
            "dst_addr": msg.dst_addr,
            "nbytes": msg.nbytes,
            "handle_id": msg.handle.id,
        }
    if isinstance(msg, GemmCmd):
        return "gemm", f"gemm_{msg.a.dtype}", {
            "src_a_addr": msg.a.addr,
            "src_b_addr": msg.b.addr,
            "dst_addr": msg.out.addr,
            "shape_a": msg.a.shape,
            "shape_b": msg.b.shape,
            "shape_out": msg.out.shape,
            "dtype_in": msg.a.dtype,
            "dtype_out": msg.out.dtype,
            "m": msg.m, "k": msg.k, "n": msg.n,
            # ADR-0027: preserve per-operand + output MemoryStore spaces so
            # Phase 2 replay can resolve HBM-resident operands (e.g. tl.load
            # results keep space="hbm"). Absent → DataExecutor falls back
            # to the legacy single-space mode via ``addr_space``.
            "src_a_space": getattr(msg.a, "space", "tcm"),
            "src_b_space": getattr(msg.b, "space", "tcm"),
            "dst_space": getattr(msg.out, "space", "tcm"),
        }
    if isinstance(msg, MathCmd):
        return "math", msg.op, {
            "input_addrs": [h.addr for h in msg.inputs],
            "input_shapes": [h.shape for h in msg.inputs],
            "input_spaces": [getattr(h, "space", "tcm") for h in msg.inputs],
            "input_dtypes": [h.dtype for h in msg.inputs],
            "dst_addr": msg.out.addr,
            "dst_space": getattr(msg.out, "space", "tcm"),
            "shape_out": msg.out.shape,
            "dtype": msg.out.dtype,
            "axis": msg.axis,
        }
    if isinstance(msg, CompositeCmd):
        params: dict[str, Any] = {
            "op": msg.op,
            "out_addr": msg.out_addr,
            "out_nbytes": msg.out_nbytes,
        }
        # ADR-0027: preserve operand info so Phase 2 DataExecutor can replay
        # the composite's numerical effect (treat it like a GemmCmd).
        if msg.op == "gemm" and msg.a is not None and msg.b is not None:
            params.update({
                "src_a_addr": msg.a.addr,
                "src_b_addr": msg.b.addr,
                "shape_a": msg.a.shape,
                "shape_b": msg.b.shape,
                "dtype_in": msg.a.dtype,
                "dtype_out": msg.a.dtype,
                "src_a_space": getattr(msg.a, "space", "hbm"),
                "src_b_space": getattr(msg.b, "space", "hbm"),
                "dst_space": "hbm",
                # dst_addr alias so DataExecutor._execute_gemm picks it up.
                "dst_addr": msg.out_addr,
            })
        return "gemm" if msg.op == "gemm" else "math", f"composite_{msg.op}", params
    # Fallback for unknown data_op messages
    return "unknown", type(msg).__name__, {}
@@ -20,10 +20,12 @@ _PE_COMP_OFFSETS = {
    "pe_cpu": (-0.3, 0.0),
    "pe_scheduler": (-0.15, 0.0),
    "pe_dma": (0.0, -0.15),
    "pe_fetch_store": (0.15, 0.0),
    "pe_gemm": (0.0, 0.0),
    "pe_math": (0.0, 0.15),
    "pe_mmu": (0.15, -0.15),
    "pe_tcm": (0.3, 0.0),
    "pe_ipcq": (-0.15, 0.15),
 }
@@ -276,7 +278,7 @@ def _instantiate_io_chiplets(
        for phy in inst["ucie"]["phys"]:
            phy_id = f"{prefix}.ucie-{phy}"
            nodes[phy_id] = Node(
-                id=phy_id, kind="io_ucie", impl="ucie_v1",
+                id=phy_id, kind="io_ucie", impl="builtin.ucie",
                attrs={"overhead_ns": io_ucie_ns},
                pos_mm=(cx, noc_y), label=f"IO UCIe-{phy}",
            )
@@ -284,7 +286,7 @@ def _instantiate_io_chiplets(
            for ci in range(io_n_conn):
                conn_id = f"{phy_id}.conn{ci}"
                nodes[conn_id] = Node(
-                    id=conn_id, kind="io_ucie_conn", impl="ucie_v1",
+                    id=conn_id, kind="io_ucie_conn", impl="builtin.ucie",
                    attrs={"overhead_ns": 0.0},
                    pos_mm=(cx, noc_y), label=f"IO UCIe-{phy} C{ci}",
                )
@@ -378,14 +380,14 @@ def _instantiate_cube(
        pid = f"{cp}.ucie-{port}"
        lx, ly = local_pos[f"ucie-{port}"]
        nodes[pid] = Node(
-            id=pid, kind="ucie_port", impl="ucie_v1",
+            id=pid, kind="ucie_port", impl="builtin.ucie",
            attrs={"overhead_ns": ucie_ns}, pos_mm=(ox + lx, oy + ly),
            label=f"UCIe-{port}",
        )
        for ci in range(ucie_n_conn):
            conn_id = f"{cp}.ucie-{port}.conn{ci}"
            nodes[conn_id] = Node(
-                id=conn_id, kind="ucie_conn", impl="ucie_v1",
+                id=conn_id, kind="ucie_conn", impl="builtin.ucie",
                attrs={"overhead_ns": 0.0},
                pos_mm=(ox + lx, oy + ly),
                label=f"UCIe-{port} C{ci}",
@@ -637,12 +639,13 @@ def _instantiate_cube(
 def _add_pe_internal_edges(edges: list[Edge], pp: str, pe_links: dict) -> None:
-    """Add PE-internal edges for a single PE instance."""
+    """Add PE-internal edges for a single PE instance (ADR-0021)."""
    edges.append(Edge(
        src=f"{pp}.pe_cpu", dst=f"{pp}.pe_scheduler",
        distance_mm=pe_links["pe_cpu_to_scheduler_mm"],
        kind="pe_internal",
    ))
    # Scheduler → engines (initial dispatch)
    for eng, key in [("pe_dma", "scheduler_to_dma_mm"),
                     ("pe_gemm", "scheduler_to_gemm_mm"),
                     ("pe_math", "scheduler_to_math_mm")]:
@@ -651,6 +654,15 @@ def _add_pe_internal_edges(edges: list[Edge], pp: str, pe_links: dict) -> None:
            distance_mm=pe_links[key],
            kind="pe_internal",
        ))
    # Scheduler → fetch_store (initial dispatch)
    if "scheduler_to_fetch_store_mm" in pe_links:
        edges.append(Edge(
            src=f"{pp}.pe_scheduler", dst=f"{pp}.pe_fetch_store",
            distance_mm=pe_links["scheduler_to_fetch_store_mm"],
            kind="pe_internal",
        ))
    # Engine → TCM (legacy BW edges)
    for eng, mm_key, bw_key in [("pe_dma", "dma_to_tcm_mm", "dma_to_tcm_bw_gbs"),
                                 ("pe_gemm", "gemm_to_tcm_mm", "gemm_to_tcm_bw_gbs"),
                                 ("pe_math", "math_to_tcm_mm", "math_to_tcm_bw_gbs")]:
@@ -661,6 +673,46 @@ def _add_pe_internal_edges(edges: list[Edge], pp: str, pe_links: dict) -> None:
            kind="pe_internal",
        ))
    # Fetch/Store → TCM (ADR-0021 D5)
    if "fetch_store_to_tcm_mm" in pe_links:
        edges.append(Edge(
            src=f"{pp}.pe_fetch_store", dst=f"{pp}.pe_tcm",
            distance_mm=pe_links["fetch_store_to_tcm_mm"],
            bw_gbs=pe_links.get("fetch_store_to_tcm_bw_gbs", 512.0),
            kind="pe_internal",
        ))
    # Chaining edges (ADR-0021 D4 — token self-routing)
    chaining = [
        ("pe_dma", "pe_fetch_store", "dma_to_fetch_store_mm"),
        ("pe_fetch_store", "pe_gemm", "fetch_store_to_gemm_mm"),
        ("pe_fetch_store", "pe_math", "fetch_store_to_math_mm"),
        ("pe_gemm", "pe_fetch_store", "gemm_to_fetch_store_mm"),
        ("pe_math", "pe_fetch_store", "math_to_fetch_store_mm"),
        ("pe_fetch_store", "pe_dma", "fetch_store_to_dma_mm"),
    ]
    for src_eng, dst_eng, mm_key in chaining:
        if mm_key in pe_links:
            edges.append(Edge(
                src=f"{pp}.{src_eng}", dst=f"{pp}.{dst_eng}",
                distance_mm=pe_links[mm_key],
                kind="pe_internal",
            ))
    # PE_IPCQ edges (ADR-0023 D1, D9 D10)
    ipcq_edges = [
        ("pe_cpu",  "pe_ipcq", "cpu_to_ipcq_mm"),  # IpcqRequest
        ("pe_ipcq", "pe_dma",  "ipcq_to_dma_mm"),  # IpcqDmaToken outbound
        ("pe_dma",  "pe_ipcq", "dma_to_ipcq_mm"),  # IpcqMetaArrival inbound
    ]
    for src_c, dst_c, mm_key in ipcq_edges:
        if mm_key in pe_links:
            edges.append(Edge(
                src=f"{pp}.{src_c}", dst=f"{pp}.{dst_c}",
                distance_mm=pe_links[mm_key],
                kind="pe_internal",
            ))
 # ── Inter-cube / IO / system edges ──────────────────────────────────
@@ -728,7 +780,13 @@ def _add_io_to_cube_edges(
 def _add_system_to_io_edges(
    edges: list[Edge], sp: str, sip_spec: dict, system: dict,
 ) -> None:
-    """Add fabric switch → IO chiplet PCIe edges."""
+    """Add bidirectional fabric switch ↔ IO chiplet PCIe edges.
    Both directions are needed:
      switch → pcie_ep   for host→device traffic (memory writes, kernel launch)
      pcie_ep → switch   for device-side outbound traffic (cross-SIP IPCQ
                          send between PE_DMAs through the system switch).
    """
    sw_id = "fabric.switch0"
    sys_link = system["links"]["io_ep_to_switch"]
    for inst in sip_spec["iochiplet"]["instances"]:
@@ -739,6 +797,12 @@ def _add_system_to_io_edges(
            bw_gbs=sys_link["bw_gbs_per_ep"],
            kind="pcie",
        ))
        edges.append(Edge(
            src=pcie_ep_id, dst=sw_id,
            distance_mm=sys_link["distance_mm"],
            bw_gbs=sys_link["bw_gbs_per_ep"],
            kind="pcie",
        ))
 # ── View builders ────────────────────────────────────────────────────
@@ -900,13 +964,13 @@ def _build_cube_view(spec: dict) -> ViewGraph:
        pid = f"ucie-{port}"
        lx, ly = local_pos[pid]
        nodes[pid] = Node(
-            id=pid, kind="ucie_port", impl="ucie_v1",
+            id=pid, kind="ucie_port", impl="builtin.ucie",
            attrs={}, pos_mm=(lx, ly), label=f"UCIe-{port}",
        )
        for ci in range(ucie_n_conn):
            conn_id = f"ucie-{port}.conn{ci}"
            nodes[conn_id] = Node(
-                id=conn_id, kind="ucie_conn", impl="ucie_v1",
+                id=conn_id, kind="ucie_conn", impl="builtin.ucie",
                attrs={"overhead_ns": 0.0}, pos_mm=(lx, ly),
                label=f"UCIe-{port} C{ci}",
            )
@@ -1071,17 +1135,19 @@ def _build_pe_view(spec: dict) -> ViewGraph:
        "pe_cpu": (1.5, 4.0),
        "pe_scheduler": (4.0, 4.0),
        "pe_dma": (7.0, 1.5),
        "pe_fetch_store": (8.5, 4.0),
        "pe_gemm": (7.0, 4.0),
        "pe_math": (7.0, 6.5),
        "pe_mmu": (4.0, 1.5),
        "pe_tcm": (10.0, 4.0),
        "pe_ipcq": (4.0, 6.5),
    }
    nodes: dict[str, Node] = {}
    view_edges: list[Edge] = []
    for comp_name, comp_spec in pe_tmpl["components"].items():
-        px, py = positions[comp_name]
+        px, py = positions.get(comp_name, (1.0, 1.0))
        nodes[comp_name] = Node(
            id=comp_name, kind=comp_spec["kind"], impl=comp_spec["impl"],
            attrs=comp_spec["attrs"], pos_mm=(px, py),
@@ -1101,6 +1167,12 @@ def _build_pe_view(spec: dict) -> ViewGraph:
            distance_mm=pe_links[key],
            kind="pe_internal",
        ))
    if "scheduler_to_fetch_store_mm" in pe_links:
        view_edges.append(Edge(
            src="pe_scheduler", dst="pe_fetch_store",
            distance_mm=pe_links["scheduler_to_fetch_store_mm"],
            kind="pe_internal",
        ))
    for eng, mm_key, bw_key in [("pe_dma", "dma_to_tcm_mm", "dma_to_tcm_bw_gbs"),
                                 ("pe_gemm", "gemm_to_tcm_mm", "gemm_to_tcm_bw_gbs"),
                                 ("pe_math", "math_to_tcm_mm", "math_to_tcm_bw_gbs")]:
@@ -1110,6 +1182,13 @@ def _build_pe_view(spec: dict) -> ViewGraph:
            bw_gbs=pe_links[bw_key],
            kind="pe_internal",
        ))
    if "fetch_store_to_tcm_mm" in pe_links:
        view_edges.append(Edge(
            src="pe_fetch_store", dst="pe_tcm",
            distance_mm=pe_links["fetch_store_to_tcm_mm"],
            bw_gbs=pe_links.get("fetch_store_to_tcm_bw_gbs", 512.0),
            kind="pe_internal",
        ))
    return ViewGraph(
        name="pe", nodes=nodes, edges=view_edges,
@@ -0,0 +1,21 @@
 """kernbench.tp — Megatron-style Tensor Parallelism (ADR-0027).
 Public API re-exports.
 """
 from kernbench.tp.layers import (
    ColumnParallelLinear,
    RowParallelLinear,
 )
 from kernbench.tp.parallel_state import (
    get_tensor_model_parallel_rank,
    get_tensor_model_parallel_world_size,
    initialize_model_parallel,
 )
 __all__ = [
    "ColumnParallelLinear",
    "RowParallelLinear",
    "get_tensor_model_parallel_rank",
    "get_tensor_model_parallel_world_size",
    "initialize_model_parallel",
 ]
@@ -0,0 +1,23 @@
 """Kernel used by ``kernbench.tp`` layers (ADR-0027 D4/D5).
 Intentionally self-contained inside the ``tp`` package — the ``tp`` package
 must not import from ``benches/``. Future work: move to a shared
 ``kernbench.kernels`` module so benches and TP can share.
 """
 from __future__ import annotations
 def _gemm_kernel(a_ptr, b_ptr, out_ptr, M, K, N, tl, DTYPE: str = "f16") -> None:
    """Single-PE GEMM: out = a @ b via load → dot → store.
    Uses the ``tl.load + tl.dot + tl.store`` path. Unlike ``tl.composite``
    (which is absorbed by the PE scheduler into TileTokens that don't reach
    the op_log), this path emits explicit ``DmaReadCmd`` / ``GemmCmd`` /
    ``DmaWriteCmd`` records, which DataExecutor replays numerically in
    Phase 2.
    """
    M, K, N = int(M), int(K), int(N)
    a = tl.load(int(a_ptr), shape=(M, K), dtype=DTYPE)
    b = tl.load(int(b_ptr), shape=(K, N), dtype=DTYPE)
    out = tl.dot(a, b)
    tl.store(int(out_ptr), out)
@@ -0,0 +1,150 @@
 """Megatron-style parallel layers (ADR-0027 D4/D5).
 - ``ColumnParallelLinear``: weight's out_features axis split across TP ranks.
  forward(x) is local gemm; no collective.
 - ``RowParallelLinear``: weight's in_features axis split across TP ranks.
  forward(x) ends with ``dist.all_reduce`` to sum partial products.
 Both layers use the intra-device ``DPPolicy`` (ADR-0026). TP shard
 ownership is determined by ``torch.ahbm.set_device(rank)`` (ADR-0024 D10).
 Yield-safety contract (ADR-0027 D4/D5): every forward path contains at
 least one ``ctx.wait`` (via ``torch.launch``) or one collective; this
 keeps the scheduler loop making progress.
 """
 from __future__ import annotations
 from typing import Any
 from kernbench.policy.placement.dp import DPPolicy
 from kernbench.tp.kernels import _gemm_kernel
 from kernbench.tp.parallel_state import (
    get_tensor_model_parallel_world_size,
 )
 class ColumnParallelLinear:
    """Weight's K (out_features) axis distributed across TP ranks.
    forward(x):
        x: (M, N) — full-replicated across ranks
        W_k: (N, K / world_size) — this rank's slice (on its SIP)
        y_k = x @ W_k → (M, K / world_size)
    """
    def __init__(
        self,
        in_features: int,
        out_features: int,
        bias: bool = False,
        dtype: str = "f16",
        torch: Any = None,
    ) -> None:
        if torch is None:
            raise TypeError("ColumnParallelLinear requires torch=<RuntimeContext>")
        ws = get_tensor_model_parallel_world_size()
        if out_features % ws != 0:
            raise ValueError(
                f"out_features ({out_features}) must be divisible by TP world "
                f"size ({ws})"
            )
        self.in_features = in_features
        self.out_features = out_features
        self.k_local = out_features // ws
        self.dtype = dtype
        self._torch = torch
        # Per-rank weight slice. ``set_device(rank)`` (ADR-0024 D10) places
        # it on SIP ``rank``. Intra-SIP layout comes from DPPolicy (ADR-0026).
        self.weight = torch.zeros(
            (in_features, self.k_local),
            dtype=dtype,
            dp=DPPolicy(cube="replicate", pe="replicate",
                        num_cubes=1, num_pes=1),
            name="col_parallel_w",
        )
        # Bias omitted in initial scope (ADR-0027 D9).
        self.bias = None
        if bias:
            raise NotImplementedError(
                "bias=True is deferred (ADR-0027 D9 initial scope)"
            )
    def forward(self, x):
        M = int(x.shape[0])
        out = self._torch.empty(
            (M, self.k_local),
            dtype=x.dtype,
            dp=DPPolicy(cube="replicate", pe="replicate",
                        num_cubes=1, num_pes=1),
            name="col_parallel_out",
        )
        self._torch.launch(
            "col_parallel_gemm",
            _gemm_kernel,
            x, self.weight, out,
            M, self.in_features, self.k_local,
        )
        return out
 class RowParallelLinear:
    """Weight's N (in_features) axis distributed across TP ranks.
    forward(x):
        x: (M, N / world_size) — rank-local slice (ColumnParallel output)
        W_k: (N / world_size, K) — this rank's slice
        y_k = x @ W_k → (M, K) — partial sum
        y = all_reduce(y_k, op="sum") → (M, K) on every rank
    """
    def __init__(
        self,
        in_features: int,
        out_features: int,
        bias: bool = False,
        dtype: str = "f16",
        torch: Any = None,
    ) -> None:
        if torch is None:
            raise TypeError("RowParallelLinear requires torch=<RuntimeContext>")
        ws = get_tensor_model_parallel_world_size()
        if in_features % ws != 0:
            raise ValueError(
                f"in_features ({in_features}) must be divisible by TP world "
                f"size ({ws})"
            )
        self.in_features = in_features
        self.out_features = out_features
        self.n_local = in_features // ws
        self.dtype = dtype
        self._torch = torch
        self.weight = torch.zeros(
            (self.n_local, out_features),
            dtype=dtype,
            dp=DPPolicy(cube="replicate", pe="replicate",
                        num_cubes=1, num_pes=1),
            name="row_parallel_w",
        )
        self.bias = None
        if bias:
            raise NotImplementedError(
                "bias=True is deferred (ADR-0027 D9 initial scope)"
            )
    def forward(self, x):
        M = int(x.shape[0])
        y_partial = self._torch.empty(
            (M, self.out_features),
            dtype=x.dtype,
            dp=DPPolicy(cube="replicate", pe="replicate",
                        num_cubes=1, num_pes=1),
            name="row_parallel_partial",
        )
        self._torch.launch(
            "row_parallel_gemm",
            _gemm_kernel,
            x, self.weight, y_partial,
            M, self.n_local, self.out_features,
        )
        self._torch.distributed.all_reduce(y_partial, op="sum")
        return y_partial
@@ -0,0 +1,5 @@
 """Forward/backward mappings stub (ADR-0027 — future backward work).
 Inference-only initial scope. Backward hooks land when training simulation
 arrives.
 """
@@ -0,0 +1,83 @@
 """TP group state (ADR-0027 D3).
 Single global TP group. Initial scope: TP size == world_size (pure TP;
 mixed DP+TP is future work).
 """
 from __future__ import annotations
 _TP_WORLD_SIZE: int | None = None
 def initialize_model_parallel(tensor_model_parallel_size: int) -> None:
    """Initialize the TP process group.
    Must be called after ``torch.distributed.init_process_group``.
    Only ``tensor_model_parallel_size == world_size`` is supported in the
    initial scope.
    """
    global _TP_WORLD_SIZE
    # Import here to avoid cycle when tp is imported before a ctx exists.
    _ws = _current_world_size()
    if tensor_model_parallel_size != _ws:
        raise NotImplementedError(
            f"Only TP == world_size supported; got TP={tensor_model_parallel_size}, "
            f"world_size={_ws}"
        )
    _TP_WORLD_SIZE = tensor_model_parallel_size
 def get_tensor_model_parallel_world_size() -> int:
    """Return the TP group's world size.
    Raises if not initialised — callers must call
    :func:`initialize_model_parallel` first.
    """
    if _TP_WORLD_SIZE is None:
        raise RuntimeError(
            "TP group not initialised; call initialize_model_parallel() first"
        )
    return _TP_WORLD_SIZE
 def get_tensor_model_parallel_rank() -> int:
    """Return this worker's rank within the TP group.
    Delegates to the greenlet-local rank registered by the spawn launcher
    (ADR-0024 D9 via ``torch.distributed.get_rank``).
    """
    # Resolve via the global torch.distributed facade on the active ctx.
    return _current_rank()
 def _reset_for_tests() -> None:
    """Clear _TP_WORLD_SIZE so ordering-sensitive tests can re-init."""
    global _TP_WORLD_SIZE
    _TP_WORLD_SIZE = None
 # ── helpers (resolve current ctx) ────────────────────────────────────
 def _current_ctx():
    """Best-effort resolution of the currently-active RuntimeContext.
    In KernBench, the ``ctx`` is passed as the ``torch`` positional in
    bench/worker code. Since parallel_state is a module-global helper,
    we look it up via a weak registry maintained by RuntimeContext.
    """
    from kernbench.runtime_api.context import _get_active_context
    ctx = _get_active_context()
    if ctx is None:
        raise RuntimeError(
            "No active RuntimeContext; kernbench.tp requires one "
            "(call init_process_group / spawn under a live ctx)"
        )
    return ctx
 def _current_world_size() -> int:
    return _current_ctx().distributed.get_world_size()
 def _current_rank() -> int:
    return _current_ctx().distributed.get_rank()
@@ -0,0 +1,34 @@
 """TP primitive ops (ADR-0027 D6).
 ``copy_to_tp_region`` / ``reduce_from_tp_region`` are forward-only in the
 initial scope (backward pass is future work). ``scatter`` / ``gather`` are
 not implemented — they require an all-gather kernel that is not yet
 available in KernBench (see ADR-0027 D9).
 """
 from __future__ import annotations
 from typing import Any
 def copy_to_tp_region(x: Any) -> Any:
    """Forward: identity. Backward: all-reduce. (Training is future.)"""
    return x
 def reduce_from_tp_region(x: Any, torch: Any) -> Any:
    """Forward: all-reduce. Backward: identity."""
    torch.distributed.all_reduce(x, op="sum")
    return x
 def scatter_to_tp_region(x: Any) -> Any:
    raise NotImplementedError(
        "scatter_to_tp_region deferred — caller should create the sharded "
        "tensor directly (ADR-0027 D9)"
    )
 def gather_from_tp_region(x: Any) -> Any:
    raise NotImplementedError(
        "gather_from_tp_region deferred — requires all-gather kernel (ADR-0027 D9)"
    )
--- a/Show More
+++ b/Show More
		`@@ -0,0 +1,2 @@`
							`# Legacy component backups — not actively used.`
							`# Kept for reference during ADR-0021 migration.`