Compare commits
28 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 1d8b9401e5 | |||
| cfc2d74ec4 | |||
| 105f1dc09e | |||
| e7f376ebaa | |||
| 357cab525b | |||
| 787409ced1 | |||
| 79124daab1 | |||
| 4ba0a83e71 | |||
| 32536daf2e | |||
| e1084800ab | |||
| b2c52f0e34 | |||
| 10b33b44ba | |||
| 1c8ddc2d03 | |||
| 74f5f5cf08 | |||
| 372c987995 | |||
| bcf941dcee | |||
| 998cc85762 | |||
| ff2c677a9c | |||
| dc3fb02aed | |||
| 59e36f0c34 | |||
| 81ce55571d | |||
| 1d95df4bee | |||
| 95d583ef9f | |||
| f5d1606f9d | |||
| b6eb97c49a | |||
| 161132cdcb | |||
| 51004c311c | |||
| 140b85436a |
@@ -29,3 +29,4 @@ build/
|
|||||||
|
|
||||||
# Logs
|
# Logs
|
||||||
*.log
|
*.log
|
||||||
|
.claude/
|
||||||
|
|||||||
@@ -0,0 +1,103 @@
|
|||||||
|
"""CCL all-reduce bench (ADR-0024 + ADR-0027).
|
||||||
|
|
||||||
|
Pure TP launcher model: rank = SIP. Each rank owns a ``(N_CUBES, n_elem)``
|
||||||
|
tensor sharded row-wise across the cube mesh (pe0 per cube). After
|
||||||
|
``dist.all_reduce(op="sum")`` every cube on every rank must hold
|
||||||
|
``N_CUBES * sum(1..world_size)``. Rank 0 prints the pass/fail line.
|
||||||
|
|
||||||
|
Driven by ``ccl.yaml`` (``defaults.algorithm``, ``n_elem``) + ``topology.yaml``
|
||||||
|
(SIP count → world_size, cube_mesh → N_CUBES).
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from kernbench.ccl.install import load_ccl_config, resolve_algorithm_config
|
||||||
|
from kernbench.policy.placement.dp import DPPolicy
|
||||||
|
|
||||||
|
DEFAULT_N_ELEM = 8
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class _BenchCfg:
|
||||||
|
algorithm: str
|
||||||
|
n_elem: int
|
||||||
|
n_cubes: int
|
||||||
|
world_size: int
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_cfg(torch) -> _BenchCfg:
|
||||||
|
"""Read ccl.yaml + topology once at host side."""
|
||||||
|
merged = resolve_algorithm_config(load_ccl_config())
|
||||||
|
ws = torch.distributed.get_world_size()
|
||||||
|
spec = torch.spec or {}
|
||||||
|
n_sips = int(spec.get("system", {}).get("sips", {}).get("count", 1))
|
||||||
|
if ws != n_sips:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"ccl_allreduce bench requires world_size == topology SIP count "
|
||||||
|
f"(world_size={ws}, n_sips={n_sips})."
|
||||||
|
)
|
||||||
|
cm = spec.get("sip", {}).get("cube_mesh", {})
|
||||||
|
n_cubes = int(cm.get("w", 4)) * int(cm.get("h", 4))
|
||||||
|
return _BenchCfg(
|
||||||
|
algorithm=merged["algorithm"],
|
||||||
|
n_elem=int(merged.get("n_elem", DEFAULT_N_ELEM)),
|
||||||
|
n_cubes=n_cubes,
|
||||||
|
world_size=ws,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _rank_dp(n_cubes: int) -> DPPolicy:
|
||||||
|
return DPPolicy(cube="row_wise", pe="replicate", num_cubes=n_cubes, num_pes=1)
|
||||||
|
|
||||||
|
|
||||||
|
def _allocate_rank_tensor(torch, rank: int, cfg: _BenchCfg):
|
||||||
|
"""Allocate this rank's ``(n_cubes, n_elem)`` tensor on its SIP."""
|
||||||
|
return torch.zeros(
|
||||||
|
(cfg.n_cubes, cfg.n_elem), dtype="f16",
|
||||||
|
dp=_rank_dp(cfg.n_cubes), name=f"ccl_in_r{rank}",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _init_with_rank_value(torch, tensor, rank: int, cfg: _BenchCfg) -> None:
|
||||||
|
"""Fill all cubes with the scalar ``rank + 1``."""
|
||||||
|
arr = np.full((cfg.n_cubes, cfg.n_elem), float(rank + 1), dtype=np.float16)
|
||||||
|
tensor.copy_(torch.from_numpy(arr))
|
||||||
|
|
||||||
|
|
||||||
|
def _report(result: np.ndarray, cfg: _BenchCfg) -> None:
|
||||||
|
"""Single-line pass/fail printer (rank 0 only)."""
|
||||||
|
expected = float(cfg.n_cubes * sum(range(1, cfg.world_size + 1)))
|
||||||
|
ok = True
|
||||||
|
for cube_id in range(cfg.n_cubes):
|
||||||
|
if not np.allclose(result[cube_id], expected, rtol=1e-1, atol=1e-1):
|
||||||
|
ok = False
|
||||||
|
break
|
||||||
|
if ok:
|
||||||
|
total = cfg.world_size * cfg.n_cubes
|
||||||
|
print(f" {cfg.algorithm} (ws={cfg.world_size}): {total} OK")
|
||||||
|
return
|
||||||
|
got = float(result.reshape(-1).mean())
|
||||||
|
print(
|
||||||
|
f" [FAIL] {cfg.algorithm} (ws={cfg.world_size}): "
|
||||||
|
f"got mean={got:.3f}, expected={expected:.3f}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _worker(rank: int, cfg: _BenchCfg, torch) -> None:
|
||||||
|
torch.ahbm.set_device(rank)
|
||||||
|
tensor = _allocate_rank_tensor(torch, rank, cfg)
|
||||||
|
_init_with_rank_value(torch, tensor, rank, cfg)
|
||||||
|
torch.distributed.all_reduce(tensor, op="sum")
|
||||||
|
if rank == 0:
|
||||||
|
_report(tensor.numpy(), cfg)
|
||||||
|
|
||||||
|
|
||||||
|
def run(torch) -> None:
|
||||||
|
torch.distributed.init_process_group(backend="ahbm")
|
||||||
|
cfg = _resolve_cfg(torch)
|
||||||
|
torch.multiprocessing.spawn(
|
||||||
|
_worker, args=(cfg, torch), nprocs=cfg.world_size,
|
||||||
|
)
|
||||||
@@ -3,7 +3,7 @@
|
|||||||
Full host-to-PE pipeline:
|
Full host-to-PE pipeline:
|
||||||
Host → PCIE_EP → IO_CPU → M_CPU → PE_CPU → SchedulerV2 → PE_DMA → HBM
|
Host → PCIE_EP → IO_CPU → M_CPU → PE_CPU → SchedulerV2 → PE_DMA → HBM
|
||||||
|
|
||||||
Single PE: num_sips=1, num_cubes=1, num_pes=1 via DPPolicy override.
|
Single PE: num_cubes=1, num_pes=1 via DPPolicy override.
|
||||||
Both operands use tl.ref (HBM-resident); scheduler_v2 tiles and streams
|
Both operands use tl.ref (HBM-resident); scheduler_v2 tiles and streams
|
||||||
per-tile DMA internally.
|
per-tile DMA internally.
|
||||||
|
|
||||||
@@ -30,7 +30,7 @@ def _gemm_kernel(a_ptr, b_ptr, out_ptr, M, K, N, tl, DTYPE="f16"):
|
|||||||
def run(torch):
|
def run(torch):
|
||||||
"""Run the single-PE GEMM benchmark."""
|
"""Run the single-PE GEMM benchmark."""
|
||||||
dp = DPPolicy(cube="replicate", pe="replicate",
|
dp = DPPolicy(cube="replicate", pe="replicate",
|
||||||
num_sips=1, num_cubes=1, num_pes=1)
|
num_cubes=1, num_pes=1)
|
||||||
|
|
||||||
a = torch.empty((M, K), dtype=DTYPE, dp=dp, name="a")
|
a = torch.empty((M, K), dtype=DTYPE, dp=dp, name="a")
|
||||||
b = torch.empty((K, N), dtype=DTYPE, dp=dp, name="b")
|
b = torch.empty((K, N), dtype=DTYPE, dp=dp, name="b")
|
||||||
|
|||||||
+8
-4
@@ -72,12 +72,16 @@ def run(torch):
|
|||||||
K = GPT3_D_MODEL
|
K = GPT3_D_MODEL
|
||||||
N = COLS_PER_PE
|
N = COLS_PER_PE
|
||||||
|
|
||||||
# X: replicated across all PEs
|
# ADR-0026: DPPolicy is intra-device only. For multi-SIP execution the
|
||||||
|
# ADR-0024 launcher calls this bench once per SIP (each worker via
|
||||||
|
# torch.ahbm.set_device(rank)); here the policy describes only the
|
||||||
|
# cube × PE layout within a single SIP.
|
||||||
|
# X: replicated across all PEs within the SIP
|
||||||
dp_replicate = DPPolicy(cube="replicate", pe="replicate",
|
dp_replicate = DPPolicy(cube="replicate", pe="replicate",
|
||||||
num_sips=N_SIPS, num_cubes=N_CUBES, num_pes=N_PE_PER_CUBE)
|
num_cubes=N_CUBES, num_pes=N_PE_PER_CUBE)
|
||||||
# W_Q/K/V, out_Q/K/V: column-wise sharded across all PEs
|
# W_Q/K/V, out_Q/K/V: column-wise sharded across all PEs within the SIP
|
||||||
dp_sharded = DPPolicy(cube="column_wise", pe="column_wise",
|
dp_sharded = DPPolicy(cube="column_wise", pe="column_wise",
|
||||||
num_sips=N_SIPS, num_cubes=N_CUBES, num_pes=N_PE_PER_CUBE)
|
num_cubes=N_CUBES, num_pes=N_PE_PER_CUBE)
|
||||||
|
|
||||||
x = torch.empty((M, K), dtype=DTYPE, dp=dp_replicate, name="x")
|
x = torch.empty((M, K), dtype=DTYPE, dp=dp_replicate, name="x")
|
||||||
wq = torch.empty((K, GPT3_D_MODEL), dtype=DTYPE, dp=dp_sharded, name="wq")
|
wq = torch.empty((K, GPT3_D_MODEL), dtype=DTYPE, dp=dp_sharded, name="wq")
|
||||||
|
|||||||
+19
-16
@@ -9,29 +9,32 @@ from kernbench.runtime_api.context import RuntimeContext
|
|||||||
BenchFn = Callable[[RuntimeContext], Any]
|
BenchFn = Callable[[RuntimeContext], Any]
|
||||||
|
|
||||||
|
|
||||||
|
def _load_module(bench_id: str):
|
||||||
|
bench_id = bench_id.strip()
|
||||||
|
if not bench_id:
|
||||||
|
raise ValueError("Bench id is empty.")
|
||||||
|
module_path = f"benches.{bench_id}"
|
||||||
|
try:
|
||||||
|
return importlib.import_module(module_path)
|
||||||
|
except ModuleNotFoundError as e:
|
||||||
|
raise ValueError(
|
||||||
|
f"Unknown bench '{bench_id}'. Expected module {module_path}.py"
|
||||||
|
) from e
|
||||||
|
|
||||||
|
|
||||||
def resolve_bench(bench_id: str) -> BenchFn:
|
def resolve_bench(bench_id: str) -> BenchFn:
|
||||||
"""
|
"""Resolve a bench id into its ``run(torch)`` callable.
|
||||||
Resolve a bench id into a callable bench function.
|
|
||||||
|
|
||||||
Expected layout (repo root):
|
Expected layout (repo root):
|
||||||
benches/<bench_id>.py
|
benches/<bench_id>.py
|
||||||
def run(torch: RuntimeContext) -> Any
|
def run(torch: RuntimeContext) -> Any
|
||||||
"""
|
"""
|
||||||
bench_id = bench_id.strip()
|
mod = _load_module(bench_id)
|
||||||
if not bench_id:
|
|
||||||
raise ValueError("Bench id is empty.")
|
|
||||||
|
|
||||||
module_path = f"benches.{bench_id}"
|
|
||||||
|
|
||||||
try:
|
|
||||||
mod = importlib.import_module(module_path)
|
|
||||||
except ModuleNotFoundError as e:
|
|
||||||
raise ValueError(f"Unknown bench '{bench_id}'. Expected module {module_path}.py") from e
|
|
||||||
|
|
||||||
run_fn = getattr(mod, "run", None)
|
run_fn = getattr(mod, "run", None)
|
||||||
if run_fn is None:
|
if run_fn is None:
|
||||||
raise ValueError(f"Bench module {module_path} must define a 'run(torch)' function.")
|
raise ValueError(
|
||||||
|
f"Bench module benches.{bench_id} must define 'run(torch)'."
|
||||||
|
)
|
||||||
if not callable(run_fn):
|
if not callable(run_fn):
|
||||||
raise ValueError(f"'run' in {module_path} is not callable.")
|
raise ValueError(f"'run' in benches.{bench_id} is not callable.")
|
||||||
|
|
||||||
return run_fn
|
return run_fn
|
||||||
|
|||||||
+3
-1
@@ -10,7 +10,9 @@ Kernel: tl.load(a) + tl.ref(b) + tl.composite(gemm) + tl.wait()
|
|||||||
from kernbench.policy.placement.dp import DPPolicy
|
from kernbench.policy.placement.dp import DPPolicy
|
||||||
|
|
||||||
# GEMM dimensions: (M, K) x (K, N) → (M, N)
|
# GEMM dimensions: (M, K) x (K, N) → (M, N)
|
||||||
M, K, N = 128, 256, 128
|
# Small dims (1 tile) for fast regression. The test verifies the full
|
||||||
|
# host→PE pipeline, not large-matrix throughput.
|
||||||
|
M, K, N = 32, 64, 32
|
||||||
DTYPE = "f16"
|
DTYPE = "f16"
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -10,7 +10,9 @@ Kernel: tl.load(a) + tl.ref(b) + tl.composite(gemm) + tl.wait()
|
|||||||
from kernbench.policy.placement.dp import DPPolicy
|
from kernbench.policy.placement.dp import DPPolicy
|
||||||
|
|
||||||
# GEMM dimensions: (M, K) x (K, N) -> (M, N)
|
# GEMM dimensions: (M, K) x (K, N) -> (M, N)
|
||||||
M, K, N = 128, 256, 128
|
# Small dims (1 tile) for fast regression. The test verifies the multi-PE
|
||||||
|
# fan-out pipeline, not large-matrix throughput.
|
||||||
|
M, K, N = 32, 64, 32
|
||||||
DTYPE = "f16"
|
DTYPE = "f16"
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
"""VA offset verification benchmark.
|
"""VA offset verification benchmark.
|
||||||
|
|
||||||
Verifies that Triton-style base_ptr + pid * stride addressing works correctly
|
Verifies that Triton-style base_ptr + pid * stride addressing works correctly
|
||||||
with full TP sharding (sip/cube/pe all column_wise). Each PE loads its own
|
with intra-SIP TP sharding (cube/pe column_wise). Each PE loads its own
|
||||||
block from a sharded tensor and stores it back.
|
block from a sharded tensor and stores it back.
|
||||||
|
|
||||||
The kernel uses standard Triton patterns:
|
The kernel uses standard Triton patterns:
|
||||||
@@ -28,7 +28,7 @@ def _copy_kernel(src_ptr, dst_ptr, M, K, tl, DTYPE="f16"):
|
|||||||
|
|
||||||
def run(torch):
|
def run(torch):
|
||||||
"""Run the VA offset verification benchmark with full TP sharding."""
|
"""Run the VA offset verification benchmark with full TP sharding."""
|
||||||
dp = DPPolicy(sip="column_wise", cube="column_wise", pe="column_wise")
|
dp = DPPolicy(cube="column_wise", pe="column_wise")
|
||||||
src = torch.zeros((M, K), dtype=DTYPE, dp=dp, name="src")
|
src = torch.zeros((M, K), dtype=DTYPE, dp=dp, name="src")
|
||||||
dst = torch.empty((M, K), dtype=DTYPE, dp=dp, name="dst")
|
dst = torch.empty((M, K), dtype=DTYPE, dp=dp, name="dst")
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,45 @@
|
|||||||
|
# ccl.yaml — CCL backend (ahbm) configuration (ADR-0023 D11)
|
||||||
|
#
|
||||||
|
# Loaded by AhbmCCLBackend at init_process_group time.
|
||||||
|
# defaults.algorithm chooses which kernel + topology is installed
|
||||||
|
# into PE_IPCQ neighbor tables. Host code is unaware of these settings.
|
||||||
|
|
||||||
|
defaults:
|
||||||
|
# Algorithm to run for this benchmark execution.
|
||||||
|
algorithm: intercube_allreduce
|
||||||
|
|
||||||
|
# IPCQ ring buffer location.
|
||||||
|
# tcm — PE-local TCM (fast, small, conflicts with compute TCM access)
|
||||||
|
# hbm — PE-local HBM (large, slower DMA latency)
|
||||||
|
# sram — Cube-shared SRAM (medium, cube-internal contention)
|
||||||
|
buffer_kind: tcm
|
||||||
|
|
||||||
|
# Backpressure mode.
|
||||||
|
# poll — spin-loop polling of cached peer pointers
|
||||||
|
# sleep — yield SimPy event, wake on credit return
|
||||||
|
backpressure: sleep
|
||||||
|
|
||||||
|
# Ring depth: number of slots per (direction, tx|rx) buffer.
|
||||||
|
n_slots: 4
|
||||||
|
|
||||||
|
# Slot size in bytes (must hold one tile worth of data).
|
||||||
|
slot_size: 4096
|
||||||
|
|
||||||
|
# PE_DMA virtual channel chunk size (D8).
|
||||||
|
vc_chunk_size: 256
|
||||||
|
|
||||||
|
# Credit return fast path message size (D9).
|
||||||
|
ipcq_credit_size_bytes: 16
|
||||||
|
|
||||||
|
algorithms:
|
||||||
|
# ── intercube all-reduce (pe0-only, cube mesh + inter-SIP) ──
|
||||||
|
# Reduces across the 4×4 cube mesh within each SIP, then inter-SIP
|
||||||
|
# exchange on root cube, then broadcast back. SIP topology is read
|
||||||
|
# from topology.yaml → system.sips.topology. Kernel auto-selects
|
||||||
|
# ring / torus / mesh inter-SIP exchange pattern.
|
||||||
|
intercube_allreduce:
|
||||||
|
module: kernbench.ccl.algorithms.intercube_allreduce
|
||||||
|
topology: none
|
||||||
|
buffer_kind: tcm
|
||||||
|
n_elem: 8
|
||||||
|
root_cube: 15
|
||||||
+26
-20
@@ -2,6 +2,10 @@
|
|||||||
# Maps impl names (used in topology.yaml) to Python class paths.
|
# Maps impl names (used in topology.yaml) to Python class paths.
|
||||||
# Format: impl_name: module.path:ClassName
|
# Format: impl_name: module.path:ClassName
|
||||||
#
|
#
|
||||||
|
# Naming convention:
|
||||||
|
# builtin.<name> — built-in implementations
|
||||||
|
# custom.<name> — user-defined implementations
|
||||||
|
#
|
||||||
# ── Adding custom components ──────────────────────────────────────────
|
# ── Adding custom components ──────────────────────────────────────────
|
||||||
#
|
#
|
||||||
# 1. Create your implementation in:
|
# 1. Create your implementation in:
|
||||||
@@ -10,41 +14,43 @@
|
|||||||
# Your class must inherit from ComponentBase (or PeEngineBase for PE engines).
|
# Your class must inherit from ComponentBase (or PeEngineBase for PE engines).
|
||||||
#
|
#
|
||||||
# 2. Register it below under "Custom" with a unique impl name:
|
# 2. Register it below under "Custom" with a unique impl name:
|
||||||
# my_pe_cpu_v2: kernbench.components.custom.my_pe_cpu:MyPeCpuComponent
|
# custom.my_pe_cpu: kernbench.components.custom.my_pe_cpu:MyPeCpuComponent
|
||||||
#
|
#
|
||||||
# 3. Reference it in topology.yaml:
|
# 3. Reference it in topology.yaml:
|
||||||
# pe_cpu: { kind: pe_cpu, impl: my_pe_cpu_v2, attrs: { ... } }
|
# pe_cpu: { kind: pe_cpu, impl: custom.my_pe_cpu, attrs: { ... } }
|
||||||
#
|
#
|
||||||
# 4. Add unit tests in:
|
# 4. Add unit tests in:
|
||||||
# tests/custom/test_<your_component>.py
|
# tests/custom/test_<your_component>.py
|
||||||
#
|
#
|
||||||
# External packages also work — use the full module path:
|
# External packages also work — use the full module path:
|
||||||
# fast_gemm_v1: my_team.accel.fast_gemm:FastGemmComponent
|
# custom.fast_gemm: my_team.accel.fast_gemm:FastGemmComponent
|
||||||
# ──────────────────────────────────────────────────────────────────────
|
# ──────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
components:
|
components:
|
||||||
# Infrastructure
|
# Infrastructure
|
||||||
forwarding_v1: kernbench.components.builtin.forwarding:TransitComponent
|
builtin.forwarding: kernbench.components.builtin.forwarding:TransitComponent
|
||||||
switch_v1: kernbench.components.builtin.forwarding:TransitComponent
|
builtin.switch: kernbench.components.builtin.forwarding:TransitComponent
|
||||||
noc_v1: kernbench.components.builtin.forwarding:TransitComponent
|
builtin.noc: kernbench.components.builtin.forwarding:TransitComponent
|
||||||
ucie_v1: kernbench.components.builtin.forwarding:TransitComponent
|
builtin.ucie: kernbench.components.builtin.forwarding:TransitComponent
|
||||||
|
|
||||||
# IO / Host interface
|
# IO / Host interface
|
||||||
pcie_ep_v1: kernbench.components.builtin.pcie_ep:PcieEpComponent
|
builtin.pcie_ep: kernbench.components.builtin.pcie_ep:PcieEpComponent
|
||||||
io_cpu_v1: kernbench.components.builtin.io_cpu:IoCpuComponent
|
builtin.io_cpu: kernbench.components.builtin.io_cpu:IoCpuComponent
|
||||||
|
|
||||||
# Cube-level
|
# Cube-level
|
||||||
m_cpu_v1: kernbench.components.builtin.m_cpu:MCpuComponent
|
builtin.m_cpu: kernbench.components.builtin.m_cpu:MCpuComponent
|
||||||
hbm_ctrl_v1: kernbench.components.builtin.hbm_ctrl:HbmCtrlComponent
|
builtin.hbm_ctrl: kernbench.components.builtin.hbm_ctrl:HbmCtrlComponent
|
||||||
sram_v1: kernbench.components.builtin.sram:SramComponent
|
builtin.sram: kernbench.components.builtin.sram:SramComponent
|
||||||
|
|
||||||
# PE-level
|
# PE-level
|
||||||
pe_cpu_v1: kernbench.components.builtin.pe_cpu:PeCpuComponent
|
builtin.pe_cpu: kernbench.components.builtin.pe_cpu:PeCpuComponent
|
||||||
pe_scheduler_v1: kernbench.components.builtin.pe_scheduler:PeSchedulerComponent
|
builtin.pe_scheduler: kernbench.components.builtin.pe_scheduler:PeSchedulerComponent
|
||||||
pe_dma_v1: kernbench.components.builtin.pe_dma:PeDmaComponent
|
builtin.pe_dma: kernbench.components.builtin.pe_dma:PeDmaComponent
|
||||||
pe_gemm_v1: kernbench.components.builtin.pe_gemm:PeGemmComponent
|
builtin.pe_gemm: kernbench.components.builtin.pe_gemm:PeGemmComponent
|
||||||
pe_math_v1: kernbench.components.builtin.pe_math:PeMathComponent
|
builtin.pe_math: kernbench.components.builtin.pe_math:PeMathComponent
|
||||||
pe_mmu_v1: kernbench.components.builtin.pe_mmu:PeMmuComponent
|
builtin.pe_fetch_store: kernbench.components.builtin.pe_fetch_store:PeFetchStoreComponent
|
||||||
pe_tcm_v1: kernbench.components.builtin.pe_tcm:PeTcmComponent
|
builtin.pe_mmu: kernbench.components.builtin.pe_mmu:PeMmuComponent
|
||||||
|
builtin.pe_tcm: kernbench.components.builtin.pe_tcm:PeTcmComponent
|
||||||
|
builtin.pe_ipcq: kernbench.components.builtin.pe_ipcq:PeIpcqComponent
|
||||||
|
|
||||||
# Custom — add your implementations here
|
# Custom — add your implementations here
|
||||||
pe_scheduler_v2: kernbench.components.custom.pe_accel.scheduler:SchedulerV2Component
|
|
||||||
|
|||||||
@@ -0,0 +1,441 @@
|
|||||||
|
# ADR-0018: LA-Based Memory Address Abstraction and HBM Channel Mapping Mode Introduction
|
||||||
|
|
||||||
|
## Status
|
||||||
|
|
||||||
|
Proposed
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
Kernbench simulates memory access between PE_DMA and Local-HBM within a CUBE.
|
||||||
|
Currently, a VA-based access path is used; however, the following two channel mapping models
|
||||||
|
are difficult to represent consistently.
|
||||||
|
|
||||||
|
### Background: Local-HBM Pseudo Channel Structure
|
||||||
|
|
||||||
|
The HBM in a CUBE consists of 32 or 64 pseudo channels.
|
||||||
|
In the PE-Local-HBM model, each PE is responsible for an equal number of pseudo channels.
|
||||||
|
|
||||||
|
Example: 64 pseudo channels, 8 PEs per cube -> each PE accesses 8 pseudo channels as local HBM
|
||||||
|
|
||||||
|
Both the number of pseudo channels and the number of PEs are topology parameters.
|
||||||
|
`N = hbm_pseudo_channels / pes_per_cube` (= channels_per_pe) determines
|
||||||
|
the number of local channels per PE.
|
||||||
|
|
||||||
|
The routing path BW between DMA and each pseudo channel matches the BW of each pseudo channel
|
||||||
|
(e.g., 32 GB/s), so if a PE sends simultaneous requests to N channels, it can utilize the
|
||||||
|
maximum memory BW.
|
||||||
|
|
||||||
|
### Limitations of the Current VA Model
|
||||||
|
|
||||||
|
When channels are divided into 8, requests must also be generated per channel and sent to DMA.
|
||||||
|
However, in the current architecture, the kernel generates requests with VA (`tl.load`)
|
||||||
|
and passes them directly to DMA, making it difficult for PE_CPU to generate per-channel DMA requests.
|
||||||
|
|
||||||
|
Therefore, instead of VA, we propose using **Logical Address (LA)**,
|
||||||
|
where the **BAAW (Logical-to-Physical Mapping Unit)** inside PE_DMA
|
||||||
|
converts LA to PA or a list of PAs based on segment-based mapping.
|
||||||
|
|
||||||
|
### Two Channel Mapping Modes
|
||||||
|
|
||||||
|
- **1:1 mode**: Creates and executes per-channel requests. Precise per-channel modeling.
|
||||||
|
- **n:1 mode (default)**: Assumes interleaving across local HBM channels. Aggregated BW modeling.
|
||||||
|
|
||||||
|
By supporting both modes, the overhead of the n:1 mode can be measured and evaluated.
|
||||||
|
|
||||||
|
### Core Requirements
|
||||||
|
|
||||||
|
- The effective bandwidth semantics of PE_DMA -> HBM_CTRL must be identical in both modes
|
||||||
|
- The difference must only be in the request representation and resource modeling approach
|
||||||
|
- The kernel programming model must not be changed
|
||||||
|
- Physical channel information must not be exposed to the kernel
|
||||||
|
|
||||||
|
### Existing Physical Address
|
||||||
|
|
||||||
|
The current system's 51-bit Physical Address is defined in `policy/address/phyaddr.py`:
|
||||||
|
|
||||||
|
```
|
||||||
|
[50:47] rack_id (4 bit)
|
||||||
|
[46:43] sip_id (4 bit)
|
||||||
|
[42:38] cube_id (5 bit, sip_seg)
|
||||||
|
[37] hbm_selector (1=HBM window)
|
||||||
|
[36:0] hbm_offset (37 bit, 128GB per cube)
|
||||||
|
```
|
||||||
|
|
||||||
|
PA is used to represent the final routable canonical physical destination,
|
||||||
|
and this role is preserved.
|
||||||
|
However, the timing and policy of logical access -> physical request conversion are not clearly separated.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Decision
|
||||||
|
|
||||||
|
### D1. Introduction of LA (Logical Address) — Replacing VA
|
||||||
|
|
||||||
|
The existing VA (Virtual Address) infrastructure is replaced with LA (Logical Address).
|
||||||
|
|
||||||
|
#### Characteristics of LA
|
||||||
|
|
||||||
|
- Like VA, tensors can be mapped to a contiguous memory space
|
||||||
|
- Represents logical buffer + offset
|
||||||
|
- Does not directly contain physical channel information
|
||||||
|
- An intermediate abstraction maintained until physical resolution
|
||||||
|
- The sole address scheme used by kernel code (`tl.load`, `tl.store`, `tl.composite`)
|
||||||
|
|
||||||
|
#### LA Space Definition
|
||||||
|
|
||||||
|
| Item | Value |
|
||||||
|
|------|-------|
|
||||||
|
| LA start address | `0x1_0000_0000` (4 GB, preserving the existing VA start point) |
|
||||||
|
| LA space size | 64 GB per PE |
|
||||||
|
| Alignment unit | Segment-based (see D3 below) |
|
||||||
|
|
||||||
|
LA is a PE-local address space.
|
||||||
|
Even if different PEs use the same LA value, they resolve to different PAs
|
||||||
|
because each PE has a different BAAW segment table.
|
||||||
|
|
||||||
|
#### VA Infrastructure Removal Scope
|
||||||
|
|
||||||
|
With the introduction of LA, the following existing code will be replaced/removed:
|
||||||
|
|
||||||
|
| Removal Target | Replacement |
|
||||||
|
|----------------|-------------|
|
||||||
|
| `policy/address/va_allocator.py` (VirtualAllocator) | LA allocator (same free-list approach, name/role changed) |
|
||||||
|
| `policy/address/pe_mmu.py` (PeMMU) | BAAW segment table (inside PE_DMA) |
|
||||||
|
| `components/builtin/pe_mmu.py` (PeMmuComponent) | Removed — BAAW is internal PE_DMA logic, not a separate component |
|
||||||
|
| `runtime_api/kernel.py`: MmuMapMsg, MmuUnmapMsg | Replaced with BaawSegmentInstallMsg |
|
||||||
|
| `runtime_api/context.py`: VA alloc + MMU mapping install | LA alloc + BAAW segment install |
|
||||||
|
| `runtime_api/tensor.py`: `va_base` field | `la_base` field |
|
||||||
|
| `topology.yaml`: pe_mmu component entry | Removed |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### D2. Mapping Mode Configuration
|
||||||
|
|
||||||
|
The mapping mode is configured at the cube level in topology.yaml:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
cube:
|
||||||
|
memory_map:
|
||||||
|
hbm_mapping_mode: n_to_one # one_to_one | n_to_one
|
||||||
|
hbm_pseudo_channels: 64 # total pseudo channel count
|
||||||
|
hbm_channels_per_pe: 8 # local channel count per PE
|
||||||
|
hbm_channel_bw_gbs: 32.0 # per-channel bandwidth
|
||||||
|
```
|
||||||
|
|
||||||
|
This configuration is referenced during graph compilation (topology builder) and BAAW initialization.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### D3. Segments and BAAW
|
||||||
|
|
||||||
|
#### Segment Definition
|
||||||
|
|
||||||
|
A segment is a logical allocation unit that partitions the LA space so that each segment
|
||||||
|
maps to a specific HBM channel or channel group.
|
||||||
|
|
||||||
|
Segments are created by the runtime allocator during tensor deployment,
|
||||||
|
and BAAW uses them to convert LA into physical requests.
|
||||||
|
|
||||||
|
#### BAAW Segment Table Entry
|
||||||
|
|
||||||
|
```python
|
||||||
|
@dataclass
|
||||||
|
class BaawSegment:
|
||||||
|
la_base: int # segment start LA
|
||||||
|
la_size: int # segment size (bytes)
|
||||||
|
mode: str # "one_to_one" | "n_to_one"
|
||||||
|
# 1:1 mode fields
|
||||||
|
channel_count: int # number of channels assigned to this segment (e.g., 8)
|
||||||
|
pa_bases: list[int] # per-channel PA start address list (len = channel_count)
|
||||||
|
channel_ids: list[int] # per-channel logical IDs (e.g., [0,1,2,...,7])
|
||||||
|
channel_size: int # per-channel size (la_size // channel_count)
|
||||||
|
# n:1 mode fields
|
||||||
|
agg_pa_base: int # aggregated PA start address
|
||||||
|
agg_node_id: str # aggregated router node_id (for routing)
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Segment Lifecycle
|
||||||
|
|
||||||
|
1. **Allocation time** (tensor deploy):
|
||||||
|
- RuntimeContext allocates LA space from the LA allocator
|
||||||
|
- PEMemAllocator allocates per-channel PA (1:1) or aggregated PA (n:1)
|
||||||
|
- Sends `BaawSegmentInstallMsg` to PE_DMA to register in the segment table
|
||||||
|
|
||||||
|
2. **Usage time** (kernel execution):
|
||||||
|
- Kernel issues `tl.load(la_ptr)` -> DmaReadCmd(src_addr=LA)
|
||||||
|
- PE_DMA looks up the segment corresponding to the LA in BAAW
|
||||||
|
- Converts to PA(s) according to the mode
|
||||||
|
|
||||||
|
3. **Deallocation time** (tensor free):
|
||||||
|
- Removed from the segment table
|
||||||
|
- LA space returned, PA deallocated
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### D4. BAAW (Logical-to-Physical Mapping Unit)
|
||||||
|
|
||||||
|
#### Location
|
||||||
|
|
||||||
|
BAAW is placed as a front-end stage inside PE_DMA.
|
||||||
|
It is not a separate SimPy component; it is synchronous address resolution logic
|
||||||
|
executed at the beginning of PE_DMA's `handle_command()`.
|
||||||
|
|
||||||
|
#### Input
|
||||||
|
|
||||||
|
- LA (Logical Address) — DmaReadCmd.src_addr or DmaWriteCmd.dst_addr
|
||||||
|
- access size (bytes)
|
||||||
|
|
||||||
|
#### Output
|
||||||
|
|
||||||
|
- 1:1 mode: `list[PhysicalRequest]` — each request is (PA, nbytes, channel_node_id)
|
||||||
|
- n:1 mode: 1 `PhysicalRequest` — (agg_PA, nbytes, agg_node_id)
|
||||||
|
|
||||||
|
```python
|
||||||
|
@dataclass
|
||||||
|
class PhysicalRequest:
|
||||||
|
pa: int # 51-bit Physical Address
|
||||||
|
nbytes: int # transfer size for this request
|
||||||
|
dst_node: str # target node_id (channel router or aggregated router)
|
||||||
|
```
|
||||||
|
|
||||||
|
#### BAAW Resolve Logic
|
||||||
|
|
||||||
|
```python
|
||||||
|
def resolve(self, la: int, nbytes: int) -> list[PhysicalRequest]:
|
||||||
|
seg = self._find_segment(la) # la_base <= la < la_base + la_size
|
||||||
|
offset = la - seg.la_base
|
||||||
|
|
||||||
|
if seg.mode == "n_to_one":
|
||||||
|
pa = seg.agg_pa_base + offset
|
||||||
|
return [PhysicalRequest(pa=pa, nbytes=nbytes, dst_node=seg.agg_node_id)]
|
||||||
|
|
||||||
|
elif seg.mode == "one_to_one":
|
||||||
|
requests = []
|
||||||
|
per_ch_size = seg.channel_size
|
||||||
|
for i, (pa_base, ch_id) in enumerate(zip(seg.pa_bases, seg.channel_ids)):
|
||||||
|
ch_offset = offset % per_ch_size # interleaved or striped
|
||||||
|
ch_nbytes = nbytes // seg.channel_count
|
||||||
|
pa = pa_base + ch_offset
|
||||||
|
dst_node = f"{self._pe_prefix}.ch_r{ch_id}"
|
||||||
|
requests.append(PhysicalRequest(pa=pa, nbytes=ch_nbytes, dst_node=dst_node))
|
||||||
|
return requests
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Scope of Responsibility
|
||||||
|
|
||||||
|
BAAW is responsible for:
|
||||||
|
- Converting logical accesses into physical request units
|
||||||
|
- Performing fan-out (1:1) or pass-through (n:1) according to the mapping mode
|
||||||
|
- Generating Physical Addresses and determining target nodes
|
||||||
|
|
||||||
|
BAAW is NOT responsible for:
|
||||||
|
- Performing actual data movement
|
||||||
|
- Executing NOC routing
|
||||||
|
- Simulating bandwidth consumption (this is the role of downstream components)
|
||||||
|
|
||||||
|
#### Output Contract
|
||||||
|
|
||||||
|
The output of BAAW must be request units that can be directly used by the simulator's
|
||||||
|
routing and resource model without any additional address decoding.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### D5. PE_DMA handle_command() Changes
|
||||||
|
|
||||||
|
#### Current Flow (VA-based)
|
||||||
|
|
||||||
|
```
|
||||||
|
DmaReadCmd.src_addr (VA)
|
||||||
|
-> MMU.translate(VA) -> PA
|
||||||
|
-> PhysAddr.decode(PA) -> PhysAddr object
|
||||||
|
-> resolver.resolve(PhysAddr) -> dst_node_id (e.g., "sip0.cube0.hbm_ctrl")
|
||||||
|
-> router.find_path(pe_prefix, dst_node_id) -> path
|
||||||
|
-> 1 sub-Transaction created -> fabric inject
|
||||||
|
```
|
||||||
|
|
||||||
|
#### New Flow (LA-based)
|
||||||
|
|
||||||
|
```
|
||||||
|
DmaReadCmd.src_addr (LA)
|
||||||
|
-> BAAW.resolve(LA, nbytes) -> list[PhysicalRequest]
|
||||||
|
-> For each PhysicalRequest:
|
||||||
|
-> router.find_path(pe_prefix, req.dst_node) -> path
|
||||||
|
-> compute_drain_ns(path, req.nbytes) -> drain
|
||||||
|
-> sub-Transaction created -> fabric inject
|
||||||
|
-> Wait for all sub-Transactions to complete
|
||||||
|
-> pe_txn.done.succeed()
|
||||||
|
```
|
||||||
|
|
||||||
|
Key changes:
|
||||||
|
- MMU reference removed -> replaced with BAAW resolve
|
||||||
|
- PhysAddr.decode() + resolver.resolve() -> BAAW directly returns dst_node
|
||||||
|
- 1 request -> N requests injected in parallel (1:1 mode)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### D6. 1:1 Mode Details
|
||||||
|
|
||||||
|
- One logical access -> N (= `channels_per_pe`) physical requests
|
||||||
|
- N is a parameter determined by `hbm_pseudo_channels / pes_per_cube`
|
||||||
|
- Each request:
|
||||||
|
- Fully resolved 51-bit PA
|
||||||
|
- Targets a specific channel router (`{pe_prefix}.ch_r{channel_id}`)
|
||||||
|
- BW contention modeling via per-channel links
|
||||||
|
- PE_DMA injects N sub-transactions simultaneously
|
||||||
|
|
||||||
|
#### 1:1 Mode Example
|
||||||
|
|
||||||
|
Configuration: `hbm_pseudo_channels=64`, `pes_per_cube=8`
|
||||||
|
-> `channels_per_pe=8`, PE0 owns ch0-7
|
||||||
|
|
||||||
|
```text
|
||||||
|
Tensor A (4 KB) -> LA 0x1_0000_0000, size=4096 bytes
|
||||||
|
BAAW segment: {
|
||||||
|
la_base: 0x1_0000_0000, la_size: 4096,
|
||||||
|
mode: "one_to_one", channel_count: 8, # = channels_per_pe
|
||||||
|
pa_bases: [PA_ch0, PA_ch1, ..., PA_ch7],
|
||||||
|
channel_ids: [0, 1, 2, 3, 4, 5, 6, 7],
|
||||||
|
channel_size: 512, # = la_size / channel_count
|
||||||
|
}
|
||||||
|
|
||||||
|
BAAW resolve result (N=8 requests):
|
||||||
|
-> PhysicalRequest(pa=PA_ch0, nbytes=512, dst_node="sip0.cube0.pe0.ch_r0")
|
||||||
|
-> PhysicalRequest(pa=PA_ch1, nbytes=512, dst_node="sip0.cube0.pe0.ch_r1")
|
||||||
|
-> ...
|
||||||
|
-> PhysicalRequest(pa=PA_ch7, nbytes=512, dst_node="sip0.cube0.pe0.ch_r7")
|
||||||
|
|
||||||
|
PE_DMA: N sub-transactions injected in parallel
|
||||||
|
Each accesses HBM via channel router -> hbm_ctrl link (channel_bw_gbs)
|
||||||
|
Total effective BW = N x channel_bw_gbs
|
||||||
|
```
|
||||||
|
|
||||||
|
Examples with different N values:
|
||||||
|
- `hbm_pseudo_channels=32`, `pes_per_cube=8` -> `channels_per_pe=4`, 4 requests
|
||||||
|
- `hbm_pseudo_channels=64`, `pes_per_cube=4` -> `channels_per_pe=16`, 16 requests
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### D7. n:1 Mode Details
|
||||||
|
|
||||||
|
- One logical access -> one aggregated request
|
||||||
|
- Target: aggregated router -> hbm_ctrl (see ADR-0019)
|
||||||
|
- Aggregated link BW = `channels_per_pe` x `channel_bw_gbs` (e.g., 8 x 32 = 256 GB/s)
|
||||||
|
- Modeled as a single queue / resource
|
||||||
|
- No per-channel PA decomposition
|
||||||
|
|
||||||
|
#### n:1 Mode Example
|
||||||
|
|
||||||
|
```
|
||||||
|
Tensor A (4 KB) -> LA 0x1_0000_0000, size=4096 bytes
|
||||||
|
BAAW segment: {
|
||||||
|
la_base: 0x1_0000_0000, la_size: 4096,
|
||||||
|
mode: "n_to_one",
|
||||||
|
agg_pa_base: PA_agg,
|
||||||
|
agg_node_id: "sip0.cube0.pe0.agg_router",
|
||||||
|
}
|
||||||
|
|
||||||
|
BAAW resolve result:
|
||||||
|
-> PhysicalRequest(pa=PA_agg, nbytes=4096, dst_node="sip0.cube0.pe0.agg_router")
|
||||||
|
|
||||||
|
PE_DMA: 1 sub-transaction injected
|
||||||
|
Accesses HBM via aggregated router -> hbm_ctrl link (256 GB/s)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### D8. Kernel Model Preservation
|
||||||
|
|
||||||
|
- The kernel still issues only single memory ops (`tl.load`, `tl.store`, `tl.composite`)
|
||||||
|
- LA is the address scheme passed to the kernel
|
||||||
|
- Channel decomposition/aggregation is performed by BAAW inside PE_DMA
|
||||||
|
- Physical channel information is not exposed to kernel code
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Consequences
|
||||||
|
|
||||||
|
### Positive
|
||||||
|
|
||||||
|
- 1:1 vs n:1 semantics are clearly separated at a single point: BAAW
|
||||||
|
- Kernel abstraction is preserved — no kernel code changes required
|
||||||
|
- Topology-based policy control is possible (mode switching via yaml)
|
||||||
|
- Improved simulation model consistency and debuggability
|
||||||
|
- Segment-based mapping is simpler and has lower overhead compared to page tables
|
||||||
|
|
||||||
|
### Negative
|
||||||
|
|
||||||
|
- Full refactoring of VA/MMU-based code is required
|
||||||
|
- Increased complexity in the request generation path (managing N requests in 1:1 mode)
|
||||||
|
- Reduced per-channel visibility in n:1 mode
|
||||||
|
- Existing VA-related tests must be rewritten
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Alternatives
|
||||||
|
|
||||||
|
### A1. Keep VA + Fan-out at MMU
|
||||||
|
|
||||||
|
- Extend MMU to return per-channel PAs
|
||||||
|
- Problem: MMU's role expands beyond address translation to include request decomposition
|
||||||
|
- Problem: Aggregation representation is difficult in n:1 mode
|
||||||
|
|
||||||
|
### A2. Kernel Generates Channel-Aware Requests
|
||||||
|
|
||||||
|
- Kernel directly calls per-channel load/store
|
||||||
|
- Problem: Abstraction leakage, reduced portability
|
||||||
|
- Problem: All benchmark code must be modified
|
||||||
|
|
||||||
|
### A3. Always Use PA (Without LA)
|
||||||
|
|
||||||
|
- Runtime directly passes per-channel PA to the kernel
|
||||||
|
- Problem: Conflicts with the aggregation model
|
||||||
|
- Problem: Conversion timing is unclear, channel information exposed to kernel
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Implementation Notes
|
||||||
|
|
||||||
|
### Implementation Order
|
||||||
|
|
||||||
|
1. Introduce LA type (`policy/address/la_allocator.py`)
|
||||||
|
2. Implement BAAW segment table (`policy/address/baaw.py`)
|
||||||
|
3. Add `BaawSegmentInstallMsg` message type (`runtime_api/kernel.py`)
|
||||||
|
4. Integrate BAAW into PE_DMA (`components/builtin/pe_dma.py` handle_command changes)
|
||||||
|
5. Modify RuntimeContext: LA alloc + segment install (`runtime_api/context.py`)
|
||||||
|
6. Change Tensor.va_base -> la_base (`runtime_api/tensor.py`)
|
||||||
|
7. Remove VA/MMU code
|
||||||
|
8. Remove pe_mmu from topology.yaml, add mapping mode configuration
|
||||||
|
9. Test migration
|
||||||
|
|
||||||
|
### Affected Existing Tests
|
||||||
|
|
||||||
|
| Test File | Impact |
|
||||||
|
|-----------|--------|
|
||||||
|
| `tests/test_mmu_component.py` | Remove -> replace with BAAW segment install test |
|
||||||
|
| `tests/test_mmu_fabric.py` | Remove -> replace with BAAW + fabric integration test |
|
||||||
|
| `tests/test_pe_mmu.py` | Remove |
|
||||||
|
| `tests/test_va_allocator.py` | Replace with LA allocator test |
|
||||||
|
| `tests/test_va_integration.py` | Replace with LA + BAAW integration test |
|
||||||
|
| `tests/test_va_offset.py` | Replace with LA offset test |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Test Requirements
|
||||||
|
|
||||||
|
- For the same logical access:
|
||||||
|
- 1:1 -> verify N requests are generated
|
||||||
|
- n:1 -> verify 1 aggregated request is generated
|
||||||
|
- Verify effective bandwidth consistency across both modes
|
||||||
|
- 1:1 -> verify per-channel contention modeling
|
||||||
|
- n:1 -> verify aggregated bandwidth is reflected
|
||||||
|
- Verify operation without kernel code changes
|
||||||
|
- Verify correct BAAW segment install/uninstall operation
|
||||||
|
- Verify no conflicts when multiple tensors are assigned to different segments
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Links
|
||||||
|
|
||||||
|
- ADR-0011 (Memory Addressing Simplification — PA-first, VA/MMU introduction) -> superseded by this ADR
|
||||||
|
- ADR-0019 (NOC Per-Channel HBM Connection Model) -> topology-side integration
|
||||||
|
- ADR-0014 (PE Internal Execution Model) -> PE_DMA change impact
|
||||||
@@ -0,0 +1,431 @@
|
|||||||
|
# ADR-0019: Per-Channel and Aggregated HBM Connection Models within CUBE NOC
|
||||||
|
|
||||||
|
## Status
|
||||||
|
|
||||||
|
Proposed
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
ADR-0018 introduced LA-based address abstraction and BAAW,
|
||||||
|
defining how a logical memory access is translated into the following two forms of requests:
|
||||||
|
|
||||||
|
- 1:1 mode: one logical access → N per-channel requests
|
||||||
|
- n:1 mode: one logical access → one aggregated request
|
||||||
|
|
||||||
|
Here N = `hbm_pseudo_channels / pes_per_cube` (= `channels_per_pe`),
|
||||||
|
determined by topology parameters.
|
||||||
|
|
||||||
|
### Problems with the Existing Structure
|
||||||
|
|
||||||
|
In the current implementation (`topology/builder.py`):
|
||||||
|
|
||||||
|
- PE_DMA → NOC → xbar_top/xbar_bot → HBM_CTRL.slice{0-7} path is used
|
||||||
|
- HBM is modeled as 8 slice (= per-PE) nodes
|
||||||
|
- Local/remote access use different paths:
|
||||||
|
- local: NOC → xbar → HBM slice
|
||||||
|
- cross-half: NOC → xbar_top → bridge → xbar_bot → HBM slice
|
||||||
|
- remote cube: NOC → UCIe → remote NOC → remote xbar → remote HBM slice
|
||||||
|
|
||||||
|
Limitations of this structure:
|
||||||
|
|
||||||
|
- Cannot model at the pseudo-channel granularity (slice = per-PE granularity, not per-channel)
|
||||||
|
- xbar/bridge bifurcate local/remote paths
|
||||||
|
- Cannot express 1:1 / n:1 modes consistently
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Decision
|
||||||
|
|
||||||
|
### D1. HBM Attaches to PE Routers
|
||||||
|
|
||||||
|
Consolidate the current `hbm_ctrl.slice{0-7}` (8 nodes) into a **single `hbm_ctrl` node**,
|
||||||
|
and attach the HBM access point to the same router where the PE is attached.
|
||||||
|
|
||||||
|
- n:1 mode: PE's local HBM access goes directly from its own router (switching overhead only, 0 hops)
|
||||||
|
- Remote PE's HBM access: reaches the target PE's router via mesh hops
|
||||||
|
- The read/write resource model within the HBM controller is preserved
|
||||||
|
|
||||||
|
Node naming changes:
|
||||||
|
|
||||||
|
| Current | After Change |
|
||||||
|
| ---- | ------- |
|
||||||
|
| `sip0.cube0.hbm_ctrl.slice0` ~ `slice7` | `sip0.cube0.hbm_ctrl` (single) |
|
||||||
|
|
||||||
|
In `mesh_gen.py`, add `pe{idx}.hbm` to the PE attachment so that
|
||||||
|
the builder generates an edge between that router and hbm_ctrl.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### D2. Complete Removal of xbar, bridge, and Single NOC Node
|
||||||
|
|
||||||
|
Remove all of the following nodes and related edges:
|
||||||
|
|
||||||
|
- `{cube}.xbar_top`, `{cube}.xbar_bot`
|
||||||
|
- `{cube}.bridge.left`, `{cube}.bridge.right`
|
||||||
|
- `{cube}.noc` (single TwoDMeshNocComponent node)
|
||||||
|
- Edges of type `noc_to_xbar`, `xbar_to_noc`, `xbar_to_hbm`, `hbm_to_xbar`
|
||||||
|
- Edges of type `xbar_to_bridge`, `bridge_to_xbar`
|
||||||
|
- Edges of type `pe_to_noc`, `noc_to_pe`, `noc_to_pe_cpu`, etc. referencing the single noc node
|
||||||
|
|
||||||
|
Their role is replaced by an **explicit router mesh based on cube_mesh.yaml**.
|
||||||
|
Each router (r0c0, r0c1, ...) from the 6x6 router grid generated by `mesh_gen.py`
|
||||||
|
is created as a separate SimPy node in the topology graph,
|
||||||
|
and adjacent routers are connected via XY mesh edges.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### D3. Explicit Router Mesh (Common Basis for n:1 / 1:1)
|
||||||
|
|
||||||
|
#### Router Nodes Based on cube_mesh.yaml
|
||||||
|
|
||||||
|
Each non-null router from cube_mesh.yaml generated by `mesh_gen.py`
|
||||||
|
is created as a **separate SimPy node** in the topology graph.
|
||||||
|
|
||||||
|
- Node ID: `{cube}.r{row}c{col}` (e.g., `sip0.cube0.r0c0`)
|
||||||
|
- kind: `noc_router`, impl: `forwarding_v1`
|
||||||
|
- pos_mm: taken from cube_mesh.yaml
|
||||||
|
|
||||||
|
Based on the attach information in cube_mesh.yaml, components are connected to each router:
|
||||||
|
- `pe{p}.dma` → PE_DMA ↔ router edge
|
||||||
|
- `pe{p}.cpu` → PE_CPU ↔ router edge
|
||||||
|
- `pe{p}.hbm` → HBM_CTRL ↔ router edge (added in n:1)
|
||||||
|
- `m_cpu` → M_CPU ↔ router edge
|
||||||
|
- `sram` → SRAM ↔ router edge
|
||||||
|
- `ucie_{dir}.c{i}` → UCIe conn ↔ router edge
|
||||||
|
|
||||||
|
Router-to-router XY mesh edges: bidirectional edges between adjacent routers.
|
||||||
|
Null routers (HBM exclusion zones) are skipped.
|
||||||
|
|
||||||
|
#### 1:1 Mode Extension (To Be Implemented Later)
|
||||||
|
|
||||||
|
In 1:1 mode, each router differentiates into N channel mini-routers.
|
||||||
|
Per-channel routing and ChannelSplitter (LA → per-channel PA) introduction are required.
|
||||||
|
N GEMM engines per PE are also added at this point.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### D4. Cross-PE HBM Access (n:1 Mode)
|
||||||
|
|
||||||
|
In n:1 mode, when a PE accesses another PE's local HBM,
|
||||||
|
it hops through the XY mesh in cube_mesh.yaml to reach the target PE's router.
|
||||||
|
|
||||||
|
Example: PE0 (r0c0) accessing PE2's (r1c4) HBM:
|
||||||
|
|
||||||
|
```text
|
||||||
|
PE0.pe_dma → r0c0 → r0c1 → r0c2 → r0c3 → r0c4 → r1c4 → hbm_ctrl
|
||||||
|
```
|
||||||
|
|
||||||
|
The Dijkstra router finds the shortest path in the mesh.
|
||||||
|
|
||||||
|
Cross-PE channel access in 1:1 mode will be defined during the 1:1 extension in D3.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### D5. n:1 Mode: Uses cube_mesh.yaml Router Mesh
|
||||||
|
|
||||||
|
In n:1 mode, no separate "aggregated router" is created.
|
||||||
|
The existing router grid from cube_mesh.yaml serves that role.
|
||||||
|
|
||||||
|
#### Connection Structure
|
||||||
|
|
||||||
|
PE_DMA, PE_CPU, and HBM are all connected to the router where each PE is attached:
|
||||||
|
|
||||||
|
```text
|
||||||
|
sip0.cube0.pe0.pe_dma ←→ sip0.cube0.r0c0 (bw: N × channel_bw_gbs)
|
||||||
|
sip0.cube0.hbm_ctrl ←→ sip0.cube0.r0c0 (bw: N × channel_bw_gbs)
|
||||||
|
```
|
||||||
|
|
||||||
|
Routers are connected via XY mesh edges. PE's local HBM access goes
|
||||||
|
directly from its own router (switching overhead only).
|
||||||
|
|
||||||
|
#### n:1 Mode Full Data Paths
|
||||||
|
|
||||||
|
**Local HBM (0 hops):**
|
||||||
|
```text
|
||||||
|
PE0.pe_dma → r0c0 → hbm_ctrl (switching overhead only)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Remote HBM (mesh hops):**
|
||||||
|
```text
|
||||||
|
PE0.pe_dma → r0c0 → r0c1 → ... → r1c4 → hbm_ctrl
|
||||||
|
```
|
||||||
|
|
||||||
|
**M_CPU DMA:**
|
||||||
|
```text
|
||||||
|
M_CPU → r2c0 → (mesh hops) → r{x}c{y} → hbm_ctrl
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### D6. All Traffic Is Unified onto the Same Router Mesh
|
||||||
|
|
||||||
|
- All memory accesses (DMA data) and commands (PE_CPU) use the same router mesh
|
||||||
|
- Local access does not use a separate fast path (xbar)
|
||||||
|
- Cross-cube (remote) access path:
|
||||||
|
|
||||||
|
```text
|
||||||
|
PE_DMA → r{x}c{y} → (mesh hops) → ucie_conn → ucie-{PORT}
|
||||||
|
→ [UCIe link] → remote ucie → remote conn → remote r{x}c{y} → hbm_ctrl
|
||||||
|
```
|
||||||
|
|
||||||
|
UCIe connections maintain the existing structure,
|
||||||
|
but both endpoints become mesh routers instead of xbars.
|
||||||
|
|
||||||
|
The number of UCIe lines is determined by BW ratio: `ucie_lines_per_side = ceil(ucie_bw / noc_line_bw)`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### D7. AddressResolver Changes
|
||||||
|
|
||||||
|
Current `AddressResolver.resolve()`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Current: HBM offset → pe_slice → "sip{s}.cube{c}.hbm_ctrl.slice{pe_slice}"
|
||||||
|
pe_slice = PhysAddr.hbm_pe_id(addr.hbm_offset, self._slice_size_bytes)
|
||||||
|
return f"sip{s}.cube{c}.hbm_ctrl.slice{pe_slice}"
|
||||||
|
```
|
||||||
|
|
||||||
|
After change:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Changed: HBM → single endpoint
|
||||||
|
return f"sip{s}.cube{c}.hbm_ctrl"
|
||||||
|
```
|
||||||
|
|
||||||
|
The pe_slice calculation is removed.
|
||||||
|
In n:1 mode, PE_DMA directly accesses the hbm_ctrl attached to its own router.
|
||||||
|
|
||||||
|
resolver.resolve() is retained for external access (M_CPU DMA, etc.) and backward compatibility.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### D8. topology.yaml Configuration Changes
|
||||||
|
|
||||||
|
#### Added Settings
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
cube:
|
||||||
|
memory_map:
|
||||||
|
hbm_mapping_mode: n_to_one # one_to_one | n_to_one
|
||||||
|
hbm_pseudo_channels: 64 # total pseudo channel count
|
||||||
|
hbm_channels_per_pe: 8 # local channels per PE (= pseudo_channels / pes_per_cube)
|
||||||
|
hbm_channel_bw_gbs: 32.0 # per-channel bandwidth (GB/s)
|
||||||
|
hbm_total_gb_per_cube: 48 # retained
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Removed Settings
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# To be removed
|
||||||
|
links:
|
||||||
|
xbar_to_hbm_bw_gbs: 256.0 # → replaced by channel_bw_gbs × channels_per_pe
|
||||||
|
xbar_to_hbm_mm: 2.5 # → replaced by ch_router_to_hbm_mm
|
||||||
|
xbar_to_bridge_bw_gbs: 128.0 # → removed (no bridge)
|
||||||
|
xbar_to_bridge_mm: 3.0 # → removed
|
||||||
|
noc_to_xbar_bw_gbs: ... # → removed
|
||||||
|
noc_to_xbar_mm: ... # → removed
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Added Link Settings
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
links:
|
||||||
|
router_link_bw_gbs: 256.0 # XY mesh link BW between routers
|
||||||
|
router_overhead_ns: 2.0 # router switching overhead
|
||||||
|
pe_to_router_bw_gbs: 256.0 # PE_DMA ↔ router
|
||||||
|
hbm_to_router_bw_gbs: 256.0 # HBM ↔ router (= N × channel_bw)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### D9. Bandwidth Numerical Consistency
|
||||||
|
|
||||||
|
| Configuration | Value |
|
||||||
|
| ---- | --- |
|
||||||
|
| pseudo channels per cube | 64 (parameter) |
|
||||||
|
| PEs per cube | 8 (parameter) |
|
||||||
|
| channels per PE (N) | `pseudo_channels / pes_per_cube` = 8 |
|
||||||
|
| per-channel BW | 32 GB/s (parameter) |
|
||||||
|
| per-PE local BW | N × 32 = 256 GB/s |
|
||||||
|
| cube total HBM BW | 64 × 32 = 2048 GB/s |
|
||||||
|
|
||||||
|
The effective BW per PE is identical in both modes:
|
||||||
|
|
||||||
|
- 1:1 mode: N channel links × channel_bw_gbs = N × 32 = 256 GB/s
|
||||||
|
- n:1 mode: 1 aggregated link = N × channel_bw_gbs = 256 GB/s
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Consequences
|
||||||
|
|
||||||
|
### Positive
|
||||||
|
|
||||||
|
- The router mesh based on cube_mesh.yaml accurately reflects physical placement
|
||||||
|
- In n:1 mode, the existing VA scheme is preserved, keeping transition costs low
|
||||||
|
- Local / remote / command traffic is unified onto the same mesh, resulting in simplicity
|
||||||
|
- Aligns well with graph compiler-based topology generation
|
||||||
|
- Channel count and PE count are both parameterized, enabling testing of various configurations
|
||||||
|
- 1:1 mode extension naturally follows through router differentiation
|
||||||
|
|
||||||
|
### Negative
|
||||||
|
|
||||||
|
- The number of SimPy nodes increases due to explicit router nodes (6x6 = up to 32 routers/cube)
|
||||||
|
- Requires complete rewrite of existing xbar/bridge/single NOC-based tests
|
||||||
|
- The internal contention model of TwoDMeshNocComponent needs to be replaced with a per-router model
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Alternatives
|
||||||
|
|
||||||
|
### A1. Retain Existing xbar + HBM Slices
|
||||||
|
|
||||||
|
- Local/remote paths remain bifurcated
|
||||||
|
- Cannot model at pseudo-channel granularity
|
||||||
|
- Cannot switch between 1:1/n:1 modes
|
||||||
|
|
||||||
|
### A2. Always Generate Per-Channel Links and Aggregate Only in n:1
|
||||||
|
|
||||||
|
- Topology structure always has 1:1 size
|
||||||
|
- Expressing n:1 semantics via link aggregation is complex
|
||||||
|
- No reduction in router node count
|
||||||
|
|
||||||
|
### A3. Gradual Transition (Retain xbar + Add NOC Path)
|
||||||
|
|
||||||
|
- Higher compatibility, but dual-path coexistence increases complexity
|
||||||
|
- Since xbar removal is ultimately necessary, the intermediate step provides little value
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Implementation Notes
|
||||||
|
|
||||||
|
### topology/builder.py Change Details
|
||||||
|
|
||||||
|
#### Code to Remove (within current `_instantiate_cube()`)
|
||||||
|
|
||||||
|
- xbar_top, xbar_bot node creation (~line 495-508)
|
||||||
|
- bridge.left, bridge.right node creation
|
||||||
|
- noc ↔ xbar edge creation (~line 540-555)
|
||||||
|
- xbar ↔ hbm_ctrl.slice edge creation (~line 510-538)
|
||||||
|
- xbar ↔ bridge edge creation (~line 557-572)
|
||||||
|
|
||||||
|
#### Code to Add
|
||||||
|
|
||||||
|
1:1 mode:
|
||||||
|
|
||||||
|
```python
|
||||||
|
N = hbm_channels_per_pe # from topology config
|
||||||
|
total_ch = hbm_pseudo_channels
|
||||||
|
|
||||||
|
# Create channel router nodes
|
||||||
|
for ch_id in range(total_ch):
|
||||||
|
pe_id = ch_id // N
|
||||||
|
nodes[f"{cp}.ch_r{ch_id}"] = Node(
|
||||||
|
id=f"{cp}.ch_r{ch_id}", kind="noc_router", impl="noc_v1",
|
||||||
|
attrs={}, pos_mm=(...), # horizontal row = ch_id % N
|
||||||
|
)
|
||||||
|
|
||||||
|
# PE_DMA ↔ local channel router edges
|
||||||
|
for pe_id in range(pes_per_cube):
|
||||||
|
for local_ch in range(N):
|
||||||
|
ch_id = pe_id * N + local_ch
|
||||||
|
edges.append(Edge(
|
||||||
|
src=f"{cp}.pe{pe_id}.pe_dma", dst=f"{cp}.ch_r{ch_id}",
|
||||||
|
bw_gbs=channel_bw, kind="pe_to_ch_router", ...))
|
||||||
|
edges.append(Edge(
|
||||||
|
src=f"{cp}.ch_r{ch_id}", dst=f"{cp}.pe{pe_id}.pe_dma",
|
||||||
|
bw_gbs=channel_bw, kind="ch_router_to_pe", ...))
|
||||||
|
|
||||||
|
# Channel router ↔ hbm_ctrl edges
|
||||||
|
for ch_id in range(total_ch):
|
||||||
|
edges.append(Edge(
|
||||||
|
src=f"{cp}.ch_r{ch_id}", dst=f"{cp}.hbm_ctrl",
|
||||||
|
bw_gbs=channel_bw, kind="ch_router_to_hbm", ...))
|
||||||
|
edges.append(Edge(
|
||||||
|
src=f"{cp}.hbm_ctrl", dst=f"{cp}.ch_r{ch_id}",
|
||||||
|
bw_gbs=channel_bw, kind="hbm_to_ch_router", ...))
|
||||||
|
|
||||||
|
# Horizontal line edges (same logical index)
|
||||||
|
for row in range(N):
|
||||||
|
for p in range(pes_per_cube - 1):
|
||||||
|
ch_a = p * N + row
|
||||||
|
ch_b = (p + 1) * N + row
|
||||||
|
edges.append(Edge(
|
||||||
|
src=f"{cp}.ch_r{ch_a}", dst=f"{cp}.ch_r{ch_b}",
|
||||||
|
bw_gbs=ch_horizontal_bw, kind="ch_horizontal", ...))
|
||||||
|
edges.append(Edge(
|
||||||
|
src=f"{cp}.ch_r{ch_b}", dst=f"{cp}.ch_r{ch_a}",
|
||||||
|
bw_gbs=ch_horizontal_bw, kind="ch_horizontal", ...))
|
||||||
|
```
|
||||||
|
|
||||||
|
n:1 mode:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Create aggregated router nodes
|
||||||
|
for pe_id in range(pes_per_cube):
|
||||||
|
nodes[f"{cp}.pe{pe_id}.agg_router"] = Node(
|
||||||
|
id=f"{cp}.pe{pe_id}.agg_router", kind="noc_router", impl="noc_v1",
|
||||||
|
attrs={}, pos_mm=(...),
|
||||||
|
)
|
||||||
|
|
||||||
|
agg_bw = N * channel_bw # aggregated BW
|
||||||
|
|
||||||
|
# PE_DMA ↔ aggregated router
|
||||||
|
for pe_id in range(pes_per_cube):
|
||||||
|
edges.append(Edge(
|
||||||
|
src=f"{cp}.pe{pe_id}.pe_dma", dst=f"{cp}.pe{pe_id}.agg_router",
|
||||||
|
bw_gbs=agg_bw, kind="pe_to_agg_router", ...))
|
||||||
|
edges.append(Edge(
|
||||||
|
src=f"{cp}.pe{pe_id}.agg_router", dst=f"{cp}.pe{pe_id}.pe_dma",
|
||||||
|
bw_gbs=agg_bw, kind="agg_router_to_pe", ...))
|
||||||
|
|
||||||
|
# Aggregated router ↔ hbm_ctrl
|
||||||
|
for pe_id in range(pes_per_cube):
|
||||||
|
edges.append(Edge(
|
||||||
|
src=f"{cp}.pe{pe_id}.agg_router", dst=f"{cp}.hbm_ctrl",
|
||||||
|
bw_gbs=agg_bw, kind="agg_to_hbm", ...))
|
||||||
|
edges.append(Edge(
|
||||||
|
src=f"{cp}.hbm_ctrl", dst=f"{cp}.pe{pe_id}.agg_router",
|
||||||
|
bw_gbs=agg_bw, kind="hbm_to_agg", ...))
|
||||||
|
|
||||||
|
# Horizontal links between aggregated routers
|
||||||
|
for p in range(pes_per_cube - 1):
|
||||||
|
edges.append(Edge(
|
||||||
|
src=f"{cp}.pe{p}.agg_router", dst=f"{cp}.pe{p+1}.agg_router",
|
||||||
|
bw_gbs=agg_horizontal_bw, kind="agg_horizontal", ...))
|
||||||
|
edges.append(Edge(
|
||||||
|
src=f"{cp}.pe{p+1}.agg_router", dst=f"{cp}.pe{p}.agg_router",
|
||||||
|
bw_gbs=agg_horizontal_bw, kind="agg_horizontal", ...))
|
||||||
|
```
|
||||||
|
|
||||||
|
### Affected Existing Tests
|
||||||
|
|
||||||
|
| Test File | Impact |
|
||||||
|
| ---------- | ---- |
|
||||||
|
| `tests/test_topology_compile.py` | Remove xbar/bridge node references, add channel router verification |
|
||||||
|
| `tests/test_topology_load.py` | Reflect topology.yaml configuration changes |
|
||||||
|
| `tests/test_pe_components.py` | PE_DMA routing path changes |
|
||||||
|
| `tests/test_sip_parallel.py` | Cross-PE access path changes |
|
||||||
|
| Cases that directly test xbar/bridge | Remove |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Test Requirements
|
||||||
|
|
||||||
|
- Verify that requests are delivered via per-channel links in 1:1 mode
|
||||||
|
- Verify that requests are delivered via the aggregated link in n:1 mode
|
||||||
|
- Verify that topology is correctly generated in both modes:
|
||||||
|
- 1:1: `total_ch` channel routers + per-PE links + horizontal links
|
||||||
|
- n:1: `pes_per_cube` aggregated routers + per-PE links
|
||||||
|
- Verify that effective BW is consistent across both modes for the same workload
|
||||||
|
- Verify that horizontal line routing works for cross-PE access
|
||||||
|
- Verify that routing through UCIe works for cross-cube access
|
||||||
|
- Verify that topology generation is correct under parameter variations (channels_per_pe = 4, 8, 16, etc.)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Links
|
||||||
|
|
||||||
|
- ADR-0018 (LA + BAAW) → addressing-side integration
|
||||||
|
- ADR-0017 (Cube NOC 2D Mesh) → this ADR replaces the xbar/bridge portion
|
||||||
|
- ADR-0004 (Memory Semantics) → BW model redefinition
|
||||||
|
- ADR-0014 (PE Internal Execution Model) → impact from PE_DMA path changes
|
||||||
@@ -0,0 +1,553 @@
|
|||||||
|
# ADR-0020: 2-Pass Data Execution Model (Timing / Data Separation)
|
||||||
|
|
||||||
|
## Status
|
||||||
|
|
||||||
|
Proposed
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
The current simulation models **timing only**.
|
||||||
|
`tl.load()`, `tl.composite(op="gemm")`, etc. generate SimPy latencies,
|
||||||
|
but do not actually read tensor data or perform computations.
|
||||||
|
|
||||||
|
### Required Capabilities
|
||||||
|
|
||||||
|
1. Must be able to store and read actual data in HBM/TCM/SRAM
|
||||||
|
2. PE_GEMM, PE_MATH must be able to perform actual matrix operations and verify results
|
||||||
|
3. Must minimize simulation performance degradation
|
||||||
|
|
||||||
|
### Limitations of the Existing Kernel Execution Structure
|
||||||
|
|
||||||
|
The current kernel execution is separated into 3 stages:
|
||||||
|
|
||||||
|
```
|
||||||
|
Phase 0: Kernel function execution in TLContext → PeCommand list generation (outside SimPy, no data)
|
||||||
|
Phase 1: PE_CPU replays PeCommand list via SimPy (timing only)
|
||||||
|
```
|
||||||
|
|
||||||
|
Phase 0 requires the kernel to **complete execution entirely** before SimPy begins.
|
||||||
|
`tl.load()` returns a TensorHandle (placeholder), so actual data cannot be accessed.
|
||||||
|
Therefore, branching based on data values (dynamic control flow) is impossible.
|
||||||
|
|
||||||
|
This ADR resolves this limitation **for memory operations only** (see D1, D3).
|
||||||
|
|
||||||
|
### Constraints
|
||||||
|
|
||||||
|
- SimPy is a single-thread event loop — running numpy matmul inside it blocks everything
|
||||||
|
- Components must be replaceable (ADR-0015) — framework requirements must not leak into implementations
|
||||||
|
- Benchmark kernels are imperative code (tl.load → tl.composite → tl.wait) — the same code must be reused
|
||||||
|
- Kernel functions must remain plain Python functions (no generator/async transformation)
|
||||||
|
|
||||||
|
### Design Exploration Results
|
||||||
|
|
||||||
|
| Option | Approach | Verdict |
|
||||||
|
|--------|----------|---------|
|
||||||
|
| Direct execution in SimPy | Call numpy GEMM inside SimPy | Rejected: single-thread block |
|
||||||
|
| SimPy + ThreadPool | future.submit → timeout → result() | Rejected: blocks on result() for back-to-back requests |
|
||||||
|
| Symbolic + lazy | Track metadata only, execute later | Rejected: difficult to handle control-flow dependent reads |
|
||||||
|
| **2-pass (adopted)** | Phase 1: timing, Phase 2: data | Full separation, no performance impact |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Decision
|
||||||
|
|
||||||
|
### D1. 2-Pass Execution Model — Phase 0 Elimination
|
||||||
|
|
||||||
|
The existing 3 stages (Phase 0 → Phase 1 → Phase 2) are **consolidated into 2 stages**.
|
||||||
|
|
||||||
|
Before:
|
||||||
|
```
|
||||||
|
Phase 0: Kernel → PeCommand list (no data, no branching)
|
||||||
|
Phase 1: Replay PeCommand list via SimPy (timing only)
|
||||||
|
```
|
||||||
|
|
||||||
|
After:
|
||||||
|
```
|
||||||
|
Phase 1 (timing): Kernel + SimPy integrated execution — greenlet-based
|
||||||
|
- Memory read/write: SimPy timing + MemoryStore actual data
|
||||||
|
- Compute (GEMM/Math): SimPy timing + op_log recording (actual computation in Phase 2)
|
||||||
|
- Dynamic control flow possible (tl.load returns actual data)
|
||||||
|
|
||||||
|
Phase 2 (data): Actual computation execution based on op_log — outside SimPy, parallelizable
|
||||||
|
```
|
||||||
|
|
||||||
|
This ADR **extends Phase 1 to be data-aware for memory operations only**.
|
||||||
|
Phase 1 handles latency/BW bottleneck analysis + memory data tracking,
|
||||||
|
Phase 2 handles GEMM/Math computation correctness verification.
|
||||||
|
Phase 2 is optional — if only timing is needed, run Phase 1 alone.
|
||||||
|
|
||||||
|
### D2. Op Log Recording — ComponentBase Hook
|
||||||
|
|
||||||
|
Op log recording is performed as a **hook in the component base class**.
|
||||||
|
Individual component implementations are not modified.
|
||||||
|
|
||||||
|
```python
|
||||||
|
class ComponentBase:
|
||||||
|
def _on_process_start(self, env, msg):
|
||||||
|
if self._op_logger and getattr(msg, 'data_op', False):
|
||||||
|
self._op_logger.record_start(env.now, self.node.id, msg)
|
||||||
|
|
||||||
|
def _on_process_end(self, env, msg):
|
||||||
|
if self._op_logger and getattr(msg, 'data_op', False):
|
||||||
|
self._op_logger.record_end(env.now, self.node.id, msg)
|
||||||
|
```
|
||||||
|
|
||||||
|
Hooks are called before and after `run()` within `_forward_txn()`.
|
||||||
|
`_op_logger` is optional — zero overhead when absent.
|
||||||
|
|
||||||
|
**Hook timing definitions**:
|
||||||
|
|
||||||
|
| Timing | Meaning |
|
||||||
|
|--------|---------|
|
||||||
|
| `t_start` | The point at which the component **begins servicing** the msg (immediately before `run()` entry) |
|
||||||
|
| `t_end` | The point at which the component's **internal service completes** (immediately after `run()` returns) |
|
||||||
|
|
||||||
|
Link traversal latency is not included in t_start/t_end.
|
||||||
|
Link latency is observed as the difference between the sending component's t_end and the receiving component's t_start.
|
||||||
|
|
||||||
|
### D3. Greenlet-Based Kernel Execution — Phase 0 Elimination
|
||||||
|
|
||||||
|
The existing Phase 0 (kernel → PeCommand list) is eliminated,
|
||||||
|
and **greenlet** is used to cooperatively interleave kernel and SimPy execution.
|
||||||
|
|
||||||
|
#### Operating Principle
|
||||||
|
|
||||||
|
greenlet is a C extension that provides cooperative context switching.
|
||||||
|
When the kernel (child greenlet) calls `tl.load()` etc., it switches to the SimPy loop (parent greenlet)
|
||||||
|
to perform timing simulation, and after completion, returns to the kernel with actual data.
|
||||||
|
|
||||||
|
```
|
||||||
|
SimPy loop (parent greenlet) Kernel (child greenlet)
|
||||||
|
───────────────────────── ──────────────────────
|
||||||
|
g.switch() ─────────────────────────→ Kernel starts
|
||||||
|
a = tl.load(ptr, ...)
|
||||||
|
internal: parent.switch(DmaReadCmd)
|
||||||
|
cmd = DmaReadCmd ←────────────────── (kernel paused)
|
||||||
|
yield DmaReadMsg(...)
|
||||||
|
yield env.timeout(dma_latency)
|
||||||
|
data = memory_store.read(...)
|
||||||
|
g.switch(data) ─────────────────────→ (kernel resumed)
|
||||||
|
a = data ← actual numpy array
|
||||||
|
if a[0][0] > 0.5: ← branching possible
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
||||||
|
The kernel is maintained as a **plain Python function**.
|
||||||
|
greenlet switches exist **only within the internal implementation** of `tl.load()`, `tl.store()`, etc.
|
||||||
|
|
||||||
|
#### KernelRunner — Framework Layer
|
||||||
|
|
||||||
|
The greenlet loop resides not in the PE_CPU component but in the framework layer,
|
||||||
|
**KernelRunner**.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# KernelRunner (framework — greenlet ↔ SimPy bridge)
|
||||||
|
class KernelRunner:
|
||||||
|
def run(self, env, kernel_fn, args, store):
|
||||||
|
g = greenlet(self._run_kernel)
|
||||||
|
cmd = g.switch(kernel_fn, args)
|
||||||
|
|
||||||
|
while cmd is not None:
|
||||||
|
if isinstance(cmd, DmaReadCmd):
|
||||||
|
yield from self._dispatch_dma(env, cmd)
|
||||||
|
data = store.read(cmd.src_addr, cmd.shape, cmd.dtype)
|
||||||
|
cmd = g.switch(data) # resume with actual data
|
||||||
|
elif isinstance(cmd, GemmCmd):
|
||||||
|
yield from self._dispatch_gemm(env, cmd)
|
||||||
|
cmd = g.switch() # resume (no data)
|
||||||
|
elif isinstance(cmd, DmaWriteCmd):
|
||||||
|
store.write(cmd.dst_addr, cmd.data) # visibility = issue time
|
||||||
|
yield from self._dispatch_dma(env, cmd) # timing only
|
||||||
|
cmd = g.switch()
|
||||||
|
|
||||||
|
# PE_CPU (component — kept simple, unaware of greenlet)
|
||||||
|
def _execute_kernel(self, env):
|
||||||
|
runner = KernelRunner(self.ctx)
|
||||||
|
yield from runner.run(env, kernel_fn, args, store)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Op logging single source of truth**: KernelRunner does not record directly to op_log.
|
||||||
|
All op logging is handled **solely by the ComponentBase hook (_on_process_start/end)**.
|
||||||
|
When KernelRunner delivers messages to components via `_dispatch_gemm()` etc.,
|
||||||
|
the component base class hooks automatically record them.
|
||||||
|
|
||||||
|
**Layer separation**:
|
||||||
|
- **Kernel code**: plain function, unaware of greenlet
|
||||||
|
- **TLContext**: calls `parent.switch(cmd)` inside `tl.load()`
|
||||||
|
- **KernelRunner**: greenlet ↔ SimPy bridge, handles MemoryStore read/write. **Does not log**.
|
||||||
|
- **ComponentBase hook**: the sole path for op_log recording
|
||||||
|
- **PE_CPU**: only calls KernelRunner, replaceable as a component
|
||||||
|
|
||||||
|
#### Handling Differences Between Memory Read/Write and Compute
|
||||||
|
|
||||||
|
| Operation | In Phase 1 | In Phase 2 |
|
||||||
|
|-----------|-----------|-----------|
|
||||||
|
| `tl.load()` | SimPy timing + MemoryStore read → **actual data returned** | — |
|
||||||
|
| `tl.store()` | SimPy timing + MemoryStore write → **actual write** | — |
|
||||||
|
| `tl.composite(gemm)` | SimPy timing + **op_log recording only** | numpy actual computation |
|
||||||
|
| `tl.dot()` / math ops | SimPy timing + **op_log recording only** | numpy actual computation |
|
||||||
|
|
||||||
|
Memory read/write is processed immediately in Phase 1 (numpy slice, fast).
|
||||||
|
GEMM/Math operations are batch-executed in Phase 2 (performance separation).
|
||||||
|
|
||||||
|
#### Store Visibility Rule
|
||||||
|
|
||||||
|
`tl.store()` is **immediately reflected in MemoryStore at issue time** (visibility = issue).
|
||||||
|
SimPy DMA timing is simulated separately afterward.
|
||||||
|
|
||||||
|
This is an intentional separation of timing and visibility:
|
||||||
|
- **visibility**: the point at which it is reflected in MemoryStore = when `store.write()` is called
|
||||||
|
- **timing**: the point at which DMA latency completes in SimPy
|
||||||
|
|
||||||
|
This separation allows a load immediately after a store to see the latest data in dynamic control flow.
|
||||||
|
|
||||||
|
#### Result Handle Semantics
|
||||||
|
|
||||||
|
`tl.composite()` (sync/async) returns a **handle** referencing the result tensor.
|
||||||
|
|
||||||
|
The key contract in Phase 1:
|
||||||
|
|
||||||
|
1. **All compute handles are always considered pending in Phase 1.**
|
||||||
|
2. `tl.wait(handle)` **expresses timing synchronization only**
|
||||||
|
and does not make the handle ready.
|
||||||
|
3. Accessing the handle's actual result data (`handle.data`, element access,
|
||||||
|
numpy conversion, etc.) is **only possible in Phase 2**.
|
||||||
|
4. Therefore, **compute-result-based control flow is not supported in Phase 1.**
|
||||||
|
5. In contrast, `tl.load()` returns actual data in Phase 1, so
|
||||||
|
**memory-read-based control flow is supported**.
|
||||||
|
|
||||||
|
| Handle state | Phase | Allowed operations |
|
||||||
|
|------------|-------|----------|
|
||||||
|
| pending | Phase 1 | `tl.wait(handle)` — timing synchronization only |
|
||||||
|
| pending | Phase 1 | Pass handle as target of `tl.store()` (logical destination binding only, payload in Phase 2) |
|
||||||
|
| pending | Phase 1 | **Data access not allowed** — value-based branching not possible |
|
||||||
|
| ready | Phase 2 | Actual numpy data access, verification |
|
||||||
|
|
||||||
|
This restriction is intentional. If computations were executed in Phase 1,
|
||||||
|
the SimPy single-thread would block, defeating the purpose of 2-pass separation.
|
||||||
|
|
||||||
|
#### Phase 1 Materialization — Future Extension
|
||||||
|
|
||||||
|
If Phase 1 eager execution becomes necessary for small operations
|
||||||
|
(scalar, small reduction) in the future, selective materialization can be supported
|
||||||
|
by adding a `materialized_in_phase1: bool` flag to the op record.
|
||||||
|
This is not implemented in the current scope.
|
||||||
|
|
||||||
|
### D4. data_op Flag — Message Self-Declaration
|
||||||
|
|
||||||
|
The logging target is determined by the `data_op` attribute on the message instance,
|
||||||
|
not by message type. The framework does not hardcode message types.
|
||||||
|
|
||||||
|
```python
|
||||||
|
class MsgBase:
|
||||||
|
data_op: bool = False # default: no logging
|
||||||
|
|
||||||
|
class DmaReadCmd(MsgBase):
|
||||||
|
data_op = True # memory transfer → logging
|
||||||
|
|
||||||
|
class GemmCmd(MsgBase):
|
||||||
|
data_op = True # compute → logging
|
||||||
|
|
||||||
|
class MathCmd(MsgBase):
|
||||||
|
data_op = True # compute → logging
|
||||||
|
```
|
||||||
|
|
||||||
|
When adding a new message type (e.g., IpcqMsg), simply setting `data_op = True`
|
||||||
|
enables automatic logging without modifying framework code.
|
||||||
|
|
||||||
|
### D5. Op Log Structure
|
||||||
|
|
||||||
|
#### Op Classification Scheme
|
||||||
|
|
||||||
|
A two-level classification is used:
|
||||||
|
|
||||||
|
| Level | Field | Role |
|
||||||
|
|-------|-------|------|
|
||||||
|
| `op_kind` | `memory` \| `gemm` \| `math` | executor dispatch criterion |
|
||||||
|
| `op_name` | `dma_read` \| `dma_write` \| `gemm_f16` \| `exp` \| `add` \| `sum` etc. | specific operation identification |
|
||||||
|
|
||||||
|
#### OpRecord Definition
|
||||||
|
|
||||||
|
```python
|
||||||
|
@dataclass
|
||||||
|
class OpRecord:
|
||||||
|
t_start: float # SimPy time (ns) — service start
|
||||||
|
t_end: float # SimPy time (ns) — service completion
|
||||||
|
component_id: str # e.g. "sip0.cube0.pe0.pe_gemm"
|
||||||
|
op_kind: str # "memory" | "gemm" | "math"
|
||||||
|
op_name: str # specific operation name
|
||||||
|
params: dict # per-operation parameters (see below)
|
||||||
|
dependency_ids: list[int] # currently based on in-memory record index, may be replaced with stable op_id in the future
|
||||||
|
```
|
||||||
|
|
||||||
|
#### dependency_ids Generation Rules
|
||||||
|
|
||||||
|
`dependency_ids` is **optional**, and by default the executor performs
|
||||||
|
address-based dependency inference (see D6).
|
||||||
|
|
||||||
|
Explicit setting is only needed when precise execution ordering is required:
|
||||||
|
- **Default (address-based inference)**: the executor analyzes read/write sets to
|
||||||
|
automatically infer RAW/WAW/WAR dependencies. This is sufficient for most cases.
|
||||||
|
- **Explicit setting**: set when logical dependencies cannot be expressed via addresses
|
||||||
|
at the TLContext or command generation stage.
|
||||||
|
Example: completion handle-based synchronization — handle dependencies depend on
|
||||||
|
logical completion order rather than memory addresses, so they cannot be captured
|
||||||
|
by address inference.
|
||||||
|
|
||||||
|
#### op_log Ordering
|
||||||
|
|
||||||
|
The op_log maintains **stable ordering** based on `t_start`.
|
||||||
|
Records with the same `t_start` preserve insertion order.
|
||||||
|
|
||||||
|
#### params Details
|
||||||
|
|
||||||
|
**memory (dma_read / dma_write)**:
|
||||||
|
```python
|
||||||
|
{
|
||||||
|
"src_addr": int, # source address (byte)
|
||||||
|
"dst_addr": int, # destination address (byte)
|
||||||
|
"nbytes": int, # transfer size
|
||||||
|
"src_space": str, # "hbm" | "tcm" | "sram"
|
||||||
|
"dst_space": str, # "hbm" | "tcm" | "sram"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**gemm**:
|
||||||
|
```python
|
||||||
|
{
|
||||||
|
"src_a_addr": int, # operand A address
|
||||||
|
"src_b_addr": int, # operand B address
|
||||||
|
"dst_addr": int, # output address
|
||||||
|
"shape_a": tuple, # e.g. (128, 256)
|
||||||
|
"shape_b": tuple, # e.g. (256, 128)
|
||||||
|
"shape_out": tuple, # e.g. (128, 128)
|
||||||
|
"dtype_in": str, # e.g. "f16"
|
||||||
|
"dtype_acc": str, # accumulation dtype, e.g. "f32"
|
||||||
|
"dtype_out": str, # output dtype, e.g. "f16"
|
||||||
|
"transpose_a": bool,
|
||||||
|
"transpose_b": bool,
|
||||||
|
"layout_a": str, # "row_major" | "col_major"
|
||||||
|
"layout_b": str,
|
||||||
|
"layout_out": str,
|
||||||
|
"addr_space": str, # "tcm" (GEMM operands are always in TCM)
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**math**:
|
||||||
|
```python
|
||||||
|
{
|
||||||
|
"op": str, # "exp" | "add" | "sum" | "where" | ...
|
||||||
|
"input_addrs": list[int], # list of operand addresses
|
||||||
|
"input_shapes": list[tuple],
|
||||||
|
"dst_addr": int,
|
||||||
|
"shape_out": tuple,
|
||||||
|
"dtype": str,
|
||||||
|
"axis": int | None, # reduction axis
|
||||||
|
"addr_space": str, # "tcm"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### D6. Phase 2 Executor
|
||||||
|
|
||||||
|
Phase 2 executes the op_log outside of SimPy.
|
||||||
|
|
||||||
|
```python
|
||||||
|
class DataExecutor:
|
||||||
|
def __init__(self, op_log: list[OpRecord], initial_store: MemoryStore):
|
||||||
|
self.store = initial_store # Takes the Phase 1 MemoryStore snapshot as input
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
for t, ops in groupby(op_log, key=lambda o: o.t_start):
|
||||||
|
batch = list(ops)
|
||||||
|
independent, sequential = self._classify(batch)
|
||||||
|
self._execute_parallel(independent)
|
||||||
|
self._execute_sequential(sequential)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Parallel execution determination**:
|
||||||
|
|
||||||
|
Ops with the same `t_start` are considered **parallel candidates**.
|
||||||
|
The executor determines actual parallel execution based on the following criteria:
|
||||||
|
- Whether read/write address ranges overlap (WAW, RAW, WAR conflict checks)
|
||||||
|
- Whether predecessor ops specified in `dependency_ids` have completed
|
||||||
|
|
||||||
|
Only ops with no overlapping address ranges and no explicit dependencies are executed in parallel.
|
||||||
|
|
||||||
|
**Batch optimization**: Only independent ops with the same op_name **and identical
|
||||||
|
shape, dtype, layout, and transpose flags** are eligible for batching.
|
||||||
|
Example: identical shape GEMMs from multiple PEs → bundled into a single `np.matmul(a_batch, b_batch)` call.
|
||||||
|
Improves BLAS efficiency on CPU, reduces launch overhead on GPU.
|
||||||
|
|
||||||
|
**Phase 2 execution order guarantee**:
|
||||||
|
|
||||||
|
Phase 2 does not consider data arrival timing,
|
||||||
|
and guarantees execution order solely through
|
||||||
|
dependencies (address-based inference + explicit dependency_ids).
|
||||||
|
|
||||||
|
### D7. Memory Store
|
||||||
|
|
||||||
|
`MemoryStore` logically follows byte-addressable semantics,
|
||||||
|
and the current implementation uses **tensor-granular storage** (addr → numpy ndarray mapping).
|
||||||
|
|
||||||
|
```python
|
||||||
|
class MemoryStore:
|
||||||
|
def write(self, space: str, addr: int, data: np.ndarray) -> None: ...
|
||||||
|
def read(self, space: str, addr: int, shape: tuple, dtype: str) -> np.ndarray: ...
|
||||||
|
```
|
||||||
|
|
||||||
|
**Internal storage format: numpy ndarray**
|
||||||
|
|
||||||
|
MemoryStore stores tensors as **numpy ndarrays**.
|
||||||
|
|
||||||
|
| Candidate | store/load speed | Phase 2 compute | Verdict |
|
||||||
|
|-----------|-----------------|-----------------|---------|
|
||||||
|
| **numpy ndarray** | Immediate (reference passing, no copy) | `np.matmul` directly usable | **Adopted** |
|
||||||
|
| bytearray | Requires memcpy | Requires `np.frombuffer` conversion | Rejected |
|
||||||
|
| torch tensor | Immediate | torch operations available | Use only for GPU optimization |
|
||||||
|
|
||||||
|
- write: **stores numpy array by reference** (no copy) → Phase 1 overhead = 1 dict lookup
|
||||||
|
- read: **returns numpy array by reference** (no copy)
|
||||||
|
- Re-writing to the same addr **overwrites at tensor granularity** (partial overwrite not supported)
|
||||||
|
- dtype uses numpy native (`np.float16`, `np.float32`, `np.bfloat16`, etc.)
|
||||||
|
- For byte-level access, convert via `.view(np.uint8)`
|
||||||
|
- For GPU batch optimization in Phase 2, numpy → torch tensor conversion is the executor's responsibility
|
||||||
|
|
||||||
|
**read/write contract**:
|
||||||
|
|
||||||
|
- read/write operates on a **contiguous tensor** basis.
|
||||||
|
If non-contiguous stride views are needed, express them as separate copy ops.
|
||||||
|
- In the normal benchmark path, producer/consumer dtype match is expected.
|
||||||
|
Reinterpret cast is a permissive behavior for low-level memory validation
|
||||||
|
or special test cases.
|
||||||
|
- addr is byte-aligned, with minimum alignment = dtype size.
|
||||||
|
- dtype mismatch (reading with a different dtype than written) is handled as a reinterpret cast.
|
||||||
|
Shape mismatch is verified based on nbytes, and raises an error on mismatch.
|
||||||
|
- Correctness criteria follow address-range-based read/write semantics.
|
||||||
|
- A tensor object cache may be used as an implementation optimization,
|
||||||
|
but the canonical state is byte-addressable storage.
|
||||||
|
- At deploy time, the host injects initial tensor data.
|
||||||
|
|
||||||
|
### D8. Benchmark Kernel Code
|
||||||
|
|
||||||
|
The benchmark's **user code API is not changed**.
|
||||||
|
The call interfaces for `tl.load()`, `tl.composite()`, `tl.store()`, etc. are maintained.
|
||||||
|
|
||||||
|
However, internal command/message schemas may be extended to include metadata
|
||||||
|
required for Phase 2 execution (e.g., additional fields such as dtype_acc, transpose).
|
||||||
|
|
||||||
|
### D9. No Component Changes
|
||||||
|
|
||||||
|
Individual component implementations (PE_GEMM, PE_DMA, HBM_CTRL, etc.) are not modified.
|
||||||
|
Op log recording is the responsibility of the ComponentBase hook.
|
||||||
|
When custom components are replaced, only the timing model changes,
|
||||||
|
and Phase 2 data execution is unaffected.
|
||||||
|
|
||||||
|
### D10. Phase 2 is Optional
|
||||||
|
|
||||||
|
```python
|
||||||
|
engine = GraphEngine(graph)
|
||||||
|
engine.run(benchmark) # Phase 1: timing only
|
||||||
|
result = engine.get_timing_result()
|
||||||
|
|
||||||
|
if verify_data:
|
||||||
|
executor = DataExecutor(engine.op_log) # Phase 2: data
|
||||||
|
executor.run()
|
||||||
|
executor.verify(expected_output)
|
||||||
|
```
|
||||||
|
|
||||||
|
If only timing analysis is needed, Phase 2 is skipped.
|
||||||
|
If the op_logger is deactivated, Phase 1 performance is identical to the original.
|
||||||
|
|
||||||
|
### D11. Verification Contract
|
||||||
|
|
||||||
|
Basic verification **compares the final output tensor** against a reference backend (numpy).
|
||||||
|
|
||||||
|
Per-dtype tolerance policy:
|
||||||
|
|
||||||
|
| dtype | Comparison method | Tolerance |
|
||||||
|
|-------|----------|-----------|
|
||||||
|
| f32 | `np.allclose` | rtol=1e-5, atol=1e-5 |
|
||||||
|
| f16 | `np.allclose` | rtol=1e-3, atol=1e-3 |
|
||||||
|
| bf16 | `np.allclose` | rtol=1e-2, atol=1e-2 |
|
||||||
|
| int types | `np.array_equal` | exact |
|
||||||
|
|
||||||
|
- Default mode: compare final output only (end-to-end correctness)
|
||||||
|
- Debug mode: can compare intermediate tensors on a per-op basis
|
||||||
|
(MemoryStore snapshot at each op boundary)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Non-goals
|
||||||
|
|
||||||
|
- **Compute-result-based control flow**: not supported.
|
||||||
|
All compute handles are in pending state during Phase 1,
|
||||||
|
`wait()` expresses timing synchronization only and does not imply data readiness.
|
||||||
|
Accessing `handle.data`, element access, or truth-value evaluation in Phase 1
|
||||||
|
is **treated as an error**.
|
||||||
|
Memory-data-based branching (results of `tl.load()`) is supported via greenlet.
|
||||||
|
Phase 1 materialization is a future extension (see D3).
|
||||||
|
- **Cycle-accurate overlap reconstruction**: Phase 2 does not precisely reproduce
|
||||||
|
the execution time overlap from Phase 1. Phase 2 only verifies data correctness.
|
||||||
|
- **GPU kernel compilation**: GEMM/Math in Phase 2 are numpy/torch calls
|
||||||
|
and do not reproduce the actual hardware PE microarchitecture.
|
||||||
|
|
||||||
|
## Open Questions
|
||||||
|
|
||||||
|
- **Aliasing / slice view**: How to represent slice/views referencing the same
|
||||||
|
backing storage in MemoryStore (stride-based view vs copy semantics)
|
||||||
|
- **IPCQ/descriptor read generalization**: Whether to fully generalize PE-to-PE
|
||||||
|
communication as memory ops or introduce a separate op_kind
|
||||||
|
- **Op log streaming**: Managing op_log memory usage in large-scale simulations
|
||||||
|
(in-memory list vs disk-backed streaming)
|
||||||
|
- **Fused operation**: Whether to record tl.composite's tiled pipeline
|
||||||
|
(READ→COMPUTE→WRITE) as a single fused op record or separate individual ops
|
||||||
|
- **Math op schema generalization**: The current math params have a simple structure,
|
||||||
|
but generalization may be needed for broadcasting rules, per-input dtype, keepdims,
|
||||||
|
scalar/immediate operands, where/mask expressions, etc.
|
||||||
|
- **Op record identifier**: Currently dependency_ids are based on in-memory list indices;
|
||||||
|
replacement with stable op_id is needed when introducing streaming/disk-backed mode
|
||||||
|
- **Phase 1 materialization policy**: See Future Extension in D3.
|
||||||
|
If allowed, the Phase 2 handling approach (skip / verify / recompute) for those ops
|
||||||
|
needs to be defined
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Consequences
|
||||||
|
|
||||||
|
### Positive
|
||||||
|
|
||||||
|
- Minimal impact on SimPy simulation performance (only op_log append added)
|
||||||
|
- Free to use multi-threading/GPU in Phase 2
|
||||||
|
- Component replaceability preserved (ADR-0015 design philosophy maintained)
|
||||||
|
- No changes needed to benchmark user code API
|
||||||
|
- When adding new message types, only set the data_op flag
|
||||||
|
- Phase 0 eliminated via greenlet — memory-data-based dynamic control flow supported
|
||||||
|
- `tl.load()` returns actual data, making kernel debugging easier
|
||||||
|
|
||||||
|
### Negative
|
||||||
|
|
||||||
|
- op_log memory usage (for large-scale simulations)
|
||||||
|
- Phase 2 execution time is proportional to tensor size (large GEMM)
|
||||||
|
- Dynamic branching based on pending handles (incomplete computations) not possible
|
||||||
|
(computations execute in Phase 2, result values are undetermined in Phase 1).
|
||||||
|
Memory-data-based branching is supported via greenlet.
|
||||||
|
- greenlet C extension dependency added (pip install greenlet)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Affected Files
|
||||||
|
|
||||||
|
| File | Change |
|
||||||
|
|------|--------|
|
||||||
|
| `src/kernbench/components/base.py` | Add `_on_process_start/end` hooks |
|
||||||
|
| `src/kernbench/common/pe_commands.py` | Add `data_op = True`, extend metadata fields |
|
||||||
|
| `src/kernbench/sim_engine/op_log.py` | New: OpRecord, OpLogger |
|
||||||
|
| `src/kernbench/sim_engine/data_executor.py` | New: DataExecutor, MemoryStore |
|
||||||
|
| `src/kernbench/sim_engine/engine.py` | op_logger injection (optional) |
|
||||||
|
| `src/kernbench/triton_emu/tl_context.py` | greenlet switch calls inside `tl.load()` etc. |
|
||||||
|
| `src/kernbench/triton_emu/kernel_runner.py` | New: KernelRunner (greenlet ↔ SimPy bridge) |
|
||||||
|
| `src/kernbench/components/builtin/pe_cpu.py` | Remove Phase 0, change to KernelRunner invocation |
|
||||||
|
| `pyproject.toml` | Add greenlet dependency |
|
||||||
|
|
||||||
|
Component implementation files (pe_gemm.py, pe_dma.py, hbm_ctrl.py, etc.): **no changes**
|
||||||
|
Benchmark kernels (benches/*.py): **no user API changes**
|
||||||
@@ -0,0 +1,550 @@
|
|||||||
|
# ADR-0020: 2-Pass 데이터 실행 모델 (타이밍 / 데이터 분리)
|
||||||
|
|
||||||
|
## Status
|
||||||
|
|
||||||
|
Proposed
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
현재 시뮬레이션은 **타이밍만** 모델링한다.
|
||||||
|
`tl.load()`, `tl.composite(op="gemm")` 등은 SimPy latency를 생성하지만,
|
||||||
|
실제 텐서 데이터를 읽거나 연산하지 않는다.
|
||||||
|
|
||||||
|
### 필요한 기능
|
||||||
|
|
||||||
|
1. HBM/TCM/SRAM에 실제 데이터를 저장하고 읽을 수 있어야 한다
|
||||||
|
2. PE_GEMM, PE_MATH가 실제 행렬 연산을 수행하고 결과를 검증할 수 있어야 한다
|
||||||
|
3. 시뮬레이션 성능 저하를 최소화해야 한다
|
||||||
|
|
||||||
|
### 기존 커널 실행 구조의 한계
|
||||||
|
|
||||||
|
현재 커널 실행은 3단계로 분리되어 있다:
|
||||||
|
|
||||||
|
```
|
||||||
|
Phase 0: TLContext에서 커널 함수 실행 → PeCommand 리스트 생성 (SimPy 밖, 데이터 없음)
|
||||||
|
Phase 1: PE_CPU가 PeCommand 리스트를 SimPy로 replay (타이밍만)
|
||||||
|
```
|
||||||
|
|
||||||
|
Phase 0에서 커널이 **전부 실행 완료**된 후에야 SimPy가 시작된다.
|
||||||
|
`tl.load()`는 TensorHandle(placeholder)을 반환하므로 실제 데이터에 접근할 수 없다.
|
||||||
|
따라서 데이터 값에 따른 분기(dynamic control flow)가 불가능하다.
|
||||||
|
|
||||||
|
본 ADR은 이 한계를 **메모리 연산에 한해** 해소한다 (D1, D3 참조).
|
||||||
|
|
||||||
|
### 제약 조건
|
||||||
|
|
||||||
|
- SimPy는 single-thread 이벤트 루프 — numpy matmul을 안에서 하면 전체가 block
|
||||||
|
- 컴포넌트는 교체 가능해야 한다 (ADR-0015) — 프레임워크 요구사항이 구현에 침투하면 안 됨
|
||||||
|
- 벤치마크 커널은 명령형 코드(tl.load → tl.composite → tl.wait) — 같은 코드를 재사용해야 함
|
||||||
|
- 커널 함수는 plain Python function으로 유지해야 한다 (generator/async 변환 불가)
|
||||||
|
|
||||||
|
### 설계 탐색 결과
|
||||||
|
|
||||||
|
| Option | 방식 | 판정 |
|
||||||
|
|--------|------|------|
|
||||||
|
| SimPy 내 직접 실행 | GEMM을 SimPy 안에서 numpy 호출 | 탈락: single-thread block |
|
||||||
|
| SimPy + ThreadPool | future.submit → timeout → result() | 탈락: back-to-back 요청 시 result()에서 block |
|
||||||
|
| Symbolic + lazy | 메타데이터만 추적, 나중에 실행 | 탈락: control-flow dependent 읽기 처리 곤란 |
|
||||||
|
| **2-pass (채택)** | Phase 1: 타이밍, Phase 2: 데이터 | 완전 분리, 성능 영향 없음 |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Decision
|
||||||
|
|
||||||
|
### D1. 2-Pass 실행 모델 — Phase 0 제거
|
||||||
|
|
||||||
|
기존의 3단계(Phase 0 → Phase 1 → Phase 2)를 **2단계로 통합**한다.
|
||||||
|
|
||||||
|
기존:
|
||||||
|
```
|
||||||
|
Phase 0: 커널 → PeCommand 리스트 (데이터 없음, 분기 불가)
|
||||||
|
Phase 1: PeCommand 리스트를 SimPy replay (타이밍만)
|
||||||
|
```
|
||||||
|
|
||||||
|
변경:
|
||||||
|
```
|
||||||
|
Phase 1 (타이밍): 커널 + SimPy 통합 실행 — greenlet 기반
|
||||||
|
- 메모리 읽기/쓰기: SimPy 타이밍 + MemoryStore 실제 데이터
|
||||||
|
- 연산 (GEMM/Math): SimPy 타이밍 + op_log 기록 (실제 연산은 Phase 2)
|
||||||
|
- dynamic control flow 가능 (tl.load가 실제 데이터 반환)
|
||||||
|
|
||||||
|
Phase 2 (데이터): op_log 기반 실제 연산 실행 — SimPy 외부, 병렬 가능
|
||||||
|
```
|
||||||
|
|
||||||
|
본 ADR은 **메모리 연산에 한해 Phase 1을 data-aware로 확장**한다.
|
||||||
|
Phase 1은 latency/BW 병목 분석 + 메모리 데이터 추적,
|
||||||
|
Phase 2는 GEMM/Math 연산 정합성 검증.
|
||||||
|
Phase 2는 optional — 타이밍만 필요하면 Phase 1만 실행.
|
||||||
|
|
||||||
|
### D2. Op Log 기록 — ComponentBase hook
|
||||||
|
|
||||||
|
op_log 기록은 **컴포넌트 베이스 클래스의 hook**으로 수행한다.
|
||||||
|
개별 컴포넌트 구현을 수정하지 않는다.
|
||||||
|
|
||||||
|
```python
|
||||||
|
class ComponentBase:
|
||||||
|
def _on_process_start(self, env, msg):
|
||||||
|
if self._op_logger and getattr(msg, 'data_op', False):
|
||||||
|
self._op_logger.record_start(env.now, self.node.id, msg)
|
||||||
|
|
||||||
|
def _on_process_end(self, env, msg):
|
||||||
|
if self._op_logger and getattr(msg, 'data_op', False):
|
||||||
|
self._op_logger.record_end(env.now, self.node.id, msg)
|
||||||
|
```
|
||||||
|
|
||||||
|
`_forward_txn()` 에서 `run()` 전후로 hook을 호출한다.
|
||||||
|
`_op_logger`는 optional — 없으면 오버헤드 제로.
|
||||||
|
|
||||||
|
**hook 시점 정의**:
|
||||||
|
|
||||||
|
| 시점 | 의미 |
|
||||||
|
|------|------|
|
||||||
|
| `t_start` | 컴포넌트가 해당 msg의 **service를 시작**한 시점 (`run()` 진입 직전) |
|
||||||
|
| `t_end` | 컴포넌트의 **내부 service가 완료**된 시점 (`run()` 반환 직후) |
|
||||||
|
|
||||||
|
link traversal latency는 t_start/t_end에 포함되지 않는다.
|
||||||
|
link latency는 발신 컴포넌트의 t_end와 수신 컴포넌트의 t_start 차이로 관측된다.
|
||||||
|
|
||||||
|
### D3. Greenlet 기반 커널 실행 — Phase 0 제거
|
||||||
|
|
||||||
|
기존 Phase 0 (커널 → PeCommand 리스트)를 제거하고,
|
||||||
|
**greenlet**을 사용하여 커널과 SimPy를 협력적으로 interleave 실행한다.
|
||||||
|
|
||||||
|
#### 동작 원리
|
||||||
|
|
||||||
|
greenlet은 협력적 context switch를 제공하는 C 확장이다.
|
||||||
|
커널(child greenlet)이 `tl.load()` 등을 호출하면 SimPy 루프(parent greenlet)로
|
||||||
|
switch하여 타이밍 시뮬레이션을 수행하고, 완료 후 실제 데이터와 함께 커널로 돌아온다.
|
||||||
|
|
||||||
|
```
|
||||||
|
SimPy 루프 (parent greenlet) 커널 (child greenlet)
|
||||||
|
───────────────────────── ──────────────────────
|
||||||
|
g.switch() ─────────────────────────→ 커널 시작
|
||||||
|
a = tl.load(ptr, ...)
|
||||||
|
내부: parent.switch(DmaReadCmd)
|
||||||
|
cmd = DmaReadCmd ←────────────────── (커널 일시정지)
|
||||||
|
yield DmaReadMsg(...)
|
||||||
|
yield env.timeout(dma_latency)
|
||||||
|
data = memory_store.read(...)
|
||||||
|
g.switch(data) ─────────────────────→ (커널 재개)
|
||||||
|
a = data ← 실제 numpy array
|
||||||
|
if a[0][0] > 0.5: ← 분기 가능
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
||||||
|
커널은 **plain Python function**으로 유지된다.
|
||||||
|
greenlet switch는 `tl.load()`, `tl.store()` 등의 **내부 구현에만** 존재한다.
|
||||||
|
|
||||||
|
#### KernelRunner — 프레임워크 레이어
|
||||||
|
|
||||||
|
greenlet 루프는 PE_CPU 컴포넌트가 아니라 프레임워크 레이어인
|
||||||
|
**KernelRunner**에 위치한다.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# KernelRunner (프레임워크 — greenlet ↔ SimPy 연결)
|
||||||
|
class KernelRunner:
|
||||||
|
def run(self, env, kernel_fn, args, store):
|
||||||
|
g = greenlet(self._run_kernel)
|
||||||
|
cmd = g.switch(kernel_fn, args)
|
||||||
|
|
||||||
|
while cmd is not None:
|
||||||
|
if isinstance(cmd, DmaReadCmd):
|
||||||
|
yield from self._dispatch_dma(env, cmd)
|
||||||
|
data = store.read(cmd.src_addr, cmd.shape, cmd.dtype)
|
||||||
|
cmd = g.switch(data) # 실제 데이터와 함께 재개
|
||||||
|
elif isinstance(cmd, GemmCmd):
|
||||||
|
yield from self._dispatch_gemm(env, cmd)
|
||||||
|
cmd = g.switch() # 재개 (데이터 없음)
|
||||||
|
elif isinstance(cmd, DmaWriteCmd):
|
||||||
|
store.write(cmd.dst_addr, cmd.data) # visibility = issue 시점
|
||||||
|
yield from self._dispatch_dma(env, cmd) # timing만 반영
|
||||||
|
cmd = g.switch()
|
||||||
|
|
||||||
|
# PE_CPU (컴포넌트 — 간단하게 유지, greenlet을 모름)
|
||||||
|
def _execute_kernel(self, env):
|
||||||
|
runner = KernelRunner(self.ctx)
|
||||||
|
yield from runner.run(env, kernel_fn, args, store)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Op logging single source of truth**: KernelRunner는 op_log에 직접 기록하지 않는다.
|
||||||
|
모든 op logging은 **ComponentBase hook (_on_process_start/end)만** 담당한다.
|
||||||
|
KernelRunner가 `_dispatch_gemm()` 등으로 컴포넌트에 메시지를 전달하면,
|
||||||
|
컴포넌트 베이스 클래스의 hook이 자동으로 기록한다.
|
||||||
|
|
||||||
|
**레이어 분리**:
|
||||||
|
- **커널 코드**: plain function, greenlet 존재를 모름
|
||||||
|
- **TLContext**: `tl.load()` 내부에서 `parent.switch(cmd)` 호출
|
||||||
|
- **KernelRunner**: greenlet ↔ SimPy 연결, MemoryStore 읽기/쓰기 처리. **logging 안 함**.
|
||||||
|
- **ComponentBase hook**: op_log 기록의 유일한 경로
|
||||||
|
- **PE_CPU**: KernelRunner를 호출만 함, 컴포넌트로서 교체 가능
|
||||||
|
|
||||||
|
#### 메모리 읽기/쓰기 vs 연산의 처리 차이
|
||||||
|
|
||||||
|
| 연산 | Phase 1에서 | Phase 2에서 |
|
||||||
|
|------|------------|------------|
|
||||||
|
| `tl.load()` | SimPy 타이밍 + MemoryStore read → **실제 데이터 반환** | — |
|
||||||
|
| `tl.store()` | SimPy 타이밍 + MemoryStore write → **실제 기록** | — |
|
||||||
|
| `tl.composite(gemm)` | SimPy 타이밍 + **op_log 기록만** | numpy 실제 연산 |
|
||||||
|
| `tl.dot()` / math ops | SimPy 타이밍 + **op_log 기록만** | numpy 실제 연산 |
|
||||||
|
|
||||||
|
메모리 읽기/쓰기는 Phase 1에서 즉시 처리 (numpy slice, 빠름).
|
||||||
|
GEMM/Math 연산은 Phase 2에서 batch 실행 (성능 분리).
|
||||||
|
|
||||||
|
#### Store Visibility Rule
|
||||||
|
|
||||||
|
`tl.store()`는 **issue 시점에 MemoryStore에 즉시 반영**된다 (visibility = issue).
|
||||||
|
SimPy DMA 타이밍은 이후 별도로 시뮬레이션된다.
|
||||||
|
|
||||||
|
이는 timing과 visibility를 의도적으로 분리한 것이다:
|
||||||
|
- **visibility**: MemoryStore에 반영되는 시점 = `store.write()` 호출 시
|
||||||
|
- **timing**: SimPy에서 DMA latency가 완료되는 시점
|
||||||
|
|
||||||
|
이 분리로 dynamic control flow에서 store 직후 load가 최신 데이터를 볼 수 있다.
|
||||||
|
|
||||||
|
#### Result Handle Semantics
|
||||||
|
|
||||||
|
`tl.composite()`(sync/async)는 결과 tensor를 참조하는 **handle**을 반환한다.
|
||||||
|
|
||||||
|
Phase 1에서의 핵심 계약:
|
||||||
|
|
||||||
|
1. **모든 compute handle은 Phase 1에서 항상 pending 상태로 간주한다.**
|
||||||
|
2. `tl.wait(handle)`은 **timing synchronization만 표현**하며,
|
||||||
|
handle을 ready로 만들지 않는다.
|
||||||
|
3. handle의 실제 결과 데이터 접근(`handle.data`, element access,
|
||||||
|
numpy conversion 등)은 **Phase 2에서만 가능**하다.
|
||||||
|
4. 따라서 Phase 1에서 **compute-result 기반 control flow는 지원하지 않는다.**
|
||||||
|
5. 반면 `tl.load()`는 Phase 1에서 실제 데이터를 반환하므로,
|
||||||
|
**memory-read 기반 control flow는 지원 가능**하다.
|
||||||
|
|
||||||
|
| handle 상태 | Phase | 허용 동작 |
|
||||||
|
|------------|-------|----------|
|
||||||
|
| pending | Phase 1 | `tl.wait(handle)` — timing 동기화만 |
|
||||||
|
| pending | Phase 1 | handle을 `tl.store()`의 대상으로 전달 (logical destination 연결만, payload는 Phase 2) |
|
||||||
|
| pending | Phase 1 | **데이터 접근 불가** — 값 기반 분기 불가 |
|
||||||
|
| ready | Phase 2 | 실제 numpy 데이터 접근, 검증 |
|
||||||
|
|
||||||
|
이 제약은 의도적이다. Phase 1에서 연산을 실행하면 SimPy single-thread가
|
||||||
|
block되어 2-pass 분리의 존재 이유가 사라진다.
|
||||||
|
|
||||||
|
#### Phase 1 Materialization — Future Extension
|
||||||
|
|
||||||
|
향후 소형 연산(scalar, 작은 reduction)에 대해 Phase 1 eager execution이
|
||||||
|
필요한 경우, `materialized_in_phase1: bool` 플래그를 op record에 추가하여
|
||||||
|
선택적 materialization을 지원할 수 있다. 현재 범위에서는 구현하지 않는다.
|
||||||
|
|
||||||
|
### D4. data_op 플래그 — 메시지 자기 선언
|
||||||
|
|
||||||
|
로깅 대상은 메시지 타입이 아니라 메시지 인스턴스의 `data_op` 속성으로 결정한다.
|
||||||
|
프레임워크가 메시지 타입을 하드코딩하지 않는다.
|
||||||
|
|
||||||
|
```python
|
||||||
|
class MsgBase:
|
||||||
|
data_op: bool = False # 기본: 로깅 안 함
|
||||||
|
|
||||||
|
class DmaReadCmd(MsgBase):
|
||||||
|
data_op = True # 메모리 이동 → 로깅
|
||||||
|
|
||||||
|
class GemmCmd(MsgBase):
|
||||||
|
data_op = True # 연산 → 로깅
|
||||||
|
|
||||||
|
class MathCmd(MsgBase):
|
||||||
|
data_op = True # 연산 → 로깅
|
||||||
|
```
|
||||||
|
|
||||||
|
새 메시지 타입(예: IpcqMsg) 추가 시 `data_op = True`만 설정하면
|
||||||
|
프레임워크 코드 수정 없이 자동 로깅된다.
|
||||||
|
|
||||||
|
### D5. Op Log 구조
|
||||||
|
|
||||||
|
#### op 분류 체계
|
||||||
|
|
||||||
|
2단계로 분류한다:
|
||||||
|
|
||||||
|
| 레벨 | 필드 | 역할 |
|
||||||
|
|------|------|------|
|
||||||
|
| `op_kind` | `memory` \| `gemm` \| `math` | executor dispatch 기준 |
|
||||||
|
| `op_name` | `dma_read` \| `dma_write` \| `gemm_f16` \| `exp` \| `add` \| `sum` 등 | 구체 연산 식별 |
|
||||||
|
|
||||||
|
#### OpRecord 정의
|
||||||
|
|
||||||
|
```python
|
||||||
|
@dataclass
|
||||||
|
class OpRecord:
|
||||||
|
t_start: float # SimPy 시각 (ns) — service 시작
|
||||||
|
t_end: float # SimPy 시각 (ns) — service 완료
|
||||||
|
component_id: str # e.g. "sip0.cube0.pe0.pe_gemm"
|
||||||
|
op_kind: str # "memory" | "gemm" | "math"
|
||||||
|
op_name: str # 구체 연산명
|
||||||
|
params: dict # 연산별 파라미터 (아래 참조)
|
||||||
|
dependency_ids: list[int] # 현재는 in-memory record index 기반, 향후 stable op_id로 대체 가능
|
||||||
|
```
|
||||||
|
|
||||||
|
#### dependency_ids 생성 규칙
|
||||||
|
|
||||||
|
`dependency_ids`는 **optional**이며, 기본적으로 executor는
|
||||||
|
주소 기반 dependency 추론을 수행한다 (D6 참조).
|
||||||
|
|
||||||
|
정확한 실행 순서가 필요한 경우에만 명시적으로 설정한다:
|
||||||
|
- **기본 (address-based inference)**: executor가 read/write set을 분석하여
|
||||||
|
RAW/WAW/WAR 의존성을 자동 추론. 대부분의 경우 이것으로 충분.
|
||||||
|
- **명시적 설정**: TLContext 또는 command 생성 단계에서 logical dependency가
|
||||||
|
주소로 표현되지 않는 경우에 설정.
|
||||||
|
예: completion handle 기반 동기화 — handle dependency는 메모리 주소가 아니라
|
||||||
|
논리적 완료 순서에 의존하므로 address inference로 잡히지 않는다.
|
||||||
|
|
||||||
|
#### op_log ordering
|
||||||
|
|
||||||
|
op_log는 `t_start` 기준으로 **stable ordering**을 유지한다.
|
||||||
|
동일 `t_start`의 record들은 insertion order를 보존한다.
|
||||||
|
|
||||||
|
#### params 상세
|
||||||
|
|
||||||
|
**memory (dma_read / dma_write)**:
|
||||||
|
```python
|
||||||
|
{
|
||||||
|
"src_addr": int, # source 주소 (byte)
|
||||||
|
"dst_addr": int, # destination 주소 (byte)
|
||||||
|
"nbytes": int, # 전송 크기
|
||||||
|
"src_space": str, # "hbm" | "tcm" | "sram"
|
||||||
|
"dst_space": str, # "hbm" | "tcm" | "sram"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**gemm**:
|
||||||
|
```python
|
||||||
|
{
|
||||||
|
"src_a_addr": int, # operand A 주소
|
||||||
|
"src_b_addr": int, # operand B 주소
|
||||||
|
"dst_addr": int, # output 주소
|
||||||
|
"shape_a": tuple, # e.g. (128, 256)
|
||||||
|
"shape_b": tuple, # e.g. (256, 128)
|
||||||
|
"shape_out": tuple, # e.g. (128, 128)
|
||||||
|
"dtype_in": str, # e.g. "f16"
|
||||||
|
"dtype_acc": str, # accumulation dtype, e.g. "f32"
|
||||||
|
"dtype_out": str, # output dtype, e.g. "f16"
|
||||||
|
"transpose_a": bool,
|
||||||
|
"transpose_b": bool,
|
||||||
|
"layout_a": str, # "row_major" | "col_major"
|
||||||
|
"layout_b": str,
|
||||||
|
"layout_out": str,
|
||||||
|
"addr_space": str, # "tcm" (GEMM operand는 항상 TCM)
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**math**:
|
||||||
|
```python
|
||||||
|
{
|
||||||
|
"op": str, # "exp" | "add" | "sum" | "where" | ...
|
||||||
|
"input_addrs": list[int], # operand 주소 목록
|
||||||
|
"input_shapes": list[tuple],
|
||||||
|
"dst_addr": int,
|
||||||
|
"shape_out": tuple,
|
||||||
|
"dtype": str,
|
||||||
|
"axis": int | None, # reduction axis
|
||||||
|
"addr_space": str, # "tcm"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### D6. Phase 2 Executor
|
||||||
|
|
||||||
|
Phase 2는 SimPy 밖에서 op_log를 실행한다.
|
||||||
|
|
||||||
|
```python
|
||||||
|
class DataExecutor:
|
||||||
|
def __init__(self, op_log: list[OpRecord], initial_store: MemoryStore):
|
||||||
|
self.store = initial_store # Phase 1의 MemoryStore snapshot을 입력으로 받는다
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
for t, ops in groupby(op_log, key=lambda o: o.t_start):
|
||||||
|
batch = list(ops)
|
||||||
|
independent, sequential = self._classify(batch)
|
||||||
|
self._execute_parallel(independent)
|
||||||
|
self._execute_sequential(sequential)
|
||||||
|
```
|
||||||
|
|
||||||
|
**병렬 실행 판정**:
|
||||||
|
|
||||||
|
같은 `t_start`의 op들은 **병렬 후보**로 간주한다.
|
||||||
|
실제 병렬 실행 여부는 executor가 다음 기준으로 판정한다:
|
||||||
|
- read/write 주소 범위 겹침 여부 (WAW, RAW, WAR 충돌 검사)
|
||||||
|
- `dependency_ids`에 명시된 선행 op 완료 여부
|
||||||
|
|
||||||
|
주소 범위가 겹치지 않고 명시적 의존성이 없는 op들만 병렬 실행한다.
|
||||||
|
|
||||||
|
**배치 최적화**: 동일 op_name이며 **shape, dtype, layout, transpose flag가
|
||||||
|
모두 동일한** 독립 op들만 batching 대상이 된다.
|
||||||
|
예: 여러 PE의 동일 shape GEMM → `np.matmul(a_batch, b_batch)` 한 번으로 묶음.
|
||||||
|
CPU에서도 BLAS 효율 향상, GPU에서는 launch overhead 절감.
|
||||||
|
|
||||||
|
**Phase 2 실행 순서 보장**:
|
||||||
|
|
||||||
|
Phase 2는 데이터 도착 시점을 고려하지 않으며,
|
||||||
|
dependency (주소 기반 추론 + 명시적 dependency_ids)를 통해서만
|
||||||
|
실행 순서를 보장한다.
|
||||||
|
|
||||||
|
### D7. Memory Store
|
||||||
|
|
||||||
|
`MemoryStore`는 논리적으로 byte-addressable semantics를 따르며,
|
||||||
|
현재 구현은 **tensor-granular storage** (addr → numpy ndarray 매핑)를 사용한다.
|
||||||
|
|
||||||
|
```python
|
||||||
|
class MemoryStore:
|
||||||
|
def write(self, space: str, addr: int, data: np.ndarray) -> None: ...
|
||||||
|
def read(self, space: str, addr: int, shape: tuple, dtype: str) -> np.ndarray: ...
|
||||||
|
```
|
||||||
|
|
||||||
|
**내부 저장 포맷: numpy ndarray**
|
||||||
|
|
||||||
|
MemoryStore는 텐서를 **numpy ndarray**로 저장한다.
|
||||||
|
|
||||||
|
| 후보 | store/load 속도 | Phase 2 연산 | 판정 |
|
||||||
|
|------|----------------|-------------|------|
|
||||||
|
| **numpy ndarray** | 즉시 (참조 전달, 복사 없음) | `np.matmul` 바로 사용 | **채택** |
|
||||||
|
| bytearray | memcpy 필요 | `np.frombuffer` 변환 필요 | 탈락 |
|
||||||
|
| torch tensor | 즉시 | torch 연산 가능 | GPU 최적화 시만 사용 |
|
||||||
|
|
||||||
|
- write: numpy array를 **참조 저장** (복사 없음) → Phase 1 오버헤드 = dict lookup 1회
|
||||||
|
- read: numpy array를 **참조 반환** (복사 없음)
|
||||||
|
- 동일 addr에 재 write 시 기존 array를 **tensor 단위로 덮어쓴다** (partial overwrite 미지원)
|
||||||
|
- dtype은 numpy native 사용 (`np.float16`, `np.float32`, `np.bfloat16` 등)
|
||||||
|
- byte-level access가 필요한 경우 `.view(np.uint8)` 로 변환
|
||||||
|
- Phase 2에서 GPU batch 최적화 시 numpy → torch tensor 변환은 executor가 담당
|
||||||
|
|
||||||
|
**read/write contract**:
|
||||||
|
|
||||||
|
- read/write는 **contiguous tensor** 기준이다.
|
||||||
|
non-contiguous stride view가 필요한 경우 별도 copy op으로 표현한다.
|
||||||
|
- 일반 benchmark path에서는 producer/consumer dtype 일치를 기대한다.
|
||||||
|
reinterpret cast는 low-level memory validation 또는 특수 테스트 케이스를 위한
|
||||||
|
permissive behavior이다.
|
||||||
|
- addr은 byte-aligned이며, 최소 alignment = dtype 크기.
|
||||||
|
- dtype mismatch (write와 다른 dtype으로 read)는 reinterpret cast로 처리한다.
|
||||||
|
shape 불일치 시 nbytes 기준으로 검증하고, 불일치하면 error.
|
||||||
|
- 정합성 기준은 주소 범위 기반 read/write semantics를 따른다.
|
||||||
|
- 구현 최적화로 tensor object cache를 둘 수 있지만,
|
||||||
|
canonical state는 byte-addressable storage이다.
|
||||||
|
- deploy 시점에 호스트가 초기 텐서 데이터를 주입한다.
|
||||||
|
|
||||||
|
### D8. 벤치마크 커널 코드
|
||||||
|
|
||||||
|
벤치마크의 **사용자 코드 API는 변경하지 않는다**.
|
||||||
|
`tl.load()`, `tl.composite()`, `tl.store()` 등의 호출 인터페이스는 유지.
|
||||||
|
|
||||||
|
단, 내부 command/message schema는 Phase 2 실행에 필요한 metadata를
|
||||||
|
포함하도록 확장될 수 있다 (예: dtype_acc, transpose 등 추가 필드).
|
||||||
|
|
||||||
|
### D9. 컴포넌트 변경 없음
|
||||||
|
|
||||||
|
개별 컴포넌트 구현(PE_GEMM, PE_DMA, HBM_CTRL 등)은 수정하지 않는다.
|
||||||
|
op_log 기록은 ComponentBase hook의 책임이다.
|
||||||
|
커스텀 컴포넌트 교체 시 타이밍 모델만 교체되며,
|
||||||
|
Phase 2 데이터 실행은 영향받지 않는다.
|
||||||
|
|
||||||
|
### D10. Phase 2는 Optional
|
||||||
|
|
||||||
|
```python
|
||||||
|
engine = GraphEngine(graph)
|
||||||
|
engine.run(benchmark) # Phase 1: 타이밍만
|
||||||
|
result = engine.get_timing_result()
|
||||||
|
|
||||||
|
if verify_data:
|
||||||
|
executor = DataExecutor(engine.op_log) # Phase 2: 데이터
|
||||||
|
executor.run()
|
||||||
|
executor.verify(expected_output)
|
||||||
|
```
|
||||||
|
|
||||||
|
타이밍 분석만 필요하면 Phase 2를 건너뛴다.
|
||||||
|
op_logger를 비활성화하면 Phase 1 성능도 기존과 동일.
|
||||||
|
|
||||||
|
### D11. Verification Contract
|
||||||
|
|
||||||
|
기본 검증은 **최종 output tensor**를 reference backend(numpy)와 비교한다.
|
||||||
|
|
||||||
|
dtype별 tolerance 정책:
|
||||||
|
|
||||||
|
| dtype | 비교 방식 | tolerance |
|
||||||
|
|-------|----------|-----------|
|
||||||
|
| f32 | `np.allclose` | rtol=1e-5, atol=1e-5 |
|
||||||
|
| f16 | `np.allclose` | rtol=1e-3, atol=1e-3 |
|
||||||
|
| bf16 | `np.allclose` | rtol=1e-2, atol=1e-2 |
|
||||||
|
| int 계열 | `np.array_equal` | exact |
|
||||||
|
|
||||||
|
- 기본 모드: 최종 output만 비교 (end-to-end correctness)
|
||||||
|
- 디버그 모드: intermediate tensor도 op 단위로 비교 가능
|
||||||
|
(MemoryStore snapshot at each op boundary)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Non-goals
|
||||||
|
|
||||||
|
- **Compute-result-based control flow**: 지원하지 않는다.
|
||||||
|
모든 compute handle은 Phase 1에서 pending 상태이며,
|
||||||
|
`wait()`는 timing synchronization만 표현하고 data readiness를 의미하지 않는다.
|
||||||
|
Phase 1에서 `handle.data` 접근, element access, truth-value evaluation은
|
||||||
|
**error로 처리**한다.
|
||||||
|
메모리 데이터 기반 분기(`tl.load()` 결과)는 greenlet으로 지원된다.
|
||||||
|
Phase 1 materialization은 future extension (D3 참조).
|
||||||
|
- **Cycle-accurate overlap reconstruction**: Phase 2에서 Phase 1의 실행 시간
|
||||||
|
overlap을 정확히 재현하지 않는다. Phase 2는 데이터 정합성만 검증한다.
|
||||||
|
- **GPU kernel compilation**: Phase 2의 GEMM/Math는 numpy/torch 호출이며,
|
||||||
|
실제 하드웨어 PE의 마이크로아키텍처를 재현하지 않는다.
|
||||||
|
|
||||||
|
## Open Questions
|
||||||
|
|
||||||
|
- **Aliasing / slice view**: 동일 backing storage를 참조하는 slice/view를
|
||||||
|
MemoryStore에서 어떻게 표현할지 (stride-based view vs copy semantics)
|
||||||
|
- **IPCQ/descriptor read 일반화**: PE-to-PE 통신을 memory op으로 완전히
|
||||||
|
일반화할지, 별도 op_kind를 둘지
|
||||||
|
- **Op log streaming**: 대규모 시뮬레이션에서 op_log 메모리 사용량 관리
|
||||||
|
(in-memory list vs disk-backed streaming)
|
||||||
|
- **Fused operation**: tl.composite의 tiled pipeline (READ→COMPUTE→WRITE)을
|
||||||
|
하나의 fused op record로 기록할지, 개별 op으로 분리할지
|
||||||
|
- **Math op schema 일반화**: 현재 math params는 단순 구조이나,
|
||||||
|
broadcasting rule, input별 dtype, keepdims, scalar/immediate operand,
|
||||||
|
where/mask 표현 등 일반화가 필요할 수 있음
|
||||||
|
- **Op record 식별자**: 현재 dependency_ids는 in-memory list index 기반이며,
|
||||||
|
streaming/disk-backed mode 도입 시 stable op_id로 대체 필요
|
||||||
|
- **Phase 1 materialization policy**: D3의 Future Extension 참조.
|
||||||
|
허용 시 해당 op의 Phase 2 처리 방식 (skip / verify / recompute) 정의 필요
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Consequences
|
||||||
|
|
||||||
|
### 긍정적
|
||||||
|
|
||||||
|
- SimPy 시뮬레이션 성능 영향 최소 (op_log append만 추가)
|
||||||
|
- Phase 2에서 멀티스레드/GPU 자유롭게 사용 가능
|
||||||
|
- 컴포넌트 교체 자유도 유지 (ADR-0015 설계 철학 보존)
|
||||||
|
- 벤치마크 사용자 코드 API 변경 불필요
|
||||||
|
- 새 메시지 타입 추가 시 data_op 플래그만 설정
|
||||||
|
- greenlet으로 Phase 0 제거 — 메모리 데이터 기반 dynamic control flow 지원
|
||||||
|
- `tl.load()`가 실제 데이터를 반환하므로 커널 디버깅 용이
|
||||||
|
|
||||||
|
### 부정적
|
||||||
|
|
||||||
|
- op_log 메모리 사용량 (대규모 시뮬레이션 시)
|
||||||
|
- Phase 2 실행 시간은 텐서 크기에 비례 (대형 GEMM)
|
||||||
|
- pending handle (연산 미완료) 기반 동적 분기 불가
|
||||||
|
(연산은 Phase 2에서 실행, Phase 1에서 결과 값 미확정).
|
||||||
|
메모리 데이터 기반 분기는 greenlet으로 지원된다.
|
||||||
|
- greenlet C 확장 의존성 추가 (pip install greenlet)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 영향받는 파일
|
||||||
|
|
||||||
|
| 파일 | 변경 |
|
||||||
|
|------|------|
|
||||||
|
| `src/kernbench/components/base.py` | `_on_process_start/end` hook 추가 |
|
||||||
|
| `src/kernbench/common/pe_commands.py` | `data_op = True` 추가, metadata 필드 확장 |
|
||||||
|
| `src/kernbench/sim_engine/op_log.py` | 신규: OpRecord, OpLogger |
|
||||||
|
| `src/kernbench/sim_engine/data_executor.py` | 신규: DataExecutor, MemoryStore |
|
||||||
|
| `src/kernbench/sim_engine/engine.py` | op_logger 주입 (optional) |
|
||||||
|
| `src/kernbench/triton_emu/tl_context.py` | `tl.load()` 등 내부에서 greenlet switch 호출 |
|
||||||
|
| `src/kernbench/triton_emu/kernel_runner.py` | 신규: KernelRunner (greenlet ↔ SimPy 연결) |
|
||||||
|
| `src/kernbench/components/builtin/pe_cpu.py` | Phase 0 제거, KernelRunner 호출로 변경 |
|
||||||
|
| `pyproject.toml` | greenlet 의존성 추가 |
|
||||||
|
|
||||||
|
컴포넌트 구현 파일 (pe_gemm.py, pe_dma.py, hbm_ctrl.py 등): **변경 없음**
|
||||||
|
벤치마크 커널 (benches/*.py): **사용자 API 변경 없음**
|
||||||
@@ -0,0 +1,537 @@
|
|||||||
|
# ADR-0021: PE Pipeline Refactoring — Component Separation + Scheduler-Based Routing
|
||||||
|
|
||||||
|
## Status
|
||||||
|
|
||||||
|
Proposed
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
### Problems with the Current Structure
|
||||||
|
|
||||||
|
pe_accel (SchedulerV2Component) hides 5 hardware blocks (DmaIn, DmaWb, Gemm, Math, Tcm)
|
||||||
|
**inside a single component**.
|
||||||
|
|
||||||
|
```
|
||||||
|
SchedulerV2Component (single topology node)
|
||||||
|
├── DmaInBlock ← directly connected via internal SimPy Store
|
||||||
|
├── DmaWbBlock ← not visible in topology
|
||||||
|
├── GemmBlock ← not replaceable
|
||||||
|
├── MathBlock ← not replaceable
|
||||||
|
└── TcmBlock ← not replaceable
|
||||||
|
```
|
||||||
|
|
||||||
|
Problems:
|
||||||
|
- Blocks directly reference the next block via `desc.next_block` — hardcoded routing
|
||||||
|
- Individual blocks cannot be replaced (violates ADR-0015 component replacement principle)
|
||||||
|
- PE internal structure is not visible in the topology
|
||||||
|
- GemmBlock and MathBlock each duplicate TCM load/store logic
|
||||||
|
|
||||||
|
### Actual Hardware Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
HBM ←(DMA)→ TCM ←(Fetch/Store Unit)→ Register File ←→ GEMM/MATH Engine
|
||||||
|
```
|
||||||
|
|
||||||
|
- DMA: HBM ↔ TCM transfer (via fabric, tens to hundreds of ns)
|
||||||
|
- Fetch/Store Unit: TCM ↔ Register File transfer (BW-based, a few ns)
|
||||||
|
- GEMM/MATH Engine: computation between Register Files (cycle-accurate)
|
||||||
|
- Completion signal: PE-internal 1-cycle wire signal (done pin assert)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Decision
|
||||||
|
|
||||||
|
### D1. Separate Each Block into an Independent Component
|
||||||
|
|
||||||
|
The internal blocks of pe_accel are separated into **independent PeEngineBase components**.
|
||||||
|
Existing 5 blocks + 1 Fetch/Store Unit = 6 components.
|
||||||
|
|
||||||
|
| Component | Role | HW Correspondence |
|
||||||
|
|-----------|------|-------------------|
|
||||||
|
| PE_SCHEDULER | Plan generation, tile state management, stage routing | Scheduler/Sequencer |
|
||||||
|
| PE_DMA | HBM ↔ TCM (via fabric) | DMA Engine |
|
||||||
|
| PE_FETCH_STORE | TCM ↔ Register File | Load/Store Unit |
|
||||||
|
| PE_GEMM | MAC compute (register only) | MAC Array |
|
||||||
|
| PE_MATH | Element-wise/reduction (register only) | SIMD/Vector Unit |
|
||||||
|
| PE_TCM | BW-serialized scratchpad | SRAM Bank |
|
||||||
|
|
||||||
|
Each component exists as a topology node and is connected via ports/wires.
|
||||||
|
Replacing the `impl` allows changing the timing model of an individual block.
|
||||||
|
|
||||||
|
### D2. Token Self-Routing — Scheduler Handles Only Dispatch + Completion
|
||||||
|
|
||||||
|
**Components do not pass through the scheduler at every stage.**
|
||||||
|
The token carries a plan so that components chain directly to the next stage.
|
||||||
|
|
||||||
|
```
|
||||||
|
Scheduler → DMA → Fetch → GEMM → Math → Store → DMA_WB → (done) → Scheduler
|
||||||
|
↑ chaining: does not go through scheduler completion only
|
||||||
|
```
|
||||||
|
|
||||||
|
This matches the actual HW structure where each block's done signal is directly
|
||||||
|
connected to the next block via wire. The scheduler is responsible **only for
|
||||||
|
initial dispatch + completion aggregation**.
|
||||||
|
|
||||||
|
#### Stage Definition
|
||||||
|
|
||||||
|
```python
|
||||||
|
class StageType(Enum):
|
||||||
|
DMA_READ = 0
|
||||||
|
FETCH = 1
|
||||||
|
GEMM = 2
|
||||||
|
MATH = 3
|
||||||
|
STORE = 4
|
||||||
|
DMA_WRITE = 5
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Plan Structure
|
||||||
|
|
||||||
|
When the scheduler receives a CompositeCmd, it generates a **per-tile execution plan**.
|
||||||
|
The plan defines the **stage sequence** for each tile:
|
||||||
|
|
||||||
|
```python
|
||||||
|
@dataclass
|
||||||
|
class Stage:
|
||||||
|
stage_type: StageType
|
||||||
|
component: str # topology node ID (e.g. "sip0.cube0.pe0.pe_dma")
|
||||||
|
params: dict # per-stage parameters (dynamic)
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class TilePlan:
|
||||||
|
tile_id: int
|
||||||
|
stages: tuple[Stage, ...] # list of stages to execute in order (immutable)
|
||||||
|
```
|
||||||
|
|
||||||
|
The stage sequence varies depending on the plan:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Normal GEMM: HBM → TCM → Register → Compute → Register → TCM → HBM
|
||||||
|
stages = (DMA_READ, FETCH, GEMM, STORE, DMA_WRITE)
|
||||||
|
|
||||||
|
# GEMM directly from TCM data (skip DMA read):
|
||||||
|
stages = (FETCH, GEMM, STORE, DMA_WRITE)
|
||||||
|
|
||||||
|
# MATH element-wise:
|
||||||
|
stages = (DMA_READ, FETCH, MATH, STORE, DMA_WRITE)
|
||||||
|
|
||||||
|
# GEMM + accumulation (intermediate K-tile, skip writeback):
|
||||||
|
stages = (DMA_READ, FETCH, GEMM, STORE) # store to TCM only
|
||||||
|
```
|
||||||
|
|
||||||
|
**Components do not hardcode the next component.**
|
||||||
|
They read the next stage from the token's plan and forward it directly via out_port.
|
||||||
|
This is the same pattern as a network packet carrying a routing header.
|
||||||
|
|
||||||
|
#### Pipeline Context
|
||||||
|
|
||||||
|
```python
|
||||||
|
@dataclass
|
||||||
|
class PipelineContext:
|
||||||
|
id: str
|
||||||
|
total_tiles: int
|
||||||
|
completed_tiles: int = 0
|
||||||
|
done_event: simpy.Event = None # succeeds when all tiles are complete
|
||||||
|
|
||||||
|
def complete_tile(self) -> None:
|
||||||
|
self.completed_tiles += 1
|
||||||
|
if self.completed_tiles == self.total_tiles:
|
||||||
|
self.done_event.succeed()
|
||||||
|
```
|
||||||
|
|
||||||
|
**Completion follows an exactly-once contract**: the last stage of each tile must call
|
||||||
|
`complete_tile()` exactly once. Duplicate calls are a bug, and `done_event` must
|
||||||
|
succeed only once (SimPy Event constraint).
|
||||||
|
|
||||||
|
#### Scheduler Role (Reduced)
|
||||||
|
|
||||||
|
When the scheduler receives a CompositeCmd, it creates a plan and PipelineContext,
|
||||||
|
enqueues them into the scheduler's internal `_pending_feeds` FIFO, and returns immediately.
|
||||||
|
|
||||||
|
Actual tile injection is handled by a **single feeder process** (`_feed_loop`).
|
||||||
|
This feeder consumes `_pending_feeds` in FIFO order and
|
||||||
|
**does not allow tile feed interleaving across composite commands.**
|
||||||
|
That is, the feed for the next command begins only after all tiles of the current
|
||||||
|
command have been injected into the first stage queue.
|
||||||
|
|
||||||
|
There is **exactly one `_feed_loop`** per scheduler, and
|
||||||
|
tile feed for composite commands is performed exclusively through this single process.
|
||||||
|
Command issue order refers to **the order in which PE_SCHEDULER receives PeInternalTxn**.
|
||||||
|
|
||||||
|
This structure maintains command issue order while ensuring that when the first stage
|
||||||
|
queue is full, only the feeder process blocks — the scheduler worker's inbox processing
|
||||||
|
itself does not stall.
|
||||||
|
|
||||||
|
```python
|
||||||
|
class PeSchedulerV2(PeEngineBase):
|
||||||
|
_pipelines: dict[str, PipelineContext]
|
||||||
|
_pending_feeds: simpy.Store # FIFO of (plan, ctx)
|
||||||
|
|
||||||
|
def start(self, env):
|
||||||
|
super().start(env)
|
||||||
|
self._pending_feeds = simpy.Store(env)
|
||||||
|
env.process(self._feed_loop(env))
|
||||||
|
|
||||||
|
def _dispatch_composite(self, env, pe_txn, cmd):
|
||||||
|
plan = generate_plan(cmd)
|
||||||
|
ctx = PipelineContext(
|
||||||
|
id=next_id(),
|
||||||
|
total_tiles=len(plan.tiles),
|
||||||
|
done_event=pe_txn.done,
|
||||||
|
)
|
||||||
|
self._pipelines[ctx.id] = ctx
|
||||||
|
|
||||||
|
# only enqueue to feeder queue and return immediately
|
||||||
|
yield self._pending_feeds.put((plan, ctx))
|
||||||
|
|
||||||
|
def _feed_loop(self, env):
|
||||||
|
"""Single feeder process: feeds composite commands in FIFO order.
|
||||||
|
|
||||||
|
Tile feed interleaving across composite commands is not allowed.
|
||||||
|
The feed for the next command begins only after all tiles of the
|
||||||
|
current command have been injected into the first stage queue.
|
||||||
|
|
||||||
|
When the first stage queue is full, only this feeder blocks;
|
||||||
|
the scheduler worker's inbox processing does not stall.
|
||||||
|
"""
|
||||||
|
while True:
|
||||||
|
plan, ctx = yield self._pending_feeds.get()
|
||||||
|
for tile in plan.tiles:
|
||||||
|
token = TileToken(
|
||||||
|
tile_id=tile.tile_id,
|
||||||
|
pipeline_ctx=ctx,
|
||||||
|
plan=tile,
|
||||||
|
stage_idx=0,
|
||||||
|
params=tile.stages[0].params,
|
||||||
|
)
|
||||||
|
yield self.out_ports[tile.stages[0].component].put(token)
|
||||||
|
# queue capacity = HW queue depth → feeder blocks only when full
|
||||||
|
```
|
||||||
|
|
||||||
|
In this ADR, the scheduler can accept multiple composite commands,
|
||||||
|
but tile submission order follows per-command FIFO.
|
||||||
|
Within a command, tile-level pipeline overlap is allowed,
|
||||||
|
but tile feed interleaving across commands is not.
|
||||||
|
|
||||||
|
### D3. Data Transfer vs. Completion Signal — HW Modeling Criteria
|
||||||
|
|
||||||
|
| Communication Type | Method | HW Correspondence |
|
||||||
|
|-------------------|--------|-------------------|
|
||||||
|
| Tile token (work directive) | message via out_port | enqueue to command queue |
|
||||||
|
| Stage completion → next stage | component directly calls out_port.put | done-triggered local enqueue |
|
||||||
|
| Pipeline completion → scheduler | PipelineContext.complete_tile() | completion interrupt |
|
||||||
|
|
||||||
|
**Tile token**: uses out_port.put(). SimPy Store capacity = HW queue depth.
|
||||||
|
|
||||||
|
**Intra-PE chaining latency**: within the scope of this ADR, no explicit latency model
|
||||||
|
is applied to intra-PE stage triggers. Chaining between components corresponds to
|
||||||
|
PE-internal wires, and since there is no scheduler round-trip, no artificial hop cost
|
||||||
|
is incurred.
|
||||||
|
|
||||||
|
**Pipeline completion**: the component at the last stage calls `pipeline_ctx.complete_tile()`.
|
||||||
|
When all tiles are complete, PipelineContext calls done_event.succeed().
|
||||||
|
|
||||||
|
### D4. Asynchronous Pipeline — Natural Overlap
|
||||||
|
|
||||||
|
The scheduler processes CompositeCmds **asynchronously**.
|
||||||
|
However, tile feed does not spawn an independent process per command; instead,
|
||||||
|
the scheduler's internal **single feeder process** performs the feed in FIFO order.
|
||||||
|
Therefore, the scheduler can continue to receive the next command,
|
||||||
|
but the first-stage tile injection order is guaranteed per command.
|
||||||
|
|
||||||
|
Since **SimPy Store capacity = HW queue depth**:
|
||||||
|
- When the queue is full, put() naturally blocks (backpressure)
|
||||||
|
- While DMA is processing tile 0, GEMM can start fetching an already-completed tile
|
||||||
|
- When a second CompositeCmd arrives, it is immediately queued to the DMA queue
|
||||||
|
|
||||||
|
```
|
||||||
|
First-stage feed order (feeder → DMA queue):
|
||||||
|
[cmd1:t0][cmd1:t1][cmd1:t2]...[cmd1:tN] | [cmd2:t0][cmd2:t1]...
|
||||||
|
↑ cmd2 starts after cmd1 feed completes
|
||||||
|
|
||||||
|
Runtime pipeline (downstream overlap):
|
||||||
|
PE_DMA: [cmd1:t0][cmd1:t1][cmd1:t2]...[cmd1:tN][cmd2:t0][cmd2:t1]...
|
||||||
|
PE_FETCH: [cmd1:t0][cmd1:t1]...
|
||||||
|
PE_GEMM: [cmd1:t0][cmd1:t1]...
|
||||||
|
↑ pipeline overlap within the same command
|
||||||
|
```
|
||||||
|
|
||||||
|
Here, the overlap does not come from tile feed interleaving across different commands,
|
||||||
|
but occurs naturally as tiles from earlier commands progress to downstream stages
|
||||||
|
while the feeder continues injecting subsequent tiles.
|
||||||
|
|
||||||
|
For example, tile feed for cmd2 does not start until all tiles of cmd1 have been
|
||||||
|
injected into the first stage queue. However, while cmd1.tile0 has already progressed
|
||||||
|
to GEMM, cmd1.tile1 and cmd1.tile2 may still remain in DMA/FETCH, so
|
||||||
|
**pipeline overlap within the same command occurs naturally**.
|
||||||
|
|
||||||
|
#### Component Chaining Pattern
|
||||||
|
|
||||||
|
All components follow the same pattern:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def _pipeline_worker(self, env):
|
||||||
|
while True:
|
||||||
|
token = yield self._inbox.get()
|
||||||
|
|
||||||
|
# process own stage
|
||||||
|
yield from self._process(env, token)
|
||||||
|
|
||||||
|
# chain to next stage (read from plan)
|
||||||
|
next_idx = token.stage_idx + 1
|
||||||
|
if next_idx < len(token.plan.stages):
|
||||||
|
next_stage = token.plan.stages[next_idx]
|
||||||
|
token.stage_idx = next_idx
|
||||||
|
token.params = next_stage.params
|
||||||
|
yield self.out_ports[next_stage.component].put(token)
|
||||||
|
else:
|
||||||
|
# last stage — pipeline completion
|
||||||
|
token.pipeline_ctx.complete_tile()
|
||||||
|
```
|
||||||
|
|
||||||
|
### D5. PE_FETCH_STORE — Dedicated TCM ↔ Register File Transfer
|
||||||
|
|
||||||
|
Previously, GemmBlock and MathBlock each implemented their own TCM read/write.
|
||||||
|
This is separated into a **PE_FETCH_STORE component**.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# PE_FETCH_STORE._process()
|
||||||
|
def _process(self, env, token):
|
||||||
|
yield self.out_ports[tcm_id].put(TcmRequest(token.params["direction"], ...))
|
||||||
|
yield tcm_done
|
||||||
|
# chaining is handled by the base class (D4 pattern)
|
||||||
|
```
|
||||||
|
|
||||||
|
Advantages:
|
||||||
|
- GEMM/MATH perform **pure compute only** — no TCM access logic
|
||||||
|
- Fetch/store BW contention is naturally modeled (serialization via PE_TCM resource)
|
||||||
|
- Prefetch strategies can be experimented with by replacing the fetch unit alone
|
||||||
|
|
||||||
|
### D6. Simplification of Each Compute Component
|
||||||
|
|
||||||
|
GEMM/MATH perform compute only with register data already prepared.
|
||||||
|
**Chaining follows the common pattern (D4), so only _process() needs to be implemented:**
|
||||||
|
|
||||||
|
```python
|
||||||
|
# PE_GEMM._process()
|
||||||
|
def _process(self, env, token):
|
||||||
|
yield env.timeout(self._mac_latency(token.params))
|
||||||
|
|
||||||
|
# PE_MATH._process()
|
||||||
|
def _process(self, env, token):
|
||||||
|
yield env.timeout(self._simd_latency(token.params))
|
||||||
|
|
||||||
|
# PE_FETCH_STORE._process()
|
||||||
|
def _process(self, env, token):
|
||||||
|
yield self.out_ports[tcm_id].put(TcmRequest(token.params["direction"], ...))
|
||||||
|
yield tcm_done
|
||||||
|
|
||||||
|
# PE_DMA._process()
|
||||||
|
def _process(self, env, token):
|
||||||
|
yield from self._do_fabric_dma(token.params)
|
||||||
|
```
|
||||||
|
|
||||||
|
By replacing only the timing model, one can freely switch between cycle-accurate
|
||||||
|
and analytical models. Since the chaining logic resides in the base class,
|
||||||
|
each component only implements its pure stage logic.
|
||||||
|
|
||||||
|
### D7. Topology Changes
|
||||||
|
|
||||||
|
Add PE_FETCH_STORE to the PE template:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
pe_template:
|
||||||
|
components:
|
||||||
|
pe_cpu: { kind: pe_cpu, impl: pe_cpu_v1, ... }
|
||||||
|
pe_scheduler: { kind: pe_scheduler, impl: pe_scheduler_v2, ... }
|
||||||
|
pe_dma: { kind: pe_dma, impl: pe_dma_v1, ... }
|
||||||
|
pe_fetch_store: { kind: pe_fetch_store, impl: pe_fetch_store_v1, ... }
|
||||||
|
pe_gemm: { kind: pe_gemm, impl: pe_gemm_v1, ... }
|
||||||
|
pe_math: { kind: pe_math, impl: pe_math_v1, ... }
|
||||||
|
pe_mmu: { kind: pe_mmu, impl: pe_mmu_v1, ... }
|
||||||
|
pe_tcm: { kind: pe_tcm, impl: pe_tcm_v1, ... }
|
||||||
|
links:
|
||||||
|
# existing links...
|
||||||
|
fetch_store_to_tcm_bw_gbs: 512.0
|
||||||
|
fetch_store_to_tcm_mm: 0.0
|
||||||
|
```
|
||||||
|
|
||||||
|
PE internal edge connections:
|
||||||
|
```
|
||||||
|
PE_SCHEDULER → PE_DMA (initial dispatch)
|
||||||
|
PE_SCHEDULER → PE_FETCH_STORE (initial dispatch)
|
||||||
|
PE_SCHEDULER → PE_GEMM (initial dispatch)
|
||||||
|
PE_SCHEDULER → PE_MATH (initial dispatch)
|
||||||
|
PE_DMA → PE_FETCH_STORE (chaining)
|
||||||
|
PE_FETCH_STORE → PE_GEMM (chaining)
|
||||||
|
PE_FETCH_STORE → PE_MATH (chaining)
|
||||||
|
PE_GEMM → PE_FETCH_STORE (store chaining)
|
||||||
|
PE_MATH → PE_FETCH_STORE (store chaining)
|
||||||
|
PE_FETCH_STORE → PE_DMA (writeback chaining)
|
||||||
|
PE_FETCH_STORE → PE_TCM (BW request)
|
||||||
|
```
|
||||||
|
|
||||||
|
Topology edges encompass both **control/dispatch visibility + runtime chaining**.
|
||||||
|
Scheduler → sub-component edges are initial dispatch paths, while
|
||||||
|
inter-component edges are runtime chaining paths driven by token self-routing.
|
||||||
|
|
||||||
|
### D8. Existing Code Migration — Builtin Integration
|
||||||
|
|
||||||
|
The existing builtin v1 components and pe_accel are **replaced with new builtin components**.
|
||||||
|
|
||||||
|
#### Migration Strategy
|
||||||
|
|
||||||
|
1. Back up existing `components/builtin/` → `components/builtin_legacy/` (preserved without modification)
|
||||||
|
2. Back up existing `components/custom/pe_accel/` → likewise
|
||||||
|
3. Re-implement new `components/builtin/` with the ADR-0021 architecture
|
||||||
|
4. Maintain **only one** topology.yaml (including pe_fetch_store)
|
||||||
|
5. components.yaml points to the new builtin
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# components.yaml — new builtin
|
||||||
|
pe_scheduler_v1: kernbench.components.builtin.pe_scheduler:PeSchedulerComponent
|
||||||
|
pe_gemm_v1: kernbench.components.builtin.pe_gemm:PeGemmComponent
|
||||||
|
pe_math_v1: kernbench.components.builtin.pe_math:PeMathComponent
|
||||||
|
pe_dma_v1: kernbench.components.builtin.pe_dma:PeDmaComponent
|
||||||
|
pe_fetch_store_v1: kernbench.components.builtin.pe_fetch_store:PeFetchStoreComponent
|
||||||
|
pe_tcm_v1: kernbench.components.builtin.pe_tcm:PeTcmComponent
|
||||||
|
```
|
||||||
|
|
||||||
|
The impl names (pe_gemm_v1, etc.) are preserved, but **the implementations are replaced
|
||||||
|
with the ADR-0021 architecture**. Existing benchmarks and tests referencing topology.yaml
|
||||||
|
continue to work without changes.
|
||||||
|
|
||||||
|
#### Latency Model Inheritance
|
||||||
|
|
||||||
|
The latency modeling of the new builtin components (MAC cycle calculation, SIMD latency,
|
||||||
|
TCM BW serialization, DMA fabric latency, etc.) is **based on the current pe_accel
|
||||||
|
implementation**. The tile schedule generation logic from tiling.py is also carried over.
|
||||||
|
Only the architecture (component separation, self-routing) changes; timing accuracy
|
||||||
|
is preserved.
|
||||||
|
|
||||||
|
#### Test Strategy
|
||||||
|
|
||||||
|
#### Test Plan
|
||||||
|
|
||||||
|
**1. Existing test pass** (regression):
|
||||||
|
After migration is complete, all existing tests (366) must pass.
|
||||||
|
|
||||||
|
**2. Latency regression**:
|
||||||
|
Verify that the new builtin produces identical latency for the same inputs as pe_accel.
|
||||||
|
|
||||||
|
**3. Phase 1 → Phase 2 end-to-end**:
|
||||||
|
Integration test from SimPy simulation (Phase 1) op_log generation → DataExecutor
|
||||||
|
(Phase 2) actual numpy computation → result correctness verification.
|
||||||
|
- GEMM: tl.composite(gemm) → op_log → Phase 2 matmul → allclose verification
|
||||||
|
- MATH: tl.exp / tl.add, etc. → op_log → Phase 2 numpy op → allclose verification
|
||||||
|
- Chaining: GEMM output → MATH input → final result end-to-end verification
|
||||||
|
|
||||||
|
**4. TileToken self-routing**:
|
||||||
|
- Verify that tiles chain according to the plan's stage sequence
|
||||||
|
- Verify PipelineContext.complete_tile() exactly-once at the last stage
|
||||||
|
- Queue backpressure: verify that only the feeder blocks when DMA queue capacity is exceeded
|
||||||
|
|
||||||
|
**5. Asynchronous pipeline overlap**:
|
||||||
|
- Verify that inter-tile stage overlap occurs within the same command (tile0 in GEMM while tile1 in DMA)
|
||||||
|
- Multiple commands: verify that cmd2 feed starts after cmd1 feed completes (FIFO order)
|
||||||
|
|
||||||
|
### D9. TileToken Message Definition
|
||||||
|
|
||||||
|
A message used for passing tile work between components.
|
||||||
|
The token carries the plan and stage index, enabling self-routing.
|
||||||
|
|
||||||
|
```python
|
||||||
|
@dataclass
|
||||||
|
class TileToken:
|
||||||
|
tile_id: int
|
||||||
|
pipeline_ctx: PipelineContext # completion tracking
|
||||||
|
plan: TilePlan # full stage sequence for this tile (immutable)
|
||||||
|
stage_idx: int # current stage index in plan.stages
|
||||||
|
params: dict # current stage parameter cache (canonical: plan.stages[stage_idx].params)
|
||||||
|
data_op: bool = True # op_log recording target (ADR-0020)
|
||||||
|
```
|
||||||
|
|
||||||
|
A TileToken is **owned by exactly one component at a time** and
|
||||||
|
is never referenced by multiple components simultaneously (single-owner).
|
||||||
|
|
||||||
|
Token lifecycle:
|
||||||
|
1. Scheduler creates it with stage_idx=0 and puts it to the first stage component
|
||||||
|
2. The component executes _process(), increments stage_idx, and puts it to the next component
|
||||||
|
3. The last stage component calls pipeline_ctx.complete_tile()
|
||||||
|
4. When all tiles are complete, PipelineContext calls done_event.succeed()
|
||||||
|
|
||||||
|
Relationship with existing PeInternalTxn:
|
||||||
|
- PeInternalTxn: command transfer between PE_CPU → PE_SCHEDULER (existing, unchanged)
|
||||||
|
- TileToken: per-tile work transfer from PE_SCHEDULER → sub-components (new, self-routing)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Non-goals
|
||||||
|
|
||||||
|
- **PE_CPU changes**: the PE_CPU → PE_SCHEDULER interface is not modified
|
||||||
|
(PeInternalTxn-based, ADR-0014 maintained)
|
||||||
|
- **Resource contention model across multiple pipelines**: the current scope focuses on
|
||||||
|
accurate modeling of a single pipeline. TCM bank conflicts across multiple pipelines
|
||||||
|
are future work.
|
||||||
|
- **builtin_legacy maintenance**: kept for backup purposes only; not a target for
|
||||||
|
bug fixes or feature additions.
|
||||||
|
|
||||||
|
## Open Questions
|
||||||
|
|
||||||
|
- **Register File capacity model**: whether to model capacity limits when the fetch unit
|
||||||
|
loads into registers. Capacity is expressed in bytes (register_file_bytes), and
|
||||||
|
the number of tiles that can be held simultaneously is determined by tile size.
|
||||||
|
When capacity is exceeded, fetch stalls, creating natural backpressure.
|
||||||
|
- **Prefetch strategy**: this ADR does not allow tile feed interleaving across composite
|
||||||
|
commands. Therefore, overlap arises not from pre-injection across commands, but
|
||||||
|
naturally from pipeline progression of tiles within the same command.
|
||||||
|
If additional prefetch is needed, it should be considered at the level of tile ordering
|
||||||
|
within the same command or fetch/store unit policy, not cross-command injection.
|
||||||
|
- **PE_DMA coalescing**: per-tile DMA may cause fragmentation.
|
||||||
|
Direction is to merge/coalesce within DMA without scheduler involvement.
|
||||||
|
- **Synchronous execution mode**: this ADR adopts asynchronous pipeline as the
|
||||||
|
default/sole execution model. If a sync mode is needed for debug or validation
|
||||||
|
purposes, it will be considered in a future ADR.
|
||||||
|
- **TCM bank conflict across multiple pipelines**: currently based on a single pipeline.
|
||||||
|
Bank conflict modeling when multiple pipelines simultaneously access TCM is future work.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Consequences
|
||||||
|
|
||||||
|
### Positive
|
||||||
|
|
||||||
|
- Each block is an independent component — individually replaceable (ADR-0015 compliant)
|
||||||
|
- PE internal structure is visible in the topology
|
||||||
|
- Components do not know the next component — plan-based routing provides flexibility
|
||||||
|
- Natural pipeline overlap between DMA and compute (SimPy Store backpressure)
|
||||||
|
- Improved HW modeling accuracy (done signal = Event, data transfer = message)
|
||||||
|
- Fetch/store separation enables accurate TCM BW contention modeling
|
||||||
|
|
||||||
|
### Negative
|
||||||
|
|
||||||
|
- Increased number of PE internal components (5 → 6) — more topology nodes/edges
|
||||||
|
- Component separation makes intra-PE token forwarding more explicit than before
|
||||||
|
- Breaking change from existing builtin/pe_accel — migration required
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Affected Files
|
||||||
|
|
||||||
|
| File | Change |
|
||||||
|
|------|--------|
|
||||||
|
| `topology.yaml` | Add pe_fetch_store component, add chaining edges |
|
||||||
|
| `components.yaml` | Register new builtin components |
|
||||||
|
| `src/kernbench/topology/builder.py` | Add fetch_store + chaining edges to PE internal edges |
|
||||||
|
| `src/kernbench/common/pe_commands.py` | Add TileToken definition |
|
||||||
|
| `src/kernbench/components/builtin/pe_scheduler.py` | Re-implement (feeder + plan-based dispatch) |
|
||||||
|
| `src/kernbench/components/builtin/pe_gemm.py` | Re-implement (TileToken, _process pattern) |
|
||||||
|
| `src/kernbench/components/builtin/pe_math.py` | Re-implement (TileToken, _process pattern) |
|
||||||
|
| `src/kernbench/components/builtin/pe_dma.py` | Re-implement (TileToken, _process pattern) |
|
||||||
|
| `src/kernbench/components/builtin/pe_fetch_store.py` | New |
|
||||||
|
| `src/kernbench/components/builtin/pe_tcm.py` | Re-implement (TcmRequest service) |
|
||||||
|
| `src/kernbench/components/builtin/types.py` | New: TilePlan, Stage, StageType, PipelineContext, TileToken |
|
||||||
|
| `src/kernbench/components/builtin/tiling.py` | Ported from pe_accel: plan generation logic |
|
||||||
|
|
||||||
|
Backup:
|
||||||
|
| `src/kernbench/components/builtin_legacy/` | Full backup of existing builtin (preserved without modification) |
|
||||||
|
| `src/kernbench/components/custom/pe_accel/` | Backup of existing pe_accel (preserved without modification) |
|
||||||
@@ -0,0 +1,528 @@
|
|||||||
|
# ADR-0021: PE 파이프라인 리팩토링 — 컴포넌트 분리 + Scheduler 기반 라우팅
|
||||||
|
|
||||||
|
## Status
|
||||||
|
|
||||||
|
Proposed
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
### 현재 구조의 문제
|
||||||
|
|
||||||
|
pe_accel (SchedulerV2Component)은 5개 하드웨어 블록(DmaIn, DmaWb, Gemm, Math, Tcm)을
|
||||||
|
**단일 컴포넌트 내부**에 숨기고 있다.
|
||||||
|
|
||||||
|
```
|
||||||
|
SchedulerV2Component (단일 topology 노드)
|
||||||
|
├── DmaInBlock ← 내부 SimPy Store로 직접 연결
|
||||||
|
├── DmaWbBlock ← topology에 안 보임
|
||||||
|
├── GemmBlock ← 교체 불가
|
||||||
|
├── MathBlock ← 교체 불가
|
||||||
|
└── TcmBlock ← 교체 불가
|
||||||
|
```
|
||||||
|
|
||||||
|
문제점:
|
||||||
|
- 블록이 다음 블록을 `desc.next_block`으로 직접 참조 — 하드코딩된 라우팅
|
||||||
|
- 개별 블록 교체 불가 (ADR-0015 컴포넌트 교체 원칙 위배)
|
||||||
|
- topology에서 PE 내부 구조가 보이지 않음
|
||||||
|
- GemmBlock과 MathBlock이 TCM load/store 로직을 각각 중복 구현
|
||||||
|
|
||||||
|
### 실제 하드웨어 구조
|
||||||
|
|
||||||
|
```
|
||||||
|
HBM ←(DMA)→ TCM ←(Fetch/Store Unit)→ Register File ←→ GEMM/MATH Engine
|
||||||
|
```
|
||||||
|
|
||||||
|
- DMA: HBM ↔ TCM 전송 (fabric 경유, 수십~수백 ns)
|
||||||
|
- Fetch/Store Unit: TCM ↔ Register File 전송 (BW 기반, 수 ns)
|
||||||
|
- GEMM/MATH Engine: Register File 간 연산 (cycle-accurate)
|
||||||
|
- 완료 신호: PE 내부 1-cycle wire signal (done pin assert)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Decision
|
||||||
|
|
||||||
|
### D1. 각 블록을 독립 컴포넌트로 분리
|
||||||
|
|
||||||
|
pe_accel의 내부 블록을 **독립 PeEngineBase 컴포넌트**로 분리한다.
|
||||||
|
기존 5개 + Fetch/Store Unit 1개 = 6개 컴포넌트.
|
||||||
|
|
||||||
|
| 컴포넌트 | 역할 | HW 대응 |
|
||||||
|
|----------|------|---------|
|
||||||
|
| PE_SCHEDULER | plan 생성, tile 상태 관리, stage 라우팅 | Scheduler/Sequencer |
|
||||||
|
| PE_DMA | HBM ↔ TCM (fabric 경유) | DMA Engine |
|
||||||
|
| PE_FETCH_STORE | TCM ↔ Register File | Load/Store Unit |
|
||||||
|
| PE_GEMM | MAC compute (register only) | MAC Array |
|
||||||
|
| PE_MATH | element-wise/reduction (register only) | SIMD/Vector Unit |
|
||||||
|
| PE_TCM | BW-serialized scratchpad | SRAM Bank |
|
||||||
|
|
||||||
|
각 컴포넌트는 topology 노드로 존재하며, port/wire로 연결된다.
|
||||||
|
`impl`을 교체하면 개별 블록의 타이밍 모델을 변경할 수 있다.
|
||||||
|
|
||||||
|
### D2. Token Self-Routing — Scheduler는 dispatch + completion만
|
||||||
|
|
||||||
|
**컴포넌트가 매 stage마다 scheduler를 경유하지 않는다.**
|
||||||
|
Token이 plan을 가지고 있어 컴포넌트가 직접 다음 stage로 체이닝한다.
|
||||||
|
|
||||||
|
```
|
||||||
|
Scheduler → DMA → Fetch → GEMM → Math → Store → DMA_WB → (done) → Scheduler
|
||||||
|
↑ 체이닝: scheduler 안 거침 completion만
|
||||||
|
```
|
||||||
|
|
||||||
|
이는 실제 HW에서 각 블록의 done signal이 다음 블록에 직접 wire로 연결되어
|
||||||
|
있는 구조와 일치한다. Scheduler는 **초기 dispatch + completion aggregation만** 담당.
|
||||||
|
|
||||||
|
#### Stage 정의
|
||||||
|
|
||||||
|
```python
|
||||||
|
class StageType(Enum):
|
||||||
|
DMA_READ = 0
|
||||||
|
FETCH = 1
|
||||||
|
GEMM = 2
|
||||||
|
MATH = 3
|
||||||
|
STORE = 4
|
||||||
|
DMA_WRITE = 5
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Plan 구조
|
||||||
|
|
||||||
|
Scheduler가 CompositeCmd를 받으면 **tile 단위 실행 plan**을 생성한다.
|
||||||
|
Plan은 각 tile의 **stage sequence**를 정의한다:
|
||||||
|
|
||||||
|
```python
|
||||||
|
@dataclass
|
||||||
|
class Stage:
|
||||||
|
stage_type: StageType
|
||||||
|
component: str # topology 노드 ID (e.g. "sip0.cube0.pe0.pe_dma")
|
||||||
|
params: dict # stage별 파라미터 (dynamic)
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class TilePlan:
|
||||||
|
tile_id: int
|
||||||
|
stages: tuple[Stage, ...] # 순서대로 실행할 stage 목록 (immutable)
|
||||||
|
```
|
||||||
|
|
||||||
|
Plan에 따라 stage sequence가 달라진다:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# 일반 GEMM: HBM → TCM → Register → Compute → Register → TCM → HBM
|
||||||
|
stages = (DMA_READ, FETCH, GEMM, STORE, DMA_WRITE)
|
||||||
|
|
||||||
|
# TCM 데이터로 바로 GEMM (DMA read 생략):
|
||||||
|
stages = (FETCH, GEMM, STORE, DMA_WRITE)
|
||||||
|
|
||||||
|
# MATH element-wise:
|
||||||
|
stages = (DMA_READ, FETCH, MATH, STORE, DMA_WRITE)
|
||||||
|
|
||||||
|
# GEMM + accumulation (중간 K-tile, writeback 생략):
|
||||||
|
stages = (DMA_READ, FETCH, GEMM, STORE) # store to TCM only
|
||||||
|
```
|
||||||
|
|
||||||
|
**컴포넌트는 다음 컴포넌트를 하드코딩하지 않는다.**
|
||||||
|
Token의 plan에서 다음 stage를 읽고, out_port로 직접 전달한다.
|
||||||
|
네트워크 패킷이 라우팅 헤더를 가지고 있는 것과 같은 패턴이다.
|
||||||
|
|
||||||
|
#### Pipeline Context
|
||||||
|
|
||||||
|
```python
|
||||||
|
@dataclass
|
||||||
|
class PipelineContext:
|
||||||
|
id: str
|
||||||
|
total_tiles: int
|
||||||
|
completed_tiles: int = 0
|
||||||
|
done_event: simpy.Event = None # 모든 tile 완료 시 succeed
|
||||||
|
|
||||||
|
def complete_tile(self) -> None:
|
||||||
|
self.completed_tiles += 1
|
||||||
|
if self.completed_tiles == self.total_tiles:
|
||||||
|
self.done_event.succeed()
|
||||||
|
```
|
||||||
|
|
||||||
|
**Completion은 exactly-once contract**: 각 tile의 마지막 stage는 정확히 한 번만
|
||||||
|
`complete_tile()`을 호출해야 한다. 중복 호출은 버그이며, `done_event`는
|
||||||
|
단 한 번만 succeed되어야 한다 (SimPy Event 제약).
|
||||||
|
|
||||||
|
#### Scheduler 역할 (축소됨)
|
||||||
|
|
||||||
|
Scheduler는 CompositeCmd를 받으면 plan과 PipelineContext를 생성한 뒤,
|
||||||
|
이를 scheduler 내부의 `_pending_feeds` FIFO에 enqueue하고 즉시 리턴한다.
|
||||||
|
|
||||||
|
실제 tile 투입은 **단일 feeder process** (`_feed_loop`)가 담당한다.
|
||||||
|
이 feeder는 `_pending_feeds`를 FIFO 순서로 소비하며,
|
||||||
|
**composite command 간 tile feed interleaving은 허용하지 않는다.**
|
||||||
|
즉, 한 command의 모든 tile이 첫 stage queue에 투입된 후에만
|
||||||
|
다음 command의 feed가 시작된다.
|
||||||
|
|
||||||
|
Scheduler당 `_feed_loop`는 **정확히 하나만** 존재하며,
|
||||||
|
composite command의 tile feed는 이 단일 process를 통해서만 수행된다.
|
||||||
|
Command issue order는 **PE_SCHEDULER가 PeInternalTxn을 수신한 순서**를 의미한다.
|
||||||
|
|
||||||
|
이 구조는 command issue order를 유지하면서도, 첫 stage queue full 시
|
||||||
|
feeder process만 block되고 scheduler worker의 inbox 처리 자체는 멈추지 않도록 한다.
|
||||||
|
|
||||||
|
```python
|
||||||
|
class PeSchedulerV2(PeEngineBase):
|
||||||
|
_pipelines: dict[str, PipelineContext]
|
||||||
|
_pending_feeds: simpy.Store # FIFO of (plan, ctx)
|
||||||
|
|
||||||
|
def start(self, env):
|
||||||
|
super().start(env)
|
||||||
|
self._pending_feeds = simpy.Store(env)
|
||||||
|
env.process(self._feed_loop(env))
|
||||||
|
|
||||||
|
def _dispatch_composite(self, env, pe_txn, cmd):
|
||||||
|
plan = generate_plan(cmd)
|
||||||
|
ctx = PipelineContext(
|
||||||
|
id=next_id(),
|
||||||
|
total_tiles=len(plan.tiles),
|
||||||
|
done_event=pe_txn.done,
|
||||||
|
)
|
||||||
|
self._pipelines[ctx.id] = ctx
|
||||||
|
|
||||||
|
# feeder queue에 등록만 하고 즉시 리턴
|
||||||
|
yield self._pending_feeds.put((plan, ctx))
|
||||||
|
|
||||||
|
def _feed_loop(self, env):
|
||||||
|
"""단일 feeder process: composite command를 FIFO 순서로 feed.
|
||||||
|
|
||||||
|
Composite command 간 tile feed interleaving은 허용하지 않는다.
|
||||||
|
한 command의 모든 tile이 첫 stage queue에 투입된 후에만
|
||||||
|
다음 command의 feed가 시작된다.
|
||||||
|
|
||||||
|
첫 stage queue full 시 이 feeder만 block되며,
|
||||||
|
scheduler worker의 inbox 처리는 멈추지 않는다.
|
||||||
|
"""
|
||||||
|
while True:
|
||||||
|
plan, ctx = yield self._pending_feeds.get()
|
||||||
|
for tile in plan.tiles:
|
||||||
|
token = TileToken(
|
||||||
|
tile_id=tile.tile_id,
|
||||||
|
pipeline_ctx=ctx,
|
||||||
|
plan=tile,
|
||||||
|
stage_idx=0,
|
||||||
|
params=tile.stages[0].params,
|
||||||
|
)
|
||||||
|
yield self.out_ports[tile.stages[0].component].put(token)
|
||||||
|
# queue capacity = HW queue depth → full이면 feeder만 block
|
||||||
|
```
|
||||||
|
|
||||||
|
본 ADR에서 scheduler는 여러 composite command를 수용할 수 있으나,
|
||||||
|
tile submission order는 command 단위 FIFO를 따른다.
|
||||||
|
Command 내부에서는 tile-level pipeline overlap을 허용하지만,
|
||||||
|
command 간 tile feed interleaving은 허용하지 않는다.
|
||||||
|
|
||||||
|
### D3. 데이터 전달 vs 완료 신호 — HW 모델링 기준
|
||||||
|
|
||||||
|
| 통신 유형 | 방식 | HW 대응 |
|
||||||
|
|----------|------|---------|
|
||||||
|
| tile token (작업 지시) | message via out_port | command queue에 enqueue |
|
||||||
|
| stage 완료 → 다음 stage | 컴포넌트가 직접 out_port.put | done-triggered local enqueue |
|
||||||
|
| pipeline 완료 → scheduler | PipelineContext.complete_tile() | completion interrupt |
|
||||||
|
|
||||||
|
**Tile token**: out_port.put() 사용. SimPy Store capacity = HW queue depth.
|
||||||
|
|
||||||
|
**Intra-PE chaining latency**: 본 ADR 범위에서는 intra-PE stage trigger에
|
||||||
|
explicit latency model을 두지 않는다. 컴포넌트 간 체이닝은 PE 내부 wire에 해당하며,
|
||||||
|
scheduler 왕복이 없으므로 artificial hop cost가 발생하지 않는다.
|
||||||
|
|
||||||
|
**Pipeline 완료**: 마지막 stage의 컴포넌트가 `pipeline_ctx.complete_tile()` 호출.
|
||||||
|
모든 tile 완료 시 PipelineContext가 done_event.succeed().
|
||||||
|
|
||||||
|
### D4. 비동기 파이프라인 — 자연스러운 overlap
|
||||||
|
|
||||||
|
Scheduler는 CompositeCmd를 **비동기로** 처리한다.
|
||||||
|
다만 tile feed는 command마다 독립 process를 만들지 않고,
|
||||||
|
scheduler 내부의 **단일 feeder process**가 FIFO 순서로 수행한다.
|
||||||
|
따라서 scheduler는 다음 command를 계속 받을 수 있지만,
|
||||||
|
첫-stage tile 투입 순서는 command 단위로 보장된다.
|
||||||
|
|
||||||
|
**SimPy Store capacity = HW queue depth**이므로:
|
||||||
|
- queue가 차면 put()이 자연스럽게 block (backpressure)
|
||||||
|
- DMA가 tile 0을 처리하는 동안 GEMM은 이미 완료된 tile의 fetch를 시작
|
||||||
|
- 두 번째 CompositeCmd가 들어오면 DMA queue에 바로 이어서 투입
|
||||||
|
|
||||||
|
```
|
||||||
|
First-stage feed order (feeder → DMA queue):
|
||||||
|
[cmd1:t0][cmd1:t1][cmd1:t2]...[cmd1:tN] | [cmd2:t0][cmd2:t1]...
|
||||||
|
↑ cmd1 feed 완료 후 cmd2 시작
|
||||||
|
|
||||||
|
Runtime pipeline (downstream overlap):
|
||||||
|
PE_DMA: [cmd1:t0][cmd1:t1][cmd1:t2]...[cmd1:tN][cmd2:t0][cmd2:t1]...
|
||||||
|
PE_FETCH: [cmd1:t0][cmd1:t1]...
|
||||||
|
PE_GEMM: [cmd1:t0][cmd1:t1]...
|
||||||
|
↑ 같은 cmd 내부에서 pipeline overlap
|
||||||
|
```
|
||||||
|
|
||||||
|
이때 overlap은 서로 다른 command의 tile feed interleaving에서 오는 것이 아니라,
|
||||||
|
먼저 투입된 command의 tile들이 downstream stage로 진행되는 동안 feeder가
|
||||||
|
다음 tile들을 계속 투입하면서 자연스럽게 발생한다.
|
||||||
|
|
||||||
|
예를 들어 cmd1의 모든 tile이 첫 stage queue에 투입되기 전에는
|
||||||
|
cmd2의 tile feed는 시작되지 않는다. 그러나 cmd1.tile0이 이미 GEMM으로
|
||||||
|
진행한 상태에서 cmd1.tile1, cmd1.tile2가 DMA/FETCH에 남아 있을 수 있으므로,
|
||||||
|
**같은 command 내부에서는 pipeline overlap이 자연스럽게 발생**한다.
|
||||||
|
|
||||||
|
#### 컴포넌트 체이닝 패턴
|
||||||
|
|
||||||
|
모든 컴포넌트가 동일한 패턴을 따른다:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def _pipeline_worker(self, env):
|
||||||
|
while True:
|
||||||
|
token = yield self._inbox.get()
|
||||||
|
|
||||||
|
# 자기 stage 처리
|
||||||
|
yield from self._process(env, token)
|
||||||
|
|
||||||
|
# 다음 stage로 체이닝 (plan에서 읽음)
|
||||||
|
next_idx = token.stage_idx + 1
|
||||||
|
if next_idx < len(token.plan.stages):
|
||||||
|
next_stage = token.plan.stages[next_idx]
|
||||||
|
token.stage_idx = next_idx
|
||||||
|
token.params = next_stage.params
|
||||||
|
yield self.out_ports[next_stage.component].put(token)
|
||||||
|
else:
|
||||||
|
# 마지막 stage — pipeline completion
|
||||||
|
token.pipeline_ctx.complete_tile()
|
||||||
|
```
|
||||||
|
|
||||||
|
### D5. PE_FETCH_STORE — TCM ↔ Register File 전담
|
||||||
|
|
||||||
|
기존에 GemmBlock과 MathBlock이 각각 TCM read/write를 구현했으나,
|
||||||
|
이를 **PE_FETCH_STORE 컴포넌트**로 분리한다.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# PE_FETCH_STORE._process()
|
||||||
|
def _process(self, env, token):
|
||||||
|
yield self.out_ports[tcm_id].put(TcmRequest(token.params["direction"], ...))
|
||||||
|
yield tcm_done
|
||||||
|
# 체이닝은 base class가 처리 (D4 패턴)
|
||||||
|
```
|
||||||
|
|
||||||
|
장점:
|
||||||
|
- GEMM/MATH는 **순수 compute만** — TCM 접근 로직 없음
|
||||||
|
- fetch/store BW 경합이 자연스럽게 모델링됨 (PE_TCM의 resource로 serialization)
|
||||||
|
- prefetch 전략 등 fetch unit 단독 교체로 실험 가능
|
||||||
|
|
||||||
|
### D6. 각 Compute 컴포넌트의 단순화
|
||||||
|
|
||||||
|
GEMM/MATH는 register 데이터가 이미 준비된 상태에서 compute만 수행.
|
||||||
|
**체이닝은 공통 패턴(D4)을 따르므로, _process()만 구현하면 된다:**
|
||||||
|
|
||||||
|
```python
|
||||||
|
# PE_GEMM._process()
|
||||||
|
def _process(self, env, token):
|
||||||
|
yield env.timeout(self._mac_latency(token.params))
|
||||||
|
|
||||||
|
# PE_MATH._process()
|
||||||
|
def _process(self, env, token):
|
||||||
|
yield env.timeout(self._simd_latency(token.params))
|
||||||
|
|
||||||
|
# PE_FETCH_STORE._process()
|
||||||
|
def _process(self, env, token):
|
||||||
|
yield self.out_ports[tcm_id].put(TcmRequest(token.params["direction"], ...))
|
||||||
|
yield tcm_done
|
||||||
|
|
||||||
|
# PE_DMA._process()
|
||||||
|
def _process(self, env, token):
|
||||||
|
yield from self._do_fabric_dma(token.params)
|
||||||
|
```
|
||||||
|
|
||||||
|
타이밍 모델만 교체하면 cycle-accurate든 analytical든 자유롭게 변경 가능.
|
||||||
|
체이닝 로직은 base class에 있으므로 각 컴포넌트는 순수 stage 로직만 구현.
|
||||||
|
|
||||||
|
### D7. Topology 변경
|
||||||
|
|
||||||
|
PE template에 PE_FETCH_STORE 추가:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
pe_template:
|
||||||
|
components:
|
||||||
|
pe_cpu: { kind: pe_cpu, impl: pe_cpu_v1, ... }
|
||||||
|
pe_scheduler: { kind: pe_scheduler, impl: pe_scheduler_v2, ... }
|
||||||
|
pe_dma: { kind: pe_dma, impl: pe_dma_v1, ... }
|
||||||
|
pe_fetch_store: { kind: pe_fetch_store, impl: pe_fetch_store_v1, ... }
|
||||||
|
pe_gemm: { kind: pe_gemm, impl: pe_gemm_v1, ... }
|
||||||
|
pe_math: { kind: pe_math, impl: pe_math_v1, ... }
|
||||||
|
pe_mmu: { kind: pe_mmu, impl: pe_mmu_v1, ... }
|
||||||
|
pe_tcm: { kind: pe_tcm, impl: pe_tcm_v1, ... }
|
||||||
|
links:
|
||||||
|
# 기존 links...
|
||||||
|
fetch_store_to_tcm_bw_gbs: 512.0
|
||||||
|
fetch_store_to_tcm_mm: 0.0
|
||||||
|
```
|
||||||
|
|
||||||
|
PE 내부 edge 연결:
|
||||||
|
```
|
||||||
|
PE_SCHEDULER → PE_DMA (초기 dispatch)
|
||||||
|
PE_SCHEDULER → PE_FETCH_STORE (초기 dispatch)
|
||||||
|
PE_SCHEDULER → PE_GEMM (초기 dispatch)
|
||||||
|
PE_SCHEDULER → PE_MATH (초기 dispatch)
|
||||||
|
PE_DMA → PE_FETCH_STORE (체이닝)
|
||||||
|
PE_FETCH_STORE → PE_GEMM (체이닝)
|
||||||
|
PE_FETCH_STORE → PE_MATH (체이닝)
|
||||||
|
PE_GEMM → PE_FETCH_STORE (store 체이닝)
|
||||||
|
PE_MATH → PE_FETCH_STORE (store 체이닝)
|
||||||
|
PE_FETCH_STORE → PE_DMA (writeback 체이닝)
|
||||||
|
PE_FETCH_STORE → PE_TCM (BW 요청)
|
||||||
|
```
|
||||||
|
|
||||||
|
Topology edge는 **control/dispatch visibility + runtime chaining** 양쪽을 포함한다.
|
||||||
|
Scheduler → 하위 컴포넌트 edge는 초기 dispatch 경로이며,
|
||||||
|
컴포넌트 간 edge는 token self-routing에 의한 runtime chaining 경로이다.
|
||||||
|
|
||||||
|
### D8. 기존 코드 마이그레이션 — builtin 통합
|
||||||
|
|
||||||
|
기존 builtin v1 컴포넌트와 pe_accel을 **새 builtin으로 교체**한다.
|
||||||
|
|
||||||
|
#### 마이그레이션 전략
|
||||||
|
|
||||||
|
1. 기존 `components/builtin/` → `components/builtin_legacy/`로 백업 (수정 없이 보관)
|
||||||
|
2. 기존 `components/custom/pe_accel/` → 동일하게 백업
|
||||||
|
3. 새 `components/builtin/`에 ADR-0021 아키텍처로 재구현
|
||||||
|
4. topology.yaml은 **하나만 유지** (pe_fetch_store 포함)
|
||||||
|
5. components.yaml은 새 builtin을 가리킴
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# components.yaml — 새 builtin
|
||||||
|
pe_scheduler_v1: kernbench.components.builtin.pe_scheduler:PeSchedulerComponent
|
||||||
|
pe_gemm_v1: kernbench.components.builtin.pe_gemm:PeGemmComponent
|
||||||
|
pe_math_v1: kernbench.components.builtin.pe_math:PeMathComponent
|
||||||
|
pe_dma_v1: kernbench.components.builtin.pe_dma:PeDmaComponent
|
||||||
|
pe_fetch_store_v1: kernbench.components.builtin.pe_fetch_store:PeFetchStoreComponent
|
||||||
|
pe_tcm_v1: kernbench.components.builtin.pe_tcm:PeTcmComponent
|
||||||
|
```
|
||||||
|
|
||||||
|
impl 이름(pe_gemm_v1 등)은 유지하되, **구현이 ADR-0021 아키텍처로 교체**된다.
|
||||||
|
기존 벤치마크와 테스트의 topology.yaml 참조는 변경 없이 동작한다.
|
||||||
|
|
||||||
|
#### 레이턴시 모델 계승
|
||||||
|
|
||||||
|
새 builtin 컴포넌트의 레이턴시 모델링(MAC cycle 계산, SIMD latency,
|
||||||
|
TCM BW serialization, DMA fabric latency 등)은 **pe_accel 현재 버전의 구현을 바탕으로** 한다.
|
||||||
|
tiling.py의 tile schedule 생성 로직도 그대로 가져온다.
|
||||||
|
아키텍처(컴포넌트 분리, self-routing)만 변경하고, 타이밍 정확도는 유지한다.
|
||||||
|
|
||||||
|
#### 테스트 전략
|
||||||
|
|
||||||
|
#### 테스트 계획
|
||||||
|
|
||||||
|
**1. 기존 테스트 통과** (regression):
|
||||||
|
마이그레이션 완료 후 기존 테스트(366개)가 전부 통과해야 한다.
|
||||||
|
|
||||||
|
**2. 레이턴시 regression**:
|
||||||
|
pe_accel과 동일한 입력에 대해 새 builtin이 동일 레이턴시를 산출하는지 검증.
|
||||||
|
|
||||||
|
**3. Phase 1 → Phase 2 end-to-end**:
|
||||||
|
SimPy 시뮬레이션(Phase 1)에서 op_log 생성 → DataExecutor(Phase 2)로
|
||||||
|
실제 numpy 연산 → 결과 정합성 검증까지 통합 테스트.
|
||||||
|
- GEMM: tl.composite(gemm) → op_log → Phase 2 matmul → allclose 검증
|
||||||
|
- MATH: tl.exp / tl.add 등 → op_log → Phase 2 numpy op → allclose 검증
|
||||||
|
- 체이닝: GEMM 출력 → MATH 입력 → 최종 결과 end-to-end 검증
|
||||||
|
|
||||||
|
**4. TileToken self-routing**:
|
||||||
|
- tile이 plan의 stage sequence를 따라 체이닝되는지 검증
|
||||||
|
- 마지막 stage에서 PipelineContext.complete_tile() exactly-once 검증
|
||||||
|
- queue backpressure: DMA queue capacity 초과 시 feeder만 block 검증
|
||||||
|
|
||||||
|
**5. 비동기 pipeline overlap**:
|
||||||
|
- 동일 command 내 tile 간 stage overlap 발생 검증 (tile0 GEMM 중 tile1 DMA)
|
||||||
|
- 다중 command: cmd1 feed 완료 후 cmd2 feed 시작 (FIFO 순서) 검증
|
||||||
|
|
||||||
|
### D9. TileToken 메시지 정의
|
||||||
|
|
||||||
|
컴포넌트 간 tile 작업 전달에 사용하는 메시지.
|
||||||
|
Token이 plan과 stage index를 가지고 있어 self-routing이 가능하다.
|
||||||
|
|
||||||
|
```python
|
||||||
|
@dataclass
|
||||||
|
class TileToken:
|
||||||
|
tile_id: int
|
||||||
|
pipeline_ctx: PipelineContext # completion 추적
|
||||||
|
plan: TilePlan # 이 tile의 전체 stage sequence (immutable)
|
||||||
|
stage_idx: int # 현재 stage index in plan.stages
|
||||||
|
params: dict # current stage 파라미터 캐시 (canonical: plan.stages[stage_idx].params)
|
||||||
|
data_op: bool = True # op_log 기록 대상 (ADR-0020)
|
||||||
|
```
|
||||||
|
|
||||||
|
TileToken은 한 시점에 **하나의 컴포넌트에 의해서만 소유**되며,
|
||||||
|
동시에 여러 컴포넌트에 의해 참조되지 않는다 (single-owner).
|
||||||
|
|
||||||
|
Token lifecycle:
|
||||||
|
1. Scheduler가 stage_idx=0으로 생성, 첫 stage 컴포넌트에 put
|
||||||
|
2. 컴포넌트가 _process() 실행 후 stage_idx 증가, 다음 컴포넌트에 put
|
||||||
|
3. 마지막 stage 컴포넌트가 pipeline_ctx.complete_tile() 호출
|
||||||
|
4. 모든 tile 완료 시 PipelineContext가 done_event.succeed()
|
||||||
|
|
||||||
|
기존 PeInternalTxn과의 관계:
|
||||||
|
- PeInternalTxn: PE_CPU → PE_SCHEDULER 간 command 전달 (기존 유지)
|
||||||
|
- TileToken: PE_SCHEDULER → 하위 컴포넌트 간 tile 단위 작업 전달 (신규, self-routing)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Non-goals
|
||||||
|
|
||||||
|
- **PE_CPU 변경**: PE_CPU → PE_SCHEDULER 인터페이스는 변경하지 않음
|
||||||
|
(PeInternalTxn 기반, ADR-0014 유지)
|
||||||
|
- **다중 pipeline 간 자원 경합 모델**: 현재 범위에서는 단일 pipeline의
|
||||||
|
정확한 모델링에 집중. 다중 pipeline 간 TCM bank conflict 등은 future work.
|
||||||
|
- **builtin_legacy 유지보수**: 백업 목적이며, 버그 수정이나 기능 추가 대상이 아님.
|
||||||
|
|
||||||
|
## Open Questions
|
||||||
|
|
||||||
|
- **Register File 용량 모델**: fetch unit이 register에 로드할 때 용량 제한을
|
||||||
|
모델링할지. 용량은 바이트 단위(register_file_bytes)로 표현하며,
|
||||||
|
동시에 보유 가능한 tile 수는 tile 크기에 따라 결정된다.
|
||||||
|
용량 초과 시 fetch가 stall되어 자연스러운 backpressure가 발생한다.
|
||||||
|
- **Prefetch 전략**: 본 ADR에서는 composite command 간 tile feed interleaving을
|
||||||
|
허용하지 않는다. 따라서 overlap은 command 간 선행 투입이 아니라,
|
||||||
|
같은 command 내부 tile들의 pipeline progression에서 자연스럽게 발생한다.
|
||||||
|
추가적인 prefetch가 필요하면 command 간 투입이 아니라, 같은 command 내부에서의
|
||||||
|
tile ordering 또는 fetch/store unit policy 차원에서 검토한다.
|
||||||
|
- **PE_DMA coalescing**: tile 단위 DMA는 fragmentation 발생 가능.
|
||||||
|
DMA 내부에서 merge/coalesce하되 scheduler는 관여하지 않는 방향.
|
||||||
|
- **동기 실행 모드**: 본 ADR에서는 비동기 pipeline을 기본/유일 execution model로
|
||||||
|
채택한다. 디버그 또는 validation 목적의 sync mode가 필요하면 future ADR에서 검토.
|
||||||
|
- **다중 pipeline 간 TCM bank conflict**: 현재 단일 pipeline 기준.
|
||||||
|
다중 pipeline이 동시에 TCM에 접근할 때의 bank conflict 모델은 future work.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Consequences
|
||||||
|
|
||||||
|
### 긍정적
|
||||||
|
|
||||||
|
- 각 블록이 독립 컴포넌트 — 개별 교체 가능 (ADR-0015 준수)
|
||||||
|
- topology에서 PE 내부 구조 가시화
|
||||||
|
- 컴포넌트가 다음 컴포넌트를 모름 — plan 기반 라우팅으로 유연성 확보
|
||||||
|
- DMA와 compute의 자연스러운 파이프라인 overlap (SimPy Store backpressure)
|
||||||
|
- HW 모델링 정확도 향상 (done signal = Event, data transfer = message)
|
||||||
|
- fetch/store 분리로 TCM BW 경합 정확히 모델링
|
||||||
|
|
||||||
|
### 부정적
|
||||||
|
|
||||||
|
- PE 내부 컴포넌트 수 증가 (5 → 6) — topology 노드/edge 증가
|
||||||
|
- 컴포넌트 분리로 인해 intra-PE token forwarding이 이전 대비 더 명시적으로 드러남
|
||||||
|
- 기존 builtin/pe_accel과의 breaking change — 마이그레이션 필요
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 영향받는 파일
|
||||||
|
|
||||||
|
| 파일 | 변경 |
|
||||||
|
|------|------|
|
||||||
|
| `topology.yaml` | pe_fetch_store 컴포넌트 추가, 체이닝 edge 추가 |
|
||||||
|
| `components.yaml` | 새 builtin 컴포넌트 등록 |
|
||||||
|
| `src/kernbench/topology/builder.py` | PE 내부 edge에 fetch_store + 체이닝 edge 추가 |
|
||||||
|
| `src/kernbench/common/pe_commands.py` | TileToken 정의 추가 |
|
||||||
|
| `src/kernbench/components/builtin/pe_scheduler.py` | 재구현 (feeder + plan 기반 dispatch) |
|
||||||
|
| `src/kernbench/components/builtin/pe_gemm.py` | 재구현 (TileToken, _process 패턴) |
|
||||||
|
| `src/kernbench/components/builtin/pe_math.py` | 재구현 (TileToken, _process 패턴) |
|
||||||
|
| `src/kernbench/components/builtin/pe_dma.py` | 재구현 (TileToken, _process 패턴) |
|
||||||
|
| `src/kernbench/components/builtin/pe_fetch_store.py` | 신규 |
|
||||||
|
| `src/kernbench/components/builtin/pe_tcm.py` | 재구현 (TcmRequest 서비스) |
|
||||||
|
| `src/kernbench/components/builtin/types.py` | 신규: TilePlan, Stage, StageType, PipelineContext, TileToken |
|
||||||
|
| `src/kernbench/components/builtin/tiling.py` | pe_accel에서 이식: plan 생성 로직 |
|
||||||
|
|
||||||
|
백업:
|
||||||
|
| `src/kernbench/components/builtin_legacy/` | 기존 builtin 전체 백업 (수정 없이 보관) |
|
||||||
|
| `src/kernbench/components/custom/pe_accel/` | 기존 pe_accel 백업 (수정 없이 보관) |
|
||||||
@@ -0,0 +1,90 @@
|
|||||||
|
# ADR-0022: 2D Grid program_id Semantics
|
||||||
|
|
||||||
|
- **Status**: Accepted
|
||||||
|
- **Date**: 2026-04-09
|
||||||
|
- **Context**: Triton-style kernel addressing for multi-cube PE topology
|
||||||
|
|
||||||
|
## Problem
|
||||||
|
|
||||||
|
Triton kernels use `tl.program_id(axis)` to identify their position in a launch grid.
|
||||||
|
Our hardware has a 2-level hierarchy: **cubes** contain **PEs**.
|
||||||
|
The previous implementation ignored the `axis` parameter and always returned a flat PE index,
|
||||||
|
making it impossible for kernels to distinguish their cube-local position from their cube identity.
|
||||||
|
|
||||||
|
## Decision
|
||||||
|
|
||||||
|
Map `tl.program_id` and `tl.num_programs` to the 2D hardware grid:
|
||||||
|
|
||||||
|
| Call | Returns | Description |
|
||||||
|
|------|---------|-------------|
|
||||||
|
| `tl.program_id(axis=0)` | `local_pe_id` | PE index within cube |
|
||||||
|
| `tl.program_id(axis=1)` | `cube_id` | Cube index |
|
||||||
|
| `tl.num_programs(axis=0)` | `num_pes_per_cube` | PEs per cube |
|
||||||
|
| `tl.num_programs(axis=1)` | `num_cubes` | Total cubes |
|
||||||
|
|
||||||
|
Global PID is derived as:
|
||||||
|
|
||||||
|
```python
|
||||||
|
global_pid = tl.program_id(axis=1) * tl.num_programs(axis=0) + tl.program_id(axis=0)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Axis mapping rationale
|
||||||
|
|
||||||
|
- **axis=0 = PE (innermost)**: PEs within a cube share HBM and communicate via local NOC mesh. This is the fast, tightly-coupled dimension — analogous to threads within a block.
|
||||||
|
- **axis=1 = Cube (outer)**: Cross-cube communication goes through UCIe with higher latency. This is the coarser scheduling dimension — analogous to blocks in a grid.
|
||||||
|
|
||||||
|
## Implementation
|
||||||
|
|
||||||
|
### TLContext (`triton_emu/tl_context.py`)
|
||||||
|
|
||||||
|
Added `cube_id` and `num_cubes` constructor parameters. `program_id()` and `num_programs()` dispatch on `axis`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def program_id(self, axis: int = 0) -> int:
|
||||||
|
if axis == 1:
|
||||||
|
return self._cube_id
|
||||||
|
return self._pe_id
|
||||||
|
|
||||||
|
def num_programs(self, axis: int = 0) -> int:
|
||||||
|
if axis == 1:
|
||||||
|
return self._num_cubes
|
||||||
|
return self._num_programs
|
||||||
|
```
|
||||||
|
|
||||||
|
### PE_CPU (`components/builtin/pe_cpu.py`)
|
||||||
|
|
||||||
|
- Extracts `num_cubes` from `ctx.spec["system"]["sips"]["cubes_per_sip"]`
|
||||||
|
- Passes `cube_id` (already available as `self._cube_idx`) and `num_cubes` to TLContext
|
||||||
|
|
||||||
|
### KernelRunner (`triton_emu/kernel_runner.py`)
|
||||||
|
|
||||||
|
- Receives `num_cubes` from PE_CPU
|
||||||
|
- Passes `cube_id` and `num_cubes` to TLContext in greenlet mode
|
||||||
|
|
||||||
|
## Backward Compatibility
|
||||||
|
|
||||||
|
- Existing code using `tl.program_id(0)` or `tl.program_id()` is unchanged — returns the same PE index as before.
|
||||||
|
- `cube_id` and `num_cubes` default to `0` and `1`, so callers that don't provide them (e.g. unit tests) continue to work.
|
||||||
|
|
||||||
|
## Usage Example
|
||||||
|
|
||||||
|
```python
|
||||||
|
def sharded_gemm_kernel(a_ptr, b_ptr, out_ptr, M, K, N, tl):
|
||||||
|
local_pid = tl.program_id(axis=0) # PE within cube
|
||||||
|
cube_id = tl.program_id(axis=1) # which cube
|
||||||
|
global_pid = cube_id * tl.num_programs(axis=0) + local_pid
|
||||||
|
|
||||||
|
# Column-wise sharding across global PID
|
||||||
|
n_per_pid = N // (tl.num_programs(axis=1) * tl.num_programs(axis=0))
|
||||||
|
col_start = global_pid * n_per_pid
|
||||||
|
|
||||||
|
a = tl.load(a_ptr, shape=(M, K), dtype="f16")
|
||||||
|
b = tl.ref(b_ptr + col_start * K * 2, shape=(K, n_per_pid), dtype="f16")
|
||||||
|
h = tl.composite(op="gemm", a=a, b=b, out_ptr=out_ptr + col_start * M * 2)
|
||||||
|
tl.wait(h)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Consequences
|
||||||
|
|
||||||
|
- Benchmarks can now express cube-aware sharding and addressing without hardcoding topology dimensions.
|
||||||
|
- Future axis=2 (SIP-level) can be added following the same pattern if needed.
|
||||||
@@ -0,0 +1,866 @@
|
|||||||
|
# ADR-0023: PE-level IPCQ — Inter-PE Collective Communication
|
||||||
|
|
||||||
|
## Status
|
||||||
|
|
||||||
|
Proposed
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
### Goal
|
||||||
|
|
||||||
|
Add the infrastructure that lets CCL (Collective Communication Library)
|
||||||
|
kernels run **inside** a PE. The host just launches a kernel on each
|
||||||
|
SIP; the actual synchronization and data movement happen **inside the
|
||||||
|
PE kernel via an IPCQ (Inter-Process Communication Queue)**.
|
||||||
|
|
||||||
|
This mirrors how NCCL performs NVLink communication inside a GPU
|
||||||
|
kernel, or how Cerebras / Tenstorrent expose core-local communication
|
||||||
|
queues. Host-level collectives (`dist.all_reduce`) are deferred to
|
||||||
|
**future work**; this ADR focuses solely on the kernel-side collective
|
||||||
|
infrastructure.
|
||||||
|
|
||||||
|
### Current state
|
||||||
|
|
||||||
|
- ADR-0021 PE pipeline refactor: each PE is decomposed into components
|
||||||
|
(PE_CPU, PE_SCHEDULER, PE_DMA, PE_FETCH_STORE, PE_GEMM, PE_MATH,
|
||||||
|
PE_TCM, PE_MMU).
|
||||||
|
- No direct PE-to-PE channel exists today. All data movement goes
|
||||||
|
through PE_DMA → cube_noc / UCIe / PCIE → HBM.
|
||||||
|
- A pre-ADR host CCL skeleton exists (`dist.init_process_group(backend="ahbm")`,
|
||||||
|
`_run_ccl_bench` running per-rank greenlets concurrently). The
|
||||||
|
collective itself is a stub.
|
||||||
|
|
||||||
|
### Problems to solve
|
||||||
|
|
||||||
|
1. PE-to-PE direct data movement (writing into a peer's memory).
|
||||||
|
2. Synchronization — the sender must check that the receiver has space
|
||||||
|
in its buffer (backpressure).
|
||||||
|
3. Resource contention between compute traffic and communication
|
||||||
|
traffic (Head-of-Line blocking).
|
||||||
|
4. The host must be able to construct logical neighbor topologies
|
||||||
|
(ring / mesh / tree) per algorithm.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Decision
|
||||||
|
|
||||||
|
### D1. Add a new `PE_IPCQ` component
|
||||||
|
|
||||||
|
A new component `PE_IPCQ` is added inside each PE. It follows the same
|
||||||
|
pattern as PE_GEMM / PE_MATH — modeling a sub-block of the PE as a
|
||||||
|
distinct component.
|
||||||
|
|
||||||
|
```
|
||||||
|
PE
|
||||||
|
├── PE_CPU
|
||||||
|
├── PE_SCHEDULER
|
||||||
|
├── PE_DMA
|
||||||
|
├── PE_IPCQ ← new
|
||||||
|
├── PE_FETCH_STORE
|
||||||
|
├── PE_GEMM
|
||||||
|
├── PE_MATH
|
||||||
|
├── PE_TCM
|
||||||
|
├── PE_MMU
|
||||||
|
```
|
||||||
|
|
||||||
|
**Role separation** (control plane vs. data plane):
|
||||||
|
|
||||||
|
- **PE_IPCQ (control plane)**: ring-buffer address arithmetic, head /
|
||||||
|
tail pointer management, peer pointer caches, backpressure, 4-direction
|
||||||
|
neighbor mapping.
|
||||||
|
- **PE_DMA (data plane)**: actually moves data through cube_noc / UCIe
|
||||||
|
/ PCIE into the peer's memory.
|
||||||
|
|
||||||
|
PE_IPCQ does **not** move data itself — it delegates to PE_DMA.
|
||||||
|
|
||||||
|
### D2. Ring buffer model
|
||||||
|
|
||||||
|
Each PE owns 4 directions (N/S/E/W) × {tx, rx} = 8 ring buffers.
|
||||||
|
|
||||||
|
```python
|
||||||
|
@dataclass
|
||||||
|
class IpcqQueuePair:
|
||||||
|
direction: Direction # N/S/E/W
|
||||||
|
peer: IpcqEndpoint # set by host at init time (D2.5)
|
||||||
|
tx_buffer_base: int # outgoing data base addr (in our memory)
|
||||||
|
rx_buffer_base: int # incoming data base addr (in our memory)
|
||||||
|
slot_size: int # 1 tile per slot
|
||||||
|
n_slots: int # ring depth
|
||||||
|
my_head: int # next slot we will write/send into
|
||||||
|
my_tail: int # next slot we will read/recv from
|
||||||
|
peer_head_cache: int # peer's last-seen head (updated via D9 piggyback)
|
||||||
|
peer_tail_cache: int # peer's last-seen tail (updated via D9 fast-path credit)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Canonical field names**: throughout this ADR the four names above
|
||||||
|
(`my_head`, `my_tail`, `peer_head_cache`, `peer_tail_cache`) are used
|
||||||
|
consistently. Synonyms (`peer_head_local`, `peer_head`, `peer_tail`,
|
||||||
|
etc.) are not used.
|
||||||
|
|
||||||
|
| Field | Owner | Updated when |
|
||||||
|
|-------|-------|--------------|
|
||||||
|
| `my_head` | local PE_IPCQ | immediately after `tl.send` (send tracking) |
|
||||||
|
| `my_tail` | local PE_IPCQ | immediately after `tl.recv` (recv tracking) |
|
||||||
|
| `peer_head_cache` | local PE_IPCQ | on `IpcqMetaArrival` (D9 piggyback) |
|
||||||
|
| `peer_tail_cache` | local PE_IPCQ | on `IpcqCreditMetadata` (D9 fast path) |
|
||||||
|
|
||||||
|
**Slot unit**: fixed-size, one slot holds one full tile (no descriptor
|
||||||
|
indirection). Full data embedded in the slot. See D5.
|
||||||
|
|
||||||
|
### D2.5. `IpcqEndpoint` schema
|
||||||
|
|
||||||
|
`IpcqQueuePair.peer` carries everything the sender needs to compute the
|
||||||
|
peer's rx slot address:
|
||||||
|
|
||||||
|
```python
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class IpcqEndpoint:
|
||||||
|
sip: int
|
||||||
|
cube: int
|
||||||
|
pe: int
|
||||||
|
buffer_kind: str # "tcm" | "hbm" | "sram"
|
||||||
|
rx_base_pa: int # peer rx_buffer base PA (PhysAddr.encode())
|
||||||
|
rx_base_va: int # peer rx_buffer base VA (optional, MMU mode)
|
||||||
|
n_slots: int # peer ring depth (for wrap-around)
|
||||||
|
slot_size: int # peer slot size (for offset)
|
||||||
|
```
|
||||||
|
|
||||||
|
Address computation:
|
||||||
|
|
||||||
|
```python
|
||||||
|
slot_idx = self.my_head % peer.n_slots
|
||||||
|
dst_pa = peer.rx_base_pa + slot_idx * peer.slot_size
|
||||||
|
```
|
||||||
|
|
||||||
|
PE_IPCQ passes `dst_pa` to PE_DMA inside an `IpcqDmaToken`. PE_DMA
|
||||||
|
(vc_comm) routes the data to `dst_pa` through the fabric.
|
||||||
|
|
||||||
|
**Endpoint construction order**: at backend init (D10), the IPCQ
|
||||||
|
buffers for **every PE** are allocated first (so each rank knows the
|
||||||
|
others' PA), then the per-rank neighbor tables are built and pushed to
|
||||||
|
PE_IPCQ via `IpcqInitMsg`.
|
||||||
|
|
||||||
|
### D3. Four-direction mapping ≡ logical ProcessGroup
|
||||||
|
|
||||||
|
The PE views four directions (N/S/E/W) as logical ports. Real peer
|
||||||
|
addresses are configured by the host CCL init, per the chosen
|
||||||
|
algorithm. The PE kernel never knows the topology, only directions.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# 1D ring
|
||||||
|
for rank in range(world_size):
|
||||||
|
ipcq_set_neighbor(rank, "E", peer=ranks[(rank + 1) % world_size])
|
||||||
|
ipcq_set_neighbor(rank, "W", peer=ranks[(rank - 1) % world_size])
|
||||||
|
|
||||||
|
# 2D mesh
|
||||||
|
for r in range(R):
|
||||||
|
for c in range(C):
|
||||||
|
ipcq_set_neighbor((r, c), "N", peer=((r - 1) % R, c))
|
||||||
|
ipcq_set_neighbor((r, c), "S", peer=((r + 1) % R, c))
|
||||||
|
ipcq_set_neighbor((r, c), "E", peer=(r, (c + 1) % C))
|
||||||
|
ipcq_set_neighbor((r, c), "W", peer=(r, (c - 1) % C))
|
||||||
|
```
|
||||||
|
|
||||||
|
The PE code does not need to know where `tl.send(dir="E", ...)` actually
|
||||||
|
ends up.
|
||||||
|
|
||||||
|
### D4. PE kernel API
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Send (blocking; may stall on backpressure)
|
||||||
|
tl.send(dir: str, src=TensorHandle)
|
||||||
|
tl.send(dir: str, src_addr=..., nbytes=..., shape=..., dtype=..., space=...)
|
||||||
|
|
||||||
|
# Recv (blocking)
|
||||||
|
recv = tl.recv(dir: str, shape=..., dtype=...)
|
||||||
|
recv = tl.recv(shape=..., dtype=...) # round-robin across 4 directions
|
||||||
|
|
||||||
|
# Recv (non-blocking)
|
||||||
|
fut = tl.recv_async(dir: str, shape=..., dtype=...)
|
||||||
|
recv = tl.wait(fut)
|
||||||
|
```
|
||||||
|
|
||||||
|
`tl.recv()` (no direction) keeps a `last_polled_dir` cursor and on each
|
||||||
|
call rotates through directions, returning the first available slot.
|
||||||
|
Empty in all 4 directions → wait.
|
||||||
|
|
||||||
|
**Fairness is weak**: the rotating start mitigates simple bias, but if
|
||||||
|
one direction always wins the race the others can starve. Algorithms
|
||||||
|
that need strict fairness must call `tl.recv(dir=...)` explicitly.
|
||||||
|
|
||||||
|
### D5. Single-hop DMA write + full-data slot model
|
||||||
|
|
||||||
|
Data moves from sender memory into the receiver's ring slot in **one
|
||||||
|
DMA transfer**. Key properties:
|
||||||
|
|
||||||
|
- **Single-hop**: the sender already knows the peer rx slot address and
|
||||||
|
fires one fabric DMA into it.
|
||||||
|
- **No CPU memcpy**: the CPU never copies data.
|
||||||
|
- **No intermediate staging**: neither side keeps a separate staging
|
||||||
|
buffer (sender uses the source addr directly; receiver gets the data
|
||||||
|
in its ring slot directly).
|
||||||
|
|
||||||
|
(Strictly speaking the fabric DMA write does happen, so this is not
|
||||||
|
literally "no data movement" — it's the same property NCCL labels
|
||||||
|
"zero-copy", meaning no CPU memcpy and no staging copy.)
|
||||||
|
|
||||||
|
```
|
||||||
|
PE A: tl.send(E, src_addr, nbytes)
|
||||||
|
1. IPCQ computes the peer rx slot address:
|
||||||
|
dst_addr = peer.rx_base_pa + (my_head % peer.n_slots) * peer.slot_size
|
||||||
|
2. Backpressure: my_head - peer_tail_cache < peer.n_slots ?
|
||||||
|
(full → sleep / poll)
|
||||||
|
3. Submit DMA on PE_DMA(vc_comm): src_addr → peer dst_addr, nbytes
|
||||||
|
4. my_head += 1
|
||||||
|
|
||||||
|
PE B: data = tl.recv(W)
|
||||||
|
1. Look at rx_buffer[my_tail % n_slots]
|
||||||
|
2. Wait for the data to arrive (D7 backpressure mode)
|
||||||
|
3. Return the slot address to the kernel (or fetch into register file)
|
||||||
|
4. my_tail += 1
|
||||||
|
5. Issue a credit-return fast path (D9): after the bottleneck-BW
|
||||||
|
latency the peer A's peer_tail_cache is updated.
|
||||||
|
```
|
||||||
|
|
||||||
|
The slot holds the full tile. The receiver only reads its own
|
||||||
|
rx_buffer; it never reads back into A's memory. The sender knows the
|
||||||
|
peer rx slot address and DMAs directly into it (single-hop).
|
||||||
|
|
||||||
|
The PE's own PE_TCM read/write does not go through DMA (PE_TCM is local
|
||||||
|
to the PE).
|
||||||
|
|
||||||
|
### D6. Buffer placement — three-way benchmark
|
||||||
|
|
||||||
|
The host CCL init picks the IPCQ ring-buffer location:
|
||||||
|
|
||||||
|
```python
|
||||||
|
ipcq_init(
|
||||||
|
backend="ahbm",
|
||||||
|
buffer_kind="tcm" | "hbm" | "sram",
|
||||||
|
n_slots=8,
|
||||||
|
slot_size=4096,
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
| Location | Trait | Trade-off |
|
||||||
|
|----------|-------|-----------|
|
||||||
|
| **PE_TCM** | Attached to the PE; fast | Small; competes with PE-internal resources |
|
||||||
|
| **PE-local HBM** | Large; via DMA | Higher latency |
|
||||||
|
| **Cube SRAM** | Mid-size; cube-shared | Cube-internal contention |
|
||||||
|
|
||||||
|
All three locations run the same kernel code; only the init differs.
|
||||||
|
|
||||||
|
### D7. Backpressure — two-mode benchmark
|
||||||
|
|
||||||
|
How the sender or receiver waits when peer slots are full / data not
|
||||||
|
yet arrived:
|
||||||
|
|
||||||
|
| Mode | Behavior | Model |
|
||||||
|
|------|----------|-------|
|
||||||
|
| **poll** | Periodically re-check the cached peer pointer | Spin loop |
|
||||||
|
| **sleep** | Yield a SimPy event; wake on a peer-trigger | Interrupt-like |
|
||||||
|
|
||||||
|
```python
|
||||||
|
ipcq_init(backpressure="poll" | "sleep", ...)
|
||||||
|
```
|
||||||
|
|
||||||
|
Both modes are implemented so latency / throughput trade-offs can be
|
||||||
|
benchmarked.
|
||||||
|
|
||||||
|
### D8. PE_DMA virtual channels
|
||||||
|
|
||||||
|
Extend PE_DMA from a single queue into a **two-channel virtual-channel**
|
||||||
|
model.
|
||||||
|
|
||||||
|
```
|
||||||
|
PE_DMA
|
||||||
|
├── vc_compute: tile load / store / writeback for GEMM and Math
|
||||||
|
└── vc_comm: IPCQ send data
|
||||||
|
```
|
||||||
|
|
||||||
|
Each VC has an independent state machine:
|
||||||
|
|
||||||
|
- One channel stalling does not block the other.
|
||||||
|
- The same physical link (cube_noc, UCIe, …) is shared, but link BW is
|
||||||
|
split between channels.
|
||||||
|
|
||||||
|
**Chunk-level interleave**:
|
||||||
|
|
||||||
|
- Large GEMM tile DMAs do not lock the link end-to-end.
|
||||||
|
- Progress happens in chunks (e.g. 256 B); each chunk shares link BW
|
||||||
|
with the other VC's pending chunks.
|
||||||
|
- Chunk size is an init parameter (smaller = fairer, larger = more
|
||||||
|
efficient).
|
||||||
|
|
||||||
|
Net effect:
|
||||||
|
|
||||||
|
- HoL blocking is eliminated (an IPCQ send can interleave with a long
|
||||||
|
compute DMA).
|
||||||
|
- Compute / comm overlap is natural (NVIDIA copy-engine + compute-SM
|
||||||
|
pattern).
|
||||||
|
- Matches the NoC-virtual-channel pattern used in real HW.
|
||||||
|
|
||||||
|
**First-implementation accuracy limit (intentional)**: this ADR's
|
||||||
|
first cut uses **deterministic chunk-level interleave + weighted
|
||||||
|
round-robin arbitration** (default 50 / 50, exposed in `ccl.yaml`).
|
||||||
|
This is a first-order approximation and is simpler than real HW
|
||||||
|
dynamic-contention / credit-based arbiters. Functional correctness is
|
||||||
|
unaffected, but heavy-contention scenarios may report slightly
|
||||||
|
optimistic latency vs. real HW. A separate ADR can add a NoC arbiter
|
||||||
|
component later if more precision is needed.
|
||||||
|
|
||||||
|
#### Token routing
|
||||||
|
|
||||||
|
- Compute tokens (`TileToken`) — go through the existing
|
||||||
|
PE_FETCH_STORE → PE_DMA chain.
|
||||||
|
- Communication tokens (`IpcqDmaToken`, new) — PE_IPCQ → PE_DMA
|
||||||
|
self-routing.
|
||||||
|
- PE_DMA picks the channel by token type.
|
||||||
|
|
||||||
|
```python
|
||||||
|
class PeDmaComponent:
|
||||||
|
def _process(self, env, token):
|
||||||
|
if isinstance(token, IpcqDmaToken):
|
||||||
|
yield from self._vc_comm_process(env, token)
|
||||||
|
else:
|
||||||
|
yield from self._vc_compute_process(env, token)
|
||||||
|
```
|
||||||
|
|
||||||
|
### D9. Pointer synchronization — DMA payload piggyback
|
||||||
|
|
||||||
|
Real HW (NVLink, UCIe, etc.) piggybacks metadata onto DMA payloads so
|
||||||
|
pointers update along with the data. This simulation adopts the same
|
||||||
|
model: **no separate control channel** — metadata travels with the
|
||||||
|
data.
|
||||||
|
|
||||||
|
The big benefits:
|
||||||
|
|
||||||
|
- **Automatic ordering**: data and metadata move on the same token, so
|
||||||
|
data is visible **before** the head_cache update. No race.
|
||||||
|
- **HW fidelity**: matches NVLink / UCIe piggybacked headers.
|
||||||
|
- **Component simplification**: no separate `IpcqPtrUpdate` event type.
|
||||||
|
|
||||||
|
#### Send flow (head update via piggyback)
|
||||||
|
|
||||||
|
```
|
||||||
|
PE A: tl.send(E, src_addr, nbytes)
|
||||||
|
1. PE_IPCQ checks backpressure (using peer_tail_cache)
|
||||||
|
2. PE_IPCQ creates an IpcqDmaToken:
|
||||||
|
- data body (src_addr → peer dst_addr)
|
||||||
|
- piggyback metadata: (sender_seq, src_sip/cube/pe, src_direction)
|
||||||
|
3. Hand the token to PE_DMA(vc_comm)
|
||||||
|
4. PE A increments my_head (send tracking)
|
||||||
|
|
||||||
|
[fabric DMA: latency elapses]
|
||||||
|
|
||||||
|
PE B's PE_DMA receives the token
|
||||||
|
5. Writes data into dst_addr (B's rx slot) via MemoryStore.write
|
||||||
|
6. Forwards token metadata to PE B's PE_IPCQ (PE-internal wire, ~1 cycle)
|
||||||
|
|
||||||
|
PE B's PE_IPCQ receives the metadata
|
||||||
|
7. Updates peer_head_cache (= A's head)
|
||||||
|
8. Wakes any pending recv on that direction
|
||||||
|
```
|
||||||
|
|
||||||
|
**Steps 5 and 6 must execute in the same SimPy step** — DMA completion
|
||||||
|
makes data and metadata atomically visible.
|
||||||
|
|
||||||
|
#### Recv flow (credit return — fast path with bottleneck-BW latency)
|
||||||
|
|
||||||
|
When the receiver frees a slot, the sender must learn about it
|
||||||
|
(backpressure release). Unlike data, the credit return does **not**
|
||||||
|
travel through general vc_comm fabric — it uses a **separate fast
|
||||||
|
path**, an abstraction of the NVLink / UCIe credit-return wire.
|
||||||
|
|
||||||
|
**Latency** is computed from the **bottleneck BW on the path**, not a
|
||||||
|
magic constant:
|
||||||
|
|
||||||
|
```
|
||||||
|
credit_size_bytes = 16 (ccl.yaml: ipcq_credit_size_bytes)
|
||||||
|
path = router.find_path(self_pe, peer_pe)
|
||||||
|
latency = compute_drain_ns(path, credit_size_bytes)
|
||||||
|
= credit_size_bytes / bottleneck_bw_on_path
|
||||||
|
```
|
||||||
|
|
||||||
|
That gives us:
|
||||||
|
|
||||||
|
- **Topology-proportional approximation**: an in-cube credit return is
|
||||||
|
automatically faster than a cross-SIP credit return.
|
||||||
|
- **No magic constants**: no arbitrary `ipcq_ctrl_latency_ns`.
|
||||||
|
- **No deadlock risk**: unlike piggyback, B can issue credit even when
|
||||||
|
it has no data to send back.
|
||||||
|
- **Reuses existing utility**: `ComponentContext.compute_drain_ns`.
|
||||||
|
|
||||||
|
#### Component coupling — SimPy Store channel
|
||||||
|
|
||||||
|
PE B's PE_IPCQ does not call PE A's PE_IPCQ directly. Instead, at init
|
||||||
|
time, **a SimPy Store is wired between the two** (a per-direction
|
||||||
|
fast-path channel) and credit metadata is `put` into that store.
|
||||||
|
|
||||||
|
```python
|
||||||
|
class PeIpcqComponent:
|
||||||
|
def _delayed_credit_send(self, env, peer_credit_store, my_tail, latency_ns):
|
||||||
|
yield env.timeout(latency_ns)
|
||||||
|
yield peer_credit_store.put(IpcqCreditMetadata(seq=my_tail, ...))
|
||||||
|
```
|
||||||
|
|
||||||
|
Backend init wires both directions of the fast-path channel as part of
|
||||||
|
fan-out (see `IpcqInitMsg` in D12).
|
||||||
|
|
||||||
|
#### Credit-return fast path limitations
|
||||||
|
|
||||||
|
- `credit_size_bytes` is an estimate (typically 16–64 bytes).
|
||||||
|
- The fast path is **excluded from vc_comm BW contention** (separate
|
||||||
|
wire). Real HW credit-return wires are very lightweight, so this is a
|
||||||
|
reasonable first approximation.
|
||||||
|
- A follow-up ADR can: model the credit fast path as a separate link
|
||||||
|
(BW limit + contention), or switch to piggyback (`credit_return_mode:
|
||||||
|
piggyback`).
|
||||||
|
|
||||||
|
#### PE_DMA's added responsibility
|
||||||
|
|
||||||
|
When `vc_comm` receives a token, PE_DMA processes it as the following
|
||||||
|
**atomic** sequence. **No SimPy yield is allowed between the two steps**
|
||||||
|
(invariant I6):
|
||||||
|
|
||||||
|
```python
|
||||||
|
def _on_vc_comm_recv(self, env, token):
|
||||||
|
# ── ATOMIC: no yield between these two operations ──
|
||||||
|
data = self._memory_store.read(token.src_space, token.src_addr,
|
||||||
|
shape=..., dtype=...)
|
||||||
|
self._memory_store.write(token.dst_endpoint.buffer_kind,
|
||||||
|
token.dst_addr, data)
|
||||||
|
# 2. Forward metadata to the local PE_IPCQ
|
||||||
|
yield self.out_ports[self._ipcq_id].put(IpcqMetaArrival(token=token))
|
||||||
|
# ───────────────────────────────────────────────────
|
||||||
|
```
|
||||||
|
|
||||||
|
The final `put` is yieldable but uses an unbounded internal store, so
|
||||||
|
it completes in a single step. That `put` is the closing call of the
|
||||||
|
atomic block; nothing may be inserted before it.
|
||||||
|
|
||||||
|
### D9.5. ADR-0020 (2-pass) integration
|
||||||
|
|
||||||
|
`tl.send` / `tl.recv` integrates with ADR-0020's two-pass model. Phase
|
||||||
|
1 simulates timing **and** moves data via MemoryStore; Phase 2 enables
|
||||||
|
op-log-based correctness verification.
|
||||||
|
|
||||||
|
#### Phase 1 (timing + data)
|
||||||
|
|
||||||
|
D9 models head and tail updates with two different mechanisms:
|
||||||
|
|
||||||
|
- **Send-side (head update)** — DMA payload piggyback. Data write and
|
||||||
|
metadata forward happen in the same SimPy step → automatic atomic
|
||||||
|
visibility.
|
||||||
|
- **Recv-side (tail credit return)** — fast-path SimPy Store channel
|
||||||
|
with bottleneck-BW latency, then `peer_tail_cache` update.
|
||||||
|
|
||||||
|
Together they preserve ring-buffer pointer consistency.
|
||||||
|
|
||||||
|
The op-log records `op_kind="ipcq"` entries for sends (with
|
||||||
|
`src/dst/space/addr/nbytes/dir/dtype/shape/sender_seq`) and recvs (with
|
||||||
|
`recv_mode/src/dst/space/addr/nbytes/dir/dtype/shape/consumer_seq`).
|
||||||
|
Two recv modes:
|
||||||
|
|
||||||
|
- **`return_slot`** (default): the slot address is returned to the
|
||||||
|
kernel. Zero-copy.
|
||||||
|
- **`copy_to_dst`**: when the kernel passes `dst_addr` + `dst_space`,
|
||||||
|
PE_IPCQ copies the slot data into the user dst.
|
||||||
|
|
||||||
|
#### Phase 2 (op_log replay)
|
||||||
|
|
||||||
|
When `DataExecutor` encounters an `op_kind="ipcq"` record:
|
||||||
|
|
||||||
|
- **send**: idempotent `src → dst` ndarray write.
|
||||||
|
- **recv (`return_slot`)**: no-op (the slot already holds the data).
|
||||||
|
- **recv (`copy_to_dst`)**: idempotent `slot → dst_addr` copy.
|
||||||
|
|
||||||
|
IPCQ ops are pure data movement — Phase 2 has nothing extra to compute.
|
||||||
|
The downstream GEMM / Math ops in `DataExecutor` will consume the data
|
||||||
|
and naturally validate correctness.
|
||||||
|
|
||||||
|
### D10. Host CCL init keeps the PyTorch shape
|
||||||
|
|
||||||
|
The host code looks just like real PyTorch DDP. `init_process_group`
|
||||||
|
creates the backend object; it does **not** receive IPCQ knobs
|
||||||
|
(neighbor topology, buffer_kind, backpressure …).
|
||||||
|
|
||||||
|
```python
|
||||||
|
# benches/ccl_allreduce.py — same shape as real PyTorch
|
||||||
|
def worker(rank, world_size, torch):
|
||||||
|
dist = torch.distributed
|
||||||
|
dist.init_process_group(backend="ahbm") # reads ccl.yaml + topology
|
||||||
|
tensor = torch.zeros((1, world_size * N_ELEM), dtype="f16", dp=...)
|
||||||
|
tensor.copy_(torch.from_numpy(init))
|
||||||
|
dist.all_reduce(tensor, op="sum")
|
||||||
|
```
|
||||||
|
|
||||||
|
The IPCQ configuration is decided by the backend at
|
||||||
|
`init_process_group` time: it loads `ccl.yaml`, picks the algorithm,
|
||||||
|
and pushes IPCQ neighbor tables to every participating PE_IPCQ. The
|
||||||
|
host code never has to know about IPCQ.
|
||||||
|
|
||||||
|
A bench runs one algorithm, chosen via `ccl.yaml`'s `defaults.algorithm`.
|
||||||
|
Switching algorithms is purely a `ccl.yaml` change — no host edits
|
||||||
|
required.
|
||||||
|
|
||||||
|
#### Init flow (eager)
|
||||||
|
|
||||||
|
1. `init_process_group(backend="ahbm")` is called.
|
||||||
|
2. Backend loads `ccl.yaml` → resolves `defaults.algorithm`.
|
||||||
|
3. Pulls topology + buffer_kind + backpressure + slot config from
|
||||||
|
`algorithms[<algo>]`.
|
||||||
|
4. **Immediately** installs neighbor tables on every PE_IPCQ
|
||||||
|
(sideband or fabric `IpcqInitMsg`).
|
||||||
|
5. Subsequent `torch.launch(kernel_name, ...)` calls behave normally —
|
||||||
|
PE_IPCQ is already prepared whether the kernel is a CCL kernel or
|
||||||
|
not.
|
||||||
|
|
||||||
|
### D11. CCL config file (`ccl.yaml`)
|
||||||
|
|
||||||
|
IPCQ config and algorithm metadata live in a separate YAML file,
|
||||||
|
following the same pattern as `components.yaml` and `topology.yaml`.
|
||||||
|
|
||||||
|
A single benchmark execution runs one algorithm
|
||||||
|
(`defaults.algorithm`). Switching algorithms means editing
|
||||||
|
`defaults.algorithm` only.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
defaults:
|
||||||
|
algorithm: ring_allreduce_tcm
|
||||||
|
buffer_kind: tcm # tcm | hbm | sram
|
||||||
|
backpressure: sleep # poll | sleep
|
||||||
|
n_slots: 8
|
||||||
|
slot_size: 4096
|
||||||
|
vc_chunk_size: 256
|
||||||
|
ipcq_credit_size_bytes: 16
|
||||||
|
|
||||||
|
algorithms:
|
||||||
|
ring_allreduce_tcm:
|
||||||
|
module: kernbench.ccl.algorithms.ring_allreduce
|
||||||
|
topology: ring_1d # builtin name or "custom"
|
||||||
|
buffer_kind: tcm
|
||||||
|
n_elem: 8 # optional, per-algorithm tile width
|
||||||
|
|
||||||
|
tree_allreduce_7:
|
||||||
|
module: kernbench.ccl.algorithms.tree_allreduce
|
||||||
|
topology: tree_binary
|
||||||
|
buffer_kind: tcm
|
||||||
|
world_size: 7 # algorithm-level override
|
||||||
|
n_elem: 16
|
||||||
|
|
||||||
|
custom_mesh:
|
||||||
|
module: kernbench.ccl.algorithms.custom_mesh
|
||||||
|
topology: custom # the module supplies its own neighbors()
|
||||||
|
```
|
||||||
|
|
||||||
|
`world_size` is **not set in `defaults`**. The backend resolves it via:
|
||||||
|
`algorithm-level override > defaults override > topology spec`. The
|
||||||
|
last fallback (`sips × cubes_per_sip × pes_per_cube`) mirrors real DDP
|
||||||
|
where `WORLD_SIZE` comes from env vars rather than config files.
|
||||||
|
|
||||||
|
#### Algorithm module structure
|
||||||
|
|
||||||
|
Each algorithm module exports two hooks — `kernel` (required) and
|
||||||
|
`neighbors` (optional) — plus a `kernel_args` helper that the
|
||||||
|
backend uses to populate positional kernel arguments at `all_reduce`
|
||||||
|
time:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# src/kernbench/ccl/algorithms/ring_allreduce.py
|
||||||
|
|
||||||
|
def kernel_args(world_size: int, n_elem: int) -> tuple:
|
||||||
|
return (n_elem, world_size)
|
||||||
|
|
||||||
|
|
||||||
|
def kernel(t_ptr, n_elem, world_size, tl):
|
||||||
|
"""Required — the PE kernel.
|
||||||
|
|
||||||
|
IPCQ is already installed by the backend before this is called.
|
||||||
|
The kernel only uses the four-direction send / recv API.
|
||||||
|
"""
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
def neighbors(rank, world_size, neighbor_map):
|
||||||
|
"""Optional — override the builtin topology's neighbor map.
|
||||||
|
|
||||||
|
Returns a new dict, the modified-in-place dict, or None to keep the
|
||||||
|
builtin map.
|
||||||
|
"""
|
||||||
|
return None
|
||||||
|
```
|
||||||
|
|
||||||
|
#### `neighbors` override patterns
|
||||||
|
|
||||||
|
- **Pattern A — tweak a builtin**: drop a direction for some ranks, etc.
|
||||||
|
- **Pattern B — replace entirely**: ignore `neighbor_map` and return a
|
||||||
|
brand-new dict.
|
||||||
|
- **Pattern C — keep builtin**: omit `neighbors` or return None.
|
||||||
|
|
||||||
|
#### Builtin topologies
|
||||||
|
|
||||||
|
| topology | direction set |
|
||||||
|
|----------|---------------|
|
||||||
|
| `ring_1d` | E, W |
|
||||||
|
| `ring_1d_unidir` | E only |
|
||||||
|
| `mesh_2d` | N, S, E, W |
|
||||||
|
| `tree_binary` | parent, child_left, child_right |
|
||||||
|
| `none` | (empty) — algorithm must supply `neighbors()` |
|
||||||
|
|
||||||
|
#### Adding a new algorithm
|
||||||
|
|
||||||
|
1. Write `kernel` and `kernel_args` in
|
||||||
|
`src/kernbench/ccl/algorithms/<algo>.py`.
|
||||||
|
2. Add an entry in `ccl.yaml`'s `algorithms` section.
|
||||||
|
3. (Optional) provide `neighbors()` for custom topology.
|
||||||
|
4. Set `defaults.algorithm` to the new algorithm.
|
||||||
|
|
||||||
|
The host bench (`benches/ccl_allreduce.py`) does not change.
|
||||||
|
|
||||||
|
### D12. Message / token schema
|
||||||
|
|
||||||
|
The new message types added by this ADR. They live in
|
||||||
|
`src/kernbench/common/pe_commands.py` and
|
||||||
|
`src/kernbench/runtime_api/kernel.py`.
|
||||||
|
|
||||||
|
#### `IpcqInitMsg` (sideband, fan-out at init)
|
||||||
|
|
||||||
|
The backend pushes neighbor tables to every PE_IPCQ. Structure mirrors
|
||||||
|
`MmuMapMsg` (`target_sips`, `target_cubes`, `target_pe`, `entries`).
|
||||||
|
Each `IpcqInitEntry` has `direction`, `peer: IpcqEndpoint`,
|
||||||
|
`my_rx_base_pa/va`, `n_slots`, `slot_size`, plus a `peer_credit_store`
|
||||||
|
field — a `simpy.Store` instance pre-wired so the sender PE_IPCQ can
|
||||||
|
push `IpcqCreditMetadata` directly into the receiver's input queue.
|
||||||
|
|
||||||
|
#### `IpcqSendCmd` (PE_CPU → PE_IPCQ)
|
||||||
|
|
||||||
|
Carries `direction`, source addr/space, nbytes, shape, dtype, and a
|
||||||
|
handle id. `data_op=True` so it lands in the op_log.
|
||||||
|
|
||||||
|
#### `IpcqRecvCmd` (PE_CPU → PE_IPCQ)
|
||||||
|
|
||||||
|
Carries `direction` (or None for round-robin), `recv_mode`
|
||||||
|
(`return_slot` / `copy_to_dst`), optional `dst_addr/dst_space`, shape,
|
||||||
|
dtype, blocking flag.
|
||||||
|
|
||||||
|
#### `IpcqDmaToken` (PE_IPCQ → PE_DMA, vc_comm channel)
|
||||||
|
|
||||||
|
Per D9 piggyback: the token carries the data (`src/dst/space/nbytes`)
|
||||||
|
plus the head metadata (`sender_seq`, `src_sip/cube/pe`,
|
||||||
|
`src_direction`). PE_DMA picks the channel by token type
|
||||||
|
(`IpcqDmaToken → vc_comm`, `TileToken → vc_compute`).
|
||||||
|
|
||||||
|
The receiver's PE_DMA, on token arrival, performs the I6 atomic
|
||||||
|
sequence: write data into MemoryStore, then forward `IpcqMetaArrival`
|
||||||
|
to the local PE_IPCQ.
|
||||||
|
|
||||||
|
#### `IpcqCreditMetadata` (PE_IPCQ → peer PE_IPCQ, fast path)
|
||||||
|
|
||||||
|
Carries `consumer_seq` (= my_tail), source PE coords, and source
|
||||||
|
direction. Travels through the dedicated SimPy Store channel rather
|
||||||
|
than `vc_comm`. Latency = `credit_size_bytes / bottleneck_bw_on_path`.
|
||||||
|
|
||||||
|
There is **no `IpcqPtrUpdate` event** — head updates flow via D9
|
||||||
|
piggyback, tail updates via the D9 fast-path channel.
|
||||||
|
|
||||||
|
### D13. Test strategy
|
||||||
|
|
||||||
|
Following the ADR-0021 D8 pattern.
|
||||||
|
|
||||||
|
#### T1. Unit tests (component-level)
|
||||||
|
|
||||||
|
- **PE_IPCQ** (`tests/test_pe_ipcq.py`): send without backpressure
|
||||||
|
immediately forwards a token; full peer slot triggers backpressure
|
||||||
|
(poll / sleep modes); recv waits, wakes on `IpcqMetaArrival`;
|
||||||
|
round-robin recv weak fairness; bad direction → `IpcqInvalidDirection`.
|
||||||
|
- **PE_DMA virtual channels** (`tests/test_pe_dma_vc.py`): `vc_compute`
|
||||||
|
/ `vc_comm` independent progress, chunk interleave, BW split.
|
||||||
|
- **Builtin topology** (`tests/test_ccl_topologies.py`): ring_1d /
|
||||||
|
mesh_2d / tree_binary correctness, mesh_2d non-square →
|
||||||
|
`ValueError`, custom resolver returns the module's `neighbors`.
|
||||||
|
|
||||||
|
#### T2. Integration tests (E2E send/recv)
|
||||||
|
|
||||||
|
- **`tests/test_ipcq_e2e.py`**: 2-rank ring, 4-rank ring (bidirectional
|
||||||
|
no-deadlock), 4×4 mesh.
|
||||||
|
- **CCL kernel + 2-pass** (`tests/test_ipcq_2pass.py`): greenlet mode
|
||||||
|
records `ipcq` ops in op_log; DataExecutor produces correct
|
||||||
|
`out.data`.
|
||||||
|
|
||||||
|
#### T3. Backend init (`tests/test_ccl_backend_ipcq.py`)
|
||||||
|
|
||||||
|
`ccl.yaml` load, builtin topology → `IpcqInitMsg` fan-out, endpoint PA
|
||||||
|
consistency, per-`buffer_kind` allocation.
|
||||||
|
|
||||||
|
#### T4. Regression
|
||||||
|
|
||||||
|
All existing tests pass; ADR-0020 op_log / DataExecutor unaffected for
|
||||||
|
non-CCL benches.
|
||||||
|
|
||||||
|
#### T5. Performance / overhead
|
||||||
|
|
||||||
|
Single send/recv pair latency = (DMA latency) + (IPCQ overhead).
|
||||||
|
Should be close to a regular PE_DMA write of the same nbytes (IPCQ
|
||||||
|
overhead < 100 ns).
|
||||||
|
|
||||||
|
### D14. Invariants and failure modes
|
||||||
|
|
||||||
|
#### Invariants
|
||||||
|
|
||||||
|
I1. **Slot lifecycle exactly-once**: one send → exactly one recv.
|
||||||
|
I2. **Pointer monotonicity**: `my_head` / `my_tail` strictly
|
||||||
|
non-decreasing; `sender_seq` strictly increasing.
|
||||||
|
I3. **Endpoint consistency**: if rank A's `direction=E` peer is rank
|
||||||
|
B, then rank B's reverse-direction peer must be rank A. Verified at
|
||||||
|
init.
|
||||||
|
I4. **`buffer_kind` consistency**: all PEs in a process group share
|
||||||
|
the same `buffer_kind` (no mixed mode in the first cut).
|
||||||
|
I5. **op_log ordering**: send → DMA complete → recv possible. The
|
||||||
|
t_start order in op_log respects this causality.
|
||||||
|
I6. **Atomic data + metadata visibility (MUST)**: at the receiver
|
||||||
|
side, data write (`MemoryStore.write`) and metadata forward
|
||||||
|
(`peer_head_cache` update) **must execute in the same SimPy step**.
|
||||||
|
No yield is allowed between the two operations in PE_DMA's vc_comm
|
||||||
|
handler. Code review must reject any inserted `yield` (or `yield
|
||||||
|
from`) — it would create a race where head_cache becomes visible
|
||||||
|
before or after the data.
|
||||||
|
I7. **MemoryStore slot existence ↔ pointer**: as a consequence of I6,
|
||||||
|
the step in which `peer_head_cache > my_tail` becomes truthy is the
|
||||||
|
same step in which the slot data is observable.
|
||||||
|
|
||||||
|
#### Failure modes (runtime errors)
|
||||||
|
|
||||||
|
F1. **Bad direction**: `tl.send(dir="X")` for an uninstalled direction
|
||||||
|
→ `IpcqInvalidDirection`, simulation aborts.
|
||||||
|
F2. **Type mismatch**: dtype/shape/nbytes disagreement between matched
|
||||||
|
send and recv. Not validated by default; opt-in strict mode catches
|
||||||
|
it (`strict_validation: true` on a PE_IPCQ node attrs).
|
||||||
|
F3. **Deadlock detection (timeout-based)**: the simulator empties its
|
||||||
|
schedule while a send/recv is still pending → engine raises
|
||||||
|
`IpcqDeadlock` and embeds a pointer dump.
|
||||||
|
F4. **Backend init failure**: missing `defaults.algorithm`, missing
|
||||||
|
`algorithms[name]`, module import failure, topology validation
|
||||||
|
failure (I3, I4) — all raised at `init_process_group` time.
|
||||||
|
F5. **Slot full + infinite backpressure**: the peer never recvs.
|
||||||
|
Surfaces as F3 timeout.
|
||||||
|
|
||||||
|
#### Diagnostics
|
||||||
|
|
||||||
|
- **CCL trace**: `KERNBENCH_CCL_TRACE=1` logs each send/recv as
|
||||||
|
`(rank, t, dir, nbytes)`.
|
||||||
|
- **Pointer dump**: `kernbench.ccl.diagnostics.pointer_dump(engine)`
|
||||||
|
prints every PE_IPCQ ring buffer's `my_head`, `my_tail`,
|
||||||
|
`peer_head_cache`, `peer_tail_cache`.
|
||||||
|
- **Deadlock dump**: on hang the engine includes the pointer dump in
|
||||||
|
the `IpcqDeadlock` exception message.
|
||||||
|
|
||||||
|
### D15. Algorithm-author cheat sheet
|
||||||
|
|
||||||
|
Full step-by-step lives in
|
||||||
|
[`docs/ccl-author-guide.en.md`](../ccl-author-guide.en.md). The
|
||||||
|
shortest version:
|
||||||
|
|
||||||
|
| Things you touch | Things you don't |
|
||||||
|
|------------------|-------------------|
|
||||||
|
| `src/kernbench/ccl/algorithms/<your_algo>.py` (`kernel`, `kernel_args`, optional `neighbors`) | `benches/ccl_allreduce.py` host code |
|
||||||
|
| One entry in `ccl.yaml` + optionally `defaults.algorithm` | `src/kernbench/ccl/` framework |
|
||||||
|
| (Optional) `tests/test_<your_algo>.py` mock test | PE_IPCQ component, AhbmCCLBackend |
|
||||||
|
|
||||||
|
5-step flow: write the kernel → register in `ccl.yaml` → optional
|
||||||
|
`neighbors` override → optional mock unit test → SimPy validation via
|
||||||
|
`kernbench run --bench ccl_allreduce --verify-data`.
|
||||||
|
|
||||||
|
Common mistakes: using a direction that wasn't installed, sends
|
||||||
|
without matching recvs (deadlock), dtype/shape disagreement, assuming
|
||||||
|
fairness from `tl.recv()` round-robin, confusing
|
||||||
|
`tl.num_programs(axis)` with the CCL group size.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Non-goals
|
||||||
|
|
||||||
|
- **Host collective**: a model where `dist.all_reduce` itself moves
|
||||||
|
data on the host side is out of scope. This ADR only covers
|
||||||
|
communication that happens inside the PE kernel.
|
||||||
|
- **All-reduce algorithms**: ring / tree / etc. live in algorithm
|
||||||
|
modules and can be added without amending this ADR.
|
||||||
|
- **Reliability / error handling**: link faults, send/recv failure
|
||||||
|
recovery, etc. are out of scope.
|
||||||
|
- **NoC arbiter precision**: dynamic VC contention is left for a future
|
||||||
|
ADR (see D8).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Open questions
|
||||||
|
|
||||||
|
- **VC arbitration accuracy** — the first cut uses deterministic
|
||||||
|
chunk interleave + weighted round-robin; heavy contention may report
|
||||||
|
optimistic latency. A NoC arbiter component can be added later.
|
||||||
|
- **Credit return BW model** — the fast path is currently outside the
|
||||||
|
fabric BW contention model. Can be modeled as a separate link or
|
||||||
|
switched to piggyback (`credit_return_mode: piggyback`).
|
||||||
|
- **Ring buffer slot allocation metadata** — whether the host pushes
|
||||||
|
IPCQ buffer metadata via sideband or via a fabric message similar to
|
||||||
|
`MmuMapMsg` is open.
|
||||||
|
- **VC BW split default** — 50/50 vs. weighted (e.g. 80/20). Exposed in
|
||||||
|
`ccl.yaml`; default value TBD.
|
||||||
|
- **Direction count** — 4 (N/S/E/W) is fixed in the first cut; 6
|
||||||
|
(with Up/Down for 3D) or N (variable) is future work.
|
||||||
|
- **Multi-tile aggregation primitives** — whether
|
||||||
|
`tl.recv_all` or similar is needed for fan-in.
|
||||||
|
- **Round-robin recv fairness** — current weak fairness can starve;
|
||||||
|
strict fairness counter is future work.
|
||||||
|
- **Deadlock detection precision** — currently timeout-based; a
|
||||||
|
realtime wait-for graph would enable deterministic detection.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Consequences
|
||||||
|
|
||||||
|
### Positive
|
||||||
|
|
||||||
|
- PE-to-PE direct communication enables CCL kernels to be written.
|
||||||
|
- Host stays minimal (just `launch`), synchronization happens inside
|
||||||
|
the PE → strong compute / comm overlap.
|
||||||
|
- VCs eliminate HoL blocking → collective latency is not blocked by
|
||||||
|
compute traffic.
|
||||||
|
- Buffer placement and backpressure mode are init-time parameters →
|
||||||
|
easy to benchmark.
|
||||||
|
- Four-direction logical neighbors → host is free to map
|
||||||
|
ring/mesh/tree algorithms.
|
||||||
|
|
||||||
|
### Negative
|
||||||
|
|
||||||
|
- One new component (PE_IPCQ) and a redesigned PE_DMA (VCs).
|
||||||
|
- IPCQ memory cost = 8 rings × `slot_size` × `n_slots` per PE.
|
||||||
|
- VC arbitration is a first-order approximation; heavy contention
|
||||||
|
scenarios may report slightly optimistic latency vs real HW (D8).
|
||||||
|
- Chunk-level interleave makes PE_DMA implementation more complex.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Affected files
|
||||||
|
|
||||||
|
| File | Change |
|
||||||
|
|------|--------|
|
||||||
|
| `topology.yaml` | Add `pe_ipcq` to `pe_template`, plus the IPCQ ↔ DMA / CPU / TCM edges. |
|
||||||
|
| `components.yaml` | Register `pe_ipcq_v1`. |
|
||||||
|
| `src/kernbench/topology/builder.py` | Wire the IPCQ chain into PE-internal edges. |
|
||||||
|
| `src/kernbench/components/builtin/pe_ipcq.py` | New. |
|
||||||
|
| `src/kernbench/components/builtin/pe_dma.py` | Add VCs, handle `IpcqDmaToken`. |
|
||||||
|
| `src/kernbench/common/pe_commands.py` | `IpcqSendCmd`, `IpcqRecvCmd`, `IpcqDmaToken`. |
|
||||||
|
| `src/kernbench/triton_emu/tl_context.py` | `tl.send` / `tl.recv` API. |
|
||||||
|
| `src/kernbench/runtime_api/distributed.py` | Eager IPCQ install in `AhbmCCLBackend.__init__`. |
|
||||||
|
| `src/kernbench/runtime_api/kernel.py` | `IpcqInitMsg` definition. |
|
||||||
|
| `src/kernbench/ccl/__init__.py` | New CCL package. |
|
||||||
|
| `src/kernbench/ccl/topologies.py` | Builtin topology generators + `resolve_topology()`. |
|
||||||
|
| `src/kernbench/ccl/helpers.py` | Algorithm-author helpers (`chunked`, `ring_step`, `tree_step`). |
|
||||||
|
| `src/kernbench/ccl/testing.py` | Mock CCL runtime (`run_kernel_in_mock`). |
|
||||||
|
| `src/kernbench/ccl/algorithms/*.py` | Algorithm modules (kernel + `kernel_args` + optional `neighbors`). |
|
||||||
|
| `ccl.yaml` | Algorithm metadata + IPCQ defaults. |
|
||||||
|
| `tests/test_pe_ipcq.py` | PE_IPCQ unit tests. |
|
||||||
|
| `tests/test_pe_dma_vc.py` | PE_DMA VC tests. |
|
||||||
|
| `tests/test_ipcq_e2e.py` | end-to-end send/recv tests. |
|
||||||
|
| `tests/test_ccl_topologies.py` | Builtin topology generator tests. |
|
||||||
|
| `tests/test_ccl_allreduce_matrix.py` | Unified bench × algorithm matrix. |
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,990 @@
|
|||||||
|
# ADR-0024: SIP-level TP Launcher — rank = SIP (host-driven dispatch)
|
||||||
|
|
||||||
|
## Status
|
||||||
|
|
||||||
|
Proposed (Revision 8 — Hierarchical content split out to ADR-0029)
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
### 목표
|
||||||
|
|
||||||
|
`torch.distributed` collective 호출의 참여 단위(rank)를 **SIP**(device)
|
||||||
|
경계에 맞춘다. 실제 PyTorch DDP/TP 스크립트와 **호스트 레벨에서 구분 없이**
|
||||||
|
읽히는 bench 코드를 목표로 한다.
|
||||||
|
|
||||||
|
real PyTorch와 비교:
|
||||||
|
|
||||||
|
| 차원 | real PyTorch | KernBench (이 ADR 이후) |
|
||||||
|
|---|---|---|
|
||||||
|
| 프로세스 모델 | N개 프로세스, 각 1 GPU | 1 프로세스, N greenlet, 각 1 SIP |
|
||||||
|
| `get_rank()` | `RANK` env var | greenlet-local 레지스트리 |
|
||||||
|
| `get_world_size()` | `WORLD_SIZE` env var | topology의 SIP 수 |
|
||||||
|
| `torch.cuda.set_device(r)` (real) / `torch.ahbm.set_device(r)` (KernBench) | rank → GPU | rank → SIP |
|
||||||
|
| `mp.spawn` | OS 프로세스 fork | greenlet fan-out |
|
||||||
|
|
||||||
|
### 설계 원칙 — 공개 API의 추상화, 내부는 기존 path 활용
|
||||||
|
|
||||||
|
**공개 API (bench worker) 수준의 추상화**:
|
||||||
|
```
|
||||||
|
rank = SIP
|
||||||
|
DPPolicy = intra-device (cube × PE) 분산만
|
||||||
|
dist.all_reduce, torch.ahbm.set_device, mp.spawn 등 PyTorch-style 표면
|
||||||
|
```
|
||||||
|
|
||||||
|
**Framework 내부 구현**:
|
||||||
|
```
|
||||||
|
build_install_plans (host): topology + mapper + algorithm → SipInstallPlan
|
||||||
|
↓
|
||||||
|
backend (host): plan의 per-PE spec을 engine.submit으로 IpcqInitMsg 디스패치
|
||||||
|
↓
|
||||||
|
engine: 기존 PE-scoped routing (MmuMapMsg 등과 동일 경로)
|
||||||
|
↓
|
||||||
|
PE_IPCQ: 자체 message loop에서 IpcqInitMsg 처리 (기존 capability)
|
||||||
|
```
|
||||||
|
|
||||||
|
**핵심**: 새 message 타입이나 IO_CPU 확장 없음. 기존 engine routing과 기존
|
||||||
|
`IpcqInitMsg` 타입을 그대로 사용. 기존의 "sideband direct call" 우회만
|
||||||
|
제거하여 convention 일원화.
|
||||||
|
|
||||||
|
### 현재 상태
|
||||||
|
|
||||||
|
- `DistributedContext` facade 존재
|
||||||
|
- `init_process_group("ahbm")` → `AhbmCCLBackend`가 `ctx.install_ipcq` 호출
|
||||||
|
→ `ccl/install.py`가 **sideband direct call** (`pe_ipcq._install_neighbors`)로
|
||||||
|
PE_IPCQ에 neighbor table 설치
|
||||||
|
- `get_rank()` 항상 `0` (single-driver)
|
||||||
|
- `get_world_size()` fallback: 총 PE 수 (rank = PE)
|
||||||
|
- `benches/ccl_allreduce.py`: `worker(rank=0, world_size=total_PEs)` 1회 호출
|
||||||
|
|
||||||
|
### 풀어야 할 문제
|
||||||
|
|
||||||
|
1. **공개 API에서 rank = SIP** — bench worker가 PE 개념을 알지 않도록.
|
||||||
|
2. **Multi-worker 실행** — N개 rank가 독립 worker 코드 실행. 1 프로세스 제약
|
||||||
|
하에서 greenlet + barrier 동기화.
|
||||||
|
3. **Cross-rank collective submit 동기화** — 첫 rank가 혼자 wait하면 peer 부재로
|
||||||
|
SimPy deadlock. 모든 rank submit 후 drain 보장.
|
||||||
|
4. **기존 sideband install 제거** — IpcqInitMsg를 engine.submit으로 일원화.
|
||||||
|
MmuMapMsg 등 다른 control-plane 메시지와 동일 패턴.
|
||||||
|
5. **Algorithm / mapper / validator 분리** — 알고리즘 모듈은 kernel 코드만
|
||||||
|
담고, topology / mapping / validation은 registry + 선언.
|
||||||
|
|
||||||
|
### Non-problem (이 ADR 밖)
|
||||||
|
|
||||||
|
- IPCQ direction addressing fix → **ADR-0025**
|
||||||
|
- `DPPolicy.sip`/`num_sips` 제거 → **ADR-0026**
|
||||||
|
- Megatron-style TP → **ADR-0027**
|
||||||
|
- DTensor → **ADR-0028 (future)**
|
||||||
|
- **IO_CPU를 SIP-level control-plane 단일 endpoint로 승격**: 이 ADR에서는
|
||||||
|
invariant으로 채택하지 않음. 현재 KernBench에 해당 원칙이 없고, 단독으로
|
||||||
|
도입하기엔 정당화가 약함. 미래에 control-plane latency 모델링 정밀도 요구가
|
||||||
|
생기면 별도 ADR.
|
||||||
|
|
||||||
|
### TODO (이 ADR 구현 이후)
|
||||||
|
|
||||||
|
- Tensor Parallelism (ADR-0027)
|
||||||
|
- Hierarchical all-reduce 알고리즘 설계 (ADR-0029) — 본 ADR의 mapper /
|
||||||
|
validator registry 인프라를 활용하는 첫 사례
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Decision
|
||||||
|
|
||||||
|
### D1. rank = SIP (world_size 해석)
|
||||||
|
|
||||||
|
```python
|
||||||
|
def _resolve_world_size(self) -> int:
|
||||||
|
if "world_size" in self._merged:
|
||||||
|
return int(self._merged["world_size"])
|
||||||
|
defaults = self._cfg_all.get("defaults", {})
|
||||||
|
if "world_size" in defaults:
|
||||||
|
return int(defaults["world_size"])
|
||||||
|
spec = self.ctx.spec or {}
|
||||||
|
return int(spec.get("system", {}).get("sips", {}).get("count", 1))
|
||||||
|
```
|
||||||
|
|
||||||
|
우선순위: 알고리즘 override > defaults override > SIP count. `ccl.yaml`
|
||||||
|
override는 legacy "rank = PE" 테스트 경로로 유지.
|
||||||
|
|
||||||
|
### D2. Install 경로 — engine.submit 일원화
|
||||||
|
|
||||||
|
`ccl/install.py`의 sideband direct call을 제거하고, `IpcqInitMsg`를
|
||||||
|
`engine.submit`으로 보낸다. MmuMapMsg / MemoryWriteMsg 등이 이미 동일 패턴.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Backend (AhbmCCLBackend.__init__ 또는 init_process_group 시점)
|
||||||
|
from kernbench.ccl.install_plan import build_install_plans
|
||||||
|
|
||||||
|
plans = build_install_plans(
|
||||||
|
world_size=self._world_size,
|
||||||
|
algorithm=self._merged["algorithm"],
|
||||||
|
algorithm_config=self._merged,
|
||||||
|
spec=self.ctx.spec,
|
||||||
|
)
|
||||||
|
self._plans = plans
|
||||||
|
|
||||||
|
# Each PE_IPCQ가 자기 neighbor table을 받도록 engine 경유 submit
|
||||||
|
handles = []
|
||||||
|
for plan in plans:
|
||||||
|
for pe_install in plan.pe_installs:
|
||||||
|
h = self.ctx.submit(IpcqInitMsg(
|
||||||
|
correlation_id=self.ctx.correlation_id,
|
||||||
|
request_id=f"ipcq_init_s{plan.sip}c{pe_install.cube}p{pe_install.pe}",
|
||||||
|
target_sips=(plan.sip,),
|
||||||
|
target_cubes=(pe_install.cube,),
|
||||||
|
target_pe=pe_install.pe,
|
||||||
|
entries=pe_install.neighbors,
|
||||||
|
buffer_kind=plan.buffer_kind,
|
||||||
|
n_slots=plan.n_slots,
|
||||||
|
slot_size=plan.slot_size,
|
||||||
|
# ... (기존 IpcqInitMsg 필드)
|
||||||
|
))
|
||||||
|
handles.append(h)
|
||||||
|
|
||||||
|
# Eager install — init_process_group이 반환하기 전에 완료 보장
|
||||||
|
for h in handles:
|
||||||
|
self.ctx.wait(h)
|
||||||
|
```
|
||||||
|
|
||||||
|
**PE_IPCQ 컴포넌트**는 이미 `IpcqInitMsg`를 main loop에서 처리 (`pe_ipcq.py`
|
||||||
|
라인 145-147). 변경 불필요. 유일한 차이는 "message가 sideband Python call이
|
||||||
|
아니라 engine queue를 거쳐 도착한다"는 점.
|
||||||
|
|
||||||
|
**Correctness invariant (equivalence)**: `init_process_group()`은 모든
|
||||||
|
install handle을 `wait()`한 후 반환하므로 launch-before-install 문제는
|
||||||
|
구조적으로 없다. 남는 correctness 질문은 단 하나:
|
||||||
|
|
||||||
|
> Engine-routed `IpcqInitMsg` 처리가 기존 sideband
|
||||||
|
> `pe_ipcq._install_neighbors(msg)` 호출과 **동일한 최종 PE_IPCQ 상태**를
|
||||||
|
> 생성하는가.
|
||||||
|
|
||||||
|
검증 포인트 (T3 참고):
|
||||||
|
|
||||||
|
1. **State equivalence**: `_install_neighbors()` 내부 상태 전이가 engine
|
||||||
|
dispatch path에서도 동일하게 일어나 최종 PE_IPCQ state
|
||||||
|
(`_queue_pairs`, `_installed`, `_credit_inbox` 등)가 일치.
|
||||||
|
|
||||||
|
2. **Sideband-only side effect 부재**: Sideband path에서만 있던 부수 효과가
|
||||||
|
없음 (예: engine.submit이 설정하는 request_id / correlation tracking 등이
|
||||||
|
install semantics를 왜곡하지 않음).
|
||||||
|
|
||||||
|
3. **Ordering independence**: 서로 다른 PE들의 install message가 engine
|
||||||
|
큐에서 임의 순서로 처리되어도 최종 상태가 동일. 즉 install은 **PE별
|
||||||
|
독립 연산**이어야 하고, cross-PE 순서 의존성이 있으면 안 됨.
|
||||||
|
|
||||||
|
4. **Idempotency**: 동일 PE에 대해 `IpcqInitMsg`가 두 번 도착하면? 현재
|
||||||
|
설계 전제는 "per-PE 단 한 번 install". 중복 install 시 동작은 정의되지
|
||||||
|
않음. 보수적 정책:
|
||||||
|
- 최초 install 시 `_installed = True`로 전이
|
||||||
|
- 이후 중복 install msg는 **에러** (raise) 또는 **silent idempotent**
|
||||||
|
(no-op) 둘 중 하나로 명시
|
||||||
|
- Recommend: **raise** (명시적 에러 → 버그 조기 검출). T3에 duplicate
|
||||||
|
install 케이스 추가.
|
||||||
|
|
||||||
|
5. **Partial install visibility**: 일부 PE만 install 완료된 중간 상태가
|
||||||
|
외부에 observable한가? 현재 구조에서는 `init_process_group()`의 eager
|
||||||
|
wait-all이 barrier 역할을 하므로 partial state는 bench 코드에 노출되지
|
||||||
|
않음. 단, debugging / introspection API는 중간 상태를 볼 수 있음 (문제
|
||||||
|
아님, 문서화만).
|
||||||
|
|
||||||
|
**Timing 영향**: Engine-routed install은 `init_process_group()`이 SimPy 시간을
|
||||||
|
소비하게 만든다. 기존 sideband install은 사실상 zero-cost. ADR 계약:
|
||||||
|
|
||||||
|
> Benchmarks must not rely on zero-cost initialization.
|
||||||
|
> `init_process_group()` consumes simulated time proportional to the number
|
||||||
|
> of participating PEs × per-PE install latency. First collective call
|
||||||
|
> starts at a well-defined but non-zero sim time.
|
||||||
|
|
||||||
|
### D3. Launch 경로 — non-CCL 커널과 동일 primitive
|
||||||
|
|
||||||
|
**CCL 커널은 non-CCL 커널과 동일한 `KernelLaunchMsg` submission path를 쓴다.**
|
||||||
|
Engine 내부의 IO_CPU/M_CPU transit 같은 것은 **기존 구현 세부이지 CCL-specific
|
||||||
|
장치가 아님**. Backend는 plan의 `participating_pes` 목록을 돌면서 `KernelLaunchMsg`를
|
||||||
|
submit할 뿐이다. 새 메시지 타입 없음, 새 라우팅 경로 없음.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# AhbmCCLBackend.all_reduce
|
||||||
|
def all_reduce(self, tensor, op="sum"):
|
||||||
|
if op != "sum":
|
||||||
|
raise NotImplementedError(...)
|
||||||
|
if tensor._handle is None or not tensor._handle.shards:
|
||||||
|
raise RuntimeError(...)
|
||||||
|
|
||||||
|
# Validator — global handle 기준 (D8)
|
||||||
|
validator_name = self._merged.get("validator")
|
||||||
|
if validator_name:
|
||||||
|
resolve_validator(validator_name)(tensor._handle, self._world_size, self.ctx.spec)
|
||||||
|
|
||||||
|
rank = self.ctx.distributed.get_rank()
|
||||||
|
plan = self._plans[rank]
|
||||||
|
tensor_view = _tensor_slice_for_sip(tensor._handle, plan.sip)
|
||||||
|
|
||||||
|
# Plan에서 kernel args 계산 (host-side)
|
||||||
|
import importlib
|
||||||
|
mod = importlib.import_module(plan.kernel_module)
|
||||||
|
n_elem = tensor_view.shards[0].nbytes // tensor.itemsize
|
||||||
|
kargs = mod.kernel_args(n_elem=n_elem, world_size=plan.world_size,
|
||||||
|
**plan.kernel_config)
|
||||||
|
|
||||||
|
def _submit():
|
||||||
|
out = []
|
||||||
|
for (cube, pe) in plan.participating_pes:
|
||||||
|
h = self.ctx.submit(KernelLaunchMsg(
|
||||||
|
correlation_id=self.ctx.correlation_id,
|
||||||
|
request_id=f"allreduce_r{rank}_c{cube}p{pe}",
|
||||||
|
kernel_ref=KernelRef(name=plan.algorithm_name, kind="builtin"),
|
||||||
|
args=(_tensor_arg_for_pe(tensor_view, cube, pe), *kargs),
|
||||||
|
target_sips=(plan.sip,),
|
||||||
|
target_cubes=(cube,),
|
||||||
|
target_pe=pe,
|
||||||
|
))
|
||||||
|
out.append(h)
|
||||||
|
return out
|
||||||
|
|
||||||
|
self._barrier.submit_and_drain(self.ctx, rank, _submit)
|
||||||
|
```
|
||||||
|
|
||||||
|
### D4. Algorithm ABI — 얇게 + 명시적 arg 계약
|
||||||
|
|
||||||
|
각 알고리즘 모듈은 **kernel + kernel_args만 필수**.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# src/kernbench/ccl/algorithms/ring_allreduce.py
|
||||||
|
def kernel(t_ptr, n_elem, world_size, tl):
|
||||||
|
"""PE-side kernel code.
|
||||||
|
|
||||||
|
Signature convention: first positional arg is the tensor pointer
|
||||||
|
(per-PE slice), subsequent positional args are whatever
|
||||||
|
kernel_args() returns. `tl` is injected by the TLContext runtime.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def kernel_args(*, n_elem: int, world_size: int, **kw) -> tuple:
|
||||||
|
"""Return the tuple of non-tensor positional args.
|
||||||
|
|
||||||
|
Signature contract:
|
||||||
|
- Called keyword-only with n_elem and world_size plus kernel_config.
|
||||||
|
- Returns a tuple (possibly empty) of scalar / metadata args.
|
||||||
|
- The backend constructs the final KernelLaunchMsg.args as:
|
||||||
|
(per_pe_tensor_arg, *kernel_args(...))
|
||||||
|
where per_pe_tensor_arg is a TensorArg containing only the shards
|
||||||
|
local to the receiving PE (derived from tensor_view).
|
||||||
|
"""
|
||||||
|
return (n_elem, world_size)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Arg assembly in backend (reference)**:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# AhbmCCLBackend.all_reduce (D3에서 발췌)
|
||||||
|
kargs = mod.kernel_args(n_elem=n_elem, world_size=plan.world_size,
|
||||||
|
**plan.kernel_config)
|
||||||
|
for (cube, pe) in plan.participating_pes:
|
||||||
|
pe_tensor_arg = _tensor_arg_for_pe(tensor_view, cube, pe)
|
||||||
|
self.ctx.submit(KernelLaunchMsg(
|
||||||
|
args=(pe_tensor_arg, *kargs), # tensor first, then kernel_args return
|
||||||
|
target_sips=(plan.sip,),
|
||||||
|
target_cubes=(cube,),
|
||||||
|
target_pe=pe,
|
||||||
|
...
|
||||||
|
))
|
||||||
|
```
|
||||||
|
|
||||||
|
**ccl.yaml**에서 선언적 metadata:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
algorithms:
|
||||||
|
ring_allreduce_tcm:
|
||||||
|
module: kernbench.ccl.algorithms.ring_allreduce
|
||||||
|
topology: ring_1d # kernbench/ccl/topologies.py
|
||||||
|
mapper: leader_only # kernbench/ccl/mappers.py (신규)
|
||||||
|
validator: single_shard_per_rank # kernbench/ccl/validators.py (신규)
|
||||||
|
buffer_kind: tcm
|
||||||
|
n_elem: 8
|
||||||
|
```
|
||||||
|
|
||||||
|
- `topology` (필수)
|
||||||
|
- `mapper` (선택, default `"leader_only"`)
|
||||||
|
- `validator` (선택)
|
||||||
|
|
||||||
|
알고리즘 모듈 자체에는 mapper/validator/participating_pes/neighbor
|
||||||
|
생성기가 **들어가지 않음**.
|
||||||
|
|
||||||
|
### D5. Mapper + validator — registry key **또는** import path
|
||||||
|
|
||||||
|
Host-side framework가 built-in registry 제공. 커스텀 확장은 dot-import path.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# src/kernbench/ccl/mappers.py (new)
|
||||||
|
Mapper = Callable[[dict, int], list[tuple[int, int]]]
|
||||||
|
|
||||||
|
def leader_only(spec, rank):
|
||||||
|
"""Single leader PE per SIP. Ring/tree/mesh용."""
|
||||||
|
return [(0, 0)]
|
||||||
|
|
||||||
|
def all_pes(spec, rank):
|
||||||
|
"""Every PE in the SIP. 알고리즘이 intra-SIP 전체 PE를 참여시킬 때 사용
|
||||||
|
(e.g. intra-SIP reduction, intra-SIP broadcast, hierarchical collective
|
||||||
|
의 낮은 레벨 등)."""
|
||||||
|
cm = spec["sip"]["cube_mesh"]
|
||||||
|
pl = spec["cube"]["pe_layout"]
|
||||||
|
n_cubes = cm["w"] * cm["h"]
|
||||||
|
n_pes = pl["pe_per_corner"] * len(pl["corners"])
|
||||||
|
return [(c, p) for c in range(n_cubes) for p in range(n_pes)]
|
||||||
|
|
||||||
|
MAPPER_REGISTRY = {"leader_only": leader_only, "all_pes": all_pes}
|
||||||
|
|
||||||
|
def resolve_mapper(key_or_path: str) -> Mapper:
|
||||||
|
if key_or_path in MAPPER_REGISTRY:
|
||||||
|
return MAPPER_REGISTRY[key_or_path]
|
||||||
|
if "." in key_or_path:
|
||||||
|
import importlib
|
||||||
|
mod_path, fn_name = key_or_path.rsplit(".", 1)
|
||||||
|
return getattr(importlib.import_module(mod_path), fn_name)
|
||||||
|
raise ValueError(f"unknown mapper: {key_or_path!r}")
|
||||||
|
```
|
||||||
|
|
||||||
|
Validator도 동일 패턴 (`src/kernbench/ccl/validators.py`). 입력은 **global
|
||||||
|
TensorHandle** (D8 참고).
|
||||||
|
|
||||||
|
### D6. Host-side install plan builder
|
||||||
|
|
||||||
|
```python
|
||||||
|
# src/kernbench/ccl/install_plan.py (new; 기존 install.py의 재구성)
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Any, Mapping
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class NeighborTableEntry:
|
||||||
|
direction: str
|
||||||
|
peer_direction: str # ADR-0025
|
||||||
|
peer_sip: int
|
||||||
|
peer_cube: int
|
||||||
|
peer_pe: int
|
||||||
|
rx_base_pa: int
|
||||||
|
# ... 기타 IPCQ 설정 ...
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class PeInstallSpec:
|
||||||
|
cube: int
|
||||||
|
pe: int
|
||||||
|
neighbors: tuple[NeighborTableEntry, ...]
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class SipInstallPlan:
|
||||||
|
algorithm_name: str # human-readable ("ring_allreduce_tcm")
|
||||||
|
sip: int
|
||||||
|
rank: int
|
||||||
|
world_size: int
|
||||||
|
pe_installs: tuple[PeInstallSpec, ...] # per-PE neighbor tables
|
||||||
|
buffer_kind: str
|
||||||
|
n_slots: int
|
||||||
|
slot_size: int
|
||||||
|
kernel_module: str
|
||||||
|
participating_pes: tuple[tuple[int, int], ...]
|
||||||
|
kernel_config: Mapping[str, Any]
|
||||||
|
|
||||||
|
|
||||||
|
def build_install_plans(
|
||||||
|
world_size: int,
|
||||||
|
algorithm: str,
|
||||||
|
algorithm_config: dict,
|
||||||
|
spec: dict,
|
||||||
|
) -> list[SipInstallPlan]:
|
||||||
|
"""Compose topology + mapper + algorithm into per-SIP plan list."""
|
||||||
|
topo_fn = _resolve_topology(algorithm_config["topology"])
|
||||||
|
mapper = resolve_mapper(algorithm_config.get("mapper", "leader_only"))
|
||||||
|
|
||||||
|
# kernel_config: launch 시 kernel_args에 전달할 algorithm-specific params
|
||||||
|
kernel_config = {
|
||||||
|
k: v for k, v in algorithm_config.items()
|
||||||
|
if k in {"n_elem", "reduce_op", "chunk_size"} or k.startswith("kernel_")
|
||||||
|
}
|
||||||
|
|
||||||
|
plans = []
|
||||||
|
for rank in range(world_size):
|
||||||
|
sip = rank # identity mapping (non-identity는 open question)
|
||||||
|
pes = mapper(spec, rank)
|
||||||
|
pe_installs = _build_pe_installs(
|
||||||
|
rank=rank, world_size=world_size, sip=sip,
|
||||||
|
pes=pes, topo_fn=topo_fn, algorithm_config=algorithm_config, spec=spec,
|
||||||
|
)
|
||||||
|
plans.append(SipInstallPlan(
|
||||||
|
algorithm_name=algorithm,
|
||||||
|
sip=sip, rank=rank, world_size=world_size,
|
||||||
|
pe_installs=pe_installs,
|
||||||
|
buffer_kind=algorithm_config["buffer_kind"],
|
||||||
|
n_slots=algorithm_config["n_slots"],
|
||||||
|
slot_size=algorithm_config["slot_size"],
|
||||||
|
kernel_module=algorithm_config["module"],
|
||||||
|
participating_pes=tuple(pes),
|
||||||
|
kernel_config=kernel_config,
|
||||||
|
))
|
||||||
|
return plans
|
||||||
|
```
|
||||||
|
|
||||||
|
`_build_pe_installs`는 기존 `ccl/install.py`의 neighbor 계산 로직을 재활용
|
||||||
|
(ADR-0025의 `reverse_direction` 개선 반영).
|
||||||
|
|
||||||
|
**Multi-PE 매퍼와 neighbor 생성 책임**: mapper가 SIP 내 여러 PE를 반환하는
|
||||||
|
경우 (`all_pes` 등), PE-level neighbor 그래프는 `_build_pe_installs` 내부에
|
||||||
|
형성된다. 즉 topology 모듈은 rank-level 관계만 제공하고, PE-level 연결은
|
||||||
|
builder에서 풀어낸다. 복잡한 multi-level 패턴을 쓰는 알고리즘은 이 책임
|
||||||
|
분산이 관리 부담이 될 수 있음 — 관련 논의는 ADR-0029 참고.
|
||||||
|
|
||||||
|
### D7. Epoch-based collective barrier
|
||||||
|
|
||||||
|
Cross-rank submit 동기화. 각 collective 호출은 독립 epoch. 같은 rank의
|
||||||
|
중복 join은 즉시 에러.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# src/kernbench/runtime_api/distributed.py
|
||||||
|
@dataclass
|
||||||
|
class _EpochState:
|
||||||
|
participants: set[int] = field(default_factory=set)
|
||||||
|
pending: list = field(default_factory=list)
|
||||||
|
drained: bool = False
|
||||||
|
returned: int = 0
|
||||||
|
|
||||||
|
|
||||||
|
class _CollectiveBarrier:
|
||||||
|
"""Epoch-based barrier.
|
||||||
|
|
||||||
|
Contract:
|
||||||
|
- Each call joins the earliest non-drained epoch.
|
||||||
|
- Each rank may join a given epoch at most once. Duplicate join raises.
|
||||||
|
- Last arriver (participants == world_size) performs drain and advances
|
||||||
|
_next_epoch. Earlier arrivers yield and re-check drained on resume.
|
||||||
|
- Epoch state is GC'd when returned == world_size (success path).
|
||||||
|
- On failure paths, residual state is acceptable; reset() clears it.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, world_size: int):
|
||||||
|
self._world_size = world_size
|
||||||
|
self._next_epoch = 0
|
||||||
|
self._state: dict[int, _EpochState] = {}
|
||||||
|
|
||||||
|
def submit_and_drain(self, ctx, rank: int, submit_fn) -> None:
|
||||||
|
epoch = self._next_epoch
|
||||||
|
state = self._state.setdefault(epoch, _EpochState())
|
||||||
|
|
||||||
|
if rank in state.participants:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"rank {rank} attempted duplicate join to epoch {epoch}"
|
||||||
|
)
|
||||||
|
state.participants.add(rank)
|
||||||
|
|
||||||
|
handles = submit_fn()
|
||||||
|
state.pending.extend(handles)
|
||||||
|
|
||||||
|
is_last = len(state.participants) >= self._world_size
|
||||||
|
|
||||||
|
if is_last:
|
||||||
|
for h in state.pending:
|
||||||
|
ctx.wait(h)
|
||||||
|
state.drained = True
|
||||||
|
self._next_epoch = epoch + 1
|
||||||
|
else:
|
||||||
|
from greenlet import getcurrent
|
||||||
|
g = getcurrent()
|
||||||
|
if g.parent is None:
|
||||||
|
raise RuntimeError("barrier requires a bound worker greenlet")
|
||||||
|
while not state.drained:
|
||||||
|
g.parent.switch()
|
||||||
|
|
||||||
|
state.returned += 1
|
||||||
|
if state.returned >= self._world_size:
|
||||||
|
self._state.pop(epoch, None)
|
||||||
|
|
||||||
|
def reset(self) -> None:
|
||||||
|
"""Explicit cleanup on spawn exception unwinding."""
|
||||||
|
self._state.clear()
|
||||||
|
self._next_epoch = 0
|
||||||
|
```
|
||||||
|
|
||||||
|
### D8. Per-rank tensor view + validator contract
|
||||||
|
|
||||||
|
**Validator** (host-side, pre-slice, global handle 기준):
|
||||||
|
|
||||||
|
```python
|
||||||
|
# src/kernbench/ccl/validators.py
|
||||||
|
Validator = Callable[[TensorHandle, int, dict], None]
|
||||||
|
|
||||||
|
def single_shard_per_rank(handle, world_size, spec):
|
||||||
|
"""Ring 계열: 정확히 world_size개 shard, SIP당 1개."""
|
||||||
|
if len(handle.shards) != world_size:
|
||||||
|
raise ValueError(...)
|
||||||
|
per_sip = {}
|
||||||
|
for s in handle.shards:
|
||||||
|
per_sip[s.sip] = per_sip.get(s.sip, 0) + 1
|
||||||
|
if any(c != 1 for c in per_sip.values()):
|
||||||
|
raise ValueError(...)
|
||||||
|
|
||||||
|
def multi_pe_sip_local(handle, world_size, spec):
|
||||||
|
"""Multi-PE per SIP layout: 각 SIP에 intra-SIP PE 수만큼 shard 존재.
|
||||||
|
Intra-SIP 전체 PE를 참여시키는 알고리즘이 사용."""
|
||||||
|
cm = spec["sip"]["cube_mesh"]
|
||||||
|
pl = spec["cube"]["pe_layout"]
|
||||||
|
per_sip = cm["w"] * cm["h"] * pl["pe_per_corner"] * len(pl["corners"])
|
||||||
|
if len(handle.shards) != world_size * per_sip:
|
||||||
|
raise ValueError(...)
|
||||||
|
|
||||||
|
VALIDATOR_REGISTRY = {...}
|
||||||
|
def resolve_validator(key_or_path): ...
|
||||||
|
```
|
||||||
|
|
||||||
|
Validator는 world 전체의 shard layout 불변량을 본다. Per-rank view는
|
||||||
|
backend가 validator 호출 **후** `_tensor_slice_for_sip`로 생성.
|
||||||
|
|
||||||
|
**Per-rank tensor view** — SIP-local slice:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def _tensor_slice_for_sip(handle, sip) -> TensorArg:
|
||||||
|
sip_shards = [s for s in handle.shards if s.sip == sip]
|
||||||
|
if not sip_shards:
|
||||||
|
raise RuntimeError(f"tensor has no shards on SIP {sip}")
|
||||||
|
# Deterministic ordering contract: (cube, pe, offset_bytes) ascending.
|
||||||
|
# Multi-PE mappers (hierarchical 등) rely on this ordering to align
|
||||||
|
# per-PE tensor arg construction with participating_pes enumeration.
|
||||||
|
sip_shards.sort(key=lambda s: (s.cube, s.pe, s.offset_bytes))
|
||||||
|
min_offset = min(s.offset_bytes for s in sip_shards)
|
||||||
|
local_va_base = handle.va_base + min_offset if handle.va_base else 0
|
||||||
|
return TensorArg(
|
||||||
|
shards=tuple(TensorArgShard(...) for s in sip_shards),
|
||||||
|
va_base=local_va_base,
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Ordering invariant**: slice의 shard는 `(cube, pe, offset_bytes)` 오름차순.
|
||||||
|
Backend가 `participating_pes`를 iterate하며 `_tensor_arg_for_pe(view, cube, pe)`를
|
||||||
|
구성할 때, 결정론적 ordering을 전제할 수 있다. 특히 `all_pes` mapper +
|
||||||
|
hierarchical 알고리즘이 per-PE slice 조합을 순서 의존적으로 해석하는 경우에
|
||||||
|
중요.
|
||||||
|
|
||||||
|
### D9. Greenlet-local rank registry (+ debug warning)
|
||||||
|
|
||||||
|
```python
|
||||||
|
class DistributedContext:
|
||||||
|
def __init__(self):
|
||||||
|
self._backend = None
|
||||||
|
self._rank_by_greenlet: dict = {}
|
||||||
|
|
||||||
|
def _bind_rank(self, g, rank: int) -> None:
|
||||||
|
self._rank_by_greenlet[g] = int(rank)
|
||||||
|
|
||||||
|
def get_rank(self) -> int:
|
||||||
|
self._ensure_initialized()
|
||||||
|
from greenlet import getcurrent
|
||||||
|
g = getcurrent()
|
||||||
|
if g not in self._rank_by_greenlet:
|
||||||
|
if os.environ.get("KERNBENCH_DEBUG"):
|
||||||
|
warnings.warn(
|
||||||
|
"get_rank() called outside a bound greenlet — returning 0. "
|
||||||
|
"Likely a bug unless running single-driver."
|
||||||
|
)
|
||||||
|
return 0
|
||||||
|
return int(self._rank_by_greenlet[g])
|
||||||
|
```
|
||||||
|
|
||||||
|
### D10. `torch.ahbm.set_device(rank)` — SIP 바인딩
|
||||||
|
|
||||||
|
KernBench 백엔드 이름은 `ahbm` (ADR-0023 D10). Real PyTorch는
|
||||||
|
`torch.cuda.set_device(r)`이지만 우리는 CUDA가 아니므로 honestly-named
|
||||||
|
namespace를 사용한다.
|
||||||
|
|
||||||
|
```python
|
||||||
|
class _AhbmNamespace:
|
||||||
|
"""torch.ahbm — per-greenlet SIP device binding.
|
||||||
|
|
||||||
|
Real-PyTorch parity idiom: ``torch.cuda.set_device(rank)``. Since
|
||||||
|
KernBench's backend is 'ahbm' (not CUDA), we expose the equivalent
|
||||||
|
API under ``torch.ahbm`` to avoid pretending to be a CUDA runtime.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self._device_by_greenlet: dict = {}
|
||||||
|
|
||||||
|
def set_device(self, device: int) -> None:
|
||||||
|
from greenlet import getcurrent
|
||||||
|
self._device_by_greenlet[getcurrent()] = int(device)
|
||||||
|
|
||||||
|
def current_device(self) -> int | None:
|
||||||
|
from greenlet import getcurrent
|
||||||
|
return self._device_by_greenlet.get(getcurrent())
|
||||||
|
|
||||||
|
# Attached to RuntimeContext as `self.ahbm = _AhbmNamespace()`.
|
||||||
|
# Bench code: `torch.ahbm.set_device(rank)` mirrors `torch.cuda.set_device`.
|
||||||
|
```
|
||||||
|
|
||||||
|
**PyTorch 2.x style 병행 지원**: 최신 PyTorch는 device-agnostic한
|
||||||
|
`torch.accelerator` 네임스페이스를 지향 (`torch.accelerator.set_device_index(r)`,
|
||||||
|
`torch.accelerator.current_device_index()`). Device vendor에 종속되지 않는
|
||||||
|
코드를 쓰려는 사용자를 위해 KernBench도 이 표면을 병행 지원한다.
|
||||||
|
|
||||||
|
```python
|
||||||
|
class _AcceleratorNamespace:
|
||||||
|
"""torch.accelerator — device-agnostic API (PyTorch 2.x style).
|
||||||
|
|
||||||
|
Aliases torch.ahbm for bench code that prefers device-neutral idiom:
|
||||||
|
torch.accelerator.set_device_index(rank)
|
||||||
|
torch.accelerator.current_device_index()
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, ahbm: _AhbmNamespace):
|
||||||
|
self._ahbm = ahbm
|
||||||
|
|
||||||
|
def set_device_index(self, device: int) -> None:
|
||||||
|
self._ahbm.set_device(device)
|
||||||
|
|
||||||
|
def current_device_index(self) -> int | None:
|
||||||
|
return self._ahbm.current_device()
|
||||||
|
|
||||||
|
# RuntimeContext
|
||||||
|
self.ahbm = _AhbmNamespace()
|
||||||
|
self.accelerator = _AcceleratorNamespace(self.ahbm) # alias
|
||||||
|
```
|
||||||
|
|
||||||
|
Bench 작성자는 다음 중 하나를 선택 — 둘 다 내부적으로 같은 레지스트리를 보유:
|
||||||
|
|
||||||
|
```python
|
||||||
|
torch.ahbm.set_device(rank) # KernBench-native, explicit backend
|
||||||
|
torch.accelerator.set_device_index(rank) # PyTorch 2.x device-agnostic
|
||||||
|
```
|
||||||
|
|
||||||
|
### D11. Tensor placement = structural (sip, cube, pe) 좌표
|
||||||
|
|
||||||
|
`resolve_dp_policy`가 `target_sip`을 직접 받아 구조적 좌표로 placement 생성.
|
||||||
|
세부는 ADR-0026.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# RuntimeContext._create_tensor
|
||||||
|
current_sip = self.ahbm.current_device() # (D10 naming)
|
||||||
|
if current_sip is None:
|
||||||
|
current_sip = 0 # single-driver fallback (D9와 일관)
|
||||||
|
placement = resolve_dp_policy(
|
||||||
|
dp, shape=shape_2d, itemsize=itemsize,
|
||||||
|
num_pe=eff_num_pe, num_cubes=eff_num_cubes,
|
||||||
|
target_sip=current_sip,
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
Post-hoc `pe_index` shifting 제거 — ShardSpec이 `(sip, cube, pe)` 구조적
|
||||||
|
좌표 보유.
|
||||||
|
|
||||||
|
### D12. `torch.multiprocessing.spawn`-compat surface
|
||||||
|
|
||||||
|
Bench 작성자 표면은 real PyTorch `mp.spawn`과 동일:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# src/kernbench/runtime_api/multiprocessing.py (new)
|
||||||
|
def spawn(fn, args=(), nprocs=1, join=True, daemon=False, start_method="spawn"):
|
||||||
|
"""Drop-in for torch.multiprocessing.spawn.
|
||||||
|
Internal: greenlet fan-out + epoch-barrier sync + exception propagation.
|
||||||
|
"""
|
||||||
|
...
|
||||||
|
|
||||||
|
# torch namespace에 부착
|
||||||
|
torch.multiprocessing = SimpleNamespace(spawn=spawn)
|
||||||
|
```
|
||||||
|
|
||||||
|
Bench:
|
||||||
|
|
||||||
|
```python
|
||||||
|
import torch.multiprocessing as mp
|
||||||
|
mp.spawn(worker, nprocs=world_size, args=(world_size, torch))
|
||||||
|
```
|
||||||
|
|
||||||
|
### D13. Scheduler + exception handling
|
||||||
|
|
||||||
|
```python
|
||||||
|
def spawn(fn, args, nprocs, ...):
|
||||||
|
dist = torch.distributed
|
||||||
|
gs: list[greenlet] = []
|
||||||
|
errors: dict[int, Exception] = {}
|
||||||
|
|
||||||
|
for rank in range(nprocs):
|
||||||
|
def _entry(r=rank):
|
||||||
|
try:
|
||||||
|
fn(r, *args)
|
||||||
|
except Exception as e:
|
||||||
|
errors[r] = e
|
||||||
|
raise
|
||||||
|
g = greenlet(_entry)
|
||||||
|
dist._bind_rank(g, rank)
|
||||||
|
gs.append(g)
|
||||||
|
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
alive = [g for g in gs if not g.dead]
|
||||||
|
if not alive:
|
||||||
|
break
|
||||||
|
for g in alive:
|
||||||
|
if not g.dead:
|
||||||
|
g.switch()
|
||||||
|
except Exception as outer:
|
||||||
|
for other in gs:
|
||||||
|
if not other.dead:
|
||||||
|
try:
|
||||||
|
other.throw(SystemExit)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
# Epoch barrier state 명시적 cleanup
|
||||||
|
backend = getattr(dist, "_backend", None)
|
||||||
|
if backend is not None and hasattr(backend, "_barrier"):
|
||||||
|
backend._barrier.reset()
|
||||||
|
raise SpawnException(errors) from outer
|
||||||
|
```
|
||||||
|
|
||||||
|
**Scheduler contract**:
|
||||||
|
- Deterministic round-robin over insertion order (rank 0, 1, ..., N-1).
|
||||||
|
- 동기화 지점은 epoch barrier (D7)만. Scheduler 순서에 의존하는 correctness 없음.
|
||||||
|
- 예외 발생 시 다른 greenlet 강제 종료 + `SpawnException` 전파.
|
||||||
|
|
||||||
|
**Starvation guideline**:
|
||||||
|
- 일반적으로 collective barrier가 workers를 동기화. 큰 편차 없음.
|
||||||
|
- 극단적 non-collective 루프 대비 cooperative yield 제공:
|
||||||
|
`torch.distributed.cooperative_yield()`.
|
||||||
|
|
||||||
|
### D14. Backward compatibility
|
||||||
|
|
||||||
|
1. **Single-driver 호출**: `get_rank()` 0 반환 (D9).
|
||||||
|
2. **`ccl.yaml` world_size override**: D1 fallback 우회 — legacy "rank = PE"
|
||||||
|
테스트 경로로 사용 가능.
|
||||||
|
3. **`DPPolicy.sip="column_wise"` 명시**: ADR-0026 scope.
|
||||||
|
4. **`install_ipcq()` compatibility wrapper**:
|
||||||
|
|
||||||
|
기존 `ccl/install.py`의 `install_ipcq()` API는 곧바로 제거하지 않는다.
|
||||||
|
Thin compatibility wrapper로 남겨 기존 직접 호출자가 점진적으로 migration할
|
||||||
|
수 있게 한다.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# src/kernbench/ccl/install.py (after this ADR)
|
||||||
|
def install_ipcq(engine, spec, merged, *, algo_module=None, rank_to_pe=None):
|
||||||
|
"""DEPRECATED: legacy host-side PE installer.
|
||||||
|
|
||||||
|
Internally delegates to build_install_plans + engine-routed IpcqInitMsg.
|
||||||
|
Use dist.init_process_group() instead.
|
||||||
|
"""
|
||||||
|
from kernbench.ccl.install_plan import build_install_plans
|
||||||
|
import warnings
|
||||||
|
warnings.warn(
|
||||||
|
"install_ipcq() is deprecated; use dist.init_process_group()",
|
||||||
|
DeprecationWarning, stacklevel=2,
|
||||||
|
)
|
||||||
|
plans = build_install_plans(
|
||||||
|
world_size=merged.get("world_size", 1),
|
||||||
|
algorithm=merged["algorithm"],
|
||||||
|
algorithm_config=merged,
|
||||||
|
spec=spec,
|
||||||
|
)
|
||||||
|
handles = []
|
||||||
|
for plan in plans:
|
||||||
|
for pe_install in plan.pe_installs:
|
||||||
|
h = engine.submit(IpcqInitMsg(
|
||||||
|
target_sips=(plan.sip,),
|
||||||
|
target_cubes=(pe_install.cube,),
|
||||||
|
target_pe=pe_install.pe,
|
||||||
|
entries=pe_install.neighbors,
|
||||||
|
buffer_kind=plan.buffer_kind,
|
||||||
|
n_slots=plan.n_slots,
|
||||||
|
slot_size=plan.slot_size,
|
||||||
|
))
|
||||||
|
handles.append(h)
|
||||||
|
for h in handles:
|
||||||
|
engine.wait(h)
|
||||||
|
return {"world_size": merged.get("world_size", 1), "plans": plans}
|
||||||
|
```
|
||||||
|
|
||||||
|
Migration 스케줄:
|
||||||
|
- Phase 1: wrapper로 유지 + DeprecationWarning
|
||||||
|
- Phase 2: 직접 호출자 grep-audit → 각각 `dist.init_process_group()` 또는
|
||||||
|
`build_install_plans()` 직접 사용으로 이관
|
||||||
|
- Phase 3: wrapper 제거 (별도 cleanup ADR 또는 PR)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Dependencies
|
||||||
|
|
||||||
|
- **ADR-0023** (IPCQ): `IpcqInitMsg` 메시지 타입과 PE_IPCQ 핸들링을 그대로
|
||||||
|
활용. Engine-routed submit으로 전환하는 것이 유일한 변경.
|
||||||
|
- **ADR-0025** (IPCQ direction fix): `_build_pe_installs`의 neighbor 계산이
|
||||||
|
2-rank ring 등에서 정확히 동작하려면 필요.
|
||||||
|
- **ADR-0003 / 0016** (IO_CPU): IO_CPU는 기존 transit 역할 그대로. 본 ADR에서
|
||||||
|
IO_CPU 역할 변경 없음.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Non-goals
|
||||||
|
|
||||||
|
- **IPCQ protocol 수정**: ADR-0023 유지.
|
||||||
|
- **DPPolicy 필드 정리**: ADR-0026.
|
||||||
|
- **Megatron-style TP**: ADR-0027.
|
||||||
|
- **Multi-node (프로세스 간)**: 단일 프로세스.
|
||||||
|
- **IO_CPU SIP control-plane 단일 endpoint 원칙 채택**: 본 ADR 범위 밖. 현재
|
||||||
|
KernBench에 이 원칙이 없고, 도입은 별도 ADR.
|
||||||
|
- **Hierarchical all-reduce 알고리즘 설계**: ADR-0029. 본 ADR은 그 알고리즘이
|
||||||
|
쓸 framework 인프라 (`all_pes` mapper, `multi_pe_sip_local` validator,
|
||||||
|
registry 확장점)만 제공.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Open questions
|
||||||
|
|
||||||
|
### 🔴 Critical — 구현 blocker 가능성 (integration 전 반드시 검증)
|
||||||
|
|
||||||
|
- **`IpcqInitMsg`의 engine routing — primary implementation risk**: 현재
|
||||||
|
sideband만 쓰여서 engine routing path가 실사용 검증되지 않은 상태. **본
|
||||||
|
ADR 전체가 "engine routing이 동작한다"는 가정 위에 서 있다**. 이것이
|
||||||
|
실제로 안 되면 D2, D14, T3 등이 전부 영향 받음. 반드시 **ADR 구현 착수
|
||||||
|
전 스파이크 검증**:
|
||||||
|
- `engine.submit(IpcqInitMsg(target_sips=..., target_cubes=..., target_pe=...))`
|
||||||
|
가 PE_IPCQ로 정확히 배달되는지 (기존 `MmuMapMsg` / `MemoryWriteMsg` 라우팅
|
||||||
|
패턴과 비교)
|
||||||
|
- 미지원 시 minor hook: engine의 message-type → component-kind 매핑 테이블에
|
||||||
|
`IpcqInitMsg → "pe_ipcq"` 등록 (localized change, topology builder /
|
||||||
|
message schema 영향 없음)
|
||||||
|
- 결과에 따라 D2 채택 여부가 달라질 수 있음 — 만약 routing 불가 시 sideband
|
||||||
|
path 유지로 fallback 후 본 ADR 범위 재조정
|
||||||
|
|
||||||
|
- **Engine-routed install vs sideband equivalence** (D2 검증점 1-5): T3의
|
||||||
|
equivalence test가 실제 동작하는지 스파이크. 특히 ordering independence와
|
||||||
|
idempotency는 기존 테스트에 없는 속성이라 신규 검증 필요.
|
||||||
|
|
||||||
|
- **`install_ipcq()` 직접 호출자 audit** (구현 전 필수): deprecated wrapper
|
||||||
|
전략은 적절하지만 실제 migration 리스크는 호출자 목록에 따라 다름. 착수 전
|
||||||
|
grep audit:
|
||||||
|
- Pattern: `install_ipcq(` (cwd 전체)
|
||||||
|
- Scope: `src/`, `tests/`, `benches/`, `scripts/`, `src/kernbench/cli/`
|
||||||
|
- 각 호출자의 예상 migration path (→ `dist.init_process_group` vs
|
||||||
|
`build_install_plans` 직접)를 정리한 후 wrapper 도입
|
||||||
|
|
||||||
|
### 🟡 Nice-to-have — scope 경계 관련
|
||||||
|
|
||||||
|
- **Install timing 허용치**: SimPy 시간 상 install이 몇 ns~us 소모. 기존
|
||||||
|
sideband는 0ns. 기존 테스트가 t=0 시작을 전제로 하는지 확인 (audit 결과에
|
||||||
|
따라 테스트 교정 필요).
|
||||||
|
|
||||||
|
- **`IpcqInitMsg` 배치 가능성**: MmuMapMsg처럼 `target_pe="all"` 브로드캐스트
|
||||||
|
는 IPCQ에서는 부적합 (PE마다 neighbor가 다름). 현재는 per-PE 개별 submit.
|
||||||
|
Per-PE payload를 담는 batched IpcqInitMsg 타입은 future optimization.
|
||||||
|
|
||||||
|
- **`_rank_to_sip` 매핑**: 현재 identity. Non-trivial mapping 요구 시 별도.
|
||||||
|
|
||||||
|
- **Cooperative yield API 위치**: `torch.distributed.cooperative_yield()`로
|
||||||
|
노출 예정. 실제 필요성은 Phase 2 이후 벤치 추가 시 판단.
|
||||||
|
|
||||||
|
(PE-level topology 일원화 관련 중장기 방향은 **ADR-0029** 참고 — 복잡한
|
||||||
|
multi-level 알고리즘이 driving force가 되는 framework 진화 방향.)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Test strategy
|
||||||
|
|
||||||
|
### T1. Launcher infrastructure
|
||||||
|
|
||||||
|
`tests/test_ccl_ddp_launcher.py`:
|
||||||
|
- `test_world_size_equals_sip_count` — D1
|
||||||
|
- `test_ahbm_set_device_binds_tensor_to_single_sip` — D10/D11
|
||||||
|
- `test_get_rank_is_greenlet_local` — D9
|
||||||
|
- `test_run_spawns_one_worker_per_rank` — D12/D13
|
||||||
|
- `test_get_rank_debug_warning` — D9 warning path
|
||||||
|
|
||||||
|
### T2. Install plan builder
|
||||||
|
|
||||||
|
`tests/test_ccl_install_plan.py` (new):
|
||||||
|
- `build_install_plans` — ring_1d × leader_only 조합 (단일 PE per rank)
|
||||||
|
- `build_install_plans` — ring_1d × all_pes 조합 (multi-PE per rank; mapper
|
||||||
|
framework 동작 확인, 알고리즘-무관)
|
||||||
|
- Mapper / validator registry resolution (built-in key vs import path vs
|
||||||
|
unknown)
|
||||||
|
- Import path fallback (`"pkg.mod.fn"` 형식) 동작 검증
|
||||||
|
|
||||||
|
### T3. Engine-routed IpcqInitMsg (equivalence — 핵심 검증)
|
||||||
|
|
||||||
|
`tests/test_ipcq_init_routing.py` (new):
|
||||||
|
- **Routing**: `engine.submit(IpcqInitMsg)` → 지정 PE_IPCQ가 실제 설치 수행
|
||||||
|
- **Equivalence**: 동일한 IpcqInitMsg를 (a) sideband `_install_neighbors`
|
||||||
|
직접 호출, (b) engine.submit 두 경로로 보낸 뒤 PE_IPCQ 최종 state
|
||||||
|
(`_queue_pairs`, `_installed` 등) 동일성 비교
|
||||||
|
- **Ordering independence**: 서로 다른 PE의 install msg를 engine 큐에 임의
|
||||||
|
순서로 넣어도 최종 state가 동일
|
||||||
|
- **Idempotency (duplicate install)**: 동일 PE에 두 번 install msg → 두
|
||||||
|
번째는 에러 raise (policy: explicit error; D2 검증점 4 참고)
|
||||||
|
- **Multi-PE 병렬 install**: per-PE submit이 interference 없이 완료
|
||||||
|
- **Install 후 send 성공**: 설치 직후 `IpcqSendCmd` 실행해서 neighbor table
|
||||||
|
state가 실제로 유효한지 확인
|
||||||
|
|
||||||
|
### T4. Barrier correctness
|
||||||
|
|
||||||
|
`tests/test_collective_barrier.py` (new):
|
||||||
|
- Single collective 정상
|
||||||
|
- 다중 collective 연속 호출 (epoch 격리)
|
||||||
|
- 동일 rank의 duplicate join → RuntimeError
|
||||||
|
- Rank 1이 all_reduce 전 종료 → SpawnException + barrier.reset()
|
||||||
|
- Conditional branch 시 모든 rank 도달하면 정상
|
||||||
|
|
||||||
|
### T5. E2E
|
||||||
|
|
||||||
|
`tests/test_ccl_allreduce_matrix.py`:
|
||||||
|
- `ring_tcm` / `ring_hbm` / `ring_sram` @ ws=SIP_count
|
||||||
|
|
||||||
|
### T6. 회귀
|
||||||
|
|
||||||
|
기존 `test_ccl_framework`, `test_ccl_install`, `test_ccl_topologies`,
|
||||||
|
`test_ccl_mock_runtime`, `test_pe_ipcq`, `test_ipcq_e2e`, 기타 non-CCL
|
||||||
|
모두 통과.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Consequences
|
||||||
|
|
||||||
|
### Positive
|
||||||
|
|
||||||
|
- **새 message 타입 0개**: 기존 `IpcqInitMsg` + `KernelLaunchMsg`만으로 구현.
|
||||||
|
- **IO_CPU / engine 변경 없음**: 기존 routing 그대로.
|
||||||
|
- **Sideband install convention 제거**: MmuMapMsg 등과 동일 패턴으로 일원화.
|
||||||
|
- **Plan state stale 문제 소멸**: Plan은 host 단일 소유.
|
||||||
|
- **Bench = real PyTorch DDP** (공개 API 관점).
|
||||||
|
- **Algorithm ABI 경량**: `kernel` + `kernel_args`만 필수.
|
||||||
|
- **Epoch-based barrier**: interleaved collective 안전.
|
||||||
|
- **Control/data plane 분리**: data plane(PE_IPCQ)은 ADR-0023 유지, control
|
||||||
|
plane은 host-driven.
|
||||||
|
- 장기 확장성: Megatron TP, DTensor 기반.
|
||||||
|
|
||||||
|
### Negative
|
||||||
|
|
||||||
|
- 신규 모듈: `install_plan.py`, `mappers.py`, `validators.py`,
|
||||||
|
`multiprocessing.py`.
|
||||||
|
- Engine이 `IpcqInitMsg`를 엔진-path로 라우팅할 수 있는지 구현 시 확인 필요
|
||||||
|
(minor hook 가능성).
|
||||||
|
- Install이 SimPy 시간을 소모 (positive로도 볼 수 있으나, 기존 sideband 시점
|
||||||
|
0ns 전제인 테스트가 있으면 교정 필요).
|
||||||
|
|
||||||
|
### Neutral
|
||||||
|
|
||||||
|
- IPCQ PE-level protocol (ADR-0023) 불변.
|
||||||
|
- `DPPolicy` 필드 변경은 ADR-0026.
|
||||||
|
- IO_CPU 역할 불변 (기존 transit 그대로).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Affected files
|
||||||
|
|
||||||
|
| File | Change |
|
||||||
|
|------|--------|
|
||||||
|
| `src/kernbench/runtime_api/distributed.py` | D1/D2/D7/D9: world_size fallback, rank_to_sip, plan 소유, engine-routed install/launch, epoch barrier |
|
||||||
|
| `src/kernbench/runtime_api/context.py` | D10/D11: `_AhbmNamespace`, `ctx.ahbm`, `_create_tensor`가 `target_sip` 전달 |
|
||||||
|
| `src/kernbench/runtime_api/multiprocessing.py` (new) | D12/D13: `spawn` + scheduler + exception |
|
||||||
|
| `src/kernbench/ccl/install_plan.py` (new) | D6: `build_install_plans`, `SipInstallPlan`, `PeInstallSpec`, `NeighborTableEntry` |
|
||||||
|
| `src/kernbench/ccl/mappers.py` (new) | D5: `leader_only`, `all_pes`, registry + resolver |
|
||||||
|
| `src/kernbench/ccl/validators.py` (new) | D5: validator registry + resolver |
|
||||||
|
| `src/kernbench/ccl/install.py` | Thin deprecated compat wrapper (D14) |
|
||||||
|
| `src/kernbench/ccl/algorithms/ring_allreduce.py` | D4: `kernel` + `kernel_args` 유지 (큰 변화 없음) |
|
||||||
|
| `src/kernbench/ccl/algorithms/mesh_allreduce.py` | D4 동일 |
|
||||||
|
| `src/kernbench/ccl/algorithms/tree_allreduce.py` | D4 동일 |
|
||||||
|
| `ccl.yaml` | 각 알고리즘에 `mapper` / `validator` 선언 추가 |
|
||||||
|
| `src/kernbench/sim_engine/engine.py` | (If needed) `IpcqInitMsg` → PE_IPCQ 라우팅 확인 hook |
|
||||||
|
| `benches/ccl_allreduce.py` | 새 launcher 기반 rewrite |
|
||||||
|
| `tests/test_ccl_ddp_launcher.py` (new) | T1 |
|
||||||
|
| `tests/test_ccl_install_plan.py` (new) | T2 |
|
||||||
|
| `tests/test_ipcq_init_routing.py` (new) | T3 |
|
||||||
|
| `tests/test_collective_barrier.py` (new) | T4 |
|
||||||
|
| `tests/test_ccl_allreduce_matrix.py` | T5: ws=SIP_count 단순화 |
|
||||||
@@ -0,0 +1,365 @@
|
|||||||
|
# ADR-0025: IPCQ Direction Addressing — address-based matching
|
||||||
|
|
||||||
|
## Status
|
||||||
|
|
||||||
|
Proposed (Revision 2 — Address-based matching; peer_direction field dropped)
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
### 목표
|
||||||
|
|
||||||
|
ADR-0023의 IPCQ protocol에서 **"어느 direction pair를 통한 전송인가"의 식별**을
|
||||||
|
topology / dict-order에 의존하지 않고 **주소 기반**으로 일관되게 한다.
|
||||||
|
2-rank bidirectional ring (또는 여러 direction이 동일 peer를 가리키는
|
||||||
|
topology 일반)에서 정확히 동작하도록 한다.
|
||||||
|
|
||||||
|
### 현재 상태 (ADR-0023 D9 구현)
|
||||||
|
|
||||||
|
`src/kernbench/components/builtin/pe_ipcq.py` — `_handle_meta_arrival`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def _handle_meta_arrival(self, msg: IpcqMetaArrival) -> None:
|
||||||
|
token = msg.token
|
||||||
|
sender_key = (token.src_sip, token.src_cube, token.src_pe)
|
||||||
|
for d, qp in self._queue_pairs.items():
|
||||||
|
p = qp["peer"]
|
||||||
|
if (p.sip, p.cube, p.pe) == sender_key:
|
||||||
|
qp["peer_head_cache"] = max(qp["peer_head_cache"], token.sender_seq + 1)
|
||||||
|
# ... wake recv waiters ...
|
||||||
|
return
|
||||||
|
```
|
||||||
|
|
||||||
|
`_credit_worker`도 동일한 "sender-coord-first-match" 패턴.
|
||||||
|
|
||||||
|
`src/kernbench/ccl/install.py` — `reverse_direction`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def reverse_direction(my_rank: int, peer_rank: int) -> str | None:
|
||||||
|
for d, target in neighbor_table[peer_rank].items():
|
||||||
|
if target == my_rank:
|
||||||
|
return d
|
||||||
|
return None
|
||||||
|
```
|
||||||
|
|
||||||
|
### 드러난 버그 — 2-rank bidirectional ring
|
||||||
|
|
||||||
|
`ring_1d(rank, world_size=2)` → `{"E": 1, "W": 1}` (rank 0). 양쪽 방향이 같은 peer.
|
||||||
|
|
||||||
|
**버그 1 (install)**:
|
||||||
|
- `reverse_direction(0, 1)` → dict order로 "E" 반환 (틀림, "W"가 맞음 — opposite
|
||||||
|
direction convention)
|
||||||
|
- rank 0의 E entry가 `peer.rx_base_pa = rx_base(sip1, cube0, pe0, d="E")`로 설정
|
||||||
|
- tl.send(E) → data가 sip1의 E-rx buffer로 landing (should be W-rx)
|
||||||
|
|
||||||
|
**버그 2 (runtime)**:
|
||||||
|
- 설령 install이 올바른 주소로 설정해도, receiver의 `_handle_meta_arrival`이
|
||||||
|
sender 좌표만으로 direction 매칭 → 첫 direction (E) 승
|
||||||
|
- peer_head_cache[E] 증가, peer_head_cache[W]는 불변
|
||||||
|
- Kernel의 tl.recv(W)는 peer_head_cache[W] 대기 → 영원히 블록 → IpcqDeadlock
|
||||||
|
|
||||||
|
### 근본 원인
|
||||||
|
|
||||||
|
두 축에서 동일 문제:
|
||||||
|
1. **Install-time pairing**: "내 direction과 peer의 어느 direction이 짝인가"
|
||||||
|
결정이 dict-iteration-order에 의존 → 여러 direction이 같은 peer를 가리킬 때
|
||||||
|
fragile
|
||||||
|
2. **Runtime identification**: "어느 qp를 업데이트해야 하는가" 결정이 sender
|
||||||
|
좌표만으로 이루어짐 → direction 중복 시 ambiguous
|
||||||
|
|
||||||
|
### 해결 방향 — address-based matching
|
||||||
|
|
||||||
|
각 PE의 rx buffer는 **direction별로 고유한 주소 range**에 위치 (rx_base_pa +
|
||||||
|
direction_idx × bytes_per_direction). 따라서:
|
||||||
|
|
||||||
|
- **Runtime**: sender coord 대신 **dst_addr 범위**로 매칭 → unambiguous
|
||||||
|
- **Install**: opposite-direction 우선 선택 heuristic (ring / mesh의 자연스러운
|
||||||
|
대칭성)
|
||||||
|
- `peer_direction` 같은 이중 메타데이터 불필요 — **주소가 single source of
|
||||||
|
truth**
|
||||||
|
|
||||||
|
이 설계는 **PhysAddr 전환 (ADR-0030)과 독립적**으로 작동. 현재 synthetic
|
||||||
|
주소든 PhysAddr든 direction별 range 유일성만 지켜지면 동일하게 적용 가능.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Decision
|
||||||
|
|
||||||
|
### D1. Install — `reverse_direction` opposite-preference
|
||||||
|
|
||||||
|
`src/kernbench/ccl/install.py`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
_OPPOSITE_DIR = {"E": "W", "W": "E", "N": "S", "S": "N"}
|
||||||
|
|
||||||
|
def reverse_direction(my_rank: int, peer_rank: int, my_dir: str) -> str | None:
|
||||||
|
"""Find peer's direction that reciprocates my_dir→peer_rank.
|
||||||
|
|
||||||
|
Prefer the OPPOSITE direction (E↔W, N↔S) when the peer has it
|
||||||
|
pointing back to us. This matters in 2-rank bidirectional rings
|
||||||
|
where both E and W on one side point to the same peer — without
|
||||||
|
the preference, the first-match-wins iteration would route data
|
||||||
|
into the wrong rx slot. Falls back to any direction pointing back
|
||||||
|
for topologies without an opposite convention (tree_binary's
|
||||||
|
parent/child).
|
||||||
|
"""
|
||||||
|
nt = neighbor_table[peer_rank]
|
||||||
|
opp = _OPPOSITE_DIR.get(my_dir)
|
||||||
|
if opp is not None and nt.get(opp) == my_rank:
|
||||||
|
return opp
|
||||||
|
for d, target in nt.items():
|
||||||
|
if target == my_rank:
|
||||||
|
return d
|
||||||
|
return None
|
||||||
|
```
|
||||||
|
|
||||||
|
호출부:
|
||||||
|
|
||||||
|
```python
|
||||||
|
for d, peer_rank in nbrs.items():
|
||||||
|
peer_dir = reverse_direction(r, peer_rank, d) # my_dir 전달
|
||||||
|
if peer_dir is None:
|
||||||
|
continue
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
||||||
|
### D2. Runtime — `_handle_meta_arrival` dst_addr 매칭
|
||||||
|
|
||||||
|
`src/kernbench/components/builtin/pe_ipcq.py`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def _handle_meta_arrival(self, msg: IpcqMetaArrival) -> None:
|
||||||
|
"""Match incoming token to the receiver-side direction by dst_addr range.
|
||||||
|
|
||||||
|
Each direction has a unique rx buffer address range
|
||||||
|
(my_rx_base_pa + n_slots * slot_size). The token's dst_addr (set by
|
||||||
|
the sender's IPCQ when computing peer's slot address) falls within
|
||||||
|
exactly one such range. This address-based matching is unambiguous
|
||||||
|
even when multiple directions have the same peer (2-rank ring).
|
||||||
|
"""
|
||||||
|
token = msg.token
|
||||||
|
dst_addr = token.dst_addr
|
||||||
|
for d, qp in self._queue_pairs.items():
|
||||||
|
base = qp["my_rx_base_pa"]
|
||||||
|
size = qp["n_slots"] * qp["slot_size"]
|
||||||
|
if base <= dst_addr < base + size:
|
||||||
|
qp["peer_head_cache"] = max(qp["peer_head_cache"],
|
||||||
|
token.sender_seq + 1)
|
||||||
|
self._arrived_tokens.setdefault(d, []).append(token)
|
||||||
|
waiters = self._recv_waiters.get(d, [])
|
||||||
|
self._recv_waiters[d] = []
|
||||||
|
for ev in waiters:
|
||||||
|
if not ev.triggered:
|
||||||
|
ev.succeed()
|
||||||
|
any_waiters = self._any_recv_waiters
|
||||||
|
self._any_recv_waiters = []
|
||||||
|
for ev in any_waiters:
|
||||||
|
if not ev.triggered:
|
||||||
|
ev.succeed()
|
||||||
|
return
|
||||||
|
# Unknown dst_addr — diagnostic log (should not happen under correct install)
|
||||||
|
```
|
||||||
|
|
||||||
|
Sender 좌표 검사는 **제거**. `dst_addr`가 이미 direction을 결정.
|
||||||
|
|
||||||
|
### D3. Credit — `dst_rx_base_pa` 필드 추가
|
||||||
|
|
||||||
|
`src/kernbench/common/ipcq_types.py`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class IpcqCreditMetadata:
|
||||||
|
consumer_seq: int
|
||||||
|
dst_rx_base_pa: int # NEW: 원 sender의 peer.rx_base_pa와 매칭용
|
||||||
|
# 기존 필드 (diagnostic / log 용도로 유지)
|
||||||
|
src_sip: int
|
||||||
|
src_cube: int
|
||||||
|
src_pe: int
|
||||||
|
src_direction: str
|
||||||
|
```
|
||||||
|
|
||||||
|
Credit 생성 시 (`_delayed_credit_send`): 자기 direction의 `my_rx_base_pa`를
|
||||||
|
`dst_rx_base_pa`로 실어 보냄 (이게 상대방이 sender 당시 썼던 `peer.rx_base_pa`).
|
||||||
|
|
||||||
|
수신 측 (`_credit_worker`):
|
||||||
|
|
||||||
|
```python
|
||||||
|
def _credit_worker(self, env):
|
||||||
|
while True:
|
||||||
|
credit = yield self._credit_inbox.get()
|
||||||
|
for d, qp in self._queue_pairs.items():
|
||||||
|
# peer의 rx_base_pa와 credit의 dst_rx_base_pa가 일치하는 qp 찾기
|
||||||
|
if qp["peer"].rx_base_pa == credit.dst_rx_base_pa:
|
||||||
|
qp["peer_tail_cache"] = max(qp["peer_tail_cache"],
|
||||||
|
credit.consumer_seq)
|
||||||
|
waiters = self._send_waiters.get(d, [])
|
||||||
|
self._send_waiters[d] = []
|
||||||
|
for ev in waiters:
|
||||||
|
if not ev.triggered:
|
||||||
|
ev.succeed()
|
||||||
|
break
|
||||||
|
```
|
||||||
|
|
||||||
|
Sender 좌표 검사 제거. `dst_rx_base_pa` 매칭으로 unambiguous.
|
||||||
|
|
||||||
|
### D4. `IpcqInitEntry`에 `peer_direction` 필드를 **추가하지 않음**
|
||||||
|
|
||||||
|
ADR-0025 rev 1에서 제안했던 `IpcqInitEntry.peer_direction`은 **불필요**.
|
||||||
|
이유:
|
||||||
|
- Meta arrival은 dst_addr로 매칭 (D2)
|
||||||
|
- Credit은 dst_rx_base_pa로 매칭 (D3)
|
||||||
|
- qp에 peer_direction 저장 필요 없음
|
||||||
|
- Install은 rx_base_pa 계산 시 내부적으로만 peer_dir 사용 (`reverse_direction`)
|
||||||
|
|
||||||
|
IpcqInitEntry schema 변경 없음. Rev 1 대비 **단순화**.
|
||||||
|
|
||||||
|
### D5. `IpcqDmaToken.src_direction` 유지 (diagnostic only)
|
||||||
|
|
||||||
|
기존 `src_direction` 필드는 제거하지 않는다. 다음 용도로 유지:
|
||||||
|
- Logging / trace: `KERNBENCH_CCL_TRACE=1` 출력의 `(rank, t, dir, nbytes)`
|
||||||
|
- Diagnostics: pointer_dump 등에서 direction 표시
|
||||||
|
- 미래 확장 여지
|
||||||
|
|
||||||
|
Runtime matching은 `dst_addr`만 사용.
|
||||||
|
|
||||||
|
### D6. Invariants (ADR-0023 I3 강화)
|
||||||
|
|
||||||
|
**I3 (엄격)**: 각 방향 pair `(my_direction, peer_direction)`에 대해 my
|
||||||
|
rx_base와 peer rx_base는 **별개의 direction slot**을 가리켜야 함. Install은
|
||||||
|
이를 보장해야 한다 (reverse_direction opposite-preference).
|
||||||
|
|
||||||
|
**I3.1 (신규)**: 모든 qp에 대해 `qp["my_rx_base_pa"]`와 `qp["peer"].rx_base_pa`는
|
||||||
|
서로 disjoint한 주소 range를 점유한다 (다른 direction의 buffer는 절대 겹치지
|
||||||
|
않음). 이것이 D2/D3의 주소-기반 매칭의 전제.
|
||||||
|
|
||||||
|
Install time에 검증 가능:
|
||||||
|
```python
|
||||||
|
# ccl/install_plan.py: build_install_plans 끝에 assertion
|
||||||
|
all_rx_ranges = set()
|
||||||
|
for plan in plans:
|
||||||
|
for pe_install in plan.pe_installs:
|
||||||
|
for entry in pe_install.neighbors:
|
||||||
|
r = (entry.my_rx_base_pa,
|
||||||
|
entry.my_rx_base_pa + plan.n_slots * plan.slot_size)
|
||||||
|
overlap = any(_ranges_overlap(r, e) for e in all_rx_ranges)
|
||||||
|
assert not overlap
|
||||||
|
all_rx_ranges.add(r)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Dependencies
|
||||||
|
|
||||||
|
- **ADR-0023** (IPCQ protocol): 본 ADR은 ADR-0023의 runtime 매칭 로직 수정
|
||||||
|
(D2, D3) + install heuristic 개선 (D1). IPCQ 프로토콜의 semantic layer
|
||||||
|
변경은 없음.
|
||||||
|
- **ADR-0024** (launcher): 2-rank bidirectional ring이 실제 쓰이는 경우가
|
||||||
|
ADR-0024의 ws=SIP_count 모델. 본 ADR이 그 케이스를 작동시킴.
|
||||||
|
- **ADR-0030** (PhysAddr transition, stub): **독립적** — ADR-0025의
|
||||||
|
주소-기반 매칭은 현재 synthetic 주소든 PhysAddr이든 동일하게 작동.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Non-goals
|
||||||
|
|
||||||
|
- **IPCQ 주소 체계를 PhysAddr로 전환**: ADR-0030 scope. 본 ADR은 주소가 어떻게
|
||||||
|
인코딩되는가와 무관.
|
||||||
|
- **Multi-hop routing**: ADR-0023 D5의 single-hop DMA write 전제 유지.
|
||||||
|
- **Unidir ring 특수화**: `ring_1d_unidir`는 direction 하나만 있으므로 본 버그
|
||||||
|
무관.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Open questions
|
||||||
|
|
||||||
|
- **주소 매칭 성능**: `_handle_meta_arrival`과 `_credit_worker`가 qp를 선형
|
||||||
|
순회 (max 4 direction). 성능 영향 무시 가능 수준. 문제 시 dict lookup으로
|
||||||
|
전환 가능 (`_qp_by_rx_base`).
|
||||||
|
- **`IpcqDmaToken.src_direction` 필요성 재평가**: diagnostic 용도로만 남긴
|
||||||
|
필드를 계속 유지할지, 또는 logging 외부로 분리할지. 현재는 유지.
|
||||||
|
- **Install-time invariant 검증 cost**: D6의 I3.1 검증은 O(N_PE × N_direction)^2.
|
||||||
|
대형 topology에서 느려질 수 있음 → interval tree 등 자료구조로 개선 가능.
|
||||||
|
단순 구현 먼저.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Test strategy
|
||||||
|
|
||||||
|
### T1. Unit — `reverse_direction` opposite-preference
|
||||||
|
|
||||||
|
`tests/test_ccl_install.py` (확장):
|
||||||
|
- Ring ws=2: `reverse_direction(0, 1, "E")` → "W", `reverse_direction(0, 1, "W")` → "E"
|
||||||
|
- Ring ws=4: `reverse_direction(0, 1, "E")` → "W" (자연스러운 opposite)
|
||||||
|
- Mesh 2×2: `reverse_direction(r, peer, "N")` → "S", "E" ↔ "W"
|
||||||
|
- Tree binary: opposite 없는 direction (parent) → fallback 경로
|
||||||
|
- Non-symmetric topology: opposite가 peer에 없고 다른 direction만 있는 경우
|
||||||
|
|
||||||
|
### T2. Runtime — `_handle_meta_arrival` dst_addr 매칭
|
||||||
|
|
||||||
|
`tests/test_pe_ipcq.py` (확장):
|
||||||
|
- 2-rank pair install 후, E direction dst_addr로 meta arrival → E의 `peer_head_cache`
|
||||||
|
증가 (W는 불변)
|
||||||
|
- W direction dst_addr로 meta arrival → W의 `peer_head_cache` 증가
|
||||||
|
- 잘못된 dst_addr (어느 rx range에도 속하지 않음) → 에러 또는 silent drop
|
||||||
|
(결정 후 명시)
|
||||||
|
|
||||||
|
### T3. Credit — `dst_rx_base_pa` 매칭
|
||||||
|
|
||||||
|
`tests/test_pe_ipcq.py` (확장):
|
||||||
|
- E direction send 후 peer가 consume → credit에 자기 W의 `my_rx_base_pa`
|
||||||
|
담아 송신 → sender의 E direction `peer_tail_cache` 증가
|
||||||
|
- W direction도 동일
|
||||||
|
|
||||||
|
### T4. E2E — 2-rank bidirectional ring
|
||||||
|
|
||||||
|
`tests/test_ipcq_e2e.py`:
|
||||||
|
- 2-rank ring_1d로 tl.send(E) + tl.recv(W) pattern이 양방향으로 작동
|
||||||
|
- ADR-0024의 `test_ccl_allreduce_matrix.py`에서 ring at ws=2가 통과
|
||||||
|
|
||||||
|
### T5. Install invariant — rx_base range disjointness
|
||||||
|
|
||||||
|
`tests/test_ccl_install_plan.py` (확장):
|
||||||
|
- I3.1 검증: `build_install_plans` 결과에서 모든 qp의 rx_base range가 disjoint
|
||||||
|
|
||||||
|
### T6. 회귀
|
||||||
|
|
||||||
|
- 기존 ws≥3 ring / mesh / tree 테스트 그대로 통과
|
||||||
|
- `test_pe_ipcq`, `test_ipcq_e2e` 기존 케이스 회귀
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Consequences
|
||||||
|
|
||||||
|
### Positive
|
||||||
|
|
||||||
|
- **단순함**: `peer_direction` 이중 메타데이터 제거. 주소가 single source of truth.
|
||||||
|
- **Unambiguous matching**: 모든 topology (direction 중복 포함)에서 동작.
|
||||||
|
- **Schema 변경 최소**: `IpcqInitEntry` 불변, `IpcqCreditMetadata`에 1 필드 추가.
|
||||||
|
- **PhysAddr 전환 (ADR-0030) 독립**: 주소-기반 매칭은 주소 인코딩 방식과 무관.
|
||||||
|
- **Diagnostic 유지**: `IpcqDmaToken.src_direction`은 로깅 용도로 존치.
|
||||||
|
|
||||||
|
### Negative
|
||||||
|
|
||||||
|
- Runtime 매칭이 주소 비교로 바뀌어서 디버깅 시 "왜 peer_head_cache[E]가 아닌
|
||||||
|
W가 업데이트됐나" 같은 질문에 address range를 추적해야 함 (기존엔 direction
|
||||||
|
이름으로 충분). 해결: pointer_dump에 "direction ↔ rx_base_pa" 매핑 포함.
|
||||||
|
|
||||||
|
### Neutral
|
||||||
|
|
||||||
|
- IPCQ protocol의 semantic layer (sender가 dst_addr 계산, receiver가 수신)는
|
||||||
|
불변.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Affected files
|
||||||
|
|
||||||
|
| File | Change |
|
||||||
|
|------|--------|
|
||||||
|
| `src/kernbench/ccl/install.py` | D1: `reverse_direction`에 `my_dir` 인자 추가, opposite-preference |
|
||||||
|
| `src/kernbench/components/builtin/pe_ipcq.py` | D2: `_handle_meta_arrival` dst_addr 매칭 / D3: `_credit_worker` dst_rx_base_pa 매칭 / `_delayed_credit_send`가 `dst_rx_base_pa` 필드 채움 |
|
||||||
|
| `src/kernbench/common/ipcq_types.py` | D3: `IpcqCreditMetadata`에 `dst_rx_base_pa` 필드 추가 |
|
||||||
|
| `src/kernbench/ccl/install_plan.py` (ADR-0024 신규) | D6: I3.1 invariant 검증 (optional) |
|
||||||
|
| `docs/adr/ADR-0023-ipcq-pe-collective.md` | Reference note: runtime 매칭 방식이 ADR-0025에서 바뀜 |
|
||||||
|
| `tests/test_ccl_install.py` | T1 |
|
||||||
|
| `tests/test_pe_ipcq.py` | T2, T3 |
|
||||||
|
| `tests/test_ipcq_e2e.py` | T4 |
|
||||||
|
| `tests/test_ccl_install_plan.py` | T5 |
|
||||||
@@ -0,0 +1,476 @@
|
|||||||
|
# ADR-0026: DPPolicy = Intra-Device Only — sip/num_sips 필드 제거
|
||||||
|
|
||||||
|
## Status
|
||||||
|
|
||||||
|
Accepted (Revision 5 — Phase 2 landed 2026-04-14, 523 passed + 1 strict xfail)
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
### 목표
|
||||||
|
|
||||||
|
`DPPolicy`를 **한 device(SIP) 내부의 cube × PE 분산**만 표현하는 순수한
|
||||||
|
intra-device 추상화로 명확화한다. SIP 간 분산(TP)은 별도 레이어로 분리
|
||||||
|
(ADR-0024의 `torch.ahbm.set_device(rank)` 또는 ADR-0027의 Megatron parallel
|
||||||
|
layers가 담당).
|
||||||
|
|
||||||
|
### 현재 상태
|
||||||
|
|
||||||
|
`src/kernbench/policy/placement/dp.py`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class DPPolicy:
|
||||||
|
sip: Literal["replicate", "column_wise", "row_wise"] = "replicate"
|
||||||
|
cube: Literal["replicate", "column_wise", "row_wise"] = "replicate"
|
||||||
|
pe: Literal["replicate", "column_wise", "row_wise"] = "replicate"
|
||||||
|
num_pes: int | None = None
|
||||||
|
num_cubes: int | None = None
|
||||||
|
num_sips: int | None = None # ← 제거 대상
|
||||||
|
```
|
||||||
|
|
||||||
|
`sip` / `num_sips` 필드는 텐서를 SIP 경계 **너머**로 분산하는 경로를 제공함.
|
||||||
|
이는:
|
||||||
|
|
||||||
|
- **ADR-0024의 launcher 모델과 충돌**: ADR-0024는 "rank = SIP = 1 worker per SIP"
|
||||||
|
모델. 각 worker가 자기 SIP에 텐서를 생성. 텐서가 여러 SIP에 걸치는 경우는
|
||||||
|
Megatron-style TP가 개별 primitive로 처리해야 함.
|
||||||
|
- **사용자 의도와 불일치**: "DPPolicy는 한 디바이스 내에서 PE들로 분산하는 방법"
|
||||||
|
(사용자 진술).
|
||||||
|
- **개념 혼동**: `DPPolicy.sip="column_wise"`는 실제로 **TP**. 이름이 DP인데
|
||||||
|
하는 일은 TP → 신규 사용자에게 혼란.
|
||||||
|
|
||||||
|
### 영향받는 call site (rollback 시점 grep 결과)
|
||||||
|
|
||||||
|
**생성 사이트** (`DPPolicy(sip=...` 또는 `num_sips=...`):
|
||||||
|
- `tests/test_runtime_api_tensor.py`
|
||||||
|
- `benches/ccl_allreduce.py` (ADR-0024 scope 내에서 이미 개편됨)
|
||||||
|
- `tests/test_va_offset.py`
|
||||||
|
- `benches/va_offset_verify.py`
|
||||||
|
- `tests/test_sip_parallel.py`
|
||||||
|
|
||||||
|
**참조 사이트** (`dp.sip`, `policy.sip`, `num_sips` 등):
|
||||||
|
- `src/kernbench/runtime_api/context.py` (`_create_tensor`, `launch`)
|
||||||
|
- `src/kernbench/components/builtin/pe_cpu.py`
|
||||||
|
- `src/kernbench/components/legacy/builtin/pe_cpu.py`
|
||||||
|
- `src/kernbench/policy/placement/dp.py` (구현 자체)
|
||||||
|
- `tests/test_tensor.py`, `test_ipcq_types.py`
|
||||||
|
|
||||||
|
**핵심 테스트**: `test_sip_parallel.py`는 이름 그대로 "SIP 병렬성을 DPPolicy로
|
||||||
|
표현하는" 테스트. 이 ADR 이후 **새 launcher 모델로 재작성** 필요.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Decision
|
||||||
|
|
||||||
|
### D1. `DPPolicy`에서 `sip` + `num_sips` 필드 제거
|
||||||
|
|
||||||
|
```python
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class DPPolicy:
|
||||||
|
"""Intra-device (cube × PE) data-parallel policy.
|
||||||
|
|
||||||
|
SIP-level placement is controlled by ``torch.ahbm.set_device(rank)``
|
||||||
|
(ADR-0024 D10) and, for model-level TP, by Megatron-style parallel
|
||||||
|
layers (ADR-0027). DPPolicy does not cross SIP boundaries.
|
||||||
|
"""
|
||||||
|
cube: Literal["replicate", "column_wise", "row_wise"] = "replicate"
|
||||||
|
pe: Literal["replicate", "column_wise", "row_wise"] = "replicate"
|
||||||
|
num_pes: int | None = None
|
||||||
|
num_cubes: int | None = None
|
||||||
|
```
|
||||||
|
|
||||||
|
제거되는 필드: `sip`, `num_sips`.
|
||||||
|
|
||||||
|
### D2. `ShardSpec` — structural (sip, cube, pe) 좌표, `pe_index` 완전 제거
|
||||||
|
|
||||||
|
현재 `ShardSpec.pe_index`는 **global flat index** (`sip × cubes × pes + cube ×
|
||||||
|
pes + pe`). 이는 ADR-0024 D11이 "abstraction leakage"로 지적한 형태.
|
||||||
|
|
||||||
|
본 ADR에서 ShardSpec을 **structural 좌표로 재정의**하고, `pe_index`는
|
||||||
|
property로도 **남기지 않는다**:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# src/kernbench/policy/placement/dp.py (after)
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class ShardSpec:
|
||||||
|
"""Structural shard placement — intra-SIP (cube × PE) coord.
|
||||||
|
|
||||||
|
Global-flat `pe_index` was removed in ADR-0026. Callers must use
|
||||||
|
structural coords (sip, cube, pe) directly. If a flat integer key is
|
||||||
|
needed (e.g. dict lookup), compute it explicitly at the call site.
|
||||||
|
"""
|
||||||
|
sip: int # structural — which SIP this shard lives on
|
||||||
|
cube: int # local within SIP
|
||||||
|
pe: int # local within cube
|
||||||
|
offset_bytes: int
|
||||||
|
nbytes: int
|
||||||
|
```
|
||||||
|
|
||||||
|
**핵심 원칙**:
|
||||||
|
- ShardSpec의 정체성은 `(sip, cube, pe)` 3튜플.
|
||||||
|
- **`pe_index` property도 없음** — silent semantics drift 차단.
|
||||||
|
- Global flat을 기대한 기존 호출자는 `.pe_index` 접근 시 **즉시
|
||||||
|
`AttributeError`** → 반드시 구조적 좌표로 migration.
|
||||||
|
- Flat integer key가 필요한 국소 문맥 (예: 내부 dict lookup)은 호출자가
|
||||||
|
명시적으로 `spec.sip * N_CUBES * N_PE + spec.cube * N_PE + spec.pe`를 계산.
|
||||||
|
|
||||||
|
**Property 제거 정당화**: KernBench는 사내 프로젝트로 call site가 한정되어
|
||||||
|
있음. Silent drift 위험 (의미만 바뀌고 타입은 같은 int) 대비 explicit breakage
|
||||||
|
(AttributeError)가 훨씬 안전.
|
||||||
|
|
||||||
|
### D3. `resolve_dp_policy`가 `target_sip`을 받아 structural 좌표 생성
|
||||||
|
|
||||||
|
ADR-0024 D11의 계약 구현. Post-hoc shifting 없음.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# src/kernbench/policy/placement/dp.py (after)
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class _LocalPeShard:
|
||||||
|
"""Internal — PE resolver의 반환. Cube 내 local PE 식별자 + payload."""
|
||||||
|
local_pe: int # cube-local PE index (0..num_pe-1)
|
||||||
|
offset_bytes: int
|
||||||
|
nbytes: int
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_dp_policy(
|
||||||
|
policy: DPPolicy,
|
||||||
|
*,
|
||||||
|
shape: tuple[int, int],
|
||||||
|
itemsize: int,
|
||||||
|
num_pe: int,
|
||||||
|
num_cubes: int = 1,
|
||||||
|
target_sip: int, # NEW — 어느 SIP에 배치할지 명시
|
||||||
|
) -> list[ShardSpec]:
|
||||||
|
"""2-level resolution (cube × PE) on a specified SIP.
|
||||||
|
|
||||||
|
Returns ShardSpecs with structural coords (sip=target_sip, cube, pe).
|
||||||
|
No SIP-level split — DPPolicy is intra-device only.
|
||||||
|
"""
|
||||||
|
resolver = _PE_RESOLVERS[policy.pe]
|
||||||
|
all_shards: list[ShardSpec] = []
|
||||||
|
|
||||||
|
# Level 1: cube within SIP
|
||||||
|
cube_splits = _split_shape(policy.cube, shape, num_cubes, itemsize)
|
||||||
|
|
||||||
|
for cube_id, (cube_shape, cube_offset) in enumerate(cube_splits):
|
||||||
|
# Level 2: PE within cube — resolver returns _LocalPeShard (local_pe)
|
||||||
|
local_shards = resolver(shape=cube_shape, itemsize=itemsize,
|
||||||
|
num_pe=num_pe)
|
||||||
|
|
||||||
|
for ls in local_shards:
|
||||||
|
all_shards.append(ShardSpec(
|
||||||
|
sip=target_sip, # from caller (current_device)
|
||||||
|
cube=cube_id, # local within SIP
|
||||||
|
pe=ls.local_pe, # local within cube (explicit name)
|
||||||
|
offset_bytes=cube_offset + ls.offset_bytes,
|
||||||
|
nbytes=ls.nbytes,
|
||||||
|
))
|
||||||
|
|
||||||
|
return all_shards
|
||||||
|
```
|
||||||
|
|
||||||
|
**내부 resolver** (`column_wise`, `row_wise`, `replicate`)는 `_LocalPeShard`
|
||||||
|
리스트 반환 — `local_pe` 필드명으로 **"cube-local PE identifier"임이 명시적**.
|
||||||
|
과거 `ShardSpec.pe_index`와 이름이 혼동되던 문제 해소.
|
||||||
|
|
||||||
|
**이름 규약 정리** (전체 ADR):
|
||||||
|
- `ShardSpec.pe`: 최종 외부 API — cube-local PE (structural coord)
|
||||||
|
- `_LocalPeShard.local_pe`: 내부 resolver 단계의 동일 의미
|
||||||
|
- `pe_index`: **제거**. 외부/내부 어디에도 남기지 않는다 (silent drift 차단의
|
||||||
|
부가 효과: 이름 재등장 없음).
|
||||||
|
|
||||||
|
### D4. `_create_tensor` — 구조적 좌표로 직접 placement
|
||||||
|
|
||||||
|
ADR-0024 D11 연속선. Post-hoc shifting 제거, 구조적 좌표를 `resolve_dp_policy`
|
||||||
|
호출 시점에 직접 지정.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# context.py _create_tensor (after)
|
||||||
|
current_sip = self.ahbm.current_device()
|
||||||
|
if current_sip is None:
|
||||||
|
# Single-driver fallback (ADR-0024 D9와 일관).
|
||||||
|
# Launcher 기반 코드가 set_device()를 빼먹으면 조용히 SIP 0에 박히는
|
||||||
|
# 문제가 있음 → debug mode에서 경고.
|
||||||
|
if os.environ.get("KERNBENCH_DEBUG"):
|
||||||
|
import warnings
|
||||||
|
warnings.warn(
|
||||||
|
"torch.ahbm.current_device() is None; defaulting to SIP 0. "
|
||||||
|
"If this is a multi-rank launcher context, you likely forgot "
|
||||||
|
"torch.ahbm.set_device(rank) inside the worker.",
|
||||||
|
stacklevel=2,
|
||||||
|
)
|
||||||
|
current_sip = 0
|
||||||
|
|
||||||
|
placement = resolve_dp_policy(
|
||||||
|
dp,
|
||||||
|
shape=shape_2d,
|
||||||
|
itemsize=itemsize,
|
||||||
|
num_pe=eff_num_pe,
|
||||||
|
num_cubes=eff_num_cubes,
|
||||||
|
target_sip=current_sip, # ← 구조적 좌표 일차 지정
|
||||||
|
)
|
||||||
|
|
||||||
|
# placement의 각 ShardSpec은 이미 (sip=current_sip, cube=local, pe=local) 포함.
|
||||||
|
# 과거의 post-hoc shifting 블록은 완전히 제거.
|
||||||
|
```
|
||||||
|
|
||||||
|
**모든** 텐서가 current device SIP에 배치됨. Multi-SIP 텐서를 만들고 싶으면
|
||||||
|
ADR-0027의 TP primitive 사용.
|
||||||
|
|
||||||
|
**Single-driver fallback의 trade-off**: set_device 없는 호출에서 SIP 0으로
|
||||||
|
default는 기존 single-driver 테스트 호환을 위해 유지. `KERNBENCH_DEBUG=1`
|
||||||
|
환경에서는 launcher 컨텍스트의 실수로 set_device 누락 시 조용히 잘못된 SIP에
|
||||||
|
배치되는 것을 감지할 수 있도록 warning.
|
||||||
|
|
||||||
|
### D5. Downstream — allocator lookup은 구조적 tuple key로
|
||||||
|
|
||||||
|
기존 `deploy_tensor` (`src/kernbench/runtime_api/tensor.py`):
|
||||||
|
|
||||||
|
```python
|
||||||
|
for spec in placement:
|
||||||
|
alloc = allocators[spec.pe_index] # ← AttributeError (property 제거됨)
|
||||||
|
```
|
||||||
|
|
||||||
|
`pe_index`가 없어졌으므로 구조적 좌표로 **강제** migration:
|
||||||
|
|
||||||
|
```python
|
||||||
|
for spec in placement:
|
||||||
|
alloc = allocators[(spec.sip, spec.cube, spec.pe)]
|
||||||
|
```
|
||||||
|
|
||||||
|
`_ensure_allocators`의 dict population도 tuple key로:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# context.py _ensure_allocators (after)
|
||||||
|
for sip_id in sip_range:
|
||||||
|
for cube_id in range(cubes_per_sip):
|
||||||
|
for pe_id in range(pes_per_cube):
|
||||||
|
self._allocators[(sip_id, cube_id, pe_id)] = PEMemAllocator(
|
||||||
|
rack_id=0, sip_id=sip_id, cube_id=cube_id, pe_id=pe_id, cfg=cfg,
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
`_free_tensor`도 동일: 기존 `flat_idx = sip * ... + cube * ... + pe` 계산
|
||||||
|
블록 제거, `(shard.sip, shard.cube, shard.pe)` 직접 사용.
|
||||||
|
|
||||||
|
**Tuple vs dataclass `PEIdentity`**: Tuple이 단순하고 hashable로 바로 써서
|
||||||
|
권고. `PEIdentity` 값객체는 명시적 타입 장점은 있지만 boilerplate가 크고 현재
|
||||||
|
allocator dict의 유일한 key라 오버엔지니어링. Tuple 유지.
|
||||||
|
|
||||||
|
### D6. Migration — 기존 call site
|
||||||
|
|
||||||
|
**(A) `DPPolicy(sip=..., num_sips=..., ...)` 사용하던 코드**:
|
||||||
|
|
||||||
|
- `DPPolicy(sip="column_wise", cube=..., pe=...)` 패턴 → **해당 bench를 ADR-0024
|
||||||
|
launcher로 재작성**. worker가 `set_device(rank)`로 SIP 선택, DPPolicy는
|
||||||
|
cube/PE만.
|
||||||
|
- `DPPolicy(sip="replicate", num_sips=1, ...)` 패턴 → `DPPolicy(cube=..., pe=...)`로
|
||||||
|
축소 (필드가 사라지니 자연스럽게).
|
||||||
|
|
||||||
|
**(B) `dp.sip`, `dp.num_sips` 읽던 코드**:
|
||||||
|
|
||||||
|
- 제거. `launch()`의 `_compute_local_shape`에서 `dp.sip` 분기 삭제.
|
||||||
|
- `pe_cpu.py`가 `dp.sip`을 참조하던 곳도 정리.
|
||||||
|
|
||||||
|
**(C) `ShardSpec.pe_index`를 사용하던 코드 — 전부 수정 필요**:
|
||||||
|
|
||||||
|
- `.pe_index` 접근은 이제 `AttributeError` 발생 → 모든 call site 수정 필수.
|
||||||
|
- Allocator lookup: `allocators[spec.pe_index]` →
|
||||||
|
`allocators[(spec.sip, spec.cube, spec.pe)]`
|
||||||
|
- Flat integer가 꼭 필요한 국소 문맥: `spec.sip * N_CUBES * N_PE + spec.cube *
|
||||||
|
N_PE + spec.pe` 명시적 계산. **국소 변수로만 사용하고 공개 API에 노출하지
|
||||||
|
않는다**.
|
||||||
|
|
||||||
|
**구현 착수 전 grep audit 체크리스트**:
|
||||||
|
|
||||||
|
1. **Property 참조**:
|
||||||
|
- `\.pe_index\b` — 필드/property 접근 모두 (regex)
|
||||||
|
- `pe_index=` — 생성 시점의 키워드 인자
|
||||||
|
- `pe_index:` — dataclass 필드 선언
|
||||||
|
2. **Allocator / dict indexing**:
|
||||||
|
- `allocators\[` — dict lookup 패턴. `allocators[spec.pe_index]` 같은
|
||||||
|
것이 걸리는지
|
||||||
|
- `_allocators\[` — 같은 패턴 (prefix _)
|
||||||
|
3. **Flat index 수동 계산 블록**:
|
||||||
|
- `flat_idx =`
|
||||||
|
- `pe_index =` (좌변)
|
||||||
|
- `* pes_per_cube +` (전형적 flat 계산 패턴)
|
||||||
|
- `* self._num_cubes \* self._pes_per_cube` (global flat 계산)
|
||||||
|
4. **Serialization / logging**:
|
||||||
|
- `asdict(.*shard` — dataclass 직렬화 시 `pe_index` 자동 포함 여부
|
||||||
|
- `repr(.*ShardSpec` — 로그 포맷에서 의존하는지
|
||||||
|
- JSON/YAML 저장 포맷에서 `pe_index` 키 사용 여부
|
||||||
|
5. **Tests asserting integer PE identity**:
|
||||||
|
- `assert .*pe_index` — 정수 동일성 주장
|
||||||
|
- `spec.pe_index ==` — 비교 (SIP-local 의미로 변하면 테스트가 깨질 수 있음)
|
||||||
|
|
||||||
|
각 match마다 "이 호출자가 global flat / SIP-local / 내부 lookup 중 무엇을
|
||||||
|
기대했나"를 판단한 뒤 구조적 좌표로 교체.
|
||||||
|
|
||||||
|
**(D) `test_sip_parallel.py`**:
|
||||||
|
|
||||||
|
- 이름 유지, 내용은 ADR-0024의 multi-greenlet launcher 기반 재작성.
|
||||||
|
- "SIP 병렬성 = rank 별 worker × 각자 DPPolicy" 로 검증.
|
||||||
|
|
||||||
|
**(E) `test_va_offset.py`, `benches/va_offset_verify.py`**:
|
||||||
|
|
||||||
|
- `num_sips=1`만 쓰는 경우가 대부분. 단순히 필드 제거.
|
||||||
|
- SIP offset 테스트가 핵심이면 `set_device(rank)` + 구조적 좌표 관찰로 이식.
|
||||||
|
|
||||||
|
### D7. 하위 호환 — 불가 (cleanup ADR)
|
||||||
|
|
||||||
|
이 ADR은 **breaking change**.
|
||||||
|
|
||||||
|
1. `DPPolicy(sip=...)` 또는 `DPPolicy(num_sips=...)` 호출 → `TypeError`
|
||||||
|
2. `ShardSpec.pe_index` 접근 → `AttributeError`
|
||||||
|
|
||||||
|
모두 **즉시 명시적 breakage**. Deprecation warning / fallback 경로 없음.
|
||||||
|
KernBench는 사내 프로젝트로 call site가 한정되어 있어 한 번에 migration.
|
||||||
|
|
||||||
|
**Silent drift 차단**이 property 완전 제거의 주된 이점: global flat을 기대한
|
||||||
|
코드가 SIP-local 결과를 받아 조용히 잘못된 인덱싱을 할 가능성 제거.
|
||||||
|
|
||||||
|
### D8. 문서 업데이트
|
||||||
|
|
||||||
|
- `ADR-0008` (tensor deploy) — DPPolicy 의미 갱신 note, ShardSpec 구조적 좌표
|
||||||
|
전환 명시
|
||||||
|
- DPPolicy docstring에 "intra-device only" 명시 (D1 코드 스니펫의 docstring)
|
||||||
|
- ShardSpec docstring에 **structural coordinates `(sip, cube, pe)`를 직접
|
||||||
|
사용하며, `pe_index`는 더 이상 제공되지 않음**을 명시 (D2)
|
||||||
|
- `docs/ccl-author-guide` 등 튜토리얼에서 `sip=...` 예시 제거
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Dependencies
|
||||||
|
|
||||||
|
- **ADR-0024** (launcher): `set_device(rank)` 및 current-device scoping이
|
||||||
|
SIP 배치 메커니즘 제공. 본 ADR은 그 위에 서서 DPPolicy를 순수 intra-device로
|
||||||
|
좁힘.
|
||||||
|
- **ADR-0027** (Megatron TP): 다중 SIP에 걸친 텐서가 필요한 경우의 대안 경로.
|
||||||
|
이 ADR 적용 후 multi-SIP use case는 ADR-0027로 이관.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Non-goals
|
||||||
|
|
||||||
|
- **`DPPolicy.cube` / `pe` 재설계**: 기존 replicate/column_wise/row_wise 의미
|
||||||
|
유지.
|
||||||
|
- **Tiling 정책 통합**: `tiled_column_major` / `tiled_row_major`는 그대로.
|
||||||
|
- **Multi-device 텐서 추상화 신규**: DTensor-like는 ADR-0028.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Open questions
|
||||||
|
|
||||||
|
- **`_create_tensor`의 current_sip 기본값**: set_device 없는 호출에서 rank=0
|
||||||
|
(SIP 0)로 fallback할지, 아니면 error 낼지. 권고는 fallback (기존 single-driver
|
||||||
|
테스트와의 호환).
|
||||||
|
- **`test_sip_parallel.py` 재작성 범위**: 기존 단위 테스트의 의도를 유지하며
|
||||||
|
launcher 기반으로 옮기려면 추가 fixture 필요. 별도 작업으로 scope.
|
||||||
|
- **`DPPolicy`의 `num_sips=None` 의미**: 필드가 없어지면 `num_sips` 개념 자체가
|
||||||
|
사라짐. Multi-SIP을 표현하고 싶으면 ADR-0027의 TP primitive를 쓰라는 것이
|
||||||
|
명시적 답.
|
||||||
|
|
||||||
|
**Resolved (이전 rev에서 open이었던 것들)**:
|
||||||
|
- ~~`ShardSpec.pe_index` property 존치 여부~~ → **완전 제거** (D2)
|
||||||
|
- ~~`_ensure_allocators` dict key 형식~~ → **tuple `(sip, cube, pe)`** (D5)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Test strategy
|
||||||
|
|
||||||
|
### T1. 단위 테스트 갱신
|
||||||
|
|
||||||
|
- `tests/test_tensor.py`, `tests/test_ipcq_types.py`, `tests/test_runtime_api_tensor.py`
|
||||||
|
— DPPolicy 생성자 인자 정리, ShardSpec 구조적 좌표 검증
|
||||||
|
- `tests/test_va_offset.py` — `num_sips=1` 제거 후 동작 유지
|
||||||
|
|
||||||
|
### T2. `resolve_dp_policy` 구조적 좌표 반환
|
||||||
|
|
||||||
|
`tests/test_dp_policy.py` (new 또는 확장):
|
||||||
|
- `resolve_dp_policy(dp, ..., target_sip=1)` 결과의 모든 ShardSpec이 `sip=1`
|
||||||
|
- 각 spec의 `(cube, pe)`가 local (0..num_cubes-1, 0..num_pe-1)
|
||||||
|
- 같은 topology에서 `target_sip=0`과 `target_sip=1` 결과가 sip 필드만 다름
|
||||||
|
|
||||||
|
### T3. `test_sip_parallel.py` 재작성
|
||||||
|
|
||||||
|
SIP 병렬성 검증을 launcher 기반으로:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def test_sip_parallel_via_launcher(topology):
|
||||||
|
...
|
||||||
|
def worker(rank, ws, torch):
|
||||||
|
torch.ahbm.set_device(rank)
|
||||||
|
t = torch.zeros((1, 128), dtype="f16",
|
||||||
|
dp=DPPolicy(cube="column_wise", pe="column_wise"))
|
||||||
|
# verify shard.sip == rank (structural coord)
|
||||||
|
|
||||||
|
spawn(worker, nprocs=n_sips, ...)
|
||||||
|
```
|
||||||
|
|
||||||
|
### T4. Allocator key migration
|
||||||
|
|
||||||
|
`tests/test_allocator_structural_key.py` (new 또는 기존 확장):
|
||||||
|
- `PEMemAllocator` dict이 `(sip, cube, pe)` tuple key로 작동
|
||||||
|
- `deploy_tensor`가 구조적 좌표로 allocator lookup
|
||||||
|
- `_free_tensor`도 동일
|
||||||
|
|
||||||
|
### T5. E2E 회귀
|
||||||
|
|
||||||
|
ADR-0024의 `test_ccl_allreduce_matrix.py` 그대로 통과.
|
||||||
|
|
||||||
|
### T6. 오류 검증
|
||||||
|
|
||||||
|
- `DPPolicy(sip="column_wise")` 호출 → `TypeError`. 테스트로 명시.
|
||||||
|
- `DPPolicy(num_sips=2)` 호출 → `TypeError`.
|
||||||
|
- `spec.pe_index` 접근 → `AttributeError` (property 완전 제거 검증).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Consequences
|
||||||
|
|
||||||
|
### Positive
|
||||||
|
|
||||||
|
- **개념 분리 명확**: DPPolicy = intra-device, TP = inter-device.
|
||||||
|
- **API 단순화**: DPPolicy 생성자 필드 ~33% 축소.
|
||||||
|
- **Structural 좌표 일관성**: ShardSpec이 `(sip, cube, pe)` 튜플로 표현 →
|
||||||
|
abstraction leakage 해소 (ADR-0024 D11 계약 충족).
|
||||||
|
- **`pe_index` 의미 명확**: SIP-local이 단일 해석. Global flat이 필요하면 명시.
|
||||||
|
- **Launcher 모델 일관성**: ADR-0024의 "1 worker per SIP" 모델이 유일한 SIP
|
||||||
|
경계 제어 메커니즘.
|
||||||
|
|
||||||
|
### Negative
|
||||||
|
|
||||||
|
- **Breaking change (explicit)**: `DPPolicy(sip=...)` → `TypeError`,
|
||||||
|
`spec.pe_index` → `AttributeError`. 모든 호출자 한 번에 수정 필요.
|
||||||
|
- **ShardSpec schema 변경**: `pe_index` 단일 필드 → `sip`/`cube`/`pe` 세 필드.
|
||||||
|
Downstream (`deploy_tensor`, `_free_tensor`, `_ensure_allocators`,
|
||||||
|
`allocators` dict key 등) 연쇄 수정.
|
||||||
|
- **Silent drift 없음**: property 완전 제거로 runtime에서 즉시 실패 →
|
||||||
|
migration leakage 원천 차단. (Negative가 아니라 explicit tradeoff)
|
||||||
|
- `test_sip_parallel.py` 재작성 비용.
|
||||||
|
|
||||||
|
### Neutral
|
||||||
|
|
||||||
|
- 기존 `cube` / `pe` 필드 의미 불변.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Affected files
|
||||||
|
|
||||||
|
| File | Change |
|
||||||
|
|------|--------|
|
||||||
|
| `src/kernbench/policy/placement/dp.py` | D1: `sip`/`num_sips` 제거 / D2: `ShardSpec`에 `sip`/`cube`/`pe` structural fields 추가, **`pe_index` property 제거** / D3: `resolve_dp_policy`에 `target_sip`, SIP-level 루프 제거 / 내부 resolver가 반환하는 shard 타입 이름도 `local_pe`로 명확화 (이름 충돌 방지) |
|
||||||
|
| `src/kernbench/runtime_api/context.py` | D4: `_create_tensor` `target_sip` 전달 / D5: `_ensure_allocators` dict key → `(sip, cube, pe)` tuple / `launch`의 `dp.sip` 분기 제거 |
|
||||||
|
| `src/kernbench/runtime_api/tensor.py` | D5: `deploy_tensor`가 구조적 좌표로 allocator lookup |
|
||||||
|
| `src/kernbench/components/builtin/pe_cpu.py` | D6: `dp.sip` 참조 제거 |
|
||||||
|
| `src/kernbench/components/legacy/builtin/pe_cpu.py` | D6: 동일 |
|
||||||
|
| `benches/ccl_allreduce.py` | ADR-0024 scope에서 이미 처리 |
|
||||||
|
| `benches/va_offset_verify.py` | D6: `num_sips=1` 제거 |
|
||||||
|
| `tests/test_runtime_api_tensor.py` | D6 |
|
||||||
|
| `tests/test_va_offset.py` | D6 |
|
||||||
|
| `tests/test_tensor.py`, `test_ipcq_types.py` | D6 |
|
||||||
|
| `tests/test_sip_parallel.py` | T3: launcher 기반 재작성 |
|
||||||
|
| `tests/test_dp_policy.py` (new 또는 확장) | T2 |
|
||||||
|
| `tests/test_allocator_structural_key.py` (new) | T4 |
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,171 @@
|
|||||||
|
# ADR-0028: DTensor Support — 선언적 분산 텐서 (Stub / Future)
|
||||||
|
|
||||||
|
## Status
|
||||||
|
|
||||||
|
Stub (Future Work)
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
### 목표
|
||||||
|
|
||||||
|
**선언적 분산 텐서 추상화**(PyTorch 2.x `DTensor` 스타일)를 KernBench에
|
||||||
|
도입하기 위한 **디자인 공간 preliminary exploration**. 본 ADR은 **구현 계획이
|
||||||
|
아닌 future 작업의 파일 플레이스홀더 + 초기 질문 목록**이다.
|
||||||
|
|
||||||
|
### Megatron-style TP와의 차이 (Why DTensor)
|
||||||
|
|
||||||
|
| 관점 | Megatron (ADR-0027) | DTensor (이 ADR) |
|
||||||
|
|---|---|---|
|
||||||
|
| 표현 | 명시적 parallel layer | 텐서 + placement spec |
|
||||||
|
| 호출 형태 | `ColumnParallelLinear(...)` | `distribute_tensor(x, mesh, [Shard(1)])` |
|
||||||
|
| Collective 삽입 | 레이어 내부 명시 | 연산 dispatch가 자동 |
|
||||||
|
| Learning curve | 낮음 (명시적) | 중~높음 (선언적 의미 이해) |
|
||||||
|
| 유연성 | 레이어 단위로 고정 | 레이어 경계 무관, 어디서나 |
|
||||||
|
| KernBench에 선행 필요한 것 | launcher (ADR-0024) + TP (0027) | 그 + operator dispatch overhaul |
|
||||||
|
|
||||||
|
DTensor는 operator-level에서 "텐서의 placement를 보고 자동으로 collective
|
||||||
|
삽입". KernBench가 이를 지원하려면 **operator dispatch layer에 placement-aware
|
||||||
|
rewriting**이 들어가야 한다. 이는 비-trivial.
|
||||||
|
|
||||||
|
### 현재 상태
|
||||||
|
|
||||||
|
- KernBench는 operator dispatch 레이어가 없음 (`torch.matmul`은 없음; kernel
|
||||||
|
launch로 대체).
|
||||||
|
- DPPolicy는 정적 placement metadata를 보유 (ADR-0026 후: intra-device only).
|
||||||
|
- ADR-0024 launcher가 rank / device 개념 제공.
|
||||||
|
- Megatron-style TP (ADR-0027)가 명시적 대안으로 기능할 것.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Preliminary decision space
|
||||||
|
|
||||||
|
### DQ1. PyTorch DTensor API 수용 범위
|
||||||
|
|
||||||
|
- `DeviceMesh`: rank들의 논리적 grid.
|
||||||
|
- `Placements`: `Shard(dim)`, `Replicate()`, `Partial(reduce_op)`.
|
||||||
|
- `distribute_tensor(tensor, device_mesh, placements)`: local tensor → DTensor.
|
||||||
|
- Redistribute: `dt.redistribute(new_placements)`로 collective 자동 삽입.
|
||||||
|
- Operator forward: `dt @ dt`, `dt + dt` 등 → 적절한 collective 자동 dispatch.
|
||||||
|
|
||||||
|
KernBench가 어느 수준까지 지원할지 결정 필요. 최소: `distribute_tensor` +
|
||||||
|
`redistribute`. 최대: 모든 operator overloading.
|
||||||
|
|
||||||
|
### DQ2. Operator dispatch 레이어
|
||||||
|
|
||||||
|
KernBench에서 `dt @ dt`를 정의하려면 Tensor의 `__matmul__`이 placement를
|
||||||
|
보고 적절한 action 수행:
|
||||||
|
|
||||||
|
- 둘 다 replicated → local matmul
|
||||||
|
- A column-sharded, B row-sharded → local matmul + all-reduce (RowParallel)
|
||||||
|
- A replicated, B column-sharded → local matmul (ColumnParallel)
|
||||||
|
- etc.
|
||||||
|
|
||||||
|
이는 Megatron-style의 **자동화된 버전**. Kernel은 기존 matmul kernel 사용.
|
||||||
|
|
||||||
|
### DQ3. DeviceMesh와 기존 topology
|
||||||
|
|
||||||
|
KernBench topology는 이미 SIP/cube/PE 계층. DTensor의 DeviceMesh는 추상
|
||||||
|
`(tp_size, dp_size, ...)` grid. 매핑:
|
||||||
|
|
||||||
|
- 1D mesh of size = SIP count → rank = SIP
|
||||||
|
- 2D mesh (tp × dp) → SIP을 그룹 분할 (pure TP 대신 mixed parallelism)
|
||||||
|
|
||||||
|
초기엔 1D mesh만, DP × TP 2D는 future.
|
||||||
|
|
||||||
|
### DQ4. Placement의 intra-device (DP) 통합
|
||||||
|
|
||||||
|
KernBench 특이점: 한 rank 내부에서 DPPolicy로 cube/PE에 분산. DTensor는
|
||||||
|
device 내부를 보지 않음. 통합:
|
||||||
|
|
||||||
|
- DTensor placement = rank (SIP) 간 분산
|
||||||
|
- 각 rank의 local tensor는 여전히 DPPolicy로 cube/PE 배치
|
||||||
|
- → DTensor wrapper가 local tensor의 DPPolicy도 보관
|
||||||
|
|
||||||
|
### DQ5. Collective 자동 삽입 지점
|
||||||
|
|
||||||
|
`redistribute` 또는 operator forward 시. ADR-0024의 submit+yield+wait 패턴을
|
||||||
|
자동으로 호출하는 형태. `_launch_submit` 내부화.
|
||||||
|
|
||||||
|
### DQ6. Autograd
|
||||||
|
|
||||||
|
DTensor는 autograd와 상호작용 (backward에서 reverse collective). KernBench가
|
||||||
|
backward 지원하기 전까지는 **forward-only DTensor**.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Open questions (to resolve before real design)
|
||||||
|
|
||||||
|
1. **우선순위**: Megatron-style(ADR-0027)이 먼저 안착한 후 DTensor를 위에
|
||||||
|
얹는가, 아니면 공통 lower-layer를 먼저 설계하는가?
|
||||||
|
2. **호환성 목표**: PyTorch DTensor API와 몇 %까지 일치시키는가? 독자 API vs
|
||||||
|
거의 동일?
|
||||||
|
3. **Operator dispatch**: KernBench `Tensor` 클래스에 `__matmul__` 등 연산자
|
||||||
|
overloading을 도입하는가? (현재는 kernel launch만)
|
||||||
|
4. **Redistribute 정책**: `Shard(0) → Replicate()` 변환 시 어떤 collective
|
||||||
|
사용? `all_gather`가 없으면 구현 전까지 제약.
|
||||||
|
5. **Mesh × DPPolicy interaction**: 하나의 DTensor가 2개 layer 분산을 갖는
|
||||||
|
경우의 metadata 표현.
|
||||||
|
6. **Partial placement의 reduce 시점**: 자동 vs 명시 `redistribute` 호출.
|
||||||
|
7. **Bench authoring impact**: 기존 Megatron-style bench가 DTensor 기반으로
|
||||||
|
얼마나 쉽게 포팅되는가?
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Non-goals (for future real ADR)
|
||||||
|
|
||||||
|
- 이번 stub에서 API 확정. Future ADR에서 구체화.
|
||||||
|
- Implementation timeline. 이번 round에서는 **설계 공간 매핑만**.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Dependencies (potential)
|
||||||
|
|
||||||
|
- **ADR-0024** (launcher): rank / device 기반
|
||||||
|
- **ADR-0026** (DPPolicy cleanup): DTensor placement와의 분리 명확화
|
||||||
|
- **ADR-0027** (Megatron TP): 실용 TP 패턴 경험을 DTensor 설계로 환류
|
||||||
|
- **Future ADR** (operator dispatch layer): KernBench Tensor에 operator
|
||||||
|
overloading 도입
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Expected consequences (hypothetical)
|
||||||
|
|
||||||
|
### Positive
|
||||||
|
|
||||||
|
- PyTorch training code 이식이 **매우 쉬워짐** (DTensor 코드 그대로).
|
||||||
|
- TP + DP + 더 복잡한 parallelism을 **하나의 추상화**로 표현.
|
||||||
|
- Collective 삽입이 자동 → bench 작성자 부담 감소.
|
||||||
|
|
||||||
|
### Negative
|
||||||
|
|
||||||
|
- Operator dispatch layer 신규 구축 → 상당한 엔지니어링.
|
||||||
|
- Implicit behavior 증가 → 디버깅 / 성능 분석 복잡.
|
||||||
|
- KernBench의 "명시적 kernel launch" 철학과 tension.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Action
|
||||||
|
|
||||||
|
- **Phase 1 (현재)**: 본 stub 유지. Megatron-style (ADR-0027) 먼저 구현 +
|
||||||
|
사용 경험 축적.
|
||||||
|
- **Phase 2 (future)**: 사용 경험을 바탕으로 본 ADR을 real design으로 승격.
|
||||||
|
위 Open questions에 대한 답을 제시.
|
||||||
|
- **Phase 3 (future)**: Implementation.
|
||||||
|
|
||||||
|
현재 구현 작업은 **없음**. 디자인 공간 매핑만.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Affected files
|
||||||
|
|
||||||
|
본 ADR은 **stub**이므로 production 변경 없음. Future real ADR에서 갱신될
|
||||||
|
파일 후보:
|
||||||
|
|
||||||
|
| File | 예상 변경 (future) |
|
||||||
|
|------|---|
|
||||||
|
| `src/kernbench/dtensor/__init__.py` | 신규 패키지 |
|
||||||
|
| `src/kernbench/dtensor/device_mesh.py` | DeviceMesh |
|
||||||
|
| `src/kernbench/dtensor/placements.py` | Shard/Replicate/Partial |
|
||||||
|
| `src/kernbench/dtensor/api.py` | distribute_tensor, redistribute |
|
||||||
|
| `src/kernbench/dtensor/ops/*.py` | Operator dispatch (matmul 등) |
|
||||||
|
| `src/kernbench/runtime_api/tensor.py` | Tensor에 `__matmul__` 등 추가 |
|
||||||
@@ -0,0 +1,419 @@
|
|||||||
|
# ADR-0029: Hierarchical All-Reduce — 3-level intra/inter-SIP 알고리즘
|
||||||
|
|
||||||
|
## Status
|
||||||
|
|
||||||
|
Proposed
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
### 목표
|
||||||
|
|
||||||
|
"Rank = SIP" 모델 (ADR-0024) 위에서 각 SIP 내부의 모든 PE를 참여시키는
|
||||||
|
**3-level 계층 all-reduce** 알고리즘을 정의한다. 각 레벨이 서로 다른 물리
|
||||||
|
연결(intra-cube ring, inter-cube NoC, inter-SIP UCIe)을 활용해 대역폭을
|
||||||
|
극대화한다.
|
||||||
|
|
||||||
|
### 왜 hierarchical인가
|
||||||
|
|
||||||
|
단순 ring/mesh/tree all-reduce는 SIP당 1 PE만 참여 (ADR-0024의 `leader_only`
|
||||||
|
mapper). 이는 inter-SIP 단계는 잘 모델링하지만:
|
||||||
|
|
||||||
|
- **Intra-SIP PE가 노는 시간이 발생**. Leader PE가 inter-SIP 통신 중이면
|
||||||
|
나머지 7 PE / 16 cube는 유휴.
|
||||||
|
- **Intra-cube/inter-cube 연결 대역폭 미활용**. Cube NoC는 매우 빠르지만
|
||||||
|
단일 leader 사용 시 이 자원이 노출되지 않음.
|
||||||
|
- **실제 NCCL 등은 hierarchical**: NVLink(intra-node) + InfiniBand(inter-node)
|
||||||
|
의 bandwidth 차이를 활용. KernBench 토폴로지도 동일 구조
|
||||||
|
(intra-cube / inter-cube / inter-SIP의 bandwidth·latency 차이).
|
||||||
|
|
||||||
|
### 현재 상태
|
||||||
|
|
||||||
|
- `src/kernbench/ccl/algorithms/hierarchical_allreduce.py` 이미 존재
|
||||||
|
(git log `10b33b4` — "Tensor indexing + hierarchical 3-level all-reduce
|
||||||
|
kernel"). PE-level로 world_size = total PE를 가정하는 옛 모델 기반 구현.
|
||||||
|
- ADR-0024에 의해 launcher는 rank = SIP로 바뀜.
|
||||||
|
- Hierarchical 커널은 **재해석 필요**: 이제 각 worker(1 per SIP)가 자기 SIP의
|
||||||
|
모든 PE를 참여시키고, kernel은 intra-cube → inter-cube → inter-SIP 순으로
|
||||||
|
3-level reduce + broadcast.
|
||||||
|
|
||||||
|
### 풀어야 할 문제
|
||||||
|
|
||||||
|
1. **ADR-0024 framework 위에 hierarchical 알고리즘 맞추기**
|
||||||
|
- Mapper: `all_pes` (ADR-0024 D5 제공)
|
||||||
|
- Validator: `multi_pe_sip_local` (ADR-0024 D8 제공)
|
||||||
|
- Kernel: 기존 `hierarchical_allreduce.py` 수정 — rank 계산 방식을 SIP 내
|
||||||
|
local (cube, pe)로 바꿈
|
||||||
|
2. **PE-level neighbor graph 생성**
|
||||||
|
- Intra-cube: `(sip, cube, pe) ↔ (sip, cube, pe±1 mod N_PE)` (ring 내부)
|
||||||
|
- Inter-cube: `(sip, cube, 0) ↔ (sip, cube±1 mod N_CUBE, 0)` (cube leader만)
|
||||||
|
- Inter-SIP: `(sip, 0, 0) ↔ (sip±1 mod N_SIP, 0, 0)` (SIP leader만)
|
||||||
|
3. **Tensor layout**: 각 PE가 1 tile을 소유하고 시작 (`multi_pe_sip_local`
|
||||||
|
validator가 이 layout 강제). DPPolicy(cube="column_wise",
|
||||||
|
pe="column_wise")로 달성 가능.
|
||||||
|
4. **PE-level topology 표현 부족** (ADR-0024 D6의 "책임 분산" 이슈 구체화)
|
||||||
|
- Ring/mesh/tree 같은 단순 패턴은 rank-level topology_fn + mapper 조합으로
|
||||||
|
충분.
|
||||||
|
- Hierarchical은 레벨마다 다른 peer 매핑이라 `_build_pe_installs`에서
|
||||||
|
multi-level 해석을 해야 함.
|
||||||
|
- 장기적으로는 topology 모듈이 PE-level을 직접 표현하는 편이 명시적.
|
||||||
|
|
||||||
|
### Non-problem (이 ADR 밖)
|
||||||
|
|
||||||
|
- Launcher / barrier / rank-to-SIP / mapper-validator registry → ADR-0024
|
||||||
|
- IPCQ direction addressing → ADR-0025
|
||||||
|
- DPPolicy 필드 정리 → ADR-0026
|
||||||
|
- Megatron TP → ADR-0027
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Decision
|
||||||
|
|
||||||
|
### D1. 알고리즘 구조 — 3-level reduce + 역순 broadcast
|
||||||
|
|
||||||
|
```
|
||||||
|
Level 1 (intra-cube, E/W ring):
|
||||||
|
각 cube의 N_PE개 PE가 bidirectional ring reduce → cube 내 PE 0에 부분합 집중
|
||||||
|
Level 2 (inter-cube within SIP, N/S ring, PE 0만 참여):
|
||||||
|
N_CUBE개 cube-leader가 bidirectional ring reduce → SIP 내 (cube 0, PE 0)에
|
||||||
|
SIP 전체 부분합 집중
|
||||||
|
Level 3 (inter-SIP, N_SIP peers, (cube 0, PE 0)만 참여):
|
||||||
|
Ring 또는 pair exchange로 전역 합산 완료
|
||||||
|
Broadcast:
|
||||||
|
역순 — Level 3 결과를 (cube 0, PE 0)에서 SIP 내 모든 cube-leader로, 다시
|
||||||
|
각 cube 내 모든 PE로 전파
|
||||||
|
```
|
||||||
|
|
||||||
|
세부는 기존 `hierarchical_allreduce.py`의 커널 구현과 일치. ADR-0024 이후
|
||||||
|
변경점은 **rank 계산 방식**과 **n_elem 해석**뿐:
|
||||||
|
|
||||||
|
- 기존 (rank=PE 모델): `rank = cube_id * pes_per_cube + local_pe`, `pe_addr =
|
||||||
|
t_ptr + rank * nbytes`
|
||||||
|
- 신규 (rank=SIP 모델): 커널은 SIP-local 좌표 `(cube_id, local_pe)`로만 동작.
|
||||||
|
텐서의 per-PE slice는 backend가 per-PE `TensorArg`로 전달 (ADR-0024 D3).
|
||||||
|
커널 내부 rank 계산 자체가 불필요해짐 — `tl.program_id(0/1)`로 충분.
|
||||||
|
|
||||||
|
### D2. Framework integration — ADR-0024 infrastructure 재활용
|
||||||
|
|
||||||
|
`ccl.yaml`:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
algorithms:
|
||||||
|
hierarchical_allreduce:
|
||||||
|
module: kernbench.ccl.algorithms.hierarchical_allreduce
|
||||||
|
topology: hierarchical_3level # NEW — D3 참고
|
||||||
|
mapper: all_pes # ADR-0024 D5 built-in
|
||||||
|
validator: multi_pe_sip_local # ADR-0024 D8 built-in
|
||||||
|
buffer_kind: tcm
|
||||||
|
n_elem: 128
|
||||||
|
```
|
||||||
|
|
||||||
|
Framework 관점에서 hierarchical은 **특별한 알고리즘이 아니라, 특정
|
||||||
|
topology / mapper / validator 조합**. 본 ADR은 그 조합과 topology 패턴을
|
||||||
|
정의.
|
||||||
|
|
||||||
|
### D3. `hierarchical_3level` topology (신규)
|
||||||
|
|
||||||
|
`kernbench/ccl/topologies.py`에 신규 추가:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def hierarchical_3level(rank: int, world_size: int, spec: dict) -> dict:
|
||||||
|
"""3-level hierarchical neighbor pattern.
|
||||||
|
|
||||||
|
Returns a nested structure describing intra-cube + inter-cube + inter-SIP
|
||||||
|
neighbors. Unlike ring_1d / mesh_2d which are rank → {dir: peer_rank},
|
||||||
|
hierarchical is PE-level and requires spec for cube_mesh / pe_layout.
|
||||||
|
"""
|
||||||
|
```
|
||||||
|
|
||||||
|
반환 스키마 (초안):
|
||||||
|
|
||||||
|
```python
|
||||||
|
{
|
||||||
|
"intra_cube": {
|
||||||
|
# 각 cube 내 ring neighbors: (cube, pe) → {"E": (cube, pe_e), "W": (cube, pe_w)}
|
||||||
|
...
|
||||||
|
},
|
||||||
|
"inter_cube": {
|
||||||
|
# cube-leader 간 ring: (cube, 0) → {"N": (cube_n, 0), "S": (cube_s, 0)}
|
||||||
|
...
|
||||||
|
},
|
||||||
|
"inter_sip": {
|
||||||
|
# SIP-leader 간: rank → {"parent": peer_rank} (또는 ring 방식)
|
||||||
|
...
|
||||||
|
},
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
이 구조는 `_build_pe_installs`가 해석하여 각 PE의 neighbor table 엔트리
|
||||||
|
(4-direction)에 대응시킨다.
|
||||||
|
|
||||||
|
**Rank-level `topologies.py` 현 API와의 관계**: 기존 단순 패턴은
|
||||||
|
`(rank → {dir: peer_rank})` 단일 레벨. Hierarchical은 multi-level이므로
|
||||||
|
기존 API와 schema가 다름. `_resolve_topology`는 **알고리즘이 어떤 schema를
|
||||||
|
쓰는지 선언**하고, builder가 그에 맞춰 해석하도록 확장 필요 (open question).
|
||||||
|
|
||||||
|
### D4. PE-level neighbor graph — `_build_pe_installs` 확장
|
||||||
|
|
||||||
|
기존 (ring/mesh/tree): topology_fn이 반환한 `(rank → {dir: peer_rank})`를
|
||||||
|
각 참여 PE에 그대로 매핑 (leader_only일 경우 peer PE도 leader).
|
||||||
|
|
||||||
|
신규 (hierarchical): `hierarchical_3level`의 3단 구조를 per-PE neighbor
|
||||||
|
table로 펼침:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def _build_pe_installs_hierarchical(rank, world_size, sip, pes, topo, spec):
|
||||||
|
"""Hierarchical 전용 PE neighbor table 빌더."""
|
||||||
|
result = []
|
||||||
|
for (cube, pe) in pes:
|
||||||
|
entries = []
|
||||||
|
# Level 1: intra-cube ring (E/W)
|
||||||
|
for d, peer in topo["intra_cube"][(cube, pe)].items():
|
||||||
|
entries.append(NeighborTableEntry(direction=d, ...))
|
||||||
|
# Level 2: inter-cube ring (N/S) — cube leader (pe == 0)만
|
||||||
|
if pe == 0:
|
||||||
|
for d, peer in topo["inter_cube"][(cube, 0)].items():
|
||||||
|
entries.append(NeighborTableEntry(direction=d, ...))
|
||||||
|
# Level 3: inter-SIP — SIP leader (cube == 0 and pe == 0)만
|
||||||
|
if cube == 0 and pe == 0:
|
||||||
|
for d, peer_rank in topo["inter_sip"][rank].items():
|
||||||
|
# peer_rank → peer SIP의 (0, 0)
|
||||||
|
entries.append(NeighborTableEntry(
|
||||||
|
direction=d, peer_sip=peer_rank, peer_cube=0, peer_pe=0, ...))
|
||||||
|
result.append(PeInstallSpec(cube=cube, pe=pe, neighbors=tuple(entries)))
|
||||||
|
return tuple(result)
|
||||||
|
```
|
||||||
|
|
||||||
|
`build_install_plans`에서 algorithm_config의 `topology`에 따라 적절한 builder
|
||||||
|
선택 (기존 simple builder vs hierarchical builder).
|
||||||
|
|
||||||
|
### D5. Kernel 재해석 — SIP-local 좌표로
|
||||||
|
|
||||||
|
`src/kernbench/ccl/algorithms/hierarchical_allreduce.py`를 ADR-0024 D3에
|
||||||
|
맞춰 수정:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def kernel_args(*, n_elem: int, world_size: int, pes_per_cube: int,
|
||||||
|
cubes_per_sip: int, num_sips: int, **kw) -> tuple:
|
||||||
|
"""world_size (= num_sips), pes_per_cube, cubes_per_sip를 스칼라로."""
|
||||||
|
return (n_elem, pes_per_cube, cubes_per_sip, num_sips)
|
||||||
|
|
||||||
|
def kernel(t_ptr, n_elem, pes_per_cube, cubes_per_sip, num_sips, tl):
|
||||||
|
"""SIP-local 좌표 기반.
|
||||||
|
|
||||||
|
이전 (rank=PE 모델):
|
||||||
|
rank = cube_id * pes_per_cube + local_pe
|
||||||
|
pe_addr = t_ptr + rank * nbytes
|
||||||
|
현재 (rank=SIP 모델):
|
||||||
|
per-PE tensor slice는 backend가 TensorArg로 전달 → t_ptr은 이미 local.
|
||||||
|
intra-cube ring은 tl.program_id(0) 사용.
|
||||||
|
inter-cube ring은 pe_id == 0 조건으로 제한.
|
||||||
|
inter-SIP reduce는 cube_id == 0 and pe_id == 0 조건으로 제한.
|
||||||
|
"""
|
||||||
|
local_pe = tl.program_id(axis=0)
|
||||||
|
cube_id = tl.program_id(axis=1)
|
||||||
|
|
||||||
|
# Level 1: intra-cube ring
|
||||||
|
for _ in range(intra_rounds(pes_per_cube)):
|
||||||
|
tl.send(dir="E", src=acc)
|
||||||
|
recv = tl.recv(dir="W", shape=(n_elem,), dtype="f16")
|
||||||
|
acc = acc + recv
|
||||||
|
|
||||||
|
# Level 2: inter-cube (cube leader only)
|
||||||
|
if local_pe == 0:
|
||||||
|
for _ in range(inter_cube_rounds(cubes_per_sip)):
|
||||||
|
tl.send(dir="N", src=acc)
|
||||||
|
recv = tl.recv(dir="S", shape=(n_elem,), dtype="f16")
|
||||||
|
acc = acc + recv
|
||||||
|
|
||||||
|
# Level 3: inter-SIP (SIP leader only)
|
||||||
|
if local_pe == 0 and cube_id == 0:
|
||||||
|
for _ in range(inter_sip_rounds(num_sips)):
|
||||||
|
tl.send(dir="parent", src=acc)
|
||||||
|
recv = tl.recv(dir="parent", shape=(n_elem,), dtype="f16")
|
||||||
|
acc = acc + recv
|
||||||
|
|
||||||
|
# Broadcast (reverse chain)
|
||||||
|
# ...
|
||||||
|
tl.store(t_ptr, acc)
|
||||||
|
```
|
||||||
|
|
||||||
|
`kernel_args`는 ADR-0024 D4의 keyword-only signature 계약을 따른다.
|
||||||
|
|
||||||
|
### D6. Validator — `multi_pe_sip_local`
|
||||||
|
|
||||||
|
ADR-0024 D8의 built-in 그대로 활용. `ccl.yaml`에서 `validator:
|
||||||
|
multi_pe_sip_local` 지정 시 backend가 각 SIP에 `cubes × pes_per_cube`개
|
||||||
|
shard가 있는지 검증.
|
||||||
|
|
||||||
|
### D7. Bench — 기본 all-reduce bench 확장
|
||||||
|
|
||||||
|
`benches/ccl_allreduce.py`의 worker는 `ccl.yaml`이 `hierarchical_allreduce`를
|
||||||
|
선택하면 자동으로:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Worker 예
|
||||||
|
dp = DPPolicy(cube="column_wise", pe="column_wise")
|
||||||
|
tensor = torch.zeros((1, intra_sip_pes * n_elem), dp=dp, name="in")
|
||||||
|
# tensor는 각 SIP의 모든 PE에 1 tile씩 분산 (multi_pe_sip_local validator 통과)
|
||||||
|
dist.all_reduce(tensor, op="sum")
|
||||||
|
```
|
||||||
|
|
||||||
|
Worker 코드 자체는 알고리즘 종류를 모름 (`ccl.yaml` 선택에 의존). 단,
|
||||||
|
**DPPolicy가 hierarchical 요구와 일치해야** 함 — `cube/pe="column_wise"`
|
||||||
|
같은 SIP-내 분산을 하는 DPPolicy여야 `multi_pe_sip_local` 검증 통과. 이
|
||||||
|
DPPolicy 선택은 bench 설정 또는 sample bench에서 결정.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Dependencies
|
||||||
|
|
||||||
|
- **ADR-0024**: Launcher, `all_pes` mapper, `multi_pe_sip_local` validator,
|
||||||
|
registry + import path. 본 ADR 구현의 전제.
|
||||||
|
- **ADR-0025**: IPCQ direction addressing — cube/pe/SIP 간 다중 direction을
|
||||||
|
동시 사용하므로 정확한 direction 매칭 필수.
|
||||||
|
- **ADR-0023**: IPCQ protocol (neighbor table, send/recv, credit return).
|
||||||
|
- **기존 `hierarchical_allreduce.py`**: 본 ADR은 그 커널의 재해석 + 주변
|
||||||
|
framework integration.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Non-goals
|
||||||
|
|
||||||
|
- **ADR-0024 framework 변경**: 재활용만.
|
||||||
|
- **Alternative reduce topology (tree-in-tree 등)**: 3-level ring이 첫 구현.
|
||||||
|
- **Dynamic level count**: 현재 SIP/cube/PE 3단 고정. 2단 (SIP + PE, cube
|
||||||
|
skip) 또는 4단 이상은 future.
|
||||||
|
- **Bandwidth-optimal schedule tuning**: reduce round 수 / chunk size 조정
|
||||||
|
같은 tuning은 별도.
|
||||||
|
- **Pipelined hierarchical**: 여러 chunk를 파이프라인으로 겹쳐서 돌리는
|
||||||
|
NCCL-style 최적화는 future.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Open questions
|
||||||
|
|
||||||
|
### 🟠 중간 영향 — 구현 시 결정 필요
|
||||||
|
|
||||||
|
- **`topologies.py` 스키마 확장**: 기존 `ring_1d` 등은 단일 레벨 `(rank →
|
||||||
|
{dir: peer})`. `hierarchical_3level`은 multi-level. `_resolve_topology`가
|
||||||
|
둘을 모두 반환할 수 있도록 schema를 일반화할지, 아니면 hierarchical 전용
|
||||||
|
return type을 두고 builder가 분기할지.
|
||||||
|
- Option A: 모든 topology를 neighbor-list 형태로 단일화
|
||||||
|
(`[{direction, peer_sip, peer_cube, peer_pe}, ...]`)
|
||||||
|
- Option B: topology 모듈이 `kind` 필드 제공, builder가 분기
|
||||||
|
- 권장: Option A (single source of truth, ADR-0024 Open Q의
|
||||||
|
"PE-level topology 일원화" 방향과 일치)
|
||||||
|
|
||||||
|
- **`hierarchical_3level` vs algorithm별 topology 모듈**: 향후 mesh-based
|
||||||
|
hierarchical 등 variant이 생기면? `hierarchical_3level` 같은 이름이 이미
|
||||||
|
topology-specific. 변형은 새 key 추가 (`hierarchical_mesh_3level` 등) 또는
|
||||||
|
알고리즘 모듈에서 topology 생성 override.
|
||||||
|
|
||||||
|
### 🟡 Nice-to-have
|
||||||
|
|
||||||
|
- **Reduce round 수 최적화**: Bidirectional ring은 `ceil((N-1)/2)` round.
|
||||||
|
Non-power-of-2 group size에서 idle PE 발생 가능.
|
||||||
|
- **Non-uniform topology 대응**: cube_mesh가 w != h일 때 inter-cube ring
|
||||||
|
balance.
|
||||||
|
- **Single SIP 케이스**: world_size = 1 (SIP 1개)일 때 Level 3 skip. Degenerate
|
||||||
|
case 검증.
|
||||||
|
|
||||||
|
### 🟢 Framework evolution 시사점 (ADR-0024로부터 이관)
|
||||||
|
|
||||||
|
- **PE-level topology 일원화 (중장기)**: 현 설계는
|
||||||
|
- topology (rank graph 또는 level-separated)
|
||||||
|
- mapper (per-SIP PE set)
|
||||||
|
- `_build_pe_installs` (actual edges)
|
||||||
|
|
||||||
|
의 3단 분산. Hierarchical이 이 분산을 가장 스트레스 받는 케이스. 중장기로는
|
||||||
|
`topologies.py`가 PE-level neighbor list를 직접 반환하고 mapper는 단순히
|
||||||
|
"어느 PE가 참여하느냐"만 결정, `_build_pe_installs`는 flat
|
||||||
|
mapping으로 단순화되는 방향이 자연스러움. **본 ADR에서 Option A를 채택**하면
|
||||||
|
이 방향으로 이미 정합.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Test strategy
|
||||||
|
|
||||||
|
### T1. Topology generator
|
||||||
|
|
||||||
|
`tests/test_hierarchical_topology.py` (new):
|
||||||
|
- `hierarchical_3level(rank, world_size, spec)` → 각 level의 neighbor set이
|
||||||
|
예상 구조인지 (intra-cube는 ring, inter-cube는 cube-leader만 참여, inter-SIP은
|
||||||
|
SIP-leader만 참여)
|
||||||
|
- 2 SIP × 4 cubes × 4 PEs 같은 작은 토폴로지로 수작업 검증 가능
|
||||||
|
- Symmetry: rank r의 E neighbor가 peer에서 W로 역포인팅
|
||||||
|
|
||||||
|
### T2. Install plan — hierarchical × all_pes
|
||||||
|
|
||||||
|
`tests/test_ccl_install_plan.py` (확장):
|
||||||
|
- `build_install_plans(algorithm="hierarchical_allreduce", mapper="all_pes",
|
||||||
|
validator="multi_pe_sip_local")` 호출 시
|
||||||
|
- 각 SIP의 모든 PE가 `participating_pes`에 포함
|
||||||
|
- PE 0 (cube leader)만 inter-cube neighbor를 가짐
|
||||||
|
- (cube 0, pe 0) (SIP leader)만 inter-SIP neighbor를 가짐
|
||||||
|
- Non-leader PE는 intra-cube neighbor만
|
||||||
|
|
||||||
|
### T3. Kernel unit — mock runtime
|
||||||
|
|
||||||
|
`tests/test_hierarchical_mock_runtime.py` (new):
|
||||||
|
- `run_kernel_in_mock` (kernbench.ccl.testing)을 확장해 multi-level 지원
|
||||||
|
- 2 SIP × 2 cubes × 4 PEs (총 16 PE) 토폴로지에서 초기 tile을 rank+1로 채우고
|
||||||
|
hierarchical all-reduce 실행
|
||||||
|
- 모든 PE의 최종 결과가 `sum(1..16)`인지
|
||||||
|
|
||||||
|
### T4. E2E — 실제 SimPy backend
|
||||||
|
|
||||||
|
`tests/test_ccl_allreduce_matrix.py` (확장):
|
||||||
|
- `hierarchical @ ws=SIP_count`: multi_pe_sip_local layout + 3-level 알고리즘
|
||||||
|
전체 stack 통과 검증
|
||||||
|
|
||||||
|
### T5. Validator enforcement
|
||||||
|
|
||||||
|
- `multi_pe_sip_local` validator가 wrong layout (예: leader_only 스타일 1
|
||||||
|
shard per rank) 입력에 raise
|
||||||
|
|
||||||
|
### T6. 회귀
|
||||||
|
|
||||||
|
기존 ring/mesh/tree 알고리즘 모두 그대로 통과. 본 ADR은 그들을 건드리지 않음.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Consequences
|
||||||
|
|
||||||
|
### Positive
|
||||||
|
|
||||||
|
- **Intra-SIP PE 활용도 증가**: Inter-SIP 통신 중에도 intra-cube / inter-cube
|
||||||
|
reduce가 진행되어 전체 PE 가동률 향상.
|
||||||
|
- **Multi-level bandwidth 활용**: cube NoC, UCIe 모두 작동 → 더 정확한 HW 모델.
|
||||||
|
- **ADR-0024 framework 검증**: `all_pes` mapper + `multi_pe_sip_local`
|
||||||
|
validator의 첫 non-trivial use case. Framework 설계 타당성 확인.
|
||||||
|
- **기존 커널 재활용**: `hierarchical_allreduce.py` 큰 구조 유지, SIP-local
|
||||||
|
좌표만 재해석.
|
||||||
|
|
||||||
|
### Negative
|
||||||
|
|
||||||
|
- **`topologies.py` schema 확장 필요**: Single-level vs multi-level 표현.
|
||||||
|
해결안(Option A)은 기존 ring/mesh/tree의 마이그레이션 비용 유발.
|
||||||
|
- **Validator / mapper 조합 요구**: 사용자가 DPPolicy를
|
||||||
|
`multi_pe_sip_local`에 맞춰 선택해야 함 (bench 설정 복잡도 증가).
|
||||||
|
|
||||||
|
### Neutral
|
||||||
|
|
||||||
|
- 본 ADR 구현 전까지 `hierarchical_allreduce.py`는 deprecated 상태 유지 또는
|
||||||
|
ADR-0024 matrix test에서 제외. 현재 파일을 곧바로 삭제하지는 않음.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Affected files
|
||||||
|
|
||||||
|
| File | Change |
|
||||||
|
|------|--------|
|
||||||
|
| `src/kernbench/ccl/topologies.py` | D3: `hierarchical_3level` topology 함수 추가. (Option A 채택 시) 기존 topology 출력 format 통일 |
|
||||||
|
| `src/kernbench/ccl/install_plan.py` | D4: hierarchical builder 분기 (또는 단일 builder가 level 개수로 dispatch) |
|
||||||
|
| `src/kernbench/ccl/algorithms/hierarchical_allreduce.py` | D5: SIP-local 좌표로 kernel 재작성, `kernel_args` keyword-only signature |
|
||||||
|
| `ccl.yaml` | D2: `hierarchical_allreduce` 엔트리 추가 (`mapper: all_pes`, `validator: multi_pe_sip_local`, `topology: hierarchical_3level`) |
|
||||||
|
| `tests/test_hierarchical_topology.py` (new) | T1 |
|
||||||
|
| `tests/test_ccl_install_plan.py` | T2 확장 |
|
||||||
|
| `tests/test_hierarchical_mock_runtime.py` (new) | T3 |
|
||||||
|
| `tests/test_ccl_allreduce_matrix.py` | T4: hierarchical row 추가 |
|
||||||
@@ -0,0 +1,347 @@
|
|||||||
|
# ADR-0030: IPCQ Physical Addressing — PhysAddr integration
|
||||||
|
|
||||||
|
## Status
|
||||||
|
|
||||||
|
Proposed (Blocked on ADR-0031 — PhysAddr PE-resource extension)
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
### 목표
|
||||||
|
|
||||||
|
IPCQ ring buffer의 주소 체계를 ADR-0023의 **synthetic parallel namespace**
|
||||||
|
(`_IPCQ_BASE = 1<<60`)에서 **ADR-0001의 PhysAddr**로 이관한다. Routing /
|
||||||
|
allocator / MemoryStore의 정합성을 회복하고, buffer_kind (tcm/hbm/sram)별
|
||||||
|
physical backing을 구조적 좌표로 표현한다.
|
||||||
|
|
||||||
|
### 현재 상태 (ADR-0023 D2.5)
|
||||||
|
|
||||||
|
`src/kernbench/ccl/install.py:52-56`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
_IPCQ_BASE = 1 << 60
|
||||||
|
def _ipcq_base_for_pe(sip, cube, pe):
|
||||||
|
return _IPCQ_BASE | (sip << 40) | (cube << 32) | (pe << 24)
|
||||||
|
|
||||||
|
def rx_base(s, c, p, d):
|
||||||
|
return _ipcq_base_for_pe(s, c, p) + direction_idx[d] * bytes_per_direction
|
||||||
|
```
|
||||||
|
|
||||||
|
- **bit 60** 사용 → ADR-0001의 51-bit PhysAddr 공간 밖 (`MAX_51 = (1 << 51) - 1`)
|
||||||
|
- `PhysAddr.decode(addr)` → `PhysAddrError("addr must be a 51-bit value")`
|
||||||
|
- `IpcqEndpoint.rx_base_pa: int` — 타입이 raw int, 구조 없음
|
||||||
|
- `buffer_kind` (tcm/hbm/sram)와 synthetic 주소의 관계가 coupling 없음
|
||||||
|
- Allocator (`PEMemAllocator`) 우회 — synthetic unique id per (sip, cube, pe,
|
||||||
|
direction). 진짜 physical allocation이 아님
|
||||||
|
|
||||||
|
ADR-0023 D2.5 원문:
|
||||||
|
|
||||||
|
> This bypasses the topology's address resolver / PhysAddr encoding and
|
||||||
|
> treats IPCQ buffers as a separate, parallel address namespace. Real PA
|
||||||
|
> encoding can be plugged in later without changing the rest of the design.
|
||||||
|
|
||||||
|
"later"가 이 ADR.
|
||||||
|
|
||||||
|
### 왜 지금 다루는가
|
||||||
|
|
||||||
|
- ADR-0025 (direction addressing)은 주소-기반 매칭으로 전환. 주소가 correctness에
|
||||||
|
직접 기여 → 주소 체계가 설계 관점에서 더 중요해짐
|
||||||
|
- ADR-0001의 "Routing consumes decoded domains, not raw bit-fields" 계약 위반
|
||||||
|
지속 → 기술 부채
|
||||||
|
- Routing fabric (cube_noc / UCIe)은 PhysAddr.decode()로 destination을 정함.
|
||||||
|
IPCQ의 synthetic 주소가 fabric routing에서 실제로 어떻게 처리되는지 **검증되지
|
||||||
|
않음** (별도 경로로 배달되는 것으로 추정)
|
||||||
|
- TCM / HBM / SRAM의 실제 memory layout과 IPCQ ring buffer 위치가 **disjoint**
|
||||||
|
→ allocator가 IPCQ 영역을 모르므로 실수로 겹칠 가능성 (현재는 bit 60로 완전
|
||||||
|
분리되어 문제 없지만 설계 원칙상 건강하지 않음)
|
||||||
|
|
||||||
|
### 풀어야 할 문제
|
||||||
|
|
||||||
|
1. **IPCQ ring buffer의 PhysAddr 표현**: buffer_kind별로 어떤 PhysAddr factory를
|
||||||
|
쓸지.
|
||||||
|
2. **PhysAddr 공간 부족 가능성**: 51-bit 공간에 IPCQ 버퍼를 담을 여유가 있는지.
|
||||||
|
3. **Allocator 통합**: `PEMemAllocator`에 IPCQ buffer 영역 예약 기능 추가, 또는
|
||||||
|
기존 pool에서 정상 allocation.
|
||||||
|
4. **MemoryStore space naming 정리**: 현재는 `{"tcm", "hbm", "sram"}` 문자열로
|
||||||
|
space 구분. IPCQ buffer도 이 space에 속하면 일반 data와 주소 겹침 방지 필요.
|
||||||
|
5. **Routing fabric 통합**: PhysAddr 기반 routing이 IPCQ 토큰을 올바른 SIP의
|
||||||
|
올바른 메모리로 배달.
|
||||||
|
6. **ADR-0025와의 정합**: 주소-기반 매칭이 PhysAddr에서도 동일하게 작동.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Decision
|
||||||
|
|
||||||
|
### D1. IPCQ ring buffer = PhysAddr factory 사용
|
||||||
|
|
||||||
|
각 `buffer_kind`가 해당하는 PhysAddr factory를 호출:
|
||||||
|
|
||||||
|
| buffer_kind | PhysAddr factory | 필요한 인자 |
|
||||||
|
|---|---|---|
|
||||||
|
| `tcm` | `PhysAddr.pe_tcm_addr(rack_id, sip_id, cube_id, pe_id, tcm_offset)` | PE-local TCM |
|
||||||
|
| `hbm` | `PhysAddr.pe_hbm_addr(rack_id, sip_id, cube_id, pe_id, pe_local_hbm_offset, slice_size_bytes)` | PE-local HBM slice |
|
||||||
|
| `sram` | `PhysAddr.cube_sram_addr(rack_id, sip_id, cube_id, sram_offset)` | Cube-shared SRAM |
|
||||||
|
|
||||||
|
Install plan builder (`build_install_plans` in ADR-0024)가 각 PE의 rx_base를
|
||||||
|
계산할 때:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ADR-0030 후 install_plan.py (pseudocode)
|
||||||
|
def _compute_rx_base(sip, cube, pe, direction_idx, buffer_kind, n_slots, slot_size,
|
||||||
|
allocator_pool, rack_id=0) -> PhysAddr:
|
||||||
|
bytes_per_direction = n_slots * slot_size
|
||||||
|
offset = direction_idx * bytes_per_direction
|
||||||
|
|
||||||
|
if buffer_kind == "tcm":
|
||||||
|
# TCM base (per-PE) + direction offset
|
||||||
|
tcm_base = allocator_pool.reserve_pe_tcm_for_ipcq(sip, cube, pe,
|
||||||
|
total_bytes=N_DIR * bytes_per_direction)
|
||||||
|
return PhysAddr.pe_tcm_addr(rack_id=rack_id, sip_id=sip, cube_id=cube,
|
||||||
|
pe_id=pe, tcm_offset=tcm_base + offset)
|
||||||
|
elif buffer_kind == "hbm":
|
||||||
|
hbm_base = allocator_pool.reserve_pe_hbm_for_ipcq(sip, cube, pe,
|
||||||
|
total_bytes=...)
|
||||||
|
return PhysAddr.pe_hbm_addr(rack_id=rack_id, sip_id=sip, cube_id=cube,
|
||||||
|
pe_id=pe, pe_local_hbm_offset=hbm_base + offset,
|
||||||
|
slice_size_bytes=slice_size)
|
||||||
|
elif buffer_kind == "sram":
|
||||||
|
sram_base = allocator_pool.reserve_cube_sram_for_ipcq(sip, cube,
|
||||||
|
total_bytes=...)
|
||||||
|
return PhysAddr.cube_sram_addr(rack_id=rack_id, sip_id=sip, cube_id=cube,
|
||||||
|
sram_offset=sram_base + offset)
|
||||||
|
```
|
||||||
|
|
||||||
|
`IpcqEndpoint.rx_base_pa`의 타입을 `PhysAddr` (또는 encoded `int`)로 변경:
|
||||||
|
|
||||||
|
```python
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class IpcqEndpoint:
|
||||||
|
sip: int
|
||||||
|
cube: int
|
||||||
|
pe: int
|
||||||
|
buffer_kind: str
|
||||||
|
rx_base_pa: int # PhysAddr.encode() 결과 (51-bit)
|
||||||
|
rx_base_va: int
|
||||||
|
n_slots: int
|
||||||
|
slot_size: int
|
||||||
|
```
|
||||||
|
|
||||||
|
타입은 int 유지 (encoded form), 단 **반드시 PhysAddr.decode()로 복원 가능**한
|
||||||
|
값임을 invariant으로 둔다. 디코더 호출자는 `PhysAddr.decode(rx_base_pa)`로
|
||||||
|
구조적 좌표 획득.
|
||||||
|
|
||||||
|
### D2. Allocator 확장 — IPCQ 예약 API
|
||||||
|
|
||||||
|
`PEMemAllocator`에 IPCQ 전용 예약 기능 추가:
|
||||||
|
|
||||||
|
```python
|
||||||
|
class PEMemAllocator:
|
||||||
|
def reserve_ipcq_tcm(self, total_bytes: int) -> int:
|
||||||
|
"""Reserve TCM region for IPCQ ring buffers at this PE.
|
||||||
|
Returns tcm_offset (to be used in PhysAddr.pe_tcm_addr)."""
|
||||||
|
# TCM에서 `total_bytes` 연속 영역 예약.
|
||||||
|
# Tensor allocation과 겹치지 않도록.
|
||||||
|
|
||||||
|
def reserve_ipcq_hbm(self, total_bytes: int) -> int: ...
|
||||||
|
# cube-level allocator도 유사
|
||||||
|
```
|
||||||
|
|
||||||
|
Install plan 빌더가 각 PE allocator에서 예약. 예약 결과(offset)를 PhysAddr
|
||||||
|
factory에 전달.
|
||||||
|
|
||||||
|
**기존 `_ipcq_base_for_pe` / `_IPCQ_BASE` 제거**.
|
||||||
|
|
||||||
|
### D3. MemoryStore space 통합
|
||||||
|
|
||||||
|
현재 `MemoryStore`는 `{space_name: {addr: ndarray}}` 구조. IPCQ buffer는 일반
|
||||||
|
tensor 데이터와 같은 space (tcm/hbm/sram)를 공유하게 됨. 주소 유일성은 ADR-0001의
|
||||||
|
PhysAddr 계층 보장.
|
||||||
|
|
||||||
|
Backward compatibility: 기존 IPCQ address (synthetic)을 쓰는 code path는
|
||||||
|
**제거**하고, 모두 PhysAddr.encode() 결과만 사용. 이 자체는 API 변경이 아니라
|
||||||
|
값 변경.
|
||||||
|
|
||||||
|
### D4. Routing fabric 통합
|
||||||
|
|
||||||
|
IPCQ DMA write (`IpcqDmaToken`의 `src_addr → dst_addr`)이 PhysAddr encoding을
|
||||||
|
사용하므로 **routing fabric이 `PhysAddr.decode(dst_addr)`로 destination
|
||||||
|
SIP/cube/PE를 정확히 찾을 수 있음**. Fabric routing 로직 변경 없음 (기존에도
|
||||||
|
PhysAddr.decode를 쓰는 것으로 추정).
|
||||||
|
|
||||||
|
**검증 필요**: 현재 fabric이 bit 60 synthetic 주소를 어떻게 라우팅하는지 확인.
|
||||||
|
별도 경로가 있다면 제거, PhysAddr 경로로 통합.
|
||||||
|
|
||||||
|
### D5. ADR-0025와의 정합
|
||||||
|
|
||||||
|
ADR-0025의 주소-기반 매칭 (dst_addr로 direction 식별)은 PhysAddr.encode()
|
||||||
|
결과를 비교하는 것으로 자연스럽게 호환. 변경 없음.
|
||||||
|
|
||||||
|
다만 debug / diagnostic 향상 가능:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# pointer_dump 등에서
|
||||||
|
print(f"E: rx_base_pa={PhysAddr.decode(qp.peer.rx_base_pa)}")
|
||||||
|
# 출력 예: PhysAddr(sip=1, cube=0, pe=0, kind="pe_resource", unit_type=PE, ...)
|
||||||
|
```
|
||||||
|
|
||||||
|
이전 synthetic 주소는 decode 불가 → diagnostic 질 저하. PhysAddr 전환으로 개선.
|
||||||
|
|
||||||
|
### D6. ADR-0023 D2.5 amendment
|
||||||
|
|
||||||
|
ADR-0023의 "bypasses PhysAddr encoding" 문구를 **Accepted fallback → now
|
||||||
|
replaced by ADR-0030**으로 수정. 본 ADR이 적용되면 ADR-0023 D2.5의 "Real PA
|
||||||
|
encoding can be plugged in later" 약속이 이행된 것.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Migration strategy
|
||||||
|
|
||||||
|
단계적 전환 (한 PR로 하지 않는다):
|
||||||
|
|
||||||
|
### Phase 1: PhysAddr 공간 재검토
|
||||||
|
- 51-bit PhysAddr 공간에 IPCQ ring buffer가 실제로 들어갈 수 있는지 확인.
|
||||||
|
- 각 buffer_kind (tcm/hbm/sram)별 factory가 제공하는 `local_offset` 범위가
|
||||||
|
IPCQ 요구 (4 direction × n_slots × slot_size)를 수용 가능한지.
|
||||||
|
- 부족하면 PhysAddr layout 자체 확장 (ADR-0001 amendment 별도 필요).
|
||||||
|
|
||||||
|
### Phase 2: Allocator API 확장
|
||||||
|
- `PEMemAllocator.reserve_ipcq_*` 메소드 추가.
|
||||||
|
- 기존 tensor allocation과 영역 충돌 방지.
|
||||||
|
|
||||||
|
### Phase 3: Install plan builder 전환
|
||||||
|
- `_ipcq_base_for_pe` 제거, PhysAddr factory 호출로 대체.
|
||||||
|
- `IpcqEndpoint.rx_base_pa`가 PhysAddr.encode() 결과 (51-bit).
|
||||||
|
|
||||||
|
### Phase 4: Routing fabric 검증
|
||||||
|
- IPCQ DMA token이 fabric 정상 경로로 배달되는지 확인.
|
||||||
|
- 별도 fast-path가 있다면 제거, 통합.
|
||||||
|
|
||||||
|
### Phase 5: MemoryStore space 검증
|
||||||
|
- IPCQ buffer 주소가 기존 tensor 주소와 겹치지 않는지.
|
||||||
|
- Allocator 레벨에서 이미 예약했으므로 정상적으로 분리되어야 함.
|
||||||
|
|
||||||
|
### Phase 6: ADR-0023 D2.5 업데이트 + 기존 sideband path 제거 (완료)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Dependencies
|
||||||
|
|
||||||
|
- **ADR-0031** (PhysAddr PE-resource extension) — **Blocker**: PhysAddr가 PE
|
||||||
|
resource (특히 IPCQ ring buffer)를 충분히 표현할 수 있도록 schema 확장이
|
||||||
|
선행되어야 함. 본 ADR은 ADR-0031 완료 후에만 실행 가능.
|
||||||
|
- **ADR-0001** (PhysAddr layout): 본 ADR의 기반. 51-bit 공간 / factory API의
|
||||||
|
ADR-0031 확장본을 사용.
|
||||||
|
- **ADR-0023** (IPCQ protocol): 본 ADR은 ADR-0023 D2.5의 "later" 약속 이행.
|
||||||
|
D9 piggyback / credit return 프로토콜 자체는 불변.
|
||||||
|
- **ADR-0024** (launcher + install_plan.py): `build_install_plans`가 PhysAddr
|
||||||
|
factory를 호출하게 됨.
|
||||||
|
- **ADR-0025** (direction addressing): 주소-기반 매칭이 PhysAddr에서도 동일하게
|
||||||
|
작동. 변경 없음.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Non-goals
|
||||||
|
|
||||||
|
- **ADR-0001 PhysAddr layout 자체 변경**: 51-bit 공간과 segment 구조는 유지.
|
||||||
|
부족 시 별도 ADR.
|
||||||
|
- **IPCQ protocol semantic 변경**: ADR-0023 D9 piggyback 등 프로토콜 로직 유지.
|
||||||
|
- **Allocator 전반 재설계**: IPCQ 예약 API 추가만.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Open questions
|
||||||
|
|
||||||
|
### 🔴 Critical — Migration 전 반드시 검증
|
||||||
|
|
||||||
|
- **PhysAddr 51-bit 공간에 IPCQ 버퍼가 실제로 들어가는가**: 각 PE의 TCM
|
||||||
|
영역에서 `4 direction × n_slots (default 4) × slot_size (default 4KB)` =
|
||||||
|
64KB가 PE TCM 공간에 수용 가능. TCM size (e.g., 16MB) 대비 충분. HBM도 여유
|
||||||
|
많음. SRAM은 cube 공유라 direction × PE 곱이 있음 — 별도 검증 필요.
|
||||||
|
- **Routing fabric의 현재 IPCQ 주소 처리**: 현재 synthetic 주소가 fabric에서
|
||||||
|
어떻게 routing되는지 trace 필요. `PhysAddr.decode()`로 판독 불가한 값이
|
||||||
|
fabric에서 정상 배달된다면 어떤 경로를 쓰는지 조사.
|
||||||
|
|
||||||
|
### 🟡 Nice-to-have
|
||||||
|
|
||||||
|
- **IPCQ 전용 kind / sub_offset 인코딩**: `UnitType.PE`의 sub_offset 공간을
|
||||||
|
IPCQ와 공유. 충돌 방지를 위해 IPCQ 전용 sub-space 정의할지 여부.
|
||||||
|
- **Debug tool**: `pointer_dump`를 PhysAddr 포매팅으로 개선.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Test strategy
|
||||||
|
|
||||||
|
### T1. PhysAddr round-trip
|
||||||
|
|
||||||
|
`tests/test_ipcq_physaddr.py` (new):
|
||||||
|
- `PhysAddr.pe_tcm_addr(...)` → encode → decode → 동일 필드 복원
|
||||||
|
- TCM / HBM / SRAM 각 factory에 대해
|
||||||
|
|
||||||
|
### T2. Allocator 예약
|
||||||
|
|
||||||
|
`tests/test_ipcq_alloc.py` (new):
|
||||||
|
- `PEMemAllocator.reserve_ipcq_tcm` → 반환된 offset이 valid TCM 영역
|
||||||
|
- 중복 예약 → 에러 또는 non-overlapping offset
|
||||||
|
- Tensor allocation과 충돌 없음
|
||||||
|
|
||||||
|
### T3. Install plan PhysAddr integration
|
||||||
|
|
||||||
|
`tests/test_ccl_install_plan.py` (확장):
|
||||||
|
- `build_install_plans` 결과의 `rx_base_pa`가 PhysAddr.decode() 가능
|
||||||
|
- Decoded 좌표가 plan의 (sip, cube, pe)와 일치
|
||||||
|
- I3.1 invariant (ADR-0025 D6) — rx_base range disjointness가 PhysAddr에서도 성립
|
||||||
|
|
||||||
|
### T4. Routing — IPCQ DMA fabric traversal
|
||||||
|
|
||||||
|
`tests/test_ipcq_routing.py` (new):
|
||||||
|
- Cross-SIP IPCQ send → fabric이 `PhysAddr.decode(dst_addr)`로 destination SIP
|
||||||
|
정확히 판단 → 올바른 MemoryStore에 write
|
||||||
|
- UCIe 경로 / cube_noc 경로 모두 검증
|
||||||
|
|
||||||
|
### T5. 회귀
|
||||||
|
|
||||||
|
- 기존 IPCQ E2E 테스트 (ring, mesh, tree) 모두 통과
|
||||||
|
- ADR-0024, ADR-0025 통합 테스트 통과
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Consequences
|
||||||
|
|
||||||
|
### Positive
|
||||||
|
|
||||||
|
- **ADR-0001 정합성 회복**: routing과 addressing이 단일 체계.
|
||||||
|
- **buffer_kind 명확**: TCM/HBM/SRAM이 구조적 좌표로 구분.
|
||||||
|
- **Debug 향상**: PhysAddr.decode()로 사람이 읽을 수 있는 좌표.
|
||||||
|
- **Allocator 통합**: IPCQ 영역이 정상 예약 → tensor와의 충돌 리스크 사전 차단.
|
||||||
|
- **Fabric routing 일원화**: 별도 경로 없이 기존 PhysAddr-based routing 재활용.
|
||||||
|
|
||||||
|
### Negative
|
||||||
|
|
||||||
|
- **Migration 복잡도**: 6 Phase 단계적 전환 필요. 각 Phase마다 regression 리스크.
|
||||||
|
- **PhysAddr 공간 검증 부담**: Phase 1에서 TCM/HBM/SRAM 공간이 IPCQ 요구를
|
||||||
|
수용하는지 실측 필요.
|
||||||
|
- **Routing fabric 검증**: 현재 fabric이 synthetic 주소를 어떻게 처리하는지
|
||||||
|
조사 필요.
|
||||||
|
|
||||||
|
### Neutral
|
||||||
|
|
||||||
|
- IPCQ protocol semantic (ADR-0023 D9 등) 불변.
|
||||||
|
- ADR-0025의 direction addressing 로직 불변.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Affected files
|
||||||
|
|
||||||
|
| File | Change |
|
||||||
|
|------|--------|
|
||||||
|
| `src/kernbench/ccl/install.py` | `_IPCQ_BASE`, `_ipcq_base_for_pe` 제거 |
|
||||||
|
| `src/kernbench/ccl/install_plan.py` (ADR-0024) | D1: PhysAddr factory 호출로 rx_base 계산 |
|
||||||
|
| `src/kernbench/policy/address/allocator.py` (or similar) | D2: IPCQ 예약 API (`reserve_ipcq_tcm` 등) |
|
||||||
|
| `src/kernbench/common/ipcq_types.py` | D1: `IpcqEndpoint.rx_base_pa` 문서화 — PhysAddr.encode 결과 |
|
||||||
|
| `src/kernbench/sim_engine/memory_store.py` | D3: IPCQ buffer가 기존 space와 공유되는지 검증 |
|
||||||
|
| `src/kernbench/sim_engine/engine.py` | D4: IPCQ token routing이 PhysAddr-based fabric 경로 사용 |
|
||||||
|
| `src/kernbench/ccl/diagnostics.py` | D5: pointer_dump를 PhysAddr 포매팅으로 개선 |
|
||||||
|
| `docs/adr/ADR-0023-ipcq-pe-collective.md` | D6: D2.5 amendment note |
|
||||||
|
| `tests/test_ipcq_physaddr.py` (new) | T1 |
|
||||||
|
| `tests/test_ipcq_alloc.py` (new) | T2 |
|
||||||
|
| `tests/test_ccl_install_plan.py` | T3 확장 |
|
||||||
|
| `tests/test_ipcq_routing.py` (new) | T4 |
|
||||||
@@ -0,0 +1,257 @@
|
|||||||
|
# ADR-0031: PhysAddr PE-Resource Extension
|
||||||
|
|
||||||
|
## Status
|
||||||
|
|
||||||
|
Stub (Blocker for ADR-0030 — specific range allocations TBD)
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
### 목표
|
||||||
|
|
||||||
|
ADR-0001의 `PhysAddr` schema를 **PE 내부의 다양한 resource**를 체계적으로
|
||||||
|
표현할 수 있도록 확장한다. ADR-0030 (IPCQ PhysAddr integration) 및 향후의
|
||||||
|
PE-local resource 추가 (scratchpad, register file, status register, 등)의
|
||||||
|
기반을 제공한다.
|
||||||
|
|
||||||
|
### 현재 상태 (ADR-0001)
|
||||||
|
|
||||||
|
51-bit PhysAddr layout:
|
||||||
|
|
||||||
|
```
|
||||||
|
[50:47] rack_id (4)
|
||||||
|
[46:43] sip_id (4)
|
||||||
|
[42:38] sip_seg (5) # cube_id
|
||||||
|
[37:0] local_offset (38)
|
||||||
|
```
|
||||||
|
|
||||||
|
`local_offset` (38 bits) 내부:
|
||||||
|
|
||||||
|
- `[37]` selector: 1 = HBM window (128GB), 0 = PE resource window
|
||||||
|
- PE resource window는 `unit_type` (3 bits: PE | MCPU | SRAM) +
|
||||||
|
`pe_id` (4 bits) + `ext` (1 bit) + `sub_offset` (29 bits)
|
||||||
|
|
||||||
|
Factory API:
|
||||||
|
- `PhysAddr.hbm_addr(...)` — HBM generic
|
||||||
|
- `PhysAddr.pe_hbm_addr(...)` — PE-local HBM slice
|
||||||
|
- `PhysAddr.pe_tcm_addr(...)` — PE TCM (via `UnitType.PE` + `sub_offset`)
|
||||||
|
- `PhysAddr.cube_sram_addr(...)` — Cube-shared SRAM
|
||||||
|
|
||||||
|
### 풀어야 할 문제
|
||||||
|
|
||||||
|
1. **PE 내부 resource 구분의 명시적 체계 부재**: 현재 `local_offset` (38 bits)
|
||||||
|
이 평면 공간으로 취급되고, PE TCM / IPCQ ring / scratchpad / 향후 register
|
||||||
|
file 등이 관습적 offset 범위로만 구분됨. Schema 레벨에서 명확하지 않음.
|
||||||
|
2. **IPCQ 주소의 PhysAddr 표현 부재**: ADR-0030이 IPCQ ring buffer를 PhysAddr로
|
||||||
|
표현하려면 "이 주소가 IPCQ 영역"을 decode 가능해야 함. 현재는 불가.
|
||||||
|
3. **향후 PE resource 확장 경로**: register file, performance counter 등
|
||||||
|
추가 시 일관된 위치 할당 규칙 필요.
|
||||||
|
|
||||||
|
### 설계 방향 — local_offset을 PE 컴포넌트별 range로 분할
|
||||||
|
|
||||||
|
`local_offset` (38 bits = 256GB per PE segment)을 **PE 컴포넌트마다 고정
|
||||||
|
range**로 나누어 할당한다. 각 range는 해당 컴포넌트 전용 주소 공간이며,
|
||||||
|
`PhysAddr.decode()`가 주소가 어느 range에 속하는지 판별해 해당하는 `kind` /
|
||||||
|
`unit_type` / `sub_type` 필드를 채운다.
|
||||||
|
|
||||||
|
개념적 구조 (구체적 bit 할당은 **TBD**):
|
||||||
|
|
||||||
|
```
|
||||||
|
local_offset [37:0] (38 bits total)
|
||||||
|
├── HBM window [37] = 1 (기존 128GB)
|
||||||
|
├── PE component ranges [37] = 0
|
||||||
|
│ ├── TCM [range_1]
|
||||||
|
│ ├── IPCQ rings [range_2]
|
||||||
|
│ ├── Scratchpad [range_3]
|
||||||
|
│ ├── Register file [range_4]
|
||||||
|
│ ├── (reserved) ...
|
||||||
|
│ └── Sideband / status [range_N]
|
||||||
|
```
|
||||||
|
|
||||||
|
### 왜 range-based partition인가
|
||||||
|
|
||||||
|
- **Schema-level 명시성**: 주소 하나 보고 어느 컴포넌트의 자원인지 decode 가능.
|
||||||
|
"Routing consumes decoded domains" (ADR-0001 D5) 계약 충족.
|
||||||
|
- **Unit type enum 확장보다 유연**: 3-bit `UnitType` 공간을 고갈시키지 않고
|
||||||
|
세분화 가능. 미래 추가 컴포넌트도 빈 range 할당.
|
||||||
|
- **Allocator 통합 자연**: 각 PE-level allocator가 관리하는 하위 pool을
|
||||||
|
address range와 1:1 매칭 (e.g., `reserve_ipcq_tcm()` → IPCQ range 안에서만
|
||||||
|
할당).
|
||||||
|
- **Decode routing 단순**: `PhysAddr.decode(addr)`가 range table을 참조해
|
||||||
|
`kind` + sub-field를 채움. 기존 HBM selector bit 패턴의 일반화.
|
||||||
|
|
||||||
|
### 왜 지금 다루는가
|
||||||
|
|
||||||
|
- ADR-0030 (IPCQ PhysAddr 통합)이 이 확장에 **의존**. ADR-0030 단독 진행 시
|
||||||
|
`sub_offset` 공간을 불투명하게 재사용하게 되어 ADR-0001 계약 미충족.
|
||||||
|
- PE 내부 자원이 더 추가될 가능성 — 지금 구조를 정리해두면 일관된 확장 경로 확보.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Decision (pending specific range allocation)
|
||||||
|
|
||||||
|
### D1. Range-based local_offset partition — approach
|
||||||
|
|
||||||
|
`local_offset`을 고정 byte range로 분할하고, 각 range를 PE 컴포넌트에 할당한다.
|
||||||
|
주소의 어느 range에 속하는가로 `kind` / component type을 결정.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# src/kernbench/policy/address/phyaddr.py (conceptual, post-extension)
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class PeResourceRange:
|
||||||
|
name: str # e.g. "tcm", "ipcq", "scratchpad", "regfile"
|
||||||
|
start_offset: int # local_offset 내 시작
|
||||||
|
end_offset: int # exclusive
|
||||||
|
byte_size: int # end - start
|
||||||
|
|
||||||
|
PE_RESOURCE_MAP: tuple[PeResourceRange, ...] = (
|
||||||
|
# TBD — 구체적 range 할당은 사용자가 별도 업데이트
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
`PhysAddr.decode(addr)`의 PE resource 경로는:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def decode_pe_resource(local_offset: int) -> dict:
|
||||||
|
for r in PE_RESOURCE_MAP:
|
||||||
|
if r.start_offset <= local_offset < r.end_offset:
|
||||||
|
return {
|
||||||
|
"kind": "pe_resource",
|
||||||
|
"component": r.name, # NEW: "tcm"/"ipcq"/...
|
||||||
|
"component_offset": local_offset - r.start_offset, # within range
|
||||||
|
}
|
||||||
|
raise PhysAddrError(f"local_offset {local_offset} not in any PE range")
|
||||||
|
```
|
||||||
|
|
||||||
|
### D2. Specific range allocations — **TBD**
|
||||||
|
|
||||||
|
> 사용자가 구체적 byte 할당을 별도로 정의한 뒤 본 ADR에 업데이트.
|
||||||
|
>
|
||||||
|
> 필요 정보:
|
||||||
|
> - 각 컴포넌트 (TCM, IPCQ, scratchpad, regfile, ...)의 이름 / byte size
|
||||||
|
> - `local_offset` 내 시작 offset (align 고려)
|
||||||
|
> - 현재 하드웨어 사양 / 시뮬레이션 요구 반영
|
||||||
|
|
||||||
|
이 섹션이 채워진 뒤 ADR status: **Stub → Proposed → Accepted** 승격.
|
||||||
|
|
||||||
|
### D3. Factory API — per-component 함수
|
||||||
|
|
||||||
|
기존 `PhysAddr.pe_tcm_addr(...)` 패턴을 일반화:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# 기존 (이미 존재)
|
||||||
|
PhysAddr.pe_tcm_addr(rack_id, sip_id, cube_id, pe_id, tcm_offset)
|
||||||
|
|
||||||
|
# 신규 (ADR-0031 후 추가)
|
||||||
|
PhysAddr.pe_ipcq_addr(rack_id, sip_id, cube_id, pe_id, ipcq_offset)
|
||||||
|
PhysAddr.pe_scratchpad_addr(...)
|
||||||
|
PhysAddr.pe_regfile_addr(...)
|
||||||
|
# ...
|
||||||
|
```
|
||||||
|
|
||||||
|
각 factory는 해당 컴포넌트의 range 내에서 `component_offset`만 받아 최종
|
||||||
|
PhysAddr encoding. 호출자는 어느 range인지 몰라도 됨.
|
||||||
|
|
||||||
|
### D4. Backward compatibility
|
||||||
|
|
||||||
|
- 기존 `pe_tcm_addr()` signature / semantic 유지.
|
||||||
|
- 내부 인코딩만 신규 range table을 참조하도록 변경.
|
||||||
|
- 기존 `UnitType.PE` decoding 경로는 `PE_RESOURCE_MAP`에서 "tcm" range를
|
||||||
|
대응하도록 매핑 → 기존 코드 transparent.
|
||||||
|
- 기존 코드가 `PhysAddr.decode(addr).unit_type == UnitType.PE`를 체크하는
|
||||||
|
경우는 여전히 유효 (TCM 주소는 계속 PE unit_type).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Open questions
|
||||||
|
|
||||||
|
### 🔴 Pending user input (ADR 승격 blocker)
|
||||||
|
|
||||||
|
- **D2의 specific range allocation**: 사용자가 구체적 byte 할당 테이블을
|
||||||
|
제공해야 Stub → Proposed 승격 가능. 필요 정보:
|
||||||
|
- 컴포넌트 목록 (TCM, IPCQ, scratchpad, regfile 등)
|
||||||
|
- 각 컴포넌트의 byte size / 시작 offset
|
||||||
|
- Alignment 요구사항 (4KB / page-aligned 등)
|
||||||
|
|
||||||
|
### 🟡 설계 세부 — range allocation 결정 과정에서 함께 결정
|
||||||
|
|
||||||
|
- **총 local_offset space 배분**: HBM window (bit 37 = 1, 128GB)을 유지할지,
|
||||||
|
아니면 PE resource space를 확장하기 위해 HBM window 축소할지.
|
||||||
|
- **Range padding / reserved space**: 미래 컴포넌트 추가를 위한 "reserved"
|
||||||
|
range 몇 개를 미리 확보할지.
|
||||||
|
- **Address alignment**: 각 range의 시작 offset이 특정 alignment (page /
|
||||||
|
cache line) 만족해야 하는지.
|
||||||
|
- **Diagnostic / debug 포맷**: `PhysAddr.decode()` 출력에서 component 이름 +
|
||||||
|
component_offset을 사람이 읽기 좋게 표시 (e.g., "IPCQ ring sip=0 cube=0 pe=3
|
||||||
|
offset=0x1234").
|
||||||
|
- **기존 `UnitType` enum의 role**: Range-based 접근 후에도 `unit_type` 필드
|
||||||
|
유지할지 (decode 결과에 `component` 추가), 또는 enum 대체할지.
|
||||||
|
|
||||||
|
### 🟢 ADR-0030 연동 질문
|
||||||
|
|
||||||
|
- **IPCQ range 내 direction/slot 표현**: PhysAddr는 `component_offset` 단위
|
||||||
|
까지만 표현. "direction=E, slot=2"는 IPCQ range 내 offset 계산으로 도출
|
||||||
|
(`direction_idx * slot_region_size + slot_idx * slot_size`) — 이 공식은
|
||||||
|
ADR-0030 scope에서 구체화.
|
||||||
|
- **Allocator pool 구조**: `PEMemAllocator`가 여러 range (TCM, IPCQ,
|
||||||
|
scratchpad)를 개별 pool로 관리할지, 단일 pool에서 kind별 reserved만 관리
|
||||||
|
할지. Range-based schema면 개별 pool이 자연스러움.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Non-goals (this ADR)
|
||||||
|
|
||||||
|
- **51-bit 전체 layout 재작성**: 본 ADR은 `local_offset` (38 bits) 내부의
|
||||||
|
subdivision만 다룬다. Rack / SIP / cube segment 같은 상위 bit 구조는
|
||||||
|
불변.
|
||||||
|
- **`UnitType` enum 재설계**: range-based 접근으로 대체 가능하지만, 기존 enum
|
||||||
|
(PE / MCPU / SRAM)은 backward compat 위해 유지.
|
||||||
|
- **Dynamic range allocation**: runtime에 range 크기 바꾸는 기능 불필요. 모든
|
||||||
|
range는 컴파일 / 설정 시점에 고정.
|
||||||
|
- **Multi-process / multi-rack partitioning**: PE 내부 resource만 다룸.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Action
|
||||||
|
|
||||||
|
### Phase 1 — User 입력: specific range allocation (**Blocker**)
|
||||||
|
- 사용자가 정의한 PE 컴포넌트별 byte range를 D2에 기입:
|
||||||
|
- `PE_RESOURCE_MAP` 테이블 내용 (name, start_offset, byte_size per 컴포넌트)
|
||||||
|
- 각 컴포넌트의 hardware spec 근거 note
|
||||||
|
|
||||||
|
### Phase 2 — ADR Stub → Proposed 승격
|
||||||
|
- D2 채워지면 status 변경.
|
||||||
|
- Open questions의 "🔴 Pending user input" 블록 제거.
|
||||||
|
- ADR-0001에 amendment note 초안 작성.
|
||||||
|
|
||||||
|
### Phase 3 — 구현
|
||||||
|
- `PhysAddr` range-based decode 구현.
|
||||||
|
- 신규 factory 함수 (`pe_ipcq_addr`, `pe_scratchpad_addr` 등 컴포넌트별)
|
||||||
|
추가.
|
||||||
|
- 기존 `pe_tcm_addr` 내부 인코딩만 신규 range table 참조하도록 수정
|
||||||
|
(signature 불변).
|
||||||
|
- 기존 코드 경로 회귀 확인.
|
||||||
|
|
||||||
|
### Phase 4 — ADR-0030 unblock
|
||||||
|
- ADR-0030 "Blocked" 상태 해제.
|
||||||
|
- Install_plan builder가 `pe_ipcq_addr(...)` 등 확장된 factory 호출하도록
|
||||||
|
수정.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Dependencies
|
||||||
|
|
||||||
|
- **ADR-0001** (PhysAddr layout): 본 ADR은 ADR-0001의 확장.
|
||||||
|
- **ADR-0023** (IPCQ protocol): IPCQ ring buffer의 주소 체계를 PhysAddr로
|
||||||
|
통합할 수 있게 하는 기반.
|
||||||
|
- **ADR-0030** (IPCQ PhysAddr integration): 본 ADR에 blocked.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Affected files (future, after promotion to Proposed)
|
||||||
|
|
||||||
|
| File | Change |
|
||||||
|
|------|--------|
|
||||||
|
| `src/kernbench/policy/address/phyaddr.py` | Range table (`PE_RESOURCE_MAP`), range-based decode, 신규 component-specific factory들 (`pe_ipcq_addr` 등), 기존 `pe_tcm_addr` 내부 인코딩 갱신 |
|
||||||
|
| `src/kernbench/policy/address/allocator.py` | Range-aware pool 분리 (TCM pool / IPCQ pool / scratchpad pool 등 per-PE) |
|
||||||
|
| `docs/adr/ADR-0001-physaddr-layout.md` | Amendment note: range-based PE resource partition |
|
||||||
|
| `tests/test_phyaddr.py` | Range table 검증, 각 factory의 encode/decode round-trip, 기존 `pe_tcm_addr` 회귀 |
|
||||||
@@ -0,0 +1,592 @@
|
|||||||
|
# CCL Algorithm Author Guide (English)
|
||||||
|
|
||||||
|
This document is a step-by-step guide for engineers writing CCL
|
||||||
|
(Collective Communication Library) algorithms in kernbench. The
|
||||||
|
internal system design and component structure live in
|
||||||
|
[ADR-0023](adr/ADR-0023-ipcq-pe-collective.md).
|
||||||
|
|
||||||
|
The goal here is to clearly separate **what an algorithm author has to
|
||||||
|
touch** from **what they can leave alone**, and to get a first
|
||||||
|
algorithm running through the shortest possible path.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 0. Five-minute tour
|
||||||
|
|
||||||
|
| Things you touch | Location |
|
||||||
|
|------------------|----------|
|
||||||
|
| Algorithm module (kernel + optional `neighbors()`) | `src/kernbench/ccl/algorithms/<algo>.py` |
|
||||||
|
| Algorithm registration | `ccl.yaml` |
|
||||||
|
| Host bench (rank count, init, launch, verify) | `benches/<your_bench>.py` |
|
||||||
|
| (Optional) unit test | `tests/test_<algo>.py` |
|
||||||
|
|
||||||
|
| Things you do NOT touch | Location |
|
||||||
|
|--------------------------|----------|
|
||||||
|
| TLContext API | `src/kernbench/triton_emu/tl_context.py` (ADR-0022 spec) |
|
||||||
|
| Framework (topology generators, helpers, mock testing) | `src/kernbench/ccl/` |
|
||||||
|
| PE_IPCQ / PE_DMA components | `src/kernbench/components/builtin/` |
|
||||||
|
| Backend implementation (`install_ipcq`) | `src/kernbench/runtime_api/distributed.py` and `kernbench/ccl/install.py` |
|
||||||
|
|
||||||
|
Workflow:
|
||||||
|
1. Write a `kernel` function in the algorithm module.
|
||||||
|
2. Register an entry in `ccl.yaml`.
|
||||||
|
3. Write a host bench using `torch.distributed.init_process_group` /
|
||||||
|
`torch.distributed.all_reduce` (the unified `benches/ccl_allreduce.py`
|
||||||
|
handles the common case).
|
||||||
|
4. (Optional) Run the mock runtime for fast unit tests (a few ms).
|
||||||
|
5. `kernbench run --bench <name> --verify-data` for full SimPy verification.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. Hello World — the simplest send/recv
|
||||||
|
|
||||||
|
Each PE sends its tile to its E neighbor once and receives a tile from
|
||||||
|
its W neighbor once. The reference code lives in
|
||||||
|
[`src/kernbench/ccl/algorithms/hello_send.py`](../src/kernbench/ccl/algorithms/hello_send.py).
|
||||||
|
|
||||||
|
### Step 1: write the kernel
|
||||||
|
|
||||||
|
New file `src/kernbench/ccl/algorithms/hello_send.py`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
"""Hello world: send your tile to the next rank, receive from the previous one."""
|
||||||
|
|
||||||
|
|
||||||
|
def kernel(t_ptr, n_elem, tl):
|
||||||
|
# Global rank is computed from program_id(0/1) (ADR-0022).
|
||||||
|
local_pe = tl.program_id(axis=0)
|
||||||
|
cube_id = tl.program_id(axis=1)
|
||||||
|
pes_per_cube = tl.num_programs(axis=0)
|
||||||
|
rank = cube_id * pes_per_cube + local_pe
|
||||||
|
|
||||||
|
nbytes = n_elem * 2 # f16
|
||||||
|
pe_addr = t_ptr + rank * nbytes
|
||||||
|
|
||||||
|
# Load our slice and send it east.
|
||||||
|
src = tl.load(pe_addr, shape=(n_elem,), dtype="f16")
|
||||||
|
tl.send(dir="E", src=src)
|
||||||
|
|
||||||
|
# Receive from west and store directly back into our slice.
|
||||||
|
recv = tl.recv(dir="W", shape=(n_elem,), dtype="f16")
|
||||||
|
tl.store(pe_addr, recv)
|
||||||
|
|
||||||
|
|
||||||
|
def kernel_args(world_size: int, n_elem: int) -> tuple:
|
||||||
|
"""Positional kernel args used by the ahbm backend (after t_ptr)."""
|
||||||
|
return (n_elem,)
|
||||||
|
```
|
||||||
|
|
||||||
|
Key points:
|
||||||
|
|
||||||
|
- **Global rank is computed from `program_id(axis=0)` + `program_id(axis=1)`.**
|
||||||
|
TL has no contractually-supported `tl.rank` / `tl.world_size`. If the
|
||||||
|
host needs to pass `world_size` or anything else as an algorithm
|
||||||
|
parameter, it goes through ordinary `torch.launch` arguments.
|
||||||
|
- **`tl.send` takes a `TensorHandle`.** PE_IPCQ reads
|
||||||
|
`addr`/`space`/`shape`/`dtype`/`nbytes` from the handle to issue an
|
||||||
|
`IpcqDmaToken` to PE_DMA.
|
||||||
|
- **`tl.recv` requires `shape` and `dtype`.** The returned TensorHandle
|
||||||
|
points at the IPCQ ring slot and can be used directly as a `dst`
|
||||||
|
handle (e.g. `tl.store(pe_addr, recv)`). Phase 2's `dma_write` replay
|
||||||
|
handles the (slot → hbm) copy, so user code never has to touch
|
||||||
|
`recv.data`.
|
||||||
|
|
||||||
|
### Step 2: register in `ccl.yaml`
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
algorithms:
|
||||||
|
hello_send:
|
||||||
|
module: kernbench.ccl.algorithms.hello_send
|
||||||
|
topology: ring_1d
|
||||||
|
buffer_kind: tcm
|
||||||
|
world_size: 8
|
||||||
|
```
|
||||||
|
|
||||||
|
`world_size` here is optional. If absent, `AhbmCCLBackend` derives it
|
||||||
|
from the topology spec (`sips × cubes_per_sip × pes_per_cube`).
|
||||||
|
|
||||||
|
### Step 3: write a host bench (optional — the unified bench may suffice)
|
||||||
|
|
||||||
|
For most CCL benchmarks the existing `benches/ccl_allreduce.py` is
|
||||||
|
sufficient: it reads `ccl.yaml`, picks the algorithm, sets up the
|
||||||
|
process group, and runs the collective. If your algorithm needs custom
|
||||||
|
host logic, write a new bench file along the same lines.
|
||||||
|
|
||||||
|
The host code looks like a real PyTorch DDP worker:
|
||||||
|
|
||||||
|
```python
|
||||||
|
"""benches/ccl_hello.py"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from kernbench.policy.placement.dp import DPPolicy
|
||||||
|
|
||||||
|
|
||||||
|
N_ELEM = 8
|
||||||
|
|
||||||
|
|
||||||
|
def worker(rank: int, world_size: int, torch) -> None:
|
||||||
|
"""Per-rank business logic — mirrors a real PyTorch DDP worker."""
|
||||||
|
dp = DPPolicy(
|
||||||
|
cube="replicate", pe="column_wise",
|
||||||
|
num_cubes=1, num_pes=world_size,
|
||||||
|
)
|
||||||
|
tensor = torch.zeros(
|
||||||
|
(1, world_size * N_ELEM), dtype="f16", dp=dp, name="hello_in",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Per-rank initialization via the real PyTorch idiom.
|
||||||
|
init = np.zeros((1, world_size * N_ELEM), dtype=np.float16)
|
||||||
|
for r in range(world_size):
|
||||||
|
init[0, r * N_ELEM : (r + 1) * N_ELEM] = float(r + 1)
|
||||||
|
tensor.copy_(torch.from_numpy(init))
|
||||||
|
|
||||||
|
# The collective itself.
|
||||||
|
torch.distributed.all_reduce(tensor, op="sum")
|
||||||
|
|
||||||
|
# Verify on rank 0 (real PyTorch DDP idiom).
|
||||||
|
if rank == 0:
|
||||||
|
result = tensor.numpy()
|
||||||
|
for r in range(world_size):
|
||||||
|
expected = float(((r - 1) % world_size) + 1)
|
||||||
|
slice_r = result[0, r * N_ELEM : (r + 1) * N_ELEM]
|
||||||
|
print(
|
||||||
|
f" rank {r}: got {float(slice_r.mean()):.1f}, "
|
||||||
|
f"expected {expected:.1f}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def run(torch) -> None:
|
||||||
|
"""CLI entry point. Initializes dist, dispatches to worker."""
|
||||||
|
dist = torch.distributed
|
||||||
|
dist.init_process_group(backend="ahbm")
|
||||||
|
worker(
|
||||||
|
rank=dist.get_rank(),
|
||||||
|
world_size=dist.get_world_size(),
|
||||||
|
torch=torch,
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 4: unit test (optional but strongly recommended)
|
||||||
|
|
||||||
|
`tests/test_hello_send.py`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from kernbench.ccl.algorithms.hello_send import kernel
|
||||||
|
from kernbench.ccl.testing import run_kernel_in_mock
|
||||||
|
|
||||||
|
|
||||||
|
def test_hello_send_4_ranks():
|
||||||
|
n_elem = 8
|
||||||
|
inputs = [
|
||||||
|
np.full((n_elem,), float(r + 1), dtype=np.float16)
|
||||||
|
for r in range(4)
|
||||||
|
]
|
||||||
|
outputs = run_kernel_in_mock(
|
||||||
|
kernel_fn=kernel,
|
||||||
|
world_size=4,
|
||||||
|
topology="ring_1d",
|
||||||
|
inputs=inputs,
|
||||||
|
kernel_args=(n_elem,),
|
||||||
|
)
|
||||||
|
# rank r should now hold rank (r-1) % 4's data.
|
||||||
|
for r in range(4):
|
||||||
|
assert np.array_equal(outputs[r], inputs[(r - 1) % 4])
|
||||||
|
```
|
||||||
|
|
||||||
|
`run_kernel_in_mock` runs every rank concurrently in pure Python (no
|
||||||
|
SimPy), so a unit test like this finishes in **milliseconds**. It only
|
||||||
|
verifies algorithmic correctness — no latency, no DMA, no fabric.
|
||||||
|
|
||||||
|
### Step 5: SimPy validation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
kernbench run --topology topology.yaml --bench ccl_hello --verify-data
|
||||||
|
```
|
||||||
|
|
||||||
|
Phase 1 runs the SimPy simulation + MemoryStore data movement, Phase 2
|
||||||
|
replays the op_log for correctness. The bench's `print` lines should
|
||||||
|
show OK for every rank.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. Ring all-reduce — the second algorithm
|
||||||
|
|
||||||
|
Slightly more complex. Each PE runs `world_size - 1` rounds, sending
|
||||||
|
its current tile east and accumulating the tile received from the west.
|
||||||
|
After all rounds, every PE holds the global sum.
|
||||||
|
|
||||||
|
The reference implementation lives in
|
||||||
|
[`src/kernbench/ccl/algorithms/ring_allreduce.py`](../src/kernbench/ccl/algorithms/ring_allreduce.py).
|
||||||
|
The core flow:
|
||||||
|
|
||||||
|
```python
|
||||||
|
"""Ring all-reduce."""
|
||||||
|
|
||||||
|
|
||||||
|
def kernel(t_ptr, n_elem, world_size, tl):
|
||||||
|
local_pe = tl.program_id(axis=0)
|
||||||
|
cube_id = tl.program_id(axis=1)
|
||||||
|
pes_per_cube = tl.num_programs(axis=0)
|
||||||
|
rank = cube_id * pes_per_cube + local_pe
|
||||||
|
nbytes = n_elem * 2
|
||||||
|
pe_addr = t_ptr + rank * nbytes
|
||||||
|
|
||||||
|
# The handle points at HBM[pe_addr]. In greenlet mode .data is
|
||||||
|
# populated, but the kernel never has to touch .data directly.
|
||||||
|
acc = tl.load(pe_addr, shape=(n_elem,), dtype="f16")
|
||||||
|
current = acc # source for the first send
|
||||||
|
|
||||||
|
for _step in range(world_size - 1):
|
||||||
|
tl.send(dir="E", src=current)
|
||||||
|
recv = tl.recv(dir="W", shape=(n_elem,), dtype="f16")
|
||||||
|
# TensorHandle operator overload → MathCmd → PE_MATH dispatch.
|
||||||
|
# Phase 1 only models timing; Phase 2 DataExecutor replays the
|
||||||
|
# actual numpy accumulation.
|
||||||
|
acc = acc + recv
|
||||||
|
current = recv # forward the received slot to the next round
|
||||||
|
|
||||||
|
# Store the final accumulator back to HBM. Source is acc (a PE-local
|
||||||
|
# scratch addr); dst is HBM. The op_log dma_write entry records both
|
||||||
|
# ends so Phase 2 copies the math result into HBM at verify time.
|
||||||
|
tl.store(pe_addr, acc)
|
||||||
|
|
||||||
|
|
||||||
|
def kernel_args(world_size: int, n_elem: int) -> tuple:
|
||||||
|
return (n_elem, world_size)
|
||||||
|
```
|
||||||
|
|
||||||
|
Four key points:
|
||||||
|
|
||||||
|
1. **Accumulation goes through TensorHandle operators.** `acc + recv`
|
||||||
|
emits a `MathCmd` and dispatches it through PE_MATH — i.e. the
|
||||||
|
real hardware path, so the latency model stays accurate. Per
|
||||||
|
ADR-0020 D3, Phase 1 only simulates timing; Phase 2's `DataExecutor`
|
||||||
|
replays the op_log and runs the actual numpy accumulation.
|
||||||
|
2. **Use `current = recv` to forward.** Each round must update the send
|
||||||
|
source to the just-received slot handle so the same data circulates
|
||||||
|
exactly once around the ring. Setting `current = acc` would resend
|
||||||
|
the cumulative sum, inflating the result.
|
||||||
|
3. **`tl.store(pe_addr, acc)` exactly once at the end.** Do not use a
|
||||||
|
store→reload pattern in the middle. `acc` lives in PE-local scratch;
|
||||||
|
the op_log records `(src=scratch, dst=hbm)` and Phase 2 first runs
|
||||||
|
math (filling scratch) then copies via the dma_write snapshot.
|
||||||
|
4. **`world_size` is passed by the host explicitly.** TL only knows the
|
||||||
|
topology slot count (e.g. `num_programs(axis=0)` is "PEs per cube"),
|
||||||
|
not the participating CCL group size. The host bench knows
|
||||||
|
`world_size` and forwards it as an explicit kernel argument.
|
||||||
|
|
||||||
|
For registration in `ccl.yaml` and wiring through the unified bench,
|
||||||
|
look at the existing `ring_allreduce_tcm/_hbm/_sram` entries plus
|
||||||
|
[`benches/ccl_allreduce.py`](../benches/ccl_allreduce.py). Mock unit
|
||||||
|
tests live in
|
||||||
|
[`tests/test_ccl_mock_runtime.py`](../tests/test_ccl_mock_runtime.py)
|
||||||
|
and follow the `kernel_args=(n_elem, world_size)` convention.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. `neighbors()` override — custom topology
|
||||||
|
|
||||||
|
Most algorithms are happy with the builtin topologies (`ring_1d`,
|
||||||
|
`mesh_2d`, `tree_binary`, `ring_1d_unidir`, `none`). If you want to
|
||||||
|
modify a builtin or define a brand-new connectivity pattern, define a
|
||||||
|
`neighbors()` function in your algorithm module.
|
||||||
|
|
||||||
|
### Signature
|
||||||
|
|
||||||
|
```python
|
||||||
|
def neighbors(
|
||||||
|
rank: int, world_size: int, neighbor_map: dict[str, int],
|
||||||
|
) -> dict[str, int] | None:
|
||||||
|
"""Override the neighbor map produced by the builtin topology.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
neighbor_map: the mapping the ccl.yaml ``topology`` field built.
|
||||||
|
For ring_1d this is {"E": (rank+1)%ws, "W": (rank-1)%ws}.
|
||||||
|
The dict is mutable — modify in place if you want.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: the new neighbor map (or the modified-in-place dict).
|
||||||
|
None: do not override; use neighbor_map as-is.
|
||||||
|
"""
|
||||||
|
return None
|
||||||
|
```
|
||||||
|
|
||||||
|
### Pattern A: tweak a builtin
|
||||||
|
|
||||||
|
```python
|
||||||
|
def neighbors(rank, world_size, neighbor_map):
|
||||||
|
# Only even ranks use W; remove W from odd ranks.
|
||||||
|
if rank % 2 == 1:
|
||||||
|
neighbor_map.pop("W", None)
|
||||||
|
return neighbor_map
|
||||||
|
```
|
||||||
|
|
||||||
|
### Pattern B: replace entirely (skip-connection ring)
|
||||||
|
|
||||||
|
```python
|
||||||
|
def neighbors(rank, world_size, neighbor_map):
|
||||||
|
return {"E": (rank + 2) % world_size}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Pattern C: keep builtin
|
||||||
|
|
||||||
|
Either omit `neighbors` entirely or return None:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def neighbors(rank, world_size, neighbor_map):
|
||||||
|
return None # explicit "use the builtin"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. PE kernel API reference (ADR-0023 D4)
|
||||||
|
|
||||||
|
### IPCQ API
|
||||||
|
|
||||||
|
| API | Description | Blocking? |
|
||||||
|
|-----|-------------|-----------|
|
||||||
|
| `tl.send(dir, src=TensorHandle)` | Send to a peer in the given direction. | Yes (waits if peer slots are full) |
|
||||||
|
| `tl.send(dir, src_addr=..., nbytes=..., shape=..., dtype=..., space=...)` | Same, keyword form. | Yes |
|
||||||
|
| `tl.recv(dir, shape=..., dtype=...)` | Blocking recv from one direction. | Yes |
|
||||||
|
| `tl.recv(shape=..., dtype=...)` | Round-robin recv across all four directions. | Yes |
|
||||||
|
| `tl.recv_async(dir, shape=..., dtype=...) → RecvFuture` | Non-blocking recv. | No |
|
||||||
|
| `tl.wait(future)` | Wait for a non-blocking recv future → returns the resolved TensorHandle. | Yes |
|
||||||
|
|
||||||
|
### Existing TL API (ADR-0020/0022, unchanged)
|
||||||
|
|
||||||
|
| API | Description |
|
||||||
|
|-----|-------------|
|
||||||
|
| `tl.load(addr, shape, dtype) → TensorHandle` | DMA read; in greenlet mode `.data` carries the ndarray. |
|
||||||
|
| `tl.store(addr, handle)` | DMA write — when `handle.data` is set the runner propagates it to MemoryStore. |
|
||||||
|
| `tl.composite(op, ...)` | Submit a GEMM/Math composite (non-blocking). |
|
||||||
|
| `tl.program_id(axis=0)` | Local PE id within the cube. |
|
||||||
|
| `tl.program_id(axis=1)` | Cube id (ADR-0022). |
|
||||||
|
| `tl.num_programs(axis=0/1)` | Topology slot counts (NOT the participating-rank count). |
|
||||||
|
|
||||||
|
### Two recv modes
|
||||||
|
|
||||||
|
The default is `return_slot` (zero-copy): the IPCQ slot address is
|
||||||
|
returned in `handle.addr`. To force a copy into a custom destination,
|
||||||
|
pass `dst_addr` + `dst_space`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
recv = tl.recv(
|
||||||
|
dir="W", shape=(8,), dtype="f16",
|
||||||
|
dst_addr=my_scratch_addr,
|
||||||
|
dst_space="hbm",
|
||||||
|
)
|
||||||
|
# After this call recv.addr == my_scratch_addr (copy_to_dst mode).
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. Helpers (`kernbench.ccl.helpers`)
|
||||||
|
|
||||||
|
Convenience helpers to keep algorithm code short:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from kernbench.ccl.helpers import chunked, ring_step, tree_step
|
||||||
|
```
|
||||||
|
|
||||||
|
### `chunked(base_addr, n_chunks, n_elem, dtype="f16") → list[Chunk]`
|
||||||
|
|
||||||
|
Split a tile of `n_elem` elements into `n_chunks` equal-size views.
|
||||||
|
Each `Chunk` has `addr`, `n_elem`, `nbytes` fields.
|
||||||
|
|
||||||
|
```python
|
||||||
|
chunks = chunked(t_ptr, n_chunks=4, n_elem=64, dtype="f16")
|
||||||
|
# chunks[0..3] are 16-element views with consecutive addresses.
|
||||||
|
```
|
||||||
|
|
||||||
|
### `ring_step(rank, step, world_size) → (send_idx, recv_idx)`
|
||||||
|
|
||||||
|
Per-step chunk indices for a ring algorithm (reduce-scatter / all-gather):
|
||||||
|
|
||||||
|
```python
|
||||||
|
for step in range(world_size - 1):
|
||||||
|
send_idx, recv_idx = ring_step(rank, step, world_size)
|
||||||
|
tl.send(
|
||||||
|
dir="E", src_addr=chunks[send_idx].addr,
|
||||||
|
nbytes=chunks[send_idx].nbytes,
|
||||||
|
shape=(chunks[send_idx].n_elem,), dtype="f16",
|
||||||
|
)
|
||||||
|
recv = tl.recv(
|
||||||
|
dir="W", shape=(chunks[recv_idx].n_elem,), dtype="f16",
|
||||||
|
)
|
||||||
|
# accumulate ...
|
||||||
|
```
|
||||||
|
|
||||||
|
### `tree_step(rank, world_size) → {"parent": int|None, "children": list[int]}`
|
||||||
|
|
||||||
|
Parent / children rank ids for a binary tree:
|
||||||
|
|
||||||
|
```python
|
||||||
|
info = tree_step(rank, world_size)
|
||||||
|
if info["parent"] is None:
|
||||||
|
print(f"rank {rank} is the root")
|
||||||
|
for child in info["children"]:
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 6. Unit testing — Mock runtime
|
||||||
|
|
||||||
|
`kernbench.ccl.testing.run_kernel_in_mock` runs an algorithm without
|
||||||
|
SimPy for fast feedback.
|
||||||
|
|
||||||
|
### Basic usage
|
||||||
|
|
||||||
|
```python
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from kernbench.ccl.testing import run_kernel_in_mock
|
||||||
|
from kernbench.ccl.algorithms.my_algo import kernel
|
||||||
|
|
||||||
|
|
||||||
|
def test_my_algo():
|
||||||
|
n_elem = 16
|
||||||
|
inputs = [np.arange(n_elem, dtype="f16") + r for r in range(4)]
|
||||||
|
expected = sum(inputs)
|
||||||
|
outputs = run_kernel_in_mock(
|
||||||
|
kernel_fn=kernel,
|
||||||
|
world_size=4,
|
||||||
|
topology="ring_1d",
|
||||||
|
inputs=inputs,
|
||||||
|
kernel_args=(n_elem, 4), # positional args after t_ptr
|
||||||
|
)
|
||||||
|
for r in range(4):
|
||||||
|
assert np.allclose(outputs[r], expected, rtol=1e-3)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Behavior
|
||||||
|
|
||||||
|
- All ranks run their kernels concurrently as cooperative greenlets.
|
||||||
|
- `tl.send` / `tl.recv` are serviced by in-memory FIFOs (no DMA, no
|
||||||
|
latency).
|
||||||
|
- Each rank's last `store` is what the helper returns as a numpy array.
|
||||||
|
|
||||||
|
### Limitations
|
||||||
|
|
||||||
|
- No latency or performance numbers (it is not a simulation).
|
||||||
|
- No PE_DMA, fabric, or BW model.
|
||||||
|
- Correctness only.
|
||||||
|
- One cube assumed: `program_id(axis=1)` is always 0.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 7. Debugging
|
||||||
|
|
||||||
|
### CCL trace
|
||||||
|
|
||||||
|
```bash
|
||||||
|
KERNBENCH_CCL_TRACE=1 kernbench run --topology topology.yaml \
|
||||||
|
--bench ccl_allreduce --verify-data
|
||||||
|
```
|
||||||
|
|
||||||
|
Per-rank send/recv events appear on stdout:
|
||||||
|
|
||||||
|
```
|
||||||
|
[ccl t=346.4 send] sip0.cube0.pe1 dir=E nbytes=64 seq=0
|
||||||
|
[ccl t=360.4 recv] sip0.cube0.pe2 dir=W nbytes=64
|
||||||
|
```
|
||||||
|
|
||||||
|
### Pointer dump
|
||||||
|
|
||||||
|
`kernbench.ccl.diagnostics.pointer_dump(engine)` returns a multi-line
|
||||||
|
dump of every PE_IPCQ ring buffer's `my_head`, `my_tail`,
|
||||||
|
`peer_head_cache`, `peer_tail_cache`. When something hangs, this shows
|
||||||
|
which rank is stuck and on what.
|
||||||
|
|
||||||
|
### Deadlock detection
|
||||||
|
|
||||||
|
When the SimPy schedule empties because of unmatched send/recv pairs,
|
||||||
|
the engine raises `IpcqDeadlock` and embeds the pointer dump in the
|
||||||
|
message (ADR-0023 D14 F3). Wait-for-graph visualization is future
|
||||||
|
work.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 8. Common mistakes
|
||||||
|
|
||||||
|
### 1. Using a direction that wasn't installed
|
||||||
|
|
||||||
|
`topology: ring_1d` only installs E and W. Trying:
|
||||||
|
|
||||||
|
```python
|
||||||
|
tl.send(dir="N", ...) # → IpcqInvalidDirection
|
||||||
|
```
|
||||||
|
|
||||||
|
Fix: switch to `topology: mesh_2d`, or add N/S in a `neighbors()` override.
|
||||||
|
|
||||||
|
### 2. `send` without a matching `recv`
|
||||||
|
|
||||||
|
```python
|
||||||
|
def kernel(..., tl):
|
||||||
|
for _ in range(100):
|
||||||
|
tl.send(dir="E", ...)
|
||||||
|
# The peer never recvs → ring buffer fills → backpressure → deadlock.
|
||||||
|
```
|
||||||
|
|
||||||
|
Fix: every `send` needs a matching `recv` on the receiver side.
|
||||||
|
Otherwise `IpcqDeadlock` is raised.
|
||||||
|
|
||||||
|
### 3. dtype/shape mismatch
|
||||||
|
|
||||||
|
By default mismatches are not validated. The author is responsible for
|
||||||
|
consistency. Set `strict_validation: true` on a PE_IPCQ node's attrs to
|
||||||
|
enable D14 F2 strict mode and catch them immediately.
|
||||||
|
|
||||||
|
### 4. Assuming round-robin recv fairness
|
||||||
|
|
||||||
|
`tl.recv()` (no direction) returns the first slot to arrive in
|
||||||
|
round-robin order, but **arrival order is not predictable**. If your
|
||||||
|
algorithm depends on a particular direction, name it explicitly:
|
||||||
|
`tl.recv(dir="N", ...)`.
|
||||||
|
|
||||||
|
### 5. Confusing `num_programs` with the CCL group size
|
||||||
|
|
||||||
|
`tl.num_programs(axis=0/1)` reports topology slot counts, not the
|
||||||
|
number of ranks participating in the collective. The host bench knows
|
||||||
|
`world_size` and must pass it through as a kernel argument.
|
||||||
|
|
||||||
|
### 6. Overwriting the send source before it's actually sent
|
||||||
|
|
||||||
|
PE_DMA snapshots the source data into the IpcqDmaToken at send time,
|
||||||
|
preserving in-flight semantics. Even so, the safest pattern is to call
|
||||||
|
`tl.send` first and only mutate the source addr afterwards. If you
|
||||||
|
mutate the addr before `tl.send` makes it into the PE_DMA queue, the
|
||||||
|
snapshot will pick up the wrong data.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 9. Next steps
|
||||||
|
|
||||||
|
- Try other topologies (`mesh_2d`, `tree_binary`).
|
||||||
|
- Faster algorithms (recursive halving / doubling).
|
||||||
|
- Compare `buffer_kind` (tcm/hbm/sram) and `backpressure` (poll/sleep)
|
||||||
|
modes for latency.
|
||||||
|
- Larger-scale validation through the unified `ccl_allreduce` bench
|
||||||
|
with different `ccl.yaml` overlays.
|
||||||
|
|
||||||
|
If you add a new algorithm or pattern, please send a PR.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## References
|
||||||
|
|
||||||
|
- [ADR-0023](adr/ADR-0023-ipcq-pe-collective.md): IPCQ + PE-level collective design.
|
||||||
|
- [ADR-0022](adr/ADR-0022-program-id-2d-grid.md): 2D grid program_id (axis=0/1).
|
||||||
|
- [ADR-0020](adr/ADR-0020-data-execution-two-pass.md): 2-pass data execution.
|
||||||
|
- [ADR-0021](adr/ADR-0021-pe-pipeline-refactor.md): PE pipeline refactor.
|
||||||
|
|
||||||
|
Existing algorithm examples:
|
||||||
|
|
||||||
|
- [`src/kernbench/ccl/algorithms/hello_send.py`](../src/kernbench/ccl/algorithms/hello_send.py) — simplest send/recv
|
||||||
|
- [`src/kernbench/ccl/algorithms/ring_allreduce.py`](../src/kernbench/ccl/algorithms/ring_allreduce.py) — ring all-reduce
|
||||||
|
- [`src/kernbench/ccl/algorithms/mesh_allreduce.py`](../src/kernbench/ccl/algorithms/mesh_allreduce.py) — 2D mesh all-reduce
|
||||||
|
- [`src/kernbench/ccl/algorithms/tree_allreduce.py`](../src/kernbench/ccl/algorithms/tree_allreduce.py) — binary tree all-reduce
|
||||||
@@ -0,0 +1,537 @@
|
|||||||
|
# CCL Algorithm Author Guide
|
||||||
|
|
||||||
|
이 문서는 kernbench에서 CCL (Collective Communication Library) 알고리즘을
|
||||||
|
직접 작성하는 사람을 위한 step-by-step 가이드이다. 시스템 내부 설계와
|
||||||
|
컴포넌트 구조는 [ADR-0023](adr/ADR-0023-ipcq-pe-collective.md)에 있다.
|
||||||
|
|
||||||
|
본 가이드는 알고리즘 작성자가 **자신이 만져야 할 곳**과 **만지지 않아도 될 곳**을
|
||||||
|
명확히 분리하고, 가장 짧은 경로로 첫 알고리즘을 동작시키는 것을 목표로 한다.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 0. 5분 요약
|
||||||
|
|
||||||
|
| 만지는 것 | 위치 |
|
||||||
|
|----------|------|
|
||||||
|
| 알고리즘 모듈 (kernel + 선택적 neighbors) | `src/kernbench/ccl/algorithms/<algo>.py` |
|
||||||
|
| 알고리즘 등록 | `ccl.yaml` |
|
||||||
|
| 호스트 bench (PE 수, 메모리 init, launch, 검증) | `benches/<your_bench>.py` |
|
||||||
|
| (선택) 단위 테스트 | `tests/test_<algo>.py` |
|
||||||
|
|
||||||
|
| 만지지 않는 것 | 위치 |
|
||||||
|
|---------------|------|
|
||||||
|
| TLContext API | `src/kernbench/triton_emu/tl_context.py` (ADR-0022 spec) |
|
||||||
|
| 프레임워크 (topology generators, helpers, mock testing) | `src/kernbench/ccl/` |
|
||||||
|
| PE_IPCQ / PE_DMA 컴포넌트 | `src/kernbench/components/builtin/` |
|
||||||
|
| backend 구현 (install_ipcq) | `src/kernbench/runtime_api/distributed.py` 및 `kernbench/ccl/install.py` |
|
||||||
|
|
||||||
|
흐름:
|
||||||
|
1. 알고리즘 모듈에 `kernel` 작성
|
||||||
|
2. `ccl.yaml`에 entry 등록
|
||||||
|
3. 호스트 bench에서 `install_ipcq` + `launch`
|
||||||
|
4. (선택) mock runtime으로 단위 테스트 (수 ms)
|
||||||
|
5. `kernbench run --bench <name> --verify-data`로 SimPy 검증
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. Hello World — 가장 단순한 send/recv
|
||||||
|
|
||||||
|
각 PE가 자기 데이터를 E 방향 이웃에 한 번 보내고, W 방향에서 한 번 받는
|
||||||
|
가장 단순한 알고리즘이다. 실제 동작 코드는
|
||||||
|
[`src/kernbench/ccl/algorithms/hello_send.py`](../src/kernbench/ccl/algorithms/hello_send.py)
|
||||||
|
에 있다.
|
||||||
|
|
||||||
|
### Step 1: kernel 작성
|
||||||
|
|
||||||
|
새 파일 `src/kernbench/ccl/algorithms/hello_send.py`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
"""Hello world: 자기 데이터를 다음 rank에 보내고 이전 rank에서 받기."""
|
||||||
|
def kernel(t_ptr, n_elem, tl):
|
||||||
|
# 글로벌 rank는 program_id(0/1)에서 계산 (ADR-0022)
|
||||||
|
local_pe = tl.program_id(axis=0)
|
||||||
|
cube_id = tl.program_id(axis=1)
|
||||||
|
pes_per_cube = tl.num_programs(axis=0)
|
||||||
|
rank = cube_id * pes_per_cube + local_pe
|
||||||
|
|
||||||
|
nbytes = n_elem * 2 # f16
|
||||||
|
pe_addr = t_ptr + rank * nbytes
|
||||||
|
|
||||||
|
# 자기 슬라이스를 로드해서 E로 보낸다.
|
||||||
|
src = tl.load(pe_addr, shape=(n_elem,), dtype="f16")
|
||||||
|
tl.send(dir="E", src=src)
|
||||||
|
|
||||||
|
# W 방향에서 받아서 그대로 자기 슬라이스에 store한다.
|
||||||
|
recv = tl.recv(dir="W", shape=(n_elem,), dtype="f16")
|
||||||
|
tl.store(pe_addr, recv)
|
||||||
|
```
|
||||||
|
|
||||||
|
핵심 포인트:
|
||||||
|
|
||||||
|
- **글로벌 rank는 `program_id(axis=0)` + `program_id(axis=1)`에서 계산.** TL에는
|
||||||
|
`tl.rank` / `tl.world_size` 같은 약속되지 않은 확장이 없다. 호스트가
|
||||||
|
`world_size` 같은 알고리즘 파라미터가 필요하면 `torch.launch`의 일반 인자로
|
||||||
|
전달한다.
|
||||||
|
- **`tl.send`는 `TensorHandle`을 받는다.** 핸들의 `addr`/`space`/`shape`/`dtype`/`nbytes`를
|
||||||
|
PE_IPCQ가 읽어 PE_DMA에 IpcqDmaToken을 발행한다.
|
||||||
|
- **`tl.recv`는 `shape`와 `dtype`이 필수.** 반환된 TensorHandle은 IPCQ ring slot을
|
||||||
|
가리키며, `tl.store(pe_addr, recv)`처럼 dst 핸들로 그대로 사용할 수 있다.
|
||||||
|
Phase 2 dma_write replay가 (slot, hbm) 복사를 수행하므로 numpy `.data`를
|
||||||
|
직접 만질 필요가 없다.
|
||||||
|
|
||||||
|
### Step 2: ccl.yaml 등록
|
||||||
|
|
||||||
|
`ccl.yaml`의 `algorithms` 섹션에 entry를 추가한다. (defaults.algorithm은 호스트
|
||||||
|
bench가 `install_ipcq(algorithm=...)`로 명시 전달해도 되므로 꼭 바꿀 필요는 없다.)
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
algorithms:
|
||||||
|
hello_send:
|
||||||
|
module: kernbench.ccl.algorithms.hello_send
|
||||||
|
topology: ring_1d
|
||||||
|
buffer_kind: tcm
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 3: 호스트 bench 작성
|
||||||
|
|
||||||
|
새 파일 `benches/ccl_hello.py`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
"""Hello-world ring rotation bench (각 PE가 W 이웃의 데이터를 1번 받음)."""
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from kernbench.ccl.algorithms import hello_send
|
||||||
|
from kernbench.policy.placement.dp import DPPolicy
|
||||||
|
|
||||||
|
ALGORITHM = "hello_send"
|
||||||
|
N_ELEM = 8
|
||||||
|
WORLD_SIZE = 8
|
||||||
|
|
||||||
|
|
||||||
|
def run(torch):
|
||||||
|
plan = torch.install_ipcq(algorithm=ALGORITHM)
|
||||||
|
|
||||||
|
a = torch.zeros(
|
||||||
|
(1, WORLD_SIZE * N_ELEM), dtype="f16",
|
||||||
|
dp=DPPolicy(
|
||||||
|
cube="replicate", pe="column_wise",
|
||||||
|
num_cubes=1,
|
||||||
|
),
|
||||||
|
name="hello_in",
|
||||||
|
)
|
||||||
|
|
||||||
|
store = torch.engine.memory_store
|
||||||
|
base = a._handle.va_base or a._handle.shards[0].pa
|
||||||
|
nbytes = N_ELEM * 2
|
||||||
|
for r in range(WORLD_SIZE):
|
||||||
|
store.write("hbm", base + r * nbytes,
|
||||||
|
np.full((N_ELEM,), float(r + 1), dtype=np.float16))
|
||||||
|
|
||||||
|
torch.launch(ALGORITHM, hello_send.kernel, a, N_ELEM)
|
||||||
|
|
||||||
|
# rank r은 rank (r-1)%ws의 데이터를 가져야 한다.
|
||||||
|
for r, (sip, cube, pe) in enumerate(plan["rank_to_pe"]):
|
||||||
|
result = store.read("hbm", base + r * nbytes, shape=(N_ELEM,), dtype="f16")
|
||||||
|
prev = float(((r - 1) % WORLD_SIZE) + 1)
|
||||||
|
ok = np.allclose(result, prev)
|
||||||
|
print(f" [{'OK ' if ok else 'FAIL'}] rank {r} got {float(result.mean()):.1f}, "
|
||||||
|
f"expected {prev:.1f}")
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 4: 단위 테스트 (선택, 강력 추천)
|
||||||
|
|
||||||
|
`tests/test_hello_send.py`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
import numpy as np
|
||||||
|
from kernbench.ccl.algorithms.hello_send import kernel
|
||||||
|
from kernbench.ccl.testing import run_kernel_in_mock
|
||||||
|
|
||||||
|
|
||||||
|
def test_hello_send_4_ranks():
|
||||||
|
n_elem = 8
|
||||||
|
inputs = [np.full((n_elem,), float(r + 1), dtype=np.float16) for r in range(4)]
|
||||||
|
|
||||||
|
outputs = run_kernel_in_mock(
|
||||||
|
kernel_fn=kernel,
|
||||||
|
world_size=4,
|
||||||
|
topology="ring_1d",
|
||||||
|
inputs=inputs,
|
||||||
|
kernel_args=(n_elem,),
|
||||||
|
)
|
||||||
|
|
||||||
|
# rank r은 rank (r-1) % 4의 데이터를 받아야 함
|
||||||
|
for r in range(4):
|
||||||
|
assert np.array_equal(outputs[r], inputs[(r - 1) % 4])
|
||||||
|
```
|
||||||
|
|
||||||
|
`run_kernel_in_mock`는 SimPy 없이 순수 Python으로 모든 rank를 동시 실행하므로
|
||||||
|
**ms 단위로 끝난다**. 알고리즘 logic 정합성만 검증.
|
||||||
|
|
||||||
|
### Step 5: 시뮬 검증
|
||||||
|
|
||||||
|
```bash
|
||||||
|
kernbench run --topology topology.yaml --bench ccl_hello --verify-data
|
||||||
|
```
|
||||||
|
|
||||||
|
Phase 1에서 SimPy 시뮬레이션 + MemoryStore 데이터 이동, Phase 2에서 op_log
|
||||||
|
정합성 replay. 호스트 bench의 `print` 검증이 모든 rank에 대해 OK여야 한다.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. Ring All-Reduce — 두 번째 알고리즘
|
||||||
|
|
||||||
|
조금 더 복잡한 예제. Ring all-reduce는 N-1 라운드 동안 각 PE가 자기 데이터를
|
||||||
|
E로 보내고 W에서 받아 누적한다. 최종적으로 모든 PE가 글로벌 sum을 갖는다.
|
||||||
|
|
||||||
|
실제 동작 코드는 [`src/kernbench/ccl/algorithms/ring_allreduce.py`](../src/kernbench/ccl/algorithms/ring_allreduce.py)
|
||||||
|
참조. 핵심 흐름:
|
||||||
|
|
||||||
|
```python
|
||||||
|
"""Ring all-reduce."""
|
||||||
|
|
||||||
|
|
||||||
|
def kernel(t_ptr, n_elem, world_size, tl):
|
||||||
|
# rank
|
||||||
|
local_pe = tl.program_id(axis=0)
|
||||||
|
cube_id = tl.program_id(axis=1)
|
||||||
|
pes_per_cube = tl.num_programs(axis=0)
|
||||||
|
rank = cube_id * pes_per_cube + local_pe
|
||||||
|
nbytes = n_elem * 2
|
||||||
|
pe_addr = t_ptr + rank * nbytes
|
||||||
|
|
||||||
|
# HBM의 자기 슬라이스를 가리키는 TensorHandle. greenlet 모드에선 .data가
|
||||||
|
# 채워지지만 커널은 .data를 직접 만질 필요가 없다.
|
||||||
|
acc = tl.load(pe_addr, shape=(n_elem,), dtype="f16")
|
||||||
|
current = acc # 첫 라운드 send 출처
|
||||||
|
|
||||||
|
for _step in range(world_size - 1):
|
||||||
|
tl.send(dir="E", src=current)
|
||||||
|
recv = tl.recv(dir="W", shape=(n_elem,), dtype="f16")
|
||||||
|
# TensorHandle 연산자 오버로드 → MathCmd → PE_MATH 디스패치.
|
||||||
|
# Phase 1은 타이밍만, Phase 2 DataExecutor가 실제 numpy 누적을 수행한다.
|
||||||
|
acc = acc + recv
|
||||||
|
current = recv # 다음 라운드는 직전에 받은 슬롯을 다시 forward
|
||||||
|
|
||||||
|
# 최종 누적값을 자기 슬라이스에 store. 출처는 acc(=PE-local scratch addr)
|
||||||
|
# 이고 dst는 HBM. op_log dma_write가 (scratch, hbm) 복사 정보를 기록하므로
|
||||||
|
# Phase 2가 검증 시점에 HBM[pe_addr]에 정답을 채워준다.
|
||||||
|
tl.store(pe_addr, acc)
|
||||||
|
```
|
||||||
|
|
||||||
|
네 가지 포인트:
|
||||||
|
|
||||||
|
1. **누적은 TensorHandle 연산자**: `acc + recv`는 `MathCmd`를 emit하고
|
||||||
|
PE_MATH로 디스패치된다 — 실제 하드웨어 경로를 거치므로 latency 모델이
|
||||||
|
정확하다. ADR-0020 D3대로 Phase 1은 타이밍만 시뮬레이션하고, Phase 2
|
||||||
|
`DataExecutor`가 op_log를 재실행하면서 numpy 누적을 수행한다.
|
||||||
|
2. **`current = recv`로 forward**: 매 라운드의 send 출처를 직전에 받은 슬롯
|
||||||
|
핸들로 갱신해야 같은 데이터가 ring을 순회하면서 누적이 한 번씩 일어난다.
|
||||||
|
`current = acc`로 두면 누적값이 다시 송출되어 결과가 부풀려진다.
|
||||||
|
3. **`tl.store(pe_addr, acc)` 한 번이면 끝**: 중간에 store→reload 패턴은
|
||||||
|
금지다. acc는 PE-local scratch에 살고, op_log가 (src=scratch, dst=hbm)
|
||||||
|
메타데이터를 기록한다. Phase 2가 math를 먼저 실행해 scratch를 채운 뒤
|
||||||
|
dma_write 스냅샷으로 HBM에 복사한다.
|
||||||
|
4. **`world_size`는 호스트가 명시 전달**: TL은 topology slot 수만 안다 (예:
|
||||||
|
`num_programs(axis=0)`은 cube당 PE 수). 실제 참여하는 CCL group 크기는 bench가
|
||||||
|
알고 호스트→kernel 인자로 넘긴다.
|
||||||
|
|
||||||
|
`ccl.yaml` 등록 + 호스트 bench는 [`benches/ccl_allreduce_tcm.py`](../benches/ccl_allreduce_tcm.py)
|
||||||
|
참조. mock 단위 테스트는 [`tests/test_ccl_mock_runtime.py`](../tests/test_ccl_mock_runtime.py)
|
||||||
|
를 그대로 따라하면 된다 (`kernel_args=(n_elem, world_size)` 인자 형태).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. neighbors() override — Custom topology
|
||||||
|
|
||||||
|
대부분의 알고리즘은 builtin topology(`ring_1d`, `mesh_2d`, `tree_binary`,
|
||||||
|
`ring_1d_unidir`, `none`)로 충분하다. builtin을 변형하거나 새로 만들고 싶으면
|
||||||
|
알고리즘 모듈에 `neighbors()`를 정의한다.
|
||||||
|
|
||||||
|
### 시그니처
|
||||||
|
|
||||||
|
```python
|
||||||
|
def neighbors(rank: int, world_size: int, neighbor_map: dict[str, int]) -> dict[str, int] | None:
|
||||||
|
"""builtin topology가 만든 neighbor_map을 override.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
neighbor_map: ccl.yaml의 topology 필드가 만든 builtin 매핑.
|
||||||
|
예: ring_1d → {"E": (rank+1)%ws, "W": (rank-1)%ws}
|
||||||
|
mutable dict — 직접 수정 가능.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: neighbor_map을 override한 결과 (또는 수정한 그 dict)
|
||||||
|
None: override 안 함, neighbor_map 그대로 사용
|
||||||
|
"""
|
||||||
|
return None
|
||||||
|
```
|
||||||
|
|
||||||
|
### Pattern A: builtin을 base로 일부만 수정
|
||||||
|
|
||||||
|
```python
|
||||||
|
def neighbors(rank, world_size, neighbor_map):
|
||||||
|
# 짝수 rank만 W 방향 사용 (홀수 rank는 W 제거)
|
||||||
|
if rank % 2 == 1:
|
||||||
|
neighbor_map.pop("W", None)
|
||||||
|
return neighbor_map
|
||||||
|
```
|
||||||
|
|
||||||
|
### Pattern B: 완전히 새로 작성 (skip-connection ring)
|
||||||
|
|
||||||
|
```python
|
||||||
|
def neighbors(rank, world_size, neighbor_map):
|
||||||
|
# neighbor_map은 무시하고 새로 작성
|
||||||
|
return {"E": (rank + 2) % world_size}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Pattern C: builtin 사용, override 없음
|
||||||
|
|
||||||
|
`neighbors()` 함수를 정의하지 않거나 None을 반환:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def neighbors(rank, world_size, neighbor_map):
|
||||||
|
return None # 명시적으로 builtin 사용
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. PE 커널 API 레퍼런스 (ADR-0023 D4)
|
||||||
|
|
||||||
|
### IPCQ API
|
||||||
|
|
||||||
|
| API | 설명 | Blocking? |
|
||||||
|
|-----|------|-----------|
|
||||||
|
| `tl.send(dir, src=TensorHandle)` | direction으로 데이터 send | Yes (peer slot full 시 wait) |
|
||||||
|
| `tl.send(dir, src_addr=..., nbytes=..., shape=..., dtype=..., space=...)` | 동일, keyword 형태 | Yes |
|
||||||
|
| `tl.recv(dir, shape=..., dtype=...)` | 특정 방향에서 blocking recv | Yes |
|
||||||
|
| `tl.recv(shape=..., dtype=...)` | 4방향 round-robin recv (방향 미지정) | Yes |
|
||||||
|
| `tl.recv_async(dir, shape=..., dtype=...) → RecvFuture` | non-blocking recv | No |
|
||||||
|
| `tl.wait(future)` | non-blocking future 완료 대기 → TensorHandle | Yes |
|
||||||
|
|
||||||
|
### 기존 TL API (ADR-0020/0022, 그대로 사용 가능)
|
||||||
|
|
||||||
|
| API | 설명 |
|
||||||
|
|-----|------|
|
||||||
|
| `tl.load(addr, shape, dtype) → TensorHandle` | DMA read; greenlet 모드에서 `.data`에 ndarray |
|
||||||
|
| `tl.store(addr, handle)` | DMA write — handle.data가 있으면 MemoryStore에 propagate |
|
||||||
|
| `tl.composite(op, ...)` | GEMM/Math compute 비동기 submit |
|
||||||
|
| `tl.program_id(axis=0)` | cube 내 local PE id |
|
||||||
|
| `tl.program_id(axis=1)` | cube id (ADR-0022) |
|
||||||
|
| `tl.num_programs(axis=0/1)` | topology 슬롯 수 (참여 ranks 수가 아님) |
|
||||||
|
|
||||||
|
### `recv` 두 가지 모드
|
||||||
|
|
||||||
|
기본은 `return_slot` (zero-copy): IPCQ slot 주소가 그대로 handle.addr에 들어온다.
|
||||||
|
slot 데이터를 별도 위치로 복사하고 싶으면 `dst_addr` + `dst_space`를 명시:
|
||||||
|
|
||||||
|
```python
|
||||||
|
recv = tl.recv(
|
||||||
|
dir="W", shape=(8,), dtype="f16",
|
||||||
|
dst_addr=my_scratch_addr,
|
||||||
|
dst_space="hbm",
|
||||||
|
)
|
||||||
|
# 이제 recv.addr == my_scratch_addr (copy_to_dst 모드)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. Helpers (`kernbench.ccl.helpers`)
|
||||||
|
|
||||||
|
알고리즘 코드를 짧게 유지하기 위한 헬퍼들:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from kernbench.ccl.helpers import chunked, ring_step, tree_step
|
||||||
|
```
|
||||||
|
|
||||||
|
### `chunked(base_addr, n_chunks, n_elem, dtype="f16") → list[Chunk]`
|
||||||
|
|
||||||
|
총 `n_elem` 개의 element를 `n_chunks` 등분한 view 리스트를 반환. 각 `Chunk`는
|
||||||
|
`addr`, `n_elem`, `nbytes` 필드를 가진다.
|
||||||
|
|
||||||
|
```python
|
||||||
|
chunks = chunked(t_ptr, n_chunks=4, n_elem=64, dtype="f16")
|
||||||
|
# chunks[0..3] 각각 16 element view, addr이 연속
|
||||||
|
```
|
||||||
|
|
||||||
|
### `ring_step(rank, step, world_size) → (send_idx, recv_idx)`
|
||||||
|
|
||||||
|
Ring algorithm의 step별 chunk 인덱스 (reduce-scatter / all-gather):
|
||||||
|
|
||||||
|
```python
|
||||||
|
for step in range(world_size - 1):
|
||||||
|
send_idx, recv_idx = ring_step(rank, step, world_size)
|
||||||
|
tl.send(dir="E", src_addr=chunks[send_idx].addr,
|
||||||
|
nbytes=chunks[send_idx].nbytes,
|
||||||
|
shape=(chunks[send_idx].n_elem,), dtype="f16")
|
||||||
|
recv = tl.recv(dir="W", shape=(chunks[recv_idx].n_elem,), dtype="f16")
|
||||||
|
# accumulate ...
|
||||||
|
```
|
||||||
|
|
||||||
|
### `tree_step(rank, world_size) → {"parent": int|None, "children": list[int]}`
|
||||||
|
|
||||||
|
Binary tree의 parent/children rank:
|
||||||
|
|
||||||
|
```python
|
||||||
|
info = tree_step(rank, world_size)
|
||||||
|
if info["parent"] is None:
|
||||||
|
print(f"rank {rank} is the root")
|
||||||
|
for child in info["children"]:
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 6. 단위 테스트 — Mock Runtime
|
||||||
|
|
||||||
|
`kernbench.ccl.testing.run_kernel_in_mock`은 SimPy를 거치지 않고 알고리즘을
|
||||||
|
빠르게 검증할 수 있다.
|
||||||
|
|
||||||
|
### 기본 사용법
|
||||||
|
|
||||||
|
```python
|
||||||
|
from kernbench.ccl.testing import run_kernel_in_mock
|
||||||
|
from kernbench.ccl.algorithms.my_algo import kernel
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
def test_my_algo():
|
||||||
|
n_elem = 16
|
||||||
|
inputs = [np.arange(n_elem, dtype="f16") + r for r in range(4)]
|
||||||
|
expected = sum(inputs)
|
||||||
|
|
||||||
|
outputs = run_kernel_in_mock(
|
||||||
|
kernel_fn=kernel,
|
||||||
|
world_size=4,
|
||||||
|
topology="ring_1d",
|
||||||
|
inputs=inputs,
|
||||||
|
kernel_args=(n_elem, 4), # kernel의 (t_ptr 이후) 추가 positional 인자
|
||||||
|
)
|
||||||
|
|
||||||
|
for r in range(4):
|
||||||
|
assert np.allclose(outputs[r], expected, rtol=1e-3)
|
||||||
|
```
|
||||||
|
|
||||||
|
### 동작
|
||||||
|
|
||||||
|
- 4개 rank의 kernel을 greenlet으로 동시 실행
|
||||||
|
- `tl.send/recv`를 in-memory FIFO로 즉시 처리 (DMA, latency 무시)
|
||||||
|
- 각 rank가 마지막에 store한 데이터를 ndarray로 반환
|
||||||
|
|
||||||
|
### 한계
|
||||||
|
|
||||||
|
- latency / 성능 측정 불가 (시뮬레이션이 아님)
|
||||||
|
- PE_DMA, fabric, BW 모델 안 함
|
||||||
|
- 정합성 검증만 가능
|
||||||
|
- 한 cube 안에서 동작하는 가정 — `program_id(axis=1)`은 항상 0
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 7. 디버깅
|
||||||
|
|
||||||
|
### CCL trace
|
||||||
|
|
||||||
|
```bash
|
||||||
|
KERNBENCH_CCL_TRACE=1 kernbench run --topology topology.yaml \
|
||||||
|
--bench ccl_allreduce_tcm --verify-data
|
||||||
|
```
|
||||||
|
|
||||||
|
각 rank의 send/recv 시점이 stdout에 출력된다:
|
||||||
|
|
||||||
|
```
|
||||||
|
[ccl t=346.4 send] sip0.cube0.pe1 dir=E nbytes=64 seq=0
|
||||||
|
[ccl t=360.4 recv] sip0.cube0.pe2 dir=W nbytes=64
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
||||||
|
### Pointer dump
|
||||||
|
|
||||||
|
`kernbench.ccl.diagnostics.pointer_dump(engine)`는 모든 PE_IPCQ의 ring buffer
|
||||||
|
상태(`my_head`, `my_tail`, `peer_head_cache`, `peer_tail_cache`)를 multi-line
|
||||||
|
문자열로 반환한다. hang이 발생하면 어느 rank가 어떤 상태에서 막혔는지 한눈에
|
||||||
|
보인다.
|
||||||
|
|
||||||
|
### Deadlock detection
|
||||||
|
|
||||||
|
매칭되지 않는 send/recv 등으로 SimPy 스케줄이 비면 engine이 `IpcqDeadlock`을
|
||||||
|
던지며 pointer dump를 메시지에 포함시킨다 (ADR-0023 D14 F3). 별도 wait-for graph
|
||||||
|
시각화는 미래 작업.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 8. 흔한 실수
|
||||||
|
|
||||||
|
### 1. install 안 된 direction 사용
|
||||||
|
|
||||||
|
ccl.yaml의 `topology: ring_1d`는 E/W만 install한다. N/S 사용 시:
|
||||||
|
|
||||||
|
```python
|
||||||
|
tl.send(dir="N", ...) # → IpcqInvalidDirection 예외
|
||||||
|
```
|
||||||
|
|
||||||
|
해결: `topology: mesh_2d`로 바꾸거나, `neighbors()` override로 N/S 추가.
|
||||||
|
|
||||||
|
### 2. send만 호출하고 recv 없음
|
||||||
|
|
||||||
|
```python
|
||||||
|
def kernel(..., tl):
|
||||||
|
for _ in range(100):
|
||||||
|
tl.send(dir="E", ...)
|
||||||
|
# peer 측 recv 없음 → ring buffer 가득 차면 backpressure → deadlock
|
||||||
|
```
|
||||||
|
|
||||||
|
해결: 모든 send에 짝이 되는 recv가 있어야 한다. 안 그러면 `IpcqDeadlock`이
|
||||||
|
발생한다.
|
||||||
|
|
||||||
|
### 3. dtype/shape 불일치
|
||||||
|
|
||||||
|
기본 모드에서는 dtype/shape mismatch를 검증하지 않는다. 작성자가 직접 보장하거나,
|
||||||
|
PE_IPCQ 노드 attrs에 `strict_validation: true`를 설정해 D14 F2 strict 모드로
|
||||||
|
mismatch를 즉시 잡을 수 있다.
|
||||||
|
|
||||||
|
### 4. round-robin recv의 fairness 가정
|
||||||
|
|
||||||
|
`tl.recv()` (방향 미지정)는 round-robin으로 가져오지만, 도착한 첫 슬롯을 반환한다.
|
||||||
|
**도착 순서를 알 수 없으므로** 알고리즘이 도착 방향에 의존하면 안 된다.
|
||||||
|
필요하면 `tl.recv(dir="N", ...)`처럼 명시.
|
||||||
|
|
||||||
|
### 5. CCL 그룹 크기 가정
|
||||||
|
|
||||||
|
`tl.num_programs(axis=0/1)`은 토폴로지 슬롯 개수이지 CCL group 크기가 아니다.
|
||||||
|
참여하는 rank 수(`world_size`)는 호스트 bench가 알고 있고, kernel 인자로 명시
|
||||||
|
전달해야 한다.
|
||||||
|
|
||||||
|
### 6. 호스트가 send-source 메모리를 도착 전에 덮어씀
|
||||||
|
|
||||||
|
PE_DMA가 송신 시점에 src 데이터를 토큰에 스냅샷해서 in-flight 데이터의 의미가
|
||||||
|
보존된다. 그래도 하나의 PE 안에서 같은 주소를 여러 step에 걸쳐 갱신할 때는
|
||||||
|
direct send 후 다른 step에서 같은 주소를 store해도 안전하다 (token snapshot 덕분).
|
||||||
|
하지만 `tl.send`가 PE_DMA 큐에 enqueue되기 전에 주소를 덮어쓰면 잘못된 데이터가
|
||||||
|
스냅샷된다 — `tl.send`를 먼저, 메모리 변경을 나중에 하는 게 권장.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 9. 다음 단계
|
||||||
|
|
||||||
|
- `mesh_2d` / `tree_binary` 같은 다른 topology 활용
|
||||||
|
- recursive halving/doubling 등 더 빠른 알고리즘
|
||||||
|
- `buffer_kind` (tcm/hbm/sram) / `backpressure` (poll/sleep) 모드별 latency 비교
|
||||||
|
- `ccl_ring_allreduce_multicube.py`, `ccl_ring_allreduce_multisip.py`처럼 큰
|
||||||
|
scale의 ring 검증
|
||||||
|
|
||||||
|
새 알고리즘이나 패턴을 추가했다면 PR로 기여해주세요.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 참고
|
||||||
|
|
||||||
|
- [ADR-0023](adr/ADR-0023-ipcq-pe-collective.md): IPCQ + PE-level collective 설계
|
||||||
|
- [ADR-0022](adr/ADR-0022-program-id-2d-grid.md): 2D grid program_id (axis=0/1)
|
||||||
|
- [ADR-0020](adr/ADR-0020-data-execution-two-pass.md): 2-pass data execution
|
||||||
|
- [ADR-0021](adr/ADR-0021-pe-pipeline-refactor.md): PE pipeline refactor
|
||||||
|
|
||||||
|
기존 알고리즘 예제:
|
||||||
|
|
||||||
|
- [`src/kernbench/ccl/algorithms/hello_send.py`](../src/kernbench/ccl/algorithms/hello_send.py) — 가장 단순한 send/recv
|
||||||
|
- [`src/kernbench/ccl/algorithms/ring_allreduce.py`](../src/kernbench/ccl/algorithms/ring_allreduce.py) — ring all-reduce
|
||||||
|
- [`src/kernbench/ccl/algorithms/mesh_allreduce.py`](../src/kernbench/ccl/algorithms/mesh_allreduce.py) — 2D mesh all-reduce
|
||||||
|
- [`src/kernbench/ccl/algorithms/tree_allreduce.py`](../src/kernbench/ccl/algorithms/tree_allreduce.py) — binary tree all-reduce
|
||||||
+4
-2
@@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta"
|
|||||||
name = "kernbench"
|
name = "kernbench"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
requires-python = ">=3.10"
|
requires-python = ">=3.10"
|
||||||
dependencies = ["pytest", "simpy", "pyyaml", "fastapi>=0.110", "uvicorn[standard]>=0.29", "websockets>=12"]
|
dependencies = ["pytest", "simpy", "pyyaml", "fastapi>=0.110", "uvicorn[standard]>=0.29", "websockets>=12", "numpy>=1.24", "greenlet>=3.0"]
|
||||||
|
|
||||||
[project.scripts]
|
[project.scripts]
|
||||||
kernbench = "kernbench.cli.main:main"
|
kernbench = "kernbench.cli.main:main"
|
||||||
@@ -18,6 +18,7 @@ include = ["kernbench*", "benches*"]
|
|||||||
[project.optional-dependencies]
|
[project.optional-dependencies]
|
||||||
dev = [
|
dev = [
|
||||||
"pytest>=7",
|
"pytest>=7",
|
||||||
|
"pytest-xdist>=3.0",
|
||||||
"ruff>=0.4.0",
|
"ruff>=0.4.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -31,4 +32,5 @@ select = ["E", "F", "I", "B", "UP"]
|
|||||||
ignore = ["F401"]
|
ignore = ["F401"]
|
||||||
|
|
||||||
[tool.pytest.ini_options]
|
[tool.pytest.ini_options]
|
||||||
addopts = ["--disable-warnings"]
|
addopts = ["--disable-warnings", "-n", "auto", "-m", "not slow"]
|
||||||
|
markers = ["slow: 256-rank full-system tests (~3min each, run with: pytest -m '')"]
|
||||||
|
|||||||
@@ -0,0 +1,9 @@
|
|||||||
|
"""CCL (Collective Communication Library) framework for kernbench (ADR-0023).
|
||||||
|
|
||||||
|
This package provides:
|
||||||
|
- topologies: builtin neighbor topology generators (ring/mesh/tree)
|
||||||
|
- helpers: utilities for algorithm authors (chunked, ring_step, ...)
|
||||||
|
- testing: mock CCL runtime for fast unit tests of algorithm kernels
|
||||||
|
|
||||||
|
See docs/adr/ADR-0023-ipcq-pe-collective.md and docs/ccl-author-guide.md.
|
||||||
|
"""
|
||||||
@@ -0,0 +1,189 @@
|
|||||||
|
"""Intercube all-reduce kernel (pe0-only, same-lane across cubes).
|
||||||
|
|
||||||
|
Reduces across the 4×4 cube mesh within each SIP, then exchanges
|
||||||
|
between SIPs using the configured SIP topology, and broadcasts back.
|
||||||
|
|
||||||
|
Supported SIP topologies (selected via ``sip_topo_kind``):
|
||||||
|
0 — ring_1d: global_E/global_W ring, n_sips-1 rounds
|
||||||
|
1 — torus_2d: row ring (global_E/W) + col ring (global_S/N)
|
||||||
|
2 — mesh_2d: row chain reduce+broadcast + col chain reduce+broadcast
|
||||||
|
|
||||||
|
IPCQ wiring is handled by ``configure_sfr_intercube_multisip``.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
SIP_TOPO_RING = 0
|
||||||
|
SIP_TOPO_TORUS = 1
|
||||||
|
SIP_TOPO_MESH = 2
|
||||||
|
|
||||||
|
TOPO_NAME_TO_KIND = {
|
||||||
|
"ring_1d": SIP_TOPO_RING,
|
||||||
|
"torus_2d": SIP_TOPO_TORUS,
|
||||||
|
"mesh_2d": SIP_TOPO_TORUS,
|
||||||
|
"mesh_2d_no_wrap": SIP_TOPO_MESH,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def kernel_args(world_size: int, n_elem: int) -> tuple:
|
||||||
|
cube_w = 4
|
||||||
|
cube_h = 4
|
||||||
|
return (n_elem, cube_w, cube_h, world_size)
|
||||||
|
|
||||||
|
|
||||||
|
def _inter_sip_ring(acc, n_sips, n_elem, tl):
|
||||||
|
current = acc
|
||||||
|
for _ in range(n_sips - 1):
|
||||||
|
tl.send(dir="global_E", src=current)
|
||||||
|
recv = tl.recv(dir="global_W", shape=(n_elem,), dtype="f16")
|
||||||
|
acc = acc + recv
|
||||||
|
current = recv
|
||||||
|
return acc
|
||||||
|
|
||||||
|
|
||||||
|
def _inter_sip_torus_2d(acc, sip_rank, sip_topo_w, sip_topo_h, n_elem, tl):
|
||||||
|
# Row ring (global_E / global_W)
|
||||||
|
current = acc
|
||||||
|
for _ in range(sip_topo_w - 1):
|
||||||
|
tl.send(dir="global_E", src=current)
|
||||||
|
recv = tl.recv(dir="global_W", shape=(n_elem,), dtype="f16")
|
||||||
|
acc = acc + recv
|
||||||
|
current = recv
|
||||||
|
# Col ring (global_S / global_N)
|
||||||
|
current = acc
|
||||||
|
for _ in range(sip_topo_h - 1):
|
||||||
|
tl.send(dir="global_S", src=current)
|
||||||
|
recv = tl.recv(dir="global_N", shape=(n_elem,), dtype="f16")
|
||||||
|
acc = acc + recv
|
||||||
|
current = recv
|
||||||
|
return acc
|
||||||
|
|
||||||
|
|
||||||
|
def _inter_sip_mesh_2d(acc, sip_rank, sip_topo_w, sip_topo_h, n_elem, tl):
|
||||||
|
sip_row = sip_rank // sip_topo_w
|
||||||
|
sip_col = sip_rank % sip_topo_w
|
||||||
|
|
||||||
|
# Row reduce W → E
|
||||||
|
if sip_col == 0:
|
||||||
|
tl.send(dir="global_E", src=acc)
|
||||||
|
elif sip_col < sip_topo_w - 1:
|
||||||
|
recv = tl.recv(dir="global_W", shape=(n_elem,), dtype="f16")
|
||||||
|
acc = acc + recv
|
||||||
|
tl.send(dir="global_E", src=acc)
|
||||||
|
else:
|
||||||
|
recv = tl.recv(dir="global_W", shape=(n_elem,), dtype="f16")
|
||||||
|
acc = acc + recv
|
||||||
|
|
||||||
|
# Row broadcast E → W
|
||||||
|
if sip_col == sip_topo_w - 1:
|
||||||
|
tl.send(dir="global_W", src=acc)
|
||||||
|
elif sip_col > 0:
|
||||||
|
acc = tl.recv(dir="global_E", shape=(n_elem,), dtype="f16")
|
||||||
|
tl.send(dir="global_W", src=acc)
|
||||||
|
else:
|
||||||
|
acc = tl.recv(dir="global_E", shape=(n_elem,), dtype="f16")
|
||||||
|
|
||||||
|
# Col reduce N → S
|
||||||
|
if sip_row == 0:
|
||||||
|
tl.send(dir="global_S", src=acc)
|
||||||
|
elif sip_row < sip_topo_h - 1:
|
||||||
|
recv = tl.recv(dir="global_N", shape=(n_elem,), dtype="f16")
|
||||||
|
acc = acc + recv
|
||||||
|
tl.send(dir="global_S", src=acc)
|
||||||
|
else:
|
||||||
|
recv = tl.recv(dir="global_N", shape=(n_elem,), dtype="f16")
|
||||||
|
acc = acc + recv
|
||||||
|
|
||||||
|
# Col broadcast S → N
|
||||||
|
if sip_row == sip_topo_h - 1:
|
||||||
|
tl.send(dir="global_N", src=acc)
|
||||||
|
elif sip_row > 0:
|
||||||
|
acc = tl.recv(dir="global_S", shape=(n_elem,), dtype="f16")
|
||||||
|
tl.send(dir="global_N", src=acc)
|
||||||
|
else:
|
||||||
|
acc = tl.recv(dir="global_S", shape=(n_elem,), dtype="f16")
|
||||||
|
|
||||||
|
return acc
|
||||||
|
|
||||||
|
|
||||||
|
def allreduce_intercube_multidevice(
|
||||||
|
t_ptr, n_elem, cube_w, cube_h, n_sips, sip_rank,
|
||||||
|
sip_topo_kind, sip_topo_w, sip_topo_h, tl,
|
||||||
|
):
|
||||||
|
"""Intercube all-reduce (pe0-only) with configurable SIP topology.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
t_ptr: VA base of the row-wise-sharded tensor on this SIP.
|
||||||
|
n_elem: f16 elements per cube tile.
|
||||||
|
cube_w: cube mesh width (columns).
|
||||||
|
cube_h: cube mesh height (rows).
|
||||||
|
n_sips: number of SIPs.
|
||||||
|
sip_rank: this SIP's rank (0-based).
|
||||||
|
sip_topo_kind: 0=ring, 1=torus_2d, 2=mesh_2d.
|
||||||
|
sip_topo_w: SIP mesh width (for 2D topologies, 0 for ring).
|
||||||
|
sip_topo_h: SIP mesh height (for 2D topologies, 0 for ring).
|
||||||
|
tl: TLContext (auto-injected).
|
||||||
|
"""
|
||||||
|
cube_id = tl.program_id(axis=1)
|
||||||
|
row = cube_id // cube_w
|
||||||
|
col = cube_id % cube_w
|
||||||
|
nbytes = n_elem * 2
|
||||||
|
|
||||||
|
pe_addr = t_ptr + cube_id * nbytes
|
||||||
|
acc = tl.load(pe_addr, shape=(n_elem,), dtype="f16")
|
||||||
|
|
||||||
|
# ── Phase 1: row reduce W → E ──
|
||||||
|
if col == 0:
|
||||||
|
tl.send(dir="E", src=acc)
|
||||||
|
elif col < cube_w - 1:
|
||||||
|
recv = tl.recv(dir="W", shape=(n_elem,), dtype="f16")
|
||||||
|
acc = acc + recv
|
||||||
|
tl.send(dir="E", src=acc)
|
||||||
|
else:
|
||||||
|
recv = tl.recv(dir="W", shape=(n_elem,), dtype="f16")
|
||||||
|
acc = acc + recv
|
||||||
|
|
||||||
|
# ── Phase 2: col reduce N → S on rightmost column ──
|
||||||
|
if col == cube_w - 1:
|
||||||
|
if row == 0:
|
||||||
|
tl.send(dir="S", src=acc)
|
||||||
|
elif row < cube_h - 1:
|
||||||
|
recv = tl.recv(dir="N", shape=(n_elem,), dtype="f16")
|
||||||
|
acc = acc + recv
|
||||||
|
tl.send(dir="S", src=acc)
|
||||||
|
else:
|
||||||
|
recv = tl.recv(dir="N", shape=(n_elem,), dtype="f16")
|
||||||
|
acc = acc + recv
|
||||||
|
|
||||||
|
# ── Phase 3: inter-SIP exchange on root cube ──
|
||||||
|
root_cube = (cube_h - 1) * cube_w + (cube_w - 1)
|
||||||
|
if cube_id == root_cube and n_sips > 1:
|
||||||
|
if sip_topo_kind == SIP_TOPO_RING:
|
||||||
|
acc = _inter_sip_ring(acc, n_sips, n_elem, tl)
|
||||||
|
elif sip_topo_kind == SIP_TOPO_TORUS:
|
||||||
|
acc = _inter_sip_torus_2d(acc, sip_rank, sip_topo_w, sip_topo_h, n_elem, tl)
|
||||||
|
elif sip_topo_kind == SIP_TOPO_MESH:
|
||||||
|
acc = _inter_sip_mesh_2d(acc, sip_rank, sip_topo_w, sip_topo_h, n_elem, tl)
|
||||||
|
|
||||||
|
# ── Phase 4: col broadcast S → N on rightmost column ──
|
||||||
|
if col == cube_w - 1:
|
||||||
|
if row == cube_h - 1:
|
||||||
|
tl.send(dir="N", src=acc)
|
||||||
|
elif row > 0:
|
||||||
|
acc = tl.recv(dir="S", shape=(n_elem,), dtype="f16")
|
||||||
|
tl.send(dir="N", src=acc)
|
||||||
|
else:
|
||||||
|
acc = tl.recv(dir="S", shape=(n_elem,), dtype="f16")
|
||||||
|
|
||||||
|
# ── Phase 5: row broadcast E → W ──
|
||||||
|
if col == cube_w - 1:
|
||||||
|
tl.send(dir="W", src=acc)
|
||||||
|
elif col > 0:
|
||||||
|
acc = tl.recv(dir="E", shape=(n_elem,), dtype="f16")
|
||||||
|
tl.send(dir="W", src=acc)
|
||||||
|
else:
|
||||||
|
acc = tl.recv(dir="E", shape=(n_elem,), dtype="f16")
|
||||||
|
|
||||||
|
tl.store(pe_addr, acc)
|
||||||
|
|
||||||
|
|
||||||
|
kernel = allreduce_intercube_multidevice
|
||||||
@@ -0,0 +1,127 @@
|
|||||||
|
"""CCL diagnostics: trace + pointer dump + deadlock (ADR-0023 D14).
|
||||||
|
|
||||||
|
Trace
|
||||||
|
-----
|
||||||
|
Set ``KERNBENCH_CCL_TRACE=1`` (or any truthy value) to enable per-event
|
||||||
|
logging of CCL send/recv to stdout. Off by default.
|
||||||
|
|
||||||
|
Pointer dump
|
||||||
|
------------
|
||||||
|
``pointer_dump(engine)`` returns a multi-line string showing every PE_IPCQ's
|
||||||
|
ring buffer state (my_head, my_tail, peer_head_cache, peer_tail_cache).
|
||||||
|
Useful for diagnosing hangs.
|
||||||
|
|
||||||
|
Deadlock
|
||||||
|
--------
|
||||||
|
``IpcqDeadlock`` is raised by the engine when SimPy's schedule empties
|
||||||
|
while a request is still pending — typical of unmatched send/recv pairs.
|
||||||
|
The exception message includes the pointer dump.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
|
||||||
|
class IpcqDeadlock(RuntimeError):
|
||||||
|
"""Raised when the simulation cannot make further progress while a
|
||||||
|
CCL request is still pending (D14 F3)."""
|
||||||
|
|
||||||
|
|
||||||
|
# ── Trace toggle ─────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
_TRACE_ENABLED: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
def reload_trace_setting() -> None:
|
||||||
|
"""Re-read the ``KERNBENCH_CCL_TRACE`` env var."""
|
||||||
|
global _TRACE_ENABLED
|
||||||
|
val = os.environ.get("KERNBENCH_CCL_TRACE", "")
|
||||||
|
_TRACE_ENABLED = val.strip().lower() in {"1", "true", "yes", "on"}
|
||||||
|
|
||||||
|
|
||||||
|
def trace_enabled() -> bool:
|
||||||
|
return _TRACE_ENABLED
|
||||||
|
|
||||||
|
|
||||||
|
# Initialise once at import time
|
||||||
|
reload_trace_setting()
|
||||||
|
|
||||||
|
|
||||||
|
# ── Trace event functions ────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def log_send(
|
||||||
|
t_ns: float,
|
||||||
|
sender: str,
|
||||||
|
direction: str,
|
||||||
|
nbytes: int,
|
||||||
|
sender_seq: int,
|
||||||
|
) -> None:
|
||||||
|
if not _TRACE_ENABLED:
|
||||||
|
return
|
||||||
|
print(
|
||||||
|
f"[ccl t={t_ns:.1f} send] {sender} dir={direction} nbytes={nbytes} seq={sender_seq}",
|
||||||
|
flush=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def log_recv(
|
||||||
|
t_ns: float,
|
||||||
|
receiver: str,
|
||||||
|
direction: str,
|
||||||
|
nbytes: int,
|
||||||
|
) -> None:
|
||||||
|
if not _TRACE_ENABLED:
|
||||||
|
return
|
||||||
|
print(
|
||||||
|
f"[ccl t={t_ns:.1f} recv] {receiver} dir={direction} nbytes={nbytes}",
|
||||||
|
flush=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def log_credit_return(
|
||||||
|
t_ns: float,
|
||||||
|
sender: str,
|
||||||
|
direction: str,
|
||||||
|
consumer_seq: int,
|
||||||
|
) -> None:
|
||||||
|
if not _TRACE_ENABLED:
|
||||||
|
return
|
||||||
|
print(
|
||||||
|
f"[ccl t={t_ns:.1f} credit] {sender} dir={direction} seq={consumer_seq}",
|
||||||
|
flush=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Pointer dump ─────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def pointer_dump(engine: Any) -> str:
|
||||||
|
"""Return a multi-line string of every PE_IPCQ's pointer state."""
|
||||||
|
lines: list[str] = []
|
||||||
|
components = getattr(engine, "_components", {})
|
||||||
|
for node_id in sorted(components):
|
||||||
|
if not node_id.endswith(".pe_ipcq"):
|
||||||
|
continue
|
||||||
|
comp = components[node_id]
|
||||||
|
qps = getattr(comp, "queue_pairs", {})
|
||||||
|
if not qps:
|
||||||
|
continue
|
||||||
|
lines.append(node_id)
|
||||||
|
for d in sorted(qps):
|
||||||
|
qp = qps[d]
|
||||||
|
peer = qp["peer"]
|
||||||
|
lines.append(
|
||||||
|
f" {d}: peer=sip{peer.sip}.cube{peer.cube}.pe{peer.pe} "
|
||||||
|
f"my_head={qp['my_head']} my_tail={qp['my_tail']} "
|
||||||
|
f"peer_head_cache={qp['peer_head_cache']} "
|
||||||
|
f"peer_tail_cache={qp['peer_tail_cache']}"
|
||||||
|
)
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def print_pointer_dump(engine: Any) -> None:
|
||||||
|
"""Convenience: print pointer_dump(engine) to stdout."""
|
||||||
|
print(pointer_dump(engine), flush=True)
|
||||||
@@ -0,0 +1,118 @@
|
|||||||
|
"""Helpers for CCL algorithm authors (ADR-0023 D15).
|
||||||
|
|
||||||
|
These are pure utility functions usable from any kernel module:
|
||||||
|
|
||||||
|
from kernbench.ccl.helpers import chunked, ring_step, tree_step
|
||||||
|
|
||||||
|
They keep algorithm code short and free of off-by-one bugs.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
|
||||||
|
_DTYPE_BYTES = {
|
||||||
|
"f16": 2, "fp16": 2, "float16": 2, "bf16": 2,
|
||||||
|
"f32": 4, "fp32": 4, "float32": 4,
|
||||||
|
"i8": 1, "int8": 1,
|
||||||
|
"i16": 2, "int16": 2,
|
||||||
|
"i32": 4, "int32": 4,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _itemsize(dtype: str) -> int:
|
||||||
|
if dtype not in _DTYPE_BYTES:
|
||||||
|
raise ValueError(f"Unsupported dtype: {dtype}")
|
||||||
|
return _DTYPE_BYTES[dtype]
|
||||||
|
|
||||||
|
|
||||||
|
# ── chunked ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class Chunk:
|
||||||
|
"""One chunk of a tensor used by collective algorithms."""
|
||||||
|
|
||||||
|
addr: int
|
||||||
|
n_elem: int
|
||||||
|
nbytes: int
|
||||||
|
|
||||||
|
|
||||||
|
def chunked(
|
||||||
|
base_addr: int,
|
||||||
|
n_chunks: int,
|
||||||
|
n_elem: int,
|
||||||
|
dtype: str = "f16",
|
||||||
|
) -> list[Chunk]:
|
||||||
|
"""Slice a 1D buffer into ``n_chunks`` equal Chunks.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
base_addr: starting address of the buffer.
|
||||||
|
n_chunks: number of equal chunks to produce.
|
||||||
|
n_elem: total number of elements (must be divisible by n_chunks).
|
||||||
|
dtype: element type for byte-size calculation.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of ``Chunk`` objects whose addresses are consecutive.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: if n_elem is not divisible by n_chunks.
|
||||||
|
"""
|
||||||
|
if n_elem % n_chunks != 0:
|
||||||
|
raise ValueError(
|
||||||
|
f"chunked: n_elem ({n_elem}) not divisible by n_chunks ({n_chunks})"
|
||||||
|
)
|
||||||
|
per_chunk_elem = n_elem // n_chunks
|
||||||
|
isize = _itemsize(dtype)
|
||||||
|
per_chunk_bytes = per_chunk_elem * isize
|
||||||
|
return [
|
||||||
|
Chunk(
|
||||||
|
addr=base_addr + i * per_chunk_bytes,
|
||||||
|
n_elem=per_chunk_elem,
|
||||||
|
nbytes=per_chunk_bytes,
|
||||||
|
)
|
||||||
|
for i in range(n_chunks)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
# ── ring_step ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def ring_step(rank: int, step: int, world_size: int) -> tuple[int, int]:
|
||||||
|
"""Return ``(send_chunk_idx, recv_chunk_idx)`` for a ring algorithm step.
|
||||||
|
|
||||||
|
Standard reduce-scatter / all-gather ring schedule:
|
||||||
|
at step s, rank r sends chunk (r - s) and receives chunk (r - s - 1)
|
||||||
|
modulo world_size.
|
||||||
|
|
||||||
|
Used by ring all-reduce kernels:
|
||||||
|
|
||||||
|
for step in range(world_size - 1):
|
||||||
|
send_idx, recv_idx = ring_step(rank, step, world_size)
|
||||||
|
tl.send(dir="E", src=chunks[send_idx])
|
||||||
|
chunks[recv_idx] += tl.recv(dir="W").data
|
||||||
|
"""
|
||||||
|
send_idx = (rank - step) % world_size
|
||||||
|
recv_idx = (rank - step - 1) % world_size
|
||||||
|
return send_idx, recv_idx
|
||||||
|
|
||||||
|
|
||||||
|
# ── tree_step ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def tree_step(rank: int, world_size: int) -> dict[str, Any]:
|
||||||
|
"""Return parent/children for binary tree rooted at rank 0.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
``{"parent": int|None, "children": list[int]}``
|
||||||
|
"""
|
||||||
|
parent = (rank - 1) // 2 if rank > 0 else None
|
||||||
|
children: list[int] = []
|
||||||
|
left = 2 * rank + 1
|
||||||
|
right = 2 * rank + 2
|
||||||
|
if left < world_size:
|
||||||
|
children.append(left)
|
||||||
|
if right < world_size:
|
||||||
|
children.append(right)
|
||||||
|
return {"parent": parent, "children": children}
|
||||||
@@ -0,0 +1,285 @@
|
|||||||
|
"""IPCQ install plan for AhbmCCLBackend (ADR-0023 D10/D11/D12).
|
||||||
|
|
||||||
|
Given a ccl.yaml config, the topology, and the engine, this module:
|
||||||
|
|
||||||
|
1. Loads ccl.yaml and resolves the chosen algorithm.
|
||||||
|
2. Maps each rank to a (sip, cube, pe) PE address using a linear scheme.
|
||||||
|
3. Allocates per-rank IPCQ ring buffer base addresses (synthetic but
|
||||||
|
unique-per-PE; see notes below).
|
||||||
|
4. Builds neighbor tables via the algorithm's ``topology`` field plus the
|
||||||
|
optional ``neighbors()`` override hook from the algorithm module.
|
||||||
|
5. Wires bidirectional credit-return SimPy Stores between every (PE, peer)
|
||||||
|
pair.
|
||||||
|
6. Installs each PE_IPCQ component's neighbor table directly via its
|
||||||
|
``_install_neighbors`` sideband call (equivalent to fan-out IpcqInitMsg
|
||||||
|
without going through fabric).
|
||||||
|
|
||||||
|
Address scheme
|
||||||
|
--------------
|
||||||
|
For the first implementation we use a synthetic address scheme that
|
||||||
|
guarantees uniqueness per (sip, cube, pe, direction) without going
|
||||||
|
through ``PEMemAllocator``. The address is encoded as:
|
||||||
|
|
||||||
|
base = IPCQ_BASE | (sip << 40) | (cube << 32) | (pe << 24)
|
||||||
|
rx_base[direction_idx] = base + direction_idx * (n_slots * slot_size)
|
||||||
|
|
||||||
|
The ``buffer_kind`` (tcm/hbm/sram) selects the *MemoryStore space* into
|
||||||
|
which data is written. Within a space, addresses are unique per PE so
|
||||||
|
the existing MemoryStore (``{space: {addr: ndarray}}``) handles them
|
||||||
|
naturally.
|
||||||
|
|
||||||
|
This bypasses the topology's address resolver / PhysAddr encoding and
|
||||||
|
treats IPCQ buffers as a separate, parallel address namespace. Real PA
|
||||||
|
encoding can be plugged in later without changing the rest of the design.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import simpy
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
from kernbench.ccl.topologies import resolve_topology
|
||||||
|
from kernbench.common.ipcq_types import (
|
||||||
|
IpcqEndpoint,
|
||||||
|
IpcqInitEntry,
|
||||||
|
)
|
||||||
|
from kernbench.runtime_api.kernel import IpcqInitMsg
|
||||||
|
|
||||||
|
|
||||||
|
# IPCQ synthetic address space top bit
|
||||||
|
_IPCQ_BASE = 1 << 60
|
||||||
|
|
||||||
|
|
||||||
|
def _ipcq_base_for_pe(sip: int, cube: int, pe: int) -> int:
|
||||||
|
return _IPCQ_BASE | (sip << 40) | (cube << 32) | (pe << 24)
|
||||||
|
|
||||||
|
|
||||||
|
# ── ccl.yaml loading ─────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def load_ccl_config(path: str | Path | None = None) -> dict:
|
||||||
|
"""Load and validate ccl.yaml. Searches cwd and project root."""
|
||||||
|
if path is None:
|
||||||
|
candidates = [
|
||||||
|
Path.cwd() / "ccl.yaml",
|
||||||
|
Path(__file__).resolve().parents[3] / "ccl.yaml",
|
||||||
|
]
|
||||||
|
for p in candidates:
|
||||||
|
if p.exists():
|
||||||
|
path = p
|
||||||
|
break
|
||||||
|
if path is None:
|
||||||
|
raise FileNotFoundError(
|
||||||
|
"ccl.yaml not found. Place it at project root or cwd."
|
||||||
|
)
|
||||||
|
with open(path) as f:
|
||||||
|
cfg = yaml.safe_load(f)
|
||||||
|
if "defaults" not in cfg:
|
||||||
|
raise ValueError("ccl.yaml missing 'defaults' section")
|
||||||
|
if "algorithms" not in cfg:
|
||||||
|
raise ValueError("ccl.yaml missing 'algorithms' section")
|
||||||
|
return cfg
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_algorithm_config(cfg: dict, name: str | None = None) -> dict:
|
||||||
|
"""Merge defaults with the chosen algorithm's overrides.
|
||||||
|
|
||||||
|
Returns a flat dict with at minimum: module, topology, buffer_kind,
|
||||||
|
backpressure, n_slots, slot_size, ipcq_credit_size_bytes, world_size.
|
||||||
|
"""
|
||||||
|
defaults = dict(cfg.get("defaults", {}))
|
||||||
|
algo_name = name or defaults.get("algorithm")
|
||||||
|
if algo_name is None:
|
||||||
|
raise ValueError("ccl.yaml: defaults.algorithm not set")
|
||||||
|
algos = cfg.get("algorithms", {})
|
||||||
|
if algo_name not in algos:
|
||||||
|
raise ValueError(
|
||||||
|
f"ccl.yaml: algorithm '{algo_name}' not in algorithms section"
|
||||||
|
)
|
||||||
|
merged = defaults.copy()
|
||||||
|
merged.update(algos[algo_name])
|
||||||
|
merged["algorithm"] = algo_name
|
||||||
|
return merged
|
||||||
|
|
||||||
|
|
||||||
|
# ── rank → PE mapping ────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def linear_rank_to_pe(rank: int, spec: dict) -> tuple[int, int, int]:
|
||||||
|
"""Map a rank to (sip, cube, pe) using linear topology order."""
|
||||||
|
sips = spec["system"]["sips"]["count"]
|
||||||
|
cubes_per_sip = spec["sip"]["cube_mesh"]["w"] * spec["sip"]["cube_mesh"]["h"]
|
||||||
|
pe_layout = spec["cube"]["pe_layout"]
|
||||||
|
pes_per_cube = pe_layout["pe_per_corner"] * len(pe_layout["corners"])
|
||||||
|
|
||||||
|
pes_per_sip = cubes_per_sip * pes_per_cube
|
||||||
|
if rank >= sips * pes_per_sip:
|
||||||
|
raise ValueError(
|
||||||
|
f"rank {rank} exceeds total PE count {sips * pes_per_sip}"
|
||||||
|
)
|
||||||
|
sip = rank // pes_per_sip
|
||||||
|
rem = rank % pes_per_sip
|
||||||
|
cube = rem // pes_per_cube
|
||||||
|
pe = rem % pes_per_cube
|
||||||
|
return sip, cube, pe
|
||||||
|
|
||||||
|
|
||||||
|
# ── Install plan ─────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def install_ipcq(
|
||||||
|
engine: Any,
|
||||||
|
spec: dict,
|
||||||
|
cfg: dict,
|
||||||
|
algo_module: Any | None = None,
|
||||||
|
rank_to_pe: list[tuple[int, int, int]] | None = None,
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
"""Build neighbor tables and install them in every participating PE_IPCQ.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
engine: GraphEngine with ``_components`` dict
|
||||||
|
spec: topology spec dict
|
||||||
|
cfg: merged algorithm config (from ``resolve_algorithm_config``)
|
||||||
|
algo_module: optional algorithm Python module (for neighbors override)
|
||||||
|
rank_to_pe: optional explicit rank → (sip, cube, pe) mapping. If
|
||||||
|
None, the default linear mapping is used.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A diagnostics dict with the install plan (rank → PE map, neighbor table).
|
||||||
|
"""
|
||||||
|
if "world_size" in cfg:
|
||||||
|
world_size = int(cfg["world_size"])
|
||||||
|
else:
|
||||||
|
# Topology-derived fallback (mirrors AhbmCCLBackend / RuntimeContext).
|
||||||
|
sips = int(spec.get("system", {}).get("sips", {}).get("count", 1))
|
||||||
|
cm = spec.get("sip", {}).get("cube_mesh", {})
|
||||||
|
cubes_per_sip = int(cm.get("w", 1)) * int(cm.get("h", 1))
|
||||||
|
pl = spec.get("cube", {}).get("pe_layout", {})
|
||||||
|
corners = pl.get("corners", [])
|
||||||
|
pe_per_corner = int(pl.get("pe_per_corner", 1))
|
||||||
|
pes_per_cube = pe_per_corner * max(len(corners), 1)
|
||||||
|
world_size = sips * cubes_per_sip * pes_per_cube
|
||||||
|
buffer_kind = cfg["buffer_kind"]
|
||||||
|
n_slots = int(cfg["n_slots"])
|
||||||
|
slot_size = int(cfg["slot_size"])
|
||||||
|
backpressure = cfg["backpressure"]
|
||||||
|
credit_size_bytes = int(cfg.get("ipcq_credit_size_bytes", 16))
|
||||||
|
|
||||||
|
# Step 1: rank → (sip, cube, pe)
|
||||||
|
if rank_to_pe is not None:
|
||||||
|
if len(rank_to_pe) != world_size:
|
||||||
|
raise ValueError(
|
||||||
|
f"rank_to_pe has {len(rank_to_pe)} entries but world_size={world_size}"
|
||||||
|
)
|
||||||
|
rank_pe = list(rank_to_pe)
|
||||||
|
else:
|
||||||
|
rank_pe: list[tuple[int, int, int]] = [
|
||||||
|
linear_rank_to_pe(r, spec) for r in range(world_size)
|
||||||
|
]
|
||||||
|
pe_to_rank = {(s, c, p): r for r, (s, c, p) in enumerate(rank_pe)}
|
||||||
|
|
||||||
|
# Step 2: resolve topology fn (with optional override)
|
||||||
|
topo_fn = resolve_topology(cfg["topology"], algo_module=algo_module)
|
||||||
|
|
||||||
|
# Build per-rank neighbor map
|
||||||
|
neighbor_table: dict[int, dict[str, int]] = {}
|
||||||
|
for r in range(world_size):
|
||||||
|
neighbor_table[r] = topo_fn(r, world_size)
|
||||||
|
|
||||||
|
# Step 3: pull the live engine reference for each PE_IPCQ
|
||||||
|
components = engine._components
|
||||||
|
pe_ipcq_id = lambda s, c, p: f"sip{s}.cube{c}.pe{p}.pe_ipcq"
|
||||||
|
|
||||||
|
# Step 4: per-PE rx_base address and per-PE credit_inbox
|
||||||
|
direction_keys = sorted({d for nt in neighbor_table.values() for d in nt})
|
||||||
|
direction_idx = {d: i for i, d in enumerate(direction_keys)}
|
||||||
|
bytes_per_direction = n_slots * slot_size
|
||||||
|
|
||||||
|
def rx_base(s: int, c: int, p: int, d: str) -> int:
|
||||||
|
return _ipcq_base_for_pe(s, c, p) + direction_idx[d] * bytes_per_direction
|
||||||
|
|
||||||
|
# Wire bidirectional credit stores: backend creates the SimPy Stores
|
||||||
|
# by reading each rank's PE_IPCQ.credit_inbox property.
|
||||||
|
rank_to_credit_inbox: dict[int, simpy.Store] = {}
|
||||||
|
for r, (s, c, p) in enumerate(rank_pe):
|
||||||
|
comp = components[pe_ipcq_id(s, c, p)]
|
||||||
|
# Trigger lazy creation of credit_inbox if not yet started.
|
||||||
|
# PE_IPCQ.start() creates it; we ensure it exists.
|
||||||
|
if comp._credit_inbox is None:
|
||||||
|
comp._credit_inbox = simpy.Store(engine._env)
|
||||||
|
rank_to_credit_inbox[r] = comp.credit_inbox
|
||||||
|
|
||||||
|
# Step 5: build IpcqInitMsg per rank and call _install_neighbors directly
|
||||||
|
plan: dict[str, Any] = {
|
||||||
|
"world_size": world_size,
|
||||||
|
"rank_to_pe": rank_pe,
|
||||||
|
"buffer_kind": buffer_kind,
|
||||||
|
"neighbor_table": neighbor_table,
|
||||||
|
}
|
||||||
|
|
||||||
|
_OPPOSITE_DIR = {
|
||||||
|
"E": "W", "W": "E", "N": "S", "S": "N",
|
||||||
|
"global_E": "global_W", "global_W": "global_E",
|
||||||
|
"global_N": "global_S", "global_S": "global_N",
|
||||||
|
}
|
||||||
|
|
||||||
|
def reverse_direction(my_rank: int, peer_rank: int, my_dir: str) -> str | None:
|
||||||
|
"""Find peer's direction that reciprocates my_dir→peer_rank.
|
||||||
|
|
||||||
|
Prefer the OPPOSITE direction (E↔W, N↔S) when the peer has it
|
||||||
|
pointing back to us (ADR-0025 D1). This matters in 2-rank
|
||||||
|
bidirectional rings where both E and W on one side point to the
|
||||||
|
same peer — without the preference, dict-order first-match would
|
||||||
|
route data into the wrong rx slot. Falls back to any direction
|
||||||
|
pointing back for topologies without an opposite convention
|
||||||
|
(e.g. tree_binary's parent/child).
|
||||||
|
"""
|
||||||
|
nt = neighbor_table[peer_rank]
|
||||||
|
opp = _OPPOSITE_DIR.get(my_dir)
|
||||||
|
if opp is not None and nt.get(opp) == my_rank:
|
||||||
|
return opp
|
||||||
|
for d, target in nt.items():
|
||||||
|
if target == my_rank:
|
||||||
|
return d
|
||||||
|
return None
|
||||||
|
|
||||||
|
for r, (s, c, p) in enumerate(rank_pe):
|
||||||
|
my_pe_ipcq = components[pe_ipcq_id(s, c, p)]
|
||||||
|
nbrs = neighbor_table[r]
|
||||||
|
entries: list[IpcqInitEntry] = []
|
||||||
|
for d, peer_rank in nbrs.items():
|
||||||
|
if peer_rank is None:
|
||||||
|
continue
|
||||||
|
peer_s, peer_c, peer_p = rank_pe[peer_rank]
|
||||||
|
peer_dir = reverse_direction(r, peer_rank, d)
|
||||||
|
if peer_dir is None:
|
||||||
|
# Peer doesn't have a reverse entry — skip (asymmetric topology)
|
||||||
|
continue
|
||||||
|
peer_endpoint = IpcqEndpoint(
|
||||||
|
sip=peer_s, cube=peer_c, pe=peer_p,
|
||||||
|
buffer_kind=buffer_kind,
|
||||||
|
rx_base_pa=rx_base(peer_s, peer_c, peer_p, peer_dir),
|
||||||
|
rx_base_va=0,
|
||||||
|
n_slots=n_slots, slot_size=slot_size,
|
||||||
|
)
|
||||||
|
entries.append(IpcqInitEntry(
|
||||||
|
direction=d,
|
||||||
|
peer=peer_endpoint,
|
||||||
|
my_rx_base_pa=rx_base(s, c, p, d),
|
||||||
|
my_rx_base_va=0,
|
||||||
|
n_slots=n_slots, slot_size=slot_size,
|
||||||
|
peer_credit_store=rank_to_credit_inbox[peer_rank],
|
||||||
|
))
|
||||||
|
msg = IpcqInitMsg(
|
||||||
|
correlation_id="ccl_init", request_id=f"init_r{r}",
|
||||||
|
target_sips=(s,), target_cubes=(c,), target_pe=p,
|
||||||
|
entries=tuple(entries),
|
||||||
|
backpressure_mode=backpressure,
|
||||||
|
buffer_kind=buffer_kind,
|
||||||
|
credit_size_bytes=credit_size_bytes,
|
||||||
|
)
|
||||||
|
my_pe_ipcq._install_neighbors(msg)
|
||||||
|
|
||||||
|
return plan
|
||||||
@@ -0,0 +1,104 @@
|
|||||||
|
"""SFR configuration for intercube + inter-SIP IPCQ wiring.
|
||||||
|
|
||||||
|
Provides ``configure_sfr_intercube_multisip`` which programs PE_IPCQ
|
||||||
|
neighbor tables for:
|
||||||
|
|
||||||
|
1. Intercube within each SIP — pe0 of every cube connects to pe0 of
|
||||||
|
its N/S/E/W mesh neighbors (no wrap-around).
|
||||||
|
2. Inter-SIP on ALL cubes — pe0 of cube_c on sip_A connects to pe0 of
|
||||||
|
cube_c on each peer SIP, using ``global_E``/``global_W`` (ring) or
|
||||||
|
``global_N``/``global_S``/``global_E``/``global_W`` (mesh/torus)
|
||||||
|
direction labels. Wiring all cubes allows the kernel to
|
||||||
|
dynamically elect the root cube at runtime.
|
||||||
|
|
||||||
|
SIP-level topology is read from ``topology.yaml`` →
|
||||||
|
``system.sips.topology`` (e.g. ``ring_1d``, ``mesh_2d``).
|
||||||
|
Intercube mesh dimensions come from ``sip.cube_mesh.w/h``.
|
||||||
|
|
||||||
|
Internally delegates to ``install_ipcq`` with a computed ``rank_to_pe``
|
||||||
|
(pe0-only) and a closure-captured ``neighbors()`` function.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import types
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from kernbench.ccl.install import install_ipcq
|
||||||
|
from kernbench.ccl.topologies import _BUILTIN as _TOPO_BUILTINS
|
||||||
|
|
||||||
|
|
||||||
|
def configure_sfr_intercube_multisip(
|
||||||
|
engine: Any,
|
||||||
|
spec: dict,
|
||||||
|
cfg: dict,
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
"""Wire IPCQ for intercube (pe0, mesh) + inter-SIP (pe0, all cubes).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
engine: GraphEngine with ``_components``.
|
||||||
|
spec: topology spec dict (from topology.yaml).
|
||||||
|
cfg: merged algorithm config (from ``resolve_algorithm_config``).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The install plan dict from ``install_ipcq``.
|
||||||
|
"""
|
||||||
|
cm = spec["sip"]["cube_mesh"]
|
||||||
|
mesh_w = int(cm["w"])
|
||||||
|
mesh_h = int(cm["h"])
|
||||||
|
n_cubes = mesh_w * mesh_h
|
||||||
|
n_sips = int(spec.get("system", {}).get("sips", {}).get("count", 1))
|
||||||
|
sip_topology = str(
|
||||||
|
spec.get("system", {}).get("sips", {}).get("topology", "ring_1d")
|
||||||
|
)
|
||||||
|
|
||||||
|
if sip_topology not in _TOPO_BUILTINS:
|
||||||
|
raise ValueError(
|
||||||
|
f"Unknown sip topology '{sip_topology}'. "
|
||||||
|
f"Available: {list(_TOPO_BUILTINS)}"
|
||||||
|
)
|
||||||
|
sip_topo_fn = _TOPO_BUILTINS[sip_topology]
|
||||||
|
|
||||||
|
world_size = n_sips * n_cubes
|
||||||
|
pe_idx_to_pe: list[tuple[int, int, int]] = [
|
||||||
|
(sip, cube, 0)
|
||||||
|
for sip in range(n_sips)
|
||||||
|
for cube in range(n_cubes)
|
||||||
|
]
|
||||||
|
|
||||||
|
def _neighbors(pe_idx: int, ws: int, _base: dict) -> dict[str, int]:
|
||||||
|
sip = pe_idx // n_cubes
|
||||||
|
cube = pe_idx % n_cubes
|
||||||
|
row = cube // mesh_w
|
||||||
|
col = cube % mesh_w
|
||||||
|
|
||||||
|
nbrs: dict[str, int] = {}
|
||||||
|
|
||||||
|
# Intercube within SIP (mesh, no wrap-around)
|
||||||
|
if col < mesh_w - 1:
|
||||||
|
nbrs["E"] = sip * n_cubes + (row * mesh_w + col + 1)
|
||||||
|
if col > 0:
|
||||||
|
nbrs["W"] = sip * n_cubes + (row * mesh_w + col - 1)
|
||||||
|
if row < mesh_h - 1:
|
||||||
|
nbrs["S"] = sip * n_cubes + ((row + 1) * mesh_w + col)
|
||||||
|
if row > 0:
|
||||||
|
nbrs["N"] = sip * n_cubes + ((row - 1) * mesh_w + col)
|
||||||
|
|
||||||
|
# Inter-SIP on ALL cubes
|
||||||
|
if n_sips > 1:
|
||||||
|
sip_nbrs = sip_topo_fn(sip, n_sips)
|
||||||
|
for d, peer_sip in sip_nbrs.items():
|
||||||
|
nbrs[f"global_{d}"] = peer_sip * n_cubes + cube
|
||||||
|
|
||||||
|
return nbrs
|
||||||
|
|
||||||
|
mock_module = types.SimpleNamespace(neighbors=_neighbors)
|
||||||
|
|
||||||
|
cfg_copy = dict(cfg)
|
||||||
|
cfg_copy["world_size"] = world_size
|
||||||
|
cfg_copy["topology"] = "none"
|
||||||
|
|
||||||
|
return install_ipcq(
|
||||||
|
engine, spec, cfg_copy,
|
||||||
|
algo_module=mock_module,
|
||||||
|
rank_to_pe=pe_idx_to_pe,
|
||||||
|
)
|
||||||
@@ -0,0 +1,163 @@
|
|||||||
|
"""Builtin neighbor topology generators for CCL backend (ADR-0023 D11).
|
||||||
|
|
||||||
|
Each generator takes ``(rank, world_size)`` and returns a
|
||||||
|
``dict[direction, peer_rank]`` for that rank. ``direction`` is one of
|
||||||
|
``"N" | "S" | "E" | "W"`` for ring/mesh, or
|
||||||
|
``"parent" | "child_left" | "child_right"`` for tree topologies.
|
||||||
|
|
||||||
|
Algorithm modules may override the generated map by defining a
|
||||||
|
``neighbors(rank, world_size, neighbor_map) -> dict | None`` function in
|
||||||
|
the same module (see D11 / D15). ``resolve_topology`` wires these together.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Any, Callable
|
||||||
|
|
||||||
|
NeighborMap = dict[str, int]
|
||||||
|
TopologyFn = Callable[[int, int], NeighborMap]
|
||||||
|
|
||||||
|
|
||||||
|
# ── Builtin generators ───────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def ring_1d(rank: int, world_size: int) -> NeighborMap:
|
||||||
|
"""1D bidirectional ring (E/W)."""
|
||||||
|
return {
|
||||||
|
"E": (rank + 1) % world_size,
|
||||||
|
"W": (rank - 1) % world_size,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def ring_1d_unidir(rank: int, world_size: int) -> NeighborMap:
|
||||||
|
"""1D unidirectional ring (E only)."""
|
||||||
|
return {"E": (rank + 1) % world_size}
|
||||||
|
|
||||||
|
|
||||||
|
def mesh_2d(rank: int, world_size: int) -> NeighborMap:
|
||||||
|
"""Square 2D mesh (N/S/E/W).
|
||||||
|
|
||||||
|
Layout: rank = row * side + col, with side = sqrt(world_size).
|
||||||
|
Wrap-around (torus) on all four edges.
|
||||||
|
"""
|
||||||
|
side = int(round(world_size ** 0.5))
|
||||||
|
if side * side != world_size:
|
||||||
|
raise ValueError(
|
||||||
|
f"mesh_2d requires square world_size, got {world_size}"
|
||||||
|
)
|
||||||
|
r, c = divmod(rank, side)
|
||||||
|
return {
|
||||||
|
"N": ((r - 1) % side) * side + c,
|
||||||
|
"S": ((r + 1) % side) * side + c,
|
||||||
|
"W": r * side + (c - 1) % side,
|
||||||
|
"E": r * side + (c + 1) % side,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def tree_binary(rank: int, world_size: int) -> NeighborMap:
|
||||||
|
"""Binary tree rooted at rank 0.
|
||||||
|
|
||||||
|
Children of rank r are 2r+1 and 2r+2 (if within world_size).
|
||||||
|
Parent of rank r > 0 is (r-1)//2.
|
||||||
|
Returned keys (only those that exist):
|
||||||
|
"parent", "child_left", "child_right"
|
||||||
|
"""
|
||||||
|
n: NeighborMap = {}
|
||||||
|
if rank > 0:
|
||||||
|
n["parent"] = (rank - 1) // 2
|
||||||
|
left = 2 * rank + 1
|
||||||
|
right = 2 * rank + 2
|
||||||
|
if left < world_size:
|
||||||
|
n["child_left"] = left
|
||||||
|
if right < world_size:
|
||||||
|
n["child_right"] = right
|
||||||
|
return n
|
||||||
|
|
||||||
|
|
||||||
|
def torus_2d(rank: int, world_size: int) -> NeighborMap:
|
||||||
|
"""Square 2D torus (N/S/E/W) with wrap-around on all edges.
|
||||||
|
|
||||||
|
Alias for mesh_2d (which already wraps). Explicit name for clarity
|
||||||
|
when used as a SIP-level topology.
|
||||||
|
"""
|
||||||
|
return mesh_2d(rank, world_size)
|
||||||
|
|
||||||
|
|
||||||
|
def mesh_2d_no_wrap(rank: int, world_size: int) -> NeighborMap:
|
||||||
|
"""Square 2D mesh (N/S/E/W) WITHOUT wrap-around.
|
||||||
|
|
||||||
|
Edge nodes have fewer neighbors (no wrapping). Used for SIP-level
|
||||||
|
topologies where physical links don't wrap.
|
||||||
|
"""
|
||||||
|
side = int(round(world_size ** 0.5))
|
||||||
|
if side * side != world_size:
|
||||||
|
raise ValueError(
|
||||||
|
f"mesh_2d_no_wrap requires square world_size, got {world_size}"
|
||||||
|
)
|
||||||
|
r, c = divmod(rank, side)
|
||||||
|
n: NeighborMap = {}
|
||||||
|
if r > 0:
|
||||||
|
n["N"] = (r - 1) * side + c
|
||||||
|
if r < side - 1:
|
||||||
|
n["S"] = (r + 1) * side + c
|
||||||
|
if c > 0:
|
||||||
|
n["W"] = r * side + (c - 1)
|
||||||
|
if c < side - 1:
|
||||||
|
n["E"] = r * side + (c + 1)
|
||||||
|
return n
|
||||||
|
|
||||||
|
|
||||||
|
def none(rank: int, world_size: int) -> NeighborMap:
|
||||||
|
"""Empty map — algorithm's neighbors() must build from scratch."""
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
_BUILTIN: dict[str, TopologyFn] = {
|
||||||
|
"ring_1d": ring_1d,
|
||||||
|
"ring_1d_unidir": ring_1d_unidir,
|
||||||
|
"mesh_2d": mesh_2d,
|
||||||
|
"torus_2d": torus_2d,
|
||||||
|
"mesh_2d_no_wrap": mesh_2d_no_wrap,
|
||||||
|
"tree_binary": tree_binary,
|
||||||
|
"none": none,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ── Resolution ───────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_topology(
|
||||||
|
name: str, algo_module: Any | None = None,
|
||||||
|
) -> TopologyFn:
|
||||||
|
"""Return a callable ``(rank, world_size) -> NeighborMap``.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
name: builtin topology name from ccl.yaml. Must be one of
|
||||||
|
``ring_1d``, ``ring_1d_unidir``, ``mesh_2d``, ``tree_binary``,
|
||||||
|
or ``none``.
|
||||||
|
algo_module: optional algorithm module. If it defines
|
||||||
|
``neighbors(rank, world_size, neighbor_map)``, that hook is
|
||||||
|
invoked after the builtin to override the result.
|
||||||
|
Returning None from neighbors() leaves the builtin map
|
||||||
|
unchanged; returning a dict replaces it.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: if ``name`` is not a known builtin.
|
||||||
|
"""
|
||||||
|
if name not in _BUILTIN:
|
||||||
|
raise ValueError(
|
||||||
|
f"Unknown topology '{name}'. "
|
||||||
|
f"Available builtins: {list(_BUILTIN)}"
|
||||||
|
)
|
||||||
|
builtin_fn = _BUILTIN[name]
|
||||||
|
override_fn = getattr(algo_module, "neighbors", None) if algo_module else None
|
||||||
|
if override_fn is None or not callable(override_fn):
|
||||||
|
return builtin_fn
|
||||||
|
|
||||||
|
def _wrapped(rank: int, world_size: int) -> NeighborMap:
|
||||||
|
base = builtin_fn(rank, world_size)
|
||||||
|
result = override_fn(rank, world_size, base)
|
||||||
|
if result is None:
|
||||||
|
return base
|
||||||
|
return result
|
||||||
|
|
||||||
|
return _wrapped
|
||||||
@@ -21,6 +21,10 @@ def build_parser() -> argparse.ArgumentParser:
|
|||||||
runp.add_argument(
|
runp.add_argument(
|
||||||
"--device", default=None, help="Target device: 'all' or 'sip:<N>' (default: all)"
|
"--device", default=None, help="Target device: 'all' or 'sip:<N>' (default: all)"
|
||||||
)
|
)
|
||||||
|
runp.add_argument(
|
||||||
|
"--verify-data", action="store_true", default=False,
|
||||||
|
help="Enable Phase 2 data verification (ADR-0020)",
|
||||||
|
)
|
||||||
runp.set_defaults(_handler=cmd_run)
|
runp.set_defaults(_handler=cmd_run)
|
||||||
|
|
||||||
probep = sub.add_parser("probe", help="Probe latency and BW for predefined traffic patterns")
|
probep = sub.add_parser("probe", help="Probe latency and BW for predefined traffic patterns")
|
||||||
@@ -36,9 +40,11 @@ def build_parser() -> argparse.ArgumentParser:
|
|||||||
return p
|
return p
|
||||||
|
|
||||||
|
|
||||||
def engine_factory(topology: object, device: DeviceSelector) -> SimEngine:
|
def engine_factory(
|
||||||
|
topology: object, device: DeviceSelector, *, enable_data: bool = False,
|
||||||
|
) -> SimEngine:
|
||||||
topo_obj = getattr(topology, "topology_obj", topology)
|
topo_obj = getattr(topology, "topology_obj", topology)
|
||||||
return GraphEngine(topo_obj)
|
return GraphEngine(topo_obj, enable_data=enable_data)
|
||||||
|
|
||||||
|
|
||||||
def cmd_web(args) -> int:
|
def cmd_web(args) -> int:
|
||||||
@@ -53,8 +59,12 @@ def cmd_run(args) -> int:
|
|||||||
topo = resolve_topology(args.topology)
|
topo = resolve_topology(args.topology)
|
||||||
bench = resolve_bench(args.bench)
|
bench = resolve_bench(args.bench)
|
||||||
device = resolve_device(args.device)
|
device = resolve_device(args.device)
|
||||||
|
verify_data = getattr(args, "verify_data", False)
|
||||||
|
|
||||||
result = run_bench(topology=topo, bench_fn=bench, device=device, engine_factory=engine_factory)
|
def _factory(topology, device):
|
||||||
|
return engine_factory(topology, device, enable_data=verify_data)
|
||||||
|
|
||||||
|
result = run_bench(topology=topo, bench_fn=bench, device=device, engine_factory=_factory)
|
||||||
|
|
||||||
topo_obj = getattr(topo, "topology_obj", topo)
|
topo_obj = getattr(topo, "topology_obj", topo)
|
||||||
spec = getattr(topo_obj, "spec", None)
|
spec = getattr(topo_obj, "spec", None)
|
||||||
@@ -62,6 +72,18 @@ def cmd_run(args) -> int:
|
|||||||
print(format_report(result.traces, title=args.bench, spec=spec))
|
print(format_report(result.traces, title=args.bench, spec=spec))
|
||||||
print(result.summary_text())
|
print(result.summary_text())
|
||||||
|
|
||||||
|
# Phase 2 diagnostic summary (ADR-0020). The actual Phase 2 replay
|
||||||
|
# already runs inside engine.wait() → _flush_data_phase(). We only
|
||||||
|
# print the summary here; no redundant re-execution.
|
||||||
|
if verify_data and result.engine is not None:
|
||||||
|
op_log = result.engine.op_log
|
||||||
|
if op_log:
|
||||||
|
n_gemm = sum(1 for r in op_log if r.op_kind == "gemm")
|
||||||
|
n_math = sum(1 for r in op_log if r.op_kind == "math")
|
||||||
|
print(f"[data] Phase 2 complete: {len(op_log)} ops ({n_gemm} gemm, {n_math} math)")
|
||||||
|
else:
|
||||||
|
print("[data] No op_log recorded — skipping Phase 2")
|
||||||
|
|
||||||
return 0 if result.completion.ok else 1
|
return 0 if result.completion.ok else 1
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,249 @@
|
|||||||
|
"""IPCQ schemas and exceptions (ADR-0023 D2.5, D12, D14 F1).
|
||||||
|
|
||||||
|
This module contains the data structures and exceptions used by the
|
||||||
|
PE-level IPCQ collective communication infrastructure. The host-facing
|
||||||
|
sideband fan-out message ``IpcqInitMsg`` lives in
|
||||||
|
``kernbench.runtime_api.kernel`` (alongside other fabric messages),
|
||||||
|
while all internal token / metadata / command schemas are kept here.
|
||||||
|
|
||||||
|
Layering:
|
||||||
|
PE_CPU --IpcqRequest(IpcqSendCmd|IpcqRecvCmd)--> PE_IPCQ
|
||||||
|
PE_IPCQ --IpcqDmaToken--> PE_DMA (vc_comm)
|
||||||
|
PE_DMA --IpcqMetaArrival--> PE_IPCQ (atomic, D9)
|
||||||
|
PE_IPCQ --IpcqCreditMetadata--> peer PE_IPCQ (fast path, D9)
|
||||||
|
|
||||||
|
See ADR-0023 for the full design.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import TYPE_CHECKING, Any, Union
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
import simpy
|
||||||
|
|
||||||
|
|
||||||
|
# ── D14 F1: invalid direction exception ──────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class IpcqInvalidDirection(ValueError):
|
||||||
|
"""Raised when a kernel calls tl.send/recv with a direction that
|
||||||
|
has no neighbor installed for this PE."""
|
||||||
|
|
||||||
|
|
||||||
|
# ── D2.5: IpcqEndpoint ───────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class IpcqEndpoint:
|
||||||
|
"""송신 측이 peer's rx_buffer 주소를 계산하기 위해 필요한 모든 정보 (D2.5).
|
||||||
|
|
||||||
|
Sender PE_IPCQ uses this to compute the destination PA for its DMA
|
||||||
|
write into the peer's rx ring buffer slot:
|
||||||
|
|
||||||
|
slot_idx = sender.my_head % peer.n_slots
|
||||||
|
dst_pa = peer.rx_base_pa + slot_idx * peer.slot_size
|
||||||
|
"""
|
||||||
|
|
||||||
|
sip: int # destination SIP
|
||||||
|
cube: int # destination cube
|
||||||
|
pe: int # destination PE (cube-local index)
|
||||||
|
buffer_kind: str # "tcm" | "hbm" | "sram"
|
||||||
|
rx_base_pa: int # peer rx_buffer base PA (PhysAddr.encode())
|
||||||
|
rx_base_va: int # peer rx_buffer base VA (optional, MMU)
|
||||||
|
n_slots: int # peer ring depth (wrap-around modulo)
|
||||||
|
slot_size: int # peer slot size (offset multiplier)
|
||||||
|
|
||||||
|
|
||||||
|
# ── D12: IpcqInitEntry (used by IpcqInitMsg in kernel.py) ────────────
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class IpcqInitEntry:
|
||||||
|
"""One direction's neighbor entry that backend installs into a PE_IPCQ
|
||||||
|
via IpcqInitMsg (kernbench.runtime_api.kernel.IpcqInitMsg, D12).
|
||||||
|
"""
|
||||||
|
|
||||||
|
direction: str # "N" | "S" | "E" | "W"
|
||||||
|
peer: IpcqEndpoint # see D2.5
|
||||||
|
my_rx_base_pa: int # this PE's own rx_buffer base
|
||||||
|
my_rx_base_va: int # this PE's own rx_buffer base VA (optional)
|
||||||
|
n_slots: int # this PE's ring depth
|
||||||
|
slot_size: int # this PE's slot size
|
||||||
|
# Credit fast path channel (D9).
|
||||||
|
# Contract: must be a simpy.Store instance dedicated to receiving
|
||||||
|
# IpcqCreditMetadata objects only. Backend wires it once at init time
|
||||||
|
# and the receiving PE_IPCQ owns its consumer side; the sender (peer's
|
||||||
|
# PE_IPCQ) puts IpcqCreditMetadata directly into this store via
|
||||||
|
# _delayed_credit_send. Do not put any other object type.
|
||||||
|
peer_credit_store: "simpy.Store"
|
||||||
|
|
||||||
|
|
||||||
|
# ── D12: IpcqSendCmd (PE_CPU → PE_IPCQ) ──────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class IpcqSendCmd:
|
||||||
|
"""tl.send command issued by the kernel to PE_IPCQ."""
|
||||||
|
|
||||||
|
direction: str # "N" | "S" | "E" | "W"
|
||||||
|
src_addr: int # source data address (TCM/HBM/SRAM)
|
||||||
|
src_space: str # "tcm" | "hbm" | "sram"
|
||||||
|
nbytes: int
|
||||||
|
shape: tuple[int, ...] # data shape (op_log + MemoryStore use)
|
||||||
|
dtype: str
|
||||||
|
handle_id: str # completion tracking
|
||||||
|
# In-flight data snapshot captured at tl.send() time from the
|
||||||
|
# TensorHandle.data field. Carries the actual numpy array that was
|
||||||
|
# visible at recv-time (when handle.data was populated), avoiding a
|
||||||
|
# Phase 1 race where a later IPCQ inbound overwrites the sender's
|
||||||
|
# slot between recv and send. If None, PE_DMA outbound falls back to
|
||||||
|
# reading MemoryStore[src_addr] (correct for sources that are never
|
||||||
|
# overwritten, such as HBM tiles).
|
||||||
|
data: Any = None
|
||||||
|
data_op: bool = True # ADR-0020 op_log recording flag
|
||||||
|
|
||||||
|
|
||||||
|
# ── D12: IpcqRecvCmd (PE_CPU → PE_IPCQ) ──────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class IpcqRecvCmd:
|
||||||
|
"""tl.recv command issued by the kernel to PE_IPCQ.
|
||||||
|
|
||||||
|
Two modes (recv_mode):
|
||||||
|
"return_slot" — return slot address as-is (default, zero-copy).
|
||||||
|
Kernel uses the slot memory directly.
|
||||||
|
"copy_to_dst" — copy slot data to dst_addr, then return.
|
||||||
|
"""
|
||||||
|
|
||||||
|
direction: str | None # None → round-robin (weak fairness, D4)
|
||||||
|
shape: tuple[int, ...]
|
||||||
|
dtype: str
|
||||||
|
handle_id: str
|
||||||
|
recv_mode: str = "return_slot"
|
||||||
|
dst_addr: int = 0 # used only when recv_mode == "copy_to_dst"
|
||||||
|
dst_space: str = "" # used only when recv_mode == "copy_to_dst"
|
||||||
|
blocking: bool = True
|
||||||
|
data_op: bool = True
|
||||||
|
|
||||||
|
|
||||||
|
# ── D12: IpcqDmaToken (PE_IPCQ → PE_DMA, vc_comm) ───────────────────
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class IpcqDmaToken:
|
||||||
|
"""Token sent from PE_IPCQ to PE_DMA (vc_comm channel) carrying both
|
||||||
|
the data move request and the piggyback metadata (ADR-0023 D9).
|
||||||
|
|
||||||
|
Receiving PE_DMA processes this atomically (I6 MUST):
|
||||||
|
1. MemoryStore.write(dst_endpoint.buffer_kind, dst_addr, data)
|
||||||
|
2. Forward IpcqMetaArrival(token=self) to peer PE_IPCQ
|
||||||
|
No yield is allowed between the two steps.
|
||||||
|
|
||||||
|
The ``data`` field is a snapshot taken by the sender's PE_DMA at the
|
||||||
|
moment the send is issued. This preserves "in-flight data" semantics:
|
||||||
|
if the sender mutates its source memory after issuing the send but
|
||||||
|
before arrival, the receiver still gets the snapshot. The snapshot is
|
||||||
|
None for control-only tokens (e.g. credit-only updates).
|
||||||
|
"""
|
||||||
|
|
||||||
|
# ── Data movement (single-hop DMA write) ──
|
||||||
|
src_addr: int
|
||||||
|
src_space: str
|
||||||
|
dst_addr: int # already-computed peer rx slot PA
|
||||||
|
dst_endpoint: IpcqEndpoint # routing target (sip/cube/pe) + buffer_kind
|
||||||
|
nbytes: int
|
||||||
|
handle_id: str # completion notify back to sender PE_IPCQ
|
||||||
|
# Optional shape/dtype carried for op_log + MemoryStore convenience.
|
||||||
|
shape: tuple[int, ...] = ()
|
||||||
|
dtype: str = "f16"
|
||||||
|
# In-flight data snapshot (sender PE_DMA captures this at send time).
|
||||||
|
data: Any = None
|
||||||
|
|
||||||
|
# ── Piggyback metadata (D9) ──
|
||||||
|
sender_seq: int = 0 # monotonic; receiver updates peer_head_cache
|
||||||
|
src_sip: int = 0
|
||||||
|
src_cube: int = 0
|
||||||
|
src_pe: int = 0
|
||||||
|
src_direction: str = "E" # sender-side direction; receiver maps to its own
|
||||||
|
|
||||||
|
data_op: bool = True
|
||||||
|
|
||||||
|
|
||||||
|
# ── D12: IpcqMetaArrival (PE_DMA → PE_IPCQ, intra-PE wire) ──────────
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class IpcqMetaArrival:
|
||||||
|
"""Posted by receiving PE_DMA into the destination PE's PE_IPCQ inbox
|
||||||
|
in the same SimPy step as the MemoryStore.write (D9, I6 MUST).
|
||||||
|
|
||||||
|
The receiver PE_IPCQ uses ``token.sender_seq`` to update its
|
||||||
|
peer_head_cache for the corresponding direction.
|
||||||
|
"""
|
||||||
|
|
||||||
|
token: IpcqDmaToken
|
||||||
|
|
||||||
|
|
||||||
|
# ── D12: IpcqCreditMetadata (PE_IPCQ → peer PE_IPCQ, fast path) ─────
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class IpcqCreditMetadata:
|
||||||
|
"""Credit return — recv-side → send-side fast path (D9).
|
||||||
|
|
||||||
|
Sent by ``PeIpcqComponent._delayed_credit_send`` after a
|
||||||
|
bottleneck-BW based latency, putting the metadata directly into
|
||||||
|
the peer's pre-wired credit store (no fabric routing).
|
||||||
|
|
||||||
|
``dst_rx_base_pa`` is the receiver's ``my_rx_base_pa`` for the direction
|
||||||
|
whose slot was consumed. The original sender matches this against
|
||||||
|
``qp.peer.rx_base_pa`` to find the correct direction (ADR-0025 D3) —
|
||||||
|
unambiguous even when multiple directions share the same peer (e.g.
|
||||||
|
2-rank bidirectional ring).
|
||||||
|
"""
|
||||||
|
|
||||||
|
consumer_seq: int # my_tail at recv side (new tail value)
|
||||||
|
dst_rx_base_pa: int # receiver-side my_rx_base_pa (ADR-0025 D3)
|
||||||
|
src_sip: int # which peer is sending the credit (diag)
|
||||||
|
src_cube: int
|
||||||
|
src_pe: int
|
||||||
|
src_direction: str # sender-side direction (peer maps to its own)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Request wrapper (PE_CPU → PE_IPCQ) ───────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class IpcqRequest:
|
||||||
|
"""Wrapper carrying an IpcqSendCmd or IpcqRecvCmd plus a SimPy completion
|
||||||
|
event. Posted by PE_CPU into PE_IPCQ's inbox; PE_IPCQ calls
|
||||||
|
``done.succeed()`` when the request is fully processed.
|
||||||
|
|
||||||
|
For recv requests, the result (slot address, direction, dtype, shape)
|
||||||
|
is written into ``result_data`` so the caller can read it after wait.
|
||||||
|
"""
|
||||||
|
|
||||||
|
command: "IpcqSendCmd | IpcqRecvCmd"
|
||||||
|
done: "simpy.Event"
|
||||||
|
result_data: dict[str, Any] = field(default_factory=dict)
|
||||||
|
|
||||||
|
|
||||||
|
# ── RecvFuture (kernel ↔ runner handshake for tl.recv_async / tl.wait) ─
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class RecvFuture:
|
||||||
|
"""Opaque future returned by ``tl.recv_async``.
|
||||||
|
|
||||||
|
The KernelRunner attaches a SimPy event and the IpcqRequest in the
|
||||||
|
background; ``tl.wait(future)`` switches back to the runner which
|
||||||
|
yields on the event and resolves the result into a TensorHandle.
|
||||||
|
"""
|
||||||
|
|
||||||
|
cmd: "IpcqRecvCmd"
|
||||||
|
request: Any = None # IpcqRequest (set by runner)
|
||||||
|
event: Any = None # simpy.Event (set by runner)
|
||||||
|
resolved: bool = False
|
||||||
|
result: Any = None # cached TensorHandle after wait()
|
||||||
@@ -33,6 +33,7 @@ class TensorHandle:
|
|||||||
dtype: str
|
dtype: str
|
||||||
nbytes: int # total byte size
|
nbytes: int # total byte size
|
||||||
data: object = None # reserved for validate mode
|
data: object = None # reserved for validate mode
|
||||||
|
space: str = "tcm" # MemoryStore space ("tcm" | "hbm" | "sram")
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
@@ -55,6 +56,7 @@ class DmaReadCmd:
|
|||||||
handle: TensorHandle
|
handle: TensorHandle
|
||||||
src_addr: int
|
src_addr: int
|
||||||
nbytes: int
|
nbytes: int
|
||||||
|
data_op: bool = True
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
@@ -64,6 +66,7 @@ class DmaWriteCmd:
|
|||||||
handle: TensorHandle
|
handle: TensorHandle
|
||||||
dst_addr: int
|
dst_addr: int
|
||||||
nbytes: int
|
nbytes: int
|
||||||
|
data_op: bool = True
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
@@ -79,6 +82,7 @@ class GemmCmd:
|
|||||||
m: int
|
m: int
|
||||||
k: int
|
k: int
|
||||||
n: int
|
n: int
|
||||||
|
data_op: bool = True
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
@@ -94,6 +98,7 @@ class MathCmd:
|
|||||||
inputs: tuple[TensorHandle, ...]
|
inputs: tuple[TensorHandle, ...]
|
||||||
out: TensorHandle
|
out: TensorHandle
|
||||||
axis: int | None = None # for reductions
|
axis: int | None = None # for reductions
|
||||||
|
data_op: bool = True
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
@@ -111,6 +116,7 @@ class CompositeCmd:
|
|||||||
out_addr: int
|
out_addr: int
|
||||||
out_nbytes: int
|
out_nbytes: int
|
||||||
math_op: str | None = None # for op="math": which math operation
|
math_op: str | None = None # for op="math": which math operation
|
||||||
|
data_op: bool = True
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
|
|||||||
@@ -33,6 +33,7 @@ class ComponentBase(ABC):
|
|||||||
self.ctx = ctx
|
self.ctx = ctx
|
||||||
self.in_ports: dict[str, simpy.Store] = {}
|
self.in_ports: dict[str, simpy.Store] = {}
|
||||||
self.out_ports: dict[str, simpy.Store] = {}
|
self.out_ports: dict[str, simpy.Store] = {}
|
||||||
|
self._op_logger: Any | None = None # OpLogger, set by GraphEngine if enabled
|
||||||
|
|
||||||
def start(self, env: simpy.Environment) -> None:
|
def start(self, env: simpy.Environment) -> None:
|
||||||
"""Called once after all ports are wired.
|
"""Called once after all ports are wired.
|
||||||
@@ -64,9 +65,21 @@ class ComponentBase(ABC):
|
|||||||
txn: Any = yield self._inbox.get()
|
txn: Any = yield self._inbox.get()
|
||||||
env.process(self._forward_txn(env, txn))
|
env.process(self._forward_txn(env, txn))
|
||||||
|
|
||||||
|
def _on_process_start(self, env: simpy.Environment, msg: Any) -> None:
|
||||||
|
"""Op log hook: record service start for data_op messages (ADR-0020 D2)."""
|
||||||
|
if self._op_logger and getattr(msg, "data_op", False):
|
||||||
|
self._op_logger.record_start(env.now, self.node.id, msg)
|
||||||
|
|
||||||
|
def _on_process_end(self, env: simpy.Environment, msg: Any) -> None:
|
||||||
|
"""Op log hook: record service end for data_op messages (ADR-0020 D2)."""
|
||||||
|
if self._op_logger and getattr(msg, "data_op", False):
|
||||||
|
self._op_logger.record_end(env.now, self.node.id, msg)
|
||||||
|
|
||||||
def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
|
def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
|
||||||
"""Apply run() latency, then forward to next hop or drain at terminal."""
|
"""Apply run() latency, then forward to next hop or drain at terminal."""
|
||||||
|
self._on_process_start(env, txn)
|
||||||
yield from self.run(env, txn.nbytes)
|
yield from self.run(env, txn.nbytes)
|
||||||
|
self._on_process_end(env, txn)
|
||||||
next_hop = txn.next_hop # duck-typed: Transaction.next_hop
|
next_hop = txn.next_hop # duck-typed: Transaction.next_hop
|
||||||
if next_hop:
|
if next_hop:
|
||||||
yield self.out_ports[next_hop].put(txn.advance())
|
yield self.out_ports[next_hop].put(txn.advance())
|
||||||
@@ -120,10 +133,16 @@ class PeEngineBase(ComponentBase):
|
|||||||
while True:
|
while True:
|
||||||
msg: Any = yield self._inbox.get()
|
msg: Any = yield self._inbox.get()
|
||||||
if isinstance(msg, PeInternalTxn):
|
if isinstance(msg, PeInternalTxn):
|
||||||
env.process(self.handle_command(env, msg))
|
env.process(self._handle_with_hooks(env, msg))
|
||||||
else:
|
else:
|
||||||
env.process(self._forward_txn(env, msg))
|
env.process(self._forward_txn(env, msg))
|
||||||
|
|
||||||
|
def _handle_with_hooks(self, env: simpy.Environment, pe_txn: Any) -> Generator:
|
||||||
|
"""Wrap handle_command with op log hooks on the inner command."""
|
||||||
|
self._on_process_start(env, pe_txn.command)
|
||||||
|
yield from self.handle_command(env, pe_txn)
|
||||||
|
self._on_process_end(env, pe_txn.command)
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def handle_command(self, env: simpy.Environment, pe_txn: Any) -> Generator:
|
def handle_command(self, env: simpy.Environment, pe_txn: Any) -> Generator:
|
||||||
"""Process a PE-internal command (PeInternalTxn).
|
"""Process a PE-internal command (PeInternalTxn).
|
||||||
|
|||||||
@@ -42,6 +42,30 @@ class PeCpuComponent(ComponentBase):
|
|||||||
self._cube_idx = int(parts[1].replace("cube", ""))
|
self._cube_idx = int(parts[1].replace("cube", ""))
|
||||||
except (IndexError, ValueError):
|
except (IndexError, ValueError):
|
||||||
self._cube_idx = 0
|
self._cube_idx = 0
|
||||||
|
# num_cubes from spec (for tl.program_id(axis=1) — ADR-0022)
|
||||||
|
spec = ctx.spec if ctx else {}
|
||||||
|
cube_mesh = spec.get("sip", {}).get("cube_mesh", {})
|
||||||
|
if cube_mesh:
|
||||||
|
self._num_cubes = int(cube_mesh.get("w", 1)) * int(cube_mesh.get("h", 1))
|
||||||
|
else:
|
||||||
|
self._num_cubes = (
|
||||||
|
spec.get("system", {}).get("sips", {}).get("cubes_per_sip", 1)
|
||||||
|
)
|
||||||
|
# PE-local scratch for kernel math output handles (ADR-0020 D3
|
||||||
|
# extension; reserved portion of TCM addressed via a synthetic
|
||||||
|
# MemoryStore key, not the real PA encoder).
|
||||||
|
pe_template = spec.get("cube", {}).get("pe_template", {})
|
||||||
|
tcm_attrs = pe_template.get("components", {}).get("pe_tcm", {}).get("attrs", {})
|
||||||
|
scratch_mb = float(tcm_attrs.get("kernel_scratch_mb", 1))
|
||||||
|
self._tl_scratch_size = int(scratch_mb * (1 << 20))
|
||||||
|
# PE-unique base address — high bit pattern to avoid collision with
|
||||||
|
# IPCQ ring buffers (which use bit 60).
|
||||||
|
self._tl_scratch_base = (
|
||||||
|
(1 << 61)
|
||||||
|
| (self._sip_idx << 40)
|
||||||
|
| (self._cube_idx << 32)
|
||||||
|
| (self._pe_idx << 24)
|
||||||
|
)
|
||||||
|
|
||||||
def _find_shard(self, shards: tuple) -> Any:
|
def _find_shard(self, shards: tuple) -> Any:
|
||||||
"""Find shard matching this PE's (sip, cube, pe). Fallback to positional index."""
|
"""Find shard matching this PE's (sip, cube, pe). Fallback to positional index."""
|
||||||
@@ -65,24 +89,45 @@ class PeCpuComponent(ComponentBase):
|
|||||||
yield from self._forward_txn(env, txn)
|
yield from self._forward_txn(env, txn)
|
||||||
|
|
||||||
def _execute_kernel(self, env: simpy.Environment, txn: Any) -> Generator:
|
def _execute_kernel(self, env: simpy.Environment, txn: Any) -> Generator:
|
||||||
"""Compile kernel function and replay command trace."""
|
"""Execute kernel: greenlet mode (ADR-0020) or legacy Phase 0 + replay."""
|
||||||
from kernbench.common.pe_commands import (
|
|
||||||
CompositeCmd,
|
|
||||||
PeCpuOverheadCmd,
|
|
||||||
PeInternalTxn,
|
|
||||||
WaitCmd,
|
|
||||||
)
|
|
||||||
from kernbench.triton_emu.registry import get_kernel
|
from kernbench.triton_emu.registry import get_kernel
|
||||||
from kernbench.triton_emu.tl_context import TLContext, run_kernel
|
|
||||||
|
|
||||||
request = txn.request
|
request = txn.request
|
||||||
|
|
||||||
# Phase 1: Compile — apply PE_CPU setup overhead, then run kernel
|
|
||||||
yield from self.run(env, 0)
|
yield from self.run(env, 0)
|
||||||
|
|
||||||
kernel_fn = get_kernel(request.kernel_ref.name)
|
kernel_fn = get_kernel(request.kernel_ref.name)
|
||||||
|
num_programs = self._derive_num_programs(request)
|
||||||
|
kernel_args = self._unpack_kernel_args(request)
|
||||||
|
|
||||||
# Derive num_programs from the number of PE shards in this cube
|
pe_exec_start = env.now
|
||||||
|
scheduler_id = f"{self._pe_prefix}.pe_scheduler"
|
||||||
|
|
||||||
|
# Choose execution mode: greenlet (ADR-0020) or legacy command-list
|
||||||
|
store = getattr(self.ctx, "memory_store", None) if self.ctx else None
|
||||||
|
|
||||||
|
if store is not None:
|
||||||
|
composite_results = yield from self._execute_greenlet(
|
||||||
|
env, kernel_fn, kernel_args, num_programs, scheduler_id, store,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
composite_results = yield from self._execute_legacy(
|
||||||
|
env, kernel_fn, kernel_args, num_programs, scheduler_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Record PE-internal execution time
|
||||||
|
txn.result_data["pe_exec_ns"] = env.now - pe_exec_start
|
||||||
|
total_dma_ns = 0.0
|
||||||
|
total_compute_ns = 0.0
|
||||||
|
for rd in composite_results:
|
||||||
|
total_dma_ns += rd.get("dma_ns", 0.0)
|
||||||
|
total_compute_ns += rd.get("compute_ns", 0.0)
|
||||||
|
txn.result_data["dma_ns"] = total_dma_ns
|
||||||
|
txn.result_data["compute_ns"] = total_compute_ns
|
||||||
|
|
||||||
|
# Send ResponseMsg on reverse path
|
||||||
|
yield from self._send_response(env, txn, request)
|
||||||
|
|
||||||
|
def _derive_num_programs(self, request: Any) -> int:
|
||||||
num_programs = 1
|
num_programs = 1
|
||||||
for arg in request.args:
|
for arg in request.args:
|
||||||
if arg.arg_kind == "tensor":
|
if arg.arg_kind == "tensor":
|
||||||
@@ -92,11 +137,9 @@ class PeCpuComponent(ComponentBase):
|
|||||||
)
|
)
|
||||||
if cube_pe_count > num_programs:
|
if cube_pe_count > num_programs:
|
||||||
num_programs = cube_pe_count
|
num_programs = cube_pe_count
|
||||||
|
return num_programs
|
||||||
|
|
||||||
tl = TLContext(pe_id=self._pe_idx, num_programs=num_programs, dispatch_cycles=0)
|
def _unpack_kernel_args(self, request: Any) -> list:
|
||||||
|
|
||||||
# Unpack KernelLaunchMsg.args into positional args for kernel function
|
|
||||||
# TensorArg → va_base (already local, set by runtime) or PA fallback
|
|
||||||
kernel_args: list = []
|
kernel_args: list = []
|
||||||
for arg in request.args:
|
for arg in request.args:
|
||||||
if arg.arg_kind == "tensor":
|
if arg.arg_kind == "tensor":
|
||||||
@@ -107,15 +150,48 @@ class PeCpuComponent(ComponentBase):
|
|||||||
kernel_args.append(shard.pa)
|
kernel_args.append(shard.pa)
|
||||||
elif arg.arg_kind == "scalar":
|
elif arg.arg_kind == "scalar":
|
||||||
kernel_args.append(arg.value)
|
kernel_args.append(arg.value)
|
||||||
|
return kernel_args
|
||||||
|
|
||||||
|
def _execute_greenlet(
|
||||||
|
self, env, kernel_fn, kernel_args, num_programs, scheduler_id, store,
|
||||||
|
) -> Generator:
|
||||||
|
"""Greenlet-based execution (ADR-0020 D3): kernel ↔ SimPy interleaved."""
|
||||||
|
from kernbench.triton_emu.kernel_runner import KernelRunner
|
||||||
|
|
||||||
|
runner = KernelRunner(
|
||||||
|
pe_prefix=self._pe_prefix,
|
||||||
|
pe_idx=self._pe_idx,
|
||||||
|
sip_idx=self._sip_idx,
|
||||||
|
cube_idx=self._cube_idx,
|
||||||
|
num_cubes=self._num_cubes,
|
||||||
|
scheduler_id=scheduler_id,
|
||||||
|
out_ports=self.out_ports,
|
||||||
|
store=store,
|
||||||
|
scratch_base=self._tl_scratch_base,
|
||||||
|
scratch_size=self._tl_scratch_size,
|
||||||
|
)
|
||||||
|
yield from runner.run(env, kernel_fn, kernel_args, num_programs)
|
||||||
|
return getattr(runner, "_composite_results", [])
|
||||||
|
|
||||||
|
def _execute_legacy(
|
||||||
|
self, env, kernel_fn, kernel_args, num_programs, scheduler_id,
|
||||||
|
) -> Generator:
|
||||||
|
"""Legacy Phase 0 + replay: generate command list, then dispatch."""
|
||||||
|
from kernbench.common.pe_commands import (
|
||||||
|
CompositeCmd, PeCpuOverheadCmd, PeInternalTxn, WaitCmd,
|
||||||
|
)
|
||||||
|
from kernbench.triton_emu.tl_context import TLContext, run_kernel
|
||||||
|
|
||||||
|
tl = TLContext(
|
||||||
|
pe_id=self._pe_idx, num_programs=num_programs,
|
||||||
|
cube_id=self._cube_idx, num_cubes=self._num_cubes,
|
||||||
|
dispatch_cycles=0,
|
||||||
|
)
|
||||||
run_kernel(kernel_fn, tl, *kernel_args)
|
run_kernel(kernel_fn, tl, *kernel_args)
|
||||||
commands = tl.commands
|
commands = tl.commands
|
||||||
|
|
||||||
# Phase 2: Replay — dispatch commands to PE_SCHEDULER
|
pending: dict[str, simpy.Event] = {}
|
||||||
pe_exec_start = env.now
|
composite_results: list[dict] = []
|
||||||
scheduler_id = f"{self._pe_prefix}.pe_scheduler"
|
|
||||||
pending: dict[str, simpy.Event] = {} # completion_id → done event
|
|
||||||
composite_results: list[dict] = [] # collect result_data from CompositeCmd txns
|
|
||||||
|
|
||||||
for cmd in commands:
|
for cmd in commands:
|
||||||
if isinstance(cmd, PeCpuOverheadCmd):
|
if isinstance(cmd, PeCpuOverheadCmd):
|
||||||
@@ -126,47 +202,30 @@ class PeCpuComponent(ComponentBase):
|
|||||||
if evt:
|
if evt:
|
||||||
yield evt
|
yield evt
|
||||||
else:
|
else:
|
||||||
# Wait all pending completions
|
|
||||||
for evt in pending.values():
|
for evt in pending.values():
|
||||||
yield evt
|
yield evt
|
||||||
pending.clear()
|
pending.clear()
|
||||||
elif isinstance(cmd, CompositeCmd):
|
elif isinstance(cmd, CompositeCmd):
|
||||||
# Non-blocking: dispatch to scheduler, track completion
|
|
||||||
done_evt = env.event()
|
done_evt = env.event()
|
||||||
pe_txn = PeInternalTxn(
|
pe_txn = PeInternalTxn(
|
||||||
command=cmd, done=done_evt,
|
command=cmd, done=done_evt, pe_prefix=self._pe_prefix,
|
||||||
pe_prefix=self._pe_prefix,
|
|
||||||
)
|
)
|
||||||
composite_results.append(pe_txn.result_data)
|
composite_results.append(pe_txn.result_data)
|
||||||
yield self.out_ports[scheduler_id].put(pe_txn)
|
yield self.out_ports[scheduler_id].put(pe_txn)
|
||||||
pending[cmd.completion.id] = done_evt
|
pending[cmd.completion.id] = done_evt
|
||||||
else:
|
else:
|
||||||
# Blocking: dispatch and wait for completion
|
|
||||||
done_evt = env.event()
|
done_evt = env.event()
|
||||||
pe_txn = PeInternalTxn(
|
pe_txn = PeInternalTxn(
|
||||||
command=cmd, done=done_evt,
|
command=cmd, done=done_evt, pe_prefix=self._pe_prefix,
|
||||||
pe_prefix=self._pe_prefix,
|
|
||||||
)
|
)
|
||||||
yield self.out_ports[scheduler_id].put(pe_txn)
|
yield self.out_ports[scheduler_id].put(pe_txn)
|
||||||
yield done_evt
|
yield done_evt
|
||||||
|
|
||||||
# Wait for any remaining pending completions
|
|
||||||
for evt in pending.values():
|
for evt in pending.values():
|
||||||
yield evt
|
yield evt
|
||||||
|
return composite_results
|
||||||
|
|
||||||
# Record PE-internal execution time
|
def _send_response(self, env, txn, request) -> Generator:
|
||||||
txn.result_data["pe_exec_ns"] = env.now - pe_exec_start
|
|
||||||
|
|
||||||
# Aggregate dma_ns / compute_ns from CompositeCmd results
|
|
||||||
total_dma_ns = 0.0
|
|
||||||
total_compute_ns = 0.0
|
|
||||||
for rd in composite_results:
|
|
||||||
total_dma_ns += rd.get("dma_ns", 0.0)
|
|
||||||
total_compute_ns += rd.get("compute_ns", 0.0)
|
|
||||||
txn.result_data["dma_ns"] = total_dma_ns
|
|
||||||
txn.result_data["compute_ns"] = total_compute_ns
|
|
||||||
|
|
||||||
# Send ResponseMsg on reverse path (PE_CPU → NOC → M_CPU)
|
|
||||||
reverse_path = list(reversed(txn.path))
|
reverse_path = list(reversed(txn.path))
|
||||||
if len(reverse_path) >= 2:
|
if len(reverse_path) >= 2:
|
||||||
from kernbench.runtime_api.kernel import ResponseMsg
|
from kernbench.runtime_api.kernel import ResponseMsg
|
||||||
|
|||||||
@@ -105,6 +105,203 @@ class PeDmaComponent(PeEngineBase):
|
|||||||
yield sub_done
|
yield sub_done
|
||||||
pe_txn.done.succeed()
|
pe_txn.done.succeed()
|
||||||
|
|
||||||
|
def _worker(self, env: simpy.Environment) -> Generator:
|
||||||
|
"""Handle TileToken (pipeline), PeInternalTxn (legacy), IpcqDmaToken,
|
||||||
|
and Transaction (fabric)."""
|
||||||
|
from kernbench.common.ipcq_types import IpcqDmaToken
|
||||||
|
from kernbench.common.pe_commands import PeInternalTxn
|
||||||
|
from kernbench.components.builtin.pe_types import TileToken
|
||||||
|
|
||||||
|
while True:
|
||||||
|
msg: Any = yield self._inbox.get()
|
||||||
|
if isinstance(msg, IpcqDmaToken):
|
||||||
|
# Outbound: IPCQ token from local PE_IPCQ → forward via fabric
|
||||||
|
env.process(self._handle_ipcq_outbound(env, msg))
|
||||||
|
elif isinstance(msg, TileToken):
|
||||||
|
env.process(self._pipeline_process(env, msg))
|
||||||
|
elif isinstance(msg, PeInternalTxn):
|
||||||
|
env.process(self._handle_with_hooks(env, msg))
|
||||||
|
else:
|
||||||
|
# Transaction (or unknown). May carry IpcqDmaToken inbound.
|
||||||
|
req = getattr(msg, "request", None)
|
||||||
|
if isinstance(req, IpcqDmaToken):
|
||||||
|
env.process(self._handle_ipcq_inbound(env, msg))
|
||||||
|
else:
|
||||||
|
env.process(self._forward_txn(env, msg))
|
||||||
|
|
||||||
|
# ── IPCQ outbound (PE_IPCQ → PE_DMA → fabric) ───────────────────
|
||||||
|
|
||||||
|
def _handle_ipcq_outbound(self, env: simpy.Environment, token: Any) -> Generator:
|
||||||
|
"""Forward IpcqDmaToken from local PE_IPCQ through the fabric to peer
|
||||||
|
PE_DMA. ADR-0023 D8 (vc_comm channel)."""
|
||||||
|
if self.ctx is None:
|
||||||
|
return # nothing to do
|
||||||
|
peer = token.dst_endpoint
|
||||||
|
peer_pe_dma = f"sip{peer.sip}.cube{peer.cube}.pe{peer.pe}.pe_dma"
|
||||||
|
|
||||||
|
# Snapshot the source data at send time (D9 in-flight semantics).
|
||||||
|
# Without this, the receiver could read stale or future data if the
|
||||||
|
# sender mutates src_addr between send issue and DMA arrival.
|
||||||
|
store = getattr(self.ctx, "memory_store", None)
|
||||||
|
if store is not None and token.data is None:
|
||||||
|
try:
|
||||||
|
snap = store.read(
|
||||||
|
token.src_space, token.src_addr,
|
||||||
|
shape=token.shape, dtype=token.dtype,
|
||||||
|
)
|
||||||
|
# Copy so later mutations to src_addr don't affect the snapshot.
|
||||||
|
token.data = snap.copy() if hasattr(snap, "copy") else snap
|
||||||
|
except Exception:
|
||||||
|
token.data = None
|
||||||
|
|
||||||
|
# Note: ipcq_copy is recorded at INBOUND time (in _handle_ipcq_inbound),
|
||||||
|
# not here. Outbound time is too early — it precedes fabric propagation,
|
||||||
|
# so in Phase 2 a later round's copy can sort before the receiver's
|
||||||
|
# math for an earlier round, causing slot data corruption.
|
||||||
|
# The secondary sort in DataExecutor (memory ops before math at the
|
||||||
|
# same t_start) ensures the inbound copy runs before the local math
|
||||||
|
# that reads the slot.
|
||||||
|
|
||||||
|
try:
|
||||||
|
path = self.ctx.router.find_path(self._pe_prefix, peer_pe_dma)
|
||||||
|
except Exception:
|
||||||
|
return
|
||||||
|
drain_ns = self.ctx.compute_drain_ns(path, token.nbytes)
|
||||||
|
|
||||||
|
sub_done = env.event()
|
||||||
|
sub_txn = Transaction(
|
||||||
|
request=token, path=path, step=0,
|
||||||
|
nbytes=token.nbytes, done=sub_done, drain_ns=drain_ns,
|
||||||
|
)
|
||||||
|
if len(path) > 1:
|
||||||
|
next_hop = path[1]
|
||||||
|
if next_hop in self.out_ports:
|
||||||
|
yield self.out_ports[next_hop].put(sub_txn.advance())
|
||||||
|
else:
|
||||||
|
return
|
||||||
|
# Note: don't wait on sub_done here — fire-and-forget for vc_comm.
|
||||||
|
# IPCQ slot bookkeeping (peer_head) was already updated by PE_IPCQ;
|
||||||
|
# backpressure is via credit return, not via this DMA's completion.
|
||||||
|
|
||||||
|
# ── IPCQ inbound (fabric → PE_DMA → MemoryStore + PE_IPCQ) ──────
|
||||||
|
|
||||||
|
def _handle_ipcq_inbound(self, env: simpy.Environment, txn: Any) -> Generator:
|
||||||
|
"""At destination PE_DMA: atomically write data and forward metadata.
|
||||||
|
|
||||||
|
I6 (MUST): no SimPy yield between MemoryStore.write and the
|
||||||
|
IpcqMetaArrival put into PE_IPCQ.
|
||||||
|
"""
|
||||||
|
from kernbench.common.ipcq_types import IpcqMetaArrival
|
||||||
|
|
||||||
|
token = txn.request
|
||||||
|
|
||||||
|
# ── ATOMIC: do not introduce yield between these two operations ──
|
||||||
|
# 1. Move data via MemoryStore (single-hop DMA write).
|
||||||
|
# Prefer the in-flight snapshot stashed by the sender PE_DMA;
|
||||||
|
# fall back to a fresh read of src_addr if no snapshot is present
|
||||||
|
# (e.g. control-only token).
|
||||||
|
store = getattr(self.ctx, "memory_store", None) if self.ctx else None
|
||||||
|
if store is not None:
|
||||||
|
try:
|
||||||
|
data = token.data
|
||||||
|
if data is None:
|
||||||
|
data = store.read(
|
||||||
|
token.src_space, token.src_addr,
|
||||||
|
shape=token.shape, dtype=token.dtype,
|
||||||
|
)
|
||||||
|
store.write(token.dst_endpoint.buffer_kind, token.dst_addr, data)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Record the IPCQ copy at INBOUND time with embedded data snapshot.
|
||||||
|
# The snapshot (token.data) was captured by the sender's outbound
|
||||||
|
# PE_DMA at send time. Phase 2 writes the snapshot directly to
|
||||||
|
# dst — it does NOT re-read from MemoryStore[src_addr], which may
|
||||||
|
# have been mutated by a different PE's Phase 2 ops by that point.
|
||||||
|
# DataExecutor's secondary sort (memory before math at same
|
||||||
|
# t_start) ensures the write completes before the local math
|
||||||
|
# that reads the slot.
|
||||||
|
if self._op_logger is not None:
|
||||||
|
try:
|
||||||
|
self._op_logger.record_copy(
|
||||||
|
t_start=float(env.now), t_end=float(env.now),
|
||||||
|
component_id=self.node.id,
|
||||||
|
src_space=token.src_space, src_addr=token.src_addr,
|
||||||
|
dst_space=token.dst_endpoint.buffer_kind,
|
||||||
|
dst_addr=token.dst_addr,
|
||||||
|
shape=token.shape, dtype=token.dtype, nbytes=token.nbytes,
|
||||||
|
snapshot=token.data,
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# 2. Forward IpcqMetaArrival to local PE_IPCQ
|
||||||
|
ipcq_id = f"{self._pe_prefix}.pe_ipcq"
|
||||||
|
if ipcq_id in self.out_ports:
|
||||||
|
yield self.out_ports[ipcq_id].put(IpcqMetaArrival(token=token))
|
||||||
|
# ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
if not txn.done.triggered:
|
||||||
|
txn.done.succeed()
|
||||||
|
|
||||||
|
def _pipeline_process(self, env: simpy.Environment, token: Any) -> Generator:
|
||||||
|
"""Pipeline mode: DMA read/write via fabric, then self-route."""
|
||||||
|
self._on_process_start(env, token)
|
||||||
|
yield from self._do_pipeline_dma(env, token)
|
||||||
|
self._on_process_end(env, token)
|
||||||
|
|
||||||
|
# Self-routing (handle same-component consecutive stages)
|
||||||
|
next_stage = token.advance()
|
||||||
|
while next_stage is not None and next_stage.component == self.node.id:
|
||||||
|
self._on_process_start(env, token)
|
||||||
|
yield from self._do_pipeline_dma(env, token)
|
||||||
|
self._on_process_end(env, token)
|
||||||
|
next_stage = token.advance()
|
||||||
|
|
||||||
|
if next_stage is not None:
|
||||||
|
yield self.out_ports[next_stage.component].put(token)
|
||||||
|
else:
|
||||||
|
token.pipeline_ctx.complete_tile()
|
||||||
|
|
||||||
|
def _do_pipeline_dma(self, env, token):
|
||||||
|
"""Core DMA logic for pipeline mode."""
|
||||||
|
from kernbench.policy.address.phyaddr import PhysAddr
|
||||||
|
from kernbench.runtime_api.kernel import PeDmaMsg
|
||||||
|
|
||||||
|
params = token.params
|
||||||
|
from kernbench.components.builtin.pe_types import StageType
|
||||||
|
is_write = token.current_stage.stage_type == StageType.DMA_WRITE
|
||||||
|
addr = params.get("dst_addr" if is_write else "src_addr", 0)
|
||||||
|
nbytes = params.get("nbytes", 0)
|
||||||
|
|
||||||
|
if nbytes > 0 and self.ctx:
|
||||||
|
dma_res = self._dma_write if is_write else self._dma_read
|
||||||
|
assert dma_res is not None
|
||||||
|
|
||||||
|
pa = PhysAddr.decode(addr)
|
||||||
|
dst_node = self.ctx.resolver.resolve(pa)
|
||||||
|
path = self.ctx.router.find_path(self._pe_prefix, dst_node)
|
||||||
|
drain_ns = self.ctx.compute_drain_ns(path, nbytes)
|
||||||
|
|
||||||
|
with dma_res.request() as req:
|
||||||
|
yield req
|
||||||
|
sub_done = env.event()
|
||||||
|
sub_request = PeDmaMsg(
|
||||||
|
correlation_id="pipeline",
|
||||||
|
request_id=f"tile_{token.tile_id}",
|
||||||
|
src_sip=0, src_cube=0, src_pe=0,
|
||||||
|
dst_pa=addr, nbytes=nbytes,
|
||||||
|
is_write=is_write,
|
||||||
|
)
|
||||||
|
sub_txn = Transaction(
|
||||||
|
request=sub_request, path=path, step=0,
|
||||||
|
nbytes=nbytes, done=sub_done, drain_ns=drain_ns,
|
||||||
|
)
|
||||||
|
if len(path) > 1:
|
||||||
|
yield self.out_ports[path[1]].put(sub_txn.advance())
|
||||||
|
|
||||||
|
yield sub_done
|
||||||
|
|
||||||
def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
|
def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
|
||||||
"""Handle external Transaction (PeDmaMsg probe, M_CPU DMA) with channel acquisition."""
|
"""Handle external Transaction (PeDmaMsg probe, M_CPU DMA) with channel acquisition."""
|
||||||
# Response transactions bypass DMA channel (no outbound resource needed)
|
# Response transactions bypass DMA channel (no outbound resource needed)
|
||||||
|
|||||||
@@ -0,0 +1,77 @@
|
|||||||
|
"""PE_FETCH_STORE: TCM ↔ Register File transfer unit (ADR-0021 D5).
|
||||||
|
|
||||||
|
Handles both fetch (TCM → register) and store (register → TCM).
|
||||||
|
BW serialization is delegated to PE_TCM via port communication.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from collections.abc import Generator
|
||||||
|
from typing import TYPE_CHECKING, Any
|
||||||
|
|
||||||
|
import simpy
|
||||||
|
|
||||||
|
from kernbench.components.base import PeEngineBase
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from kernbench.components.context import ComponentContext
|
||||||
|
from kernbench.topology.types import Node
|
||||||
|
|
||||||
|
|
||||||
|
class PeFetchStoreComponent(PeEngineBase):
|
||||||
|
"""PE_FETCH_STORE: TCM ↔ Register File (ADR-0021 D5).
|
||||||
|
|
||||||
|
Receives TileTokens via pipeline self-routing.
|
||||||
|
Sends TcmRequest to PE_TCM for BW-based latency.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
|
||||||
|
super().__init__(node, ctx)
|
||||||
|
self._tcm_id = f"{self._pe_prefix}.pe_tcm"
|
||||||
|
|
||||||
|
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
|
||||||
|
overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
|
||||||
|
yield env.timeout(overhead_ns)
|
||||||
|
|
||||||
|
def _worker(self, env: simpy.Environment) -> Generator:
|
||||||
|
"""Handle both PeInternalTxn (legacy) and TileToken (pipeline)."""
|
||||||
|
from kernbench.common.pe_commands import PeInternalTxn
|
||||||
|
from kernbench.components.builtin.pe_types import TileToken
|
||||||
|
|
||||||
|
while True:
|
||||||
|
msg: Any = yield self._inbox.get()
|
||||||
|
if isinstance(msg, TileToken):
|
||||||
|
env.process(self._pipeline_process(env, msg))
|
||||||
|
elif isinstance(msg, PeInternalTxn):
|
||||||
|
env.process(self.handle_command(env, msg))
|
||||||
|
else:
|
||||||
|
env.process(self._forward_txn(env, msg))
|
||||||
|
|
||||||
|
def _pipeline_process(self, env: simpy.Environment, token: Any) -> Generator:
|
||||||
|
"""Process a pipeline TileToken: fetch or store via TCM."""
|
||||||
|
from kernbench.components.builtin.pe_tcm import TcmRequest
|
||||||
|
|
||||||
|
self._on_process_start(env, token)
|
||||||
|
|
||||||
|
direction = token.params.get("direction", "read")
|
||||||
|
nbytes = token.params.get("nbytes", 0)
|
||||||
|
|
||||||
|
if nbytes > 0 and self._tcm_id in self.out_ports:
|
||||||
|
done = env.event()
|
||||||
|
yield self.out_ports[self._tcm_id].put(
|
||||||
|
TcmRequest(direction=direction, nbytes=nbytes, done=done)
|
||||||
|
)
|
||||||
|
yield done
|
||||||
|
|
||||||
|
self._on_process_end(env, token)
|
||||||
|
|
||||||
|
# Self-routing: advance to next stage
|
||||||
|
next_stage = token.advance()
|
||||||
|
if next_stage is not None:
|
||||||
|
yield self.out_ports[next_stage.component].put(token)
|
||||||
|
else:
|
||||||
|
token.pipeline_ctx.complete_tile()
|
||||||
|
|
||||||
|
def handle_command(self, env: simpy.Environment, pe_txn: Any) -> Generator:
|
||||||
|
"""Legacy PeInternalTxn handling."""
|
||||||
|
yield from self.run(env, 0)
|
||||||
|
pe_txn.done.succeed()
|
||||||
@@ -1,6 +1,18 @@
|
|||||||
|
"""PE_GEMM: matrix multiplication engine (ADR-0021 D6).
|
||||||
|
|
||||||
|
Handles both legacy PeInternalTxn (GemmCmd) and pipeline TileToken.
|
||||||
|
In pipeline mode, receives token after fetch stage, computes MAC, chains to next.
|
||||||
|
|
||||||
|
MAC latency model (from pe_accel):
|
||||||
|
cycles = ceil(Tm/mac_m) * ceil(Tk/mac_k) * ceil(Tn/mac_n)
|
||||||
|
latency_ns = cycles / clock_freq_ghz
|
||||||
|
|
||||||
|
Falls back to TFLOPS model when mac dimensions not configured.
|
||||||
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from collections.abc import Generator
|
from collections.abc import Generator
|
||||||
|
from math import ceil
|
||||||
from typing import TYPE_CHECKING, Any
|
from typing import TYPE_CHECKING, Any
|
||||||
|
|
||||||
import simpy
|
import simpy
|
||||||
@@ -12,33 +24,29 @@ if TYPE_CHECKING:
|
|||||||
from kernbench.components.context import ComponentContext
|
from kernbench.components.context import ComponentContext
|
||||||
from kernbench.topology.types import Node
|
from kernbench.topology.types import Node
|
||||||
|
|
||||||
|
|
||||||
# dtype → bit width (for TFLOPS scaling)
|
|
||||||
_DTYPE_BITS: dict[str, int] = {
|
_DTYPE_BITS: dict[str, int] = {
|
||||||
"f16": 16, "fp16": 16, "float16": 16, "bf16": 16,
|
"f16": 16, "fp16": 16, "float16": 16, "bf16": 16,
|
||||||
"f32": 32, "fp32": 32, "float32": 32,
|
"f32": 32, "fp32": 32, "float32": 32,
|
||||||
"i8": 8, "int8": 8,
|
"i8": 8, "int8": 8, "i16": 16, "int16": 16, "i32": 32, "int32": 32,
|
||||||
"i16": 16, "int16": 16,
|
|
||||||
"i32": 32, "int32": 32,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
class PeGemmComponent(PeEngineBase):
|
class PeGemmComponent(PeEngineBase):
|
||||||
"""PE_GEMM: matrix multiplication engine sharing accel_slot (ADR-0014 D4).
|
"""PE_GEMM: MAC array (ADR-0021 D6).
|
||||||
|
|
||||||
Uses a shared compute resource (PE_ACCEL capacity=1) that is mutually
|
In pipeline mode: pure compute — register data already fetched.
|
||||||
exclusive with PE_MATH within the same PE.
|
In legacy mode: handles PeInternalTxn(GemmCmd) with shared accel_slot.
|
||||||
|
|
||||||
Compute latency model:
|
|
||||||
FLOPs = 2 * M * K * N
|
|
||||||
effective_tflops = peak_tflops_f16 * (16 / dtype_bits)
|
|
||||||
compute_ns = FLOPs / (effective_tflops * 1e3)
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
|
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
|
||||||
super().__init__(node, ctx)
|
super().__init__(node, ctx)
|
||||||
self._accel: simpy.Resource | None = None
|
self._accel: simpy.Resource | None = None
|
||||||
self._peak_tflops_f16: float = float(node.attrs.get("peak_tflops_f16", 0.0))
|
self._peak_tflops_f16: float = float(node.attrs.get("peak_tflops_f16", 0.0))
|
||||||
|
# Cycle-accurate MAC dimensions (from pe_accel)
|
||||||
|
self._mac_m: int = int(node.attrs.get("mac_m", 0))
|
||||||
|
self._mac_k: int = int(node.attrs.get("mac_k", 0))
|
||||||
|
self._mac_n: int = int(node.attrs.get("mac_n", 0))
|
||||||
|
self._clock_freq: float = float(node.attrs.get("clock_freq_ghz", 1.0))
|
||||||
|
|
||||||
def init_resources(self, env: simpy.Environment) -> None:
|
def init_resources(self, env: simpy.Environment) -> None:
|
||||||
resource_name = self.node.attrs.get("shared_resource")
|
resource_name = self.node.attrs.get("shared_resource")
|
||||||
@@ -47,8 +55,15 @@ class PeGemmComponent(PeEngineBase):
|
|||||||
env, f"{self._pe_prefix}.{resource_name}"
|
env, f"{self._pe_prefix}.{resource_name}"
|
||||||
)
|
)
|
||||||
|
|
||||||
def _compute_ns(self, m: int, k: int, n: int, dtype: str) -> float:
|
def _compute_ns_mac(self, m: int, k: int, n: int) -> float:
|
||||||
"""Compute GEMM latency in nanoseconds."""
|
"""Cycle-accurate MAC latency (pe_accel model)."""
|
||||||
|
if self._mac_m > 0 and self._mac_k > 0 and self._mac_n > 0:
|
||||||
|
cycles = ceil(m / self._mac_m) * ceil(k / self._mac_k) * ceil(n / self._mac_n)
|
||||||
|
return cycles / self._clock_freq
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
def _compute_ns_tflops(self, m: int, k: int, n: int, dtype: str = "f16") -> float:
|
||||||
|
"""TFLOPS-based latency (legacy model)."""
|
||||||
if self._peak_tflops_f16 <= 0:
|
if self._peak_tflops_f16 <= 0:
|
||||||
return float(self.node.attrs.get("overhead_ns", 0.0))
|
return float(self.node.attrs.get("overhead_ns", 0.0))
|
||||||
dtype_bits = _DTYPE_BITS.get(dtype, 16)
|
dtype_bits = _DTYPE_BITS.get(dtype, 16)
|
||||||
@@ -56,11 +71,58 @@ class PeGemmComponent(PeEngineBase):
|
|||||||
flops = 2.0 * m * k * n
|
flops = 2.0 * m * k * n
|
||||||
return flops / (effective_tflops * 1e3)
|
return flops / (effective_tflops * 1e3)
|
||||||
|
|
||||||
|
def _compute_ns(self, m: int, k: int, n: int, dtype: str = "f16") -> float:
|
||||||
|
"""Choose best available latency model."""
|
||||||
|
mac_ns = self._compute_ns_mac(m, k, n)
|
||||||
|
if mac_ns > 0:
|
||||||
|
return mac_ns
|
||||||
|
return self._compute_ns_tflops(m, k, n, dtype)
|
||||||
|
|
||||||
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
|
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
|
||||||
overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
|
overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
|
||||||
yield env.timeout(overhead_ns)
|
yield env.timeout(overhead_ns)
|
||||||
|
|
||||||
|
def _worker(self, env: simpy.Environment) -> Generator:
|
||||||
|
from kernbench.common.pe_commands import PeInternalTxn
|
||||||
|
from kernbench.components.builtin.pe_types import TileToken
|
||||||
|
|
||||||
|
while True:
|
||||||
|
msg: Any = yield self._inbox.get()
|
||||||
|
if isinstance(msg, TileToken):
|
||||||
|
env.process(self._pipeline_process(env, msg))
|
||||||
|
elif isinstance(msg, PeInternalTxn):
|
||||||
|
env.process(self._handle_with_hooks(env, msg))
|
||||||
|
else:
|
||||||
|
env.process(self._forward_txn(env, msg))
|
||||||
|
|
||||||
|
def _pipeline_process(self, env: simpy.Environment, token: Any) -> Generator:
|
||||||
|
"""Pipeline mode: pure MAC compute, then self-route."""
|
||||||
|
self._on_process_start(env, token)
|
||||||
|
|
||||||
|
m = token.params.get("m", 0)
|
||||||
|
k = token.params.get("k", 0)
|
||||||
|
n = token.params.get("n", 0)
|
||||||
|
|
||||||
|
if self._accel:
|
||||||
|
with self._accel.request() as req:
|
||||||
|
yield req
|
||||||
|
ns = self._compute_ns(m, k, n)
|
||||||
|
yield env.timeout(ns)
|
||||||
|
else:
|
||||||
|
ns = self._compute_ns(m, k, n)
|
||||||
|
yield env.timeout(ns)
|
||||||
|
|
||||||
|
self._on_process_end(env, token)
|
||||||
|
|
||||||
|
# Self-routing
|
||||||
|
next_stage = token.advance()
|
||||||
|
if next_stage is not None:
|
||||||
|
yield self.out_ports[next_stage.component].put(token)
|
||||||
|
else:
|
||||||
|
token.pipeline_ctx.complete_tile()
|
||||||
|
|
||||||
def handle_command(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
|
def handle_command(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
|
||||||
|
"""Legacy PeInternalTxn handling."""
|
||||||
from kernbench.common.pe_commands import GemmCmd
|
from kernbench.common.pe_commands import GemmCmd
|
||||||
|
|
||||||
cmd = pe_txn.command
|
cmd = pe_txn.command
|
||||||
@@ -81,7 +143,6 @@ class PeGemmComponent(PeEngineBase):
|
|||||||
pe_txn.done.succeed()
|
pe_txn.done.succeed()
|
||||||
|
|
||||||
def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
|
def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
|
||||||
"""Transaction forwarding with accel_slot acquisition."""
|
|
||||||
if self._accel:
|
if self._accel:
|
||||||
with self._accel.request() as req:
|
with self._accel.request() as req:
|
||||||
yield req
|
yield req
|
||||||
|
|||||||
@@ -0,0 +1,479 @@
|
|||||||
|
"""PE_IPCQ component (ADR-0023): per-PE IPCQ control plane.
|
||||||
|
|
||||||
|
Responsibilities:
|
||||||
|
- Hold per-direction queue pair state (my_head, my_tail,
|
||||||
|
peer_head_cache, peer_tail_cache, ring buffer addresses)
|
||||||
|
- Process IpcqInitMsg from backend to install neighbor table
|
||||||
|
- Handle IpcqRequest(IpcqSendCmd) from PE_CPU:
|
||||||
|
compute peer slot address, check backpressure, forward
|
||||||
|
IpcqDmaToken to PE_DMA (vc_comm)
|
||||||
|
- Handle IpcqRequest(IpcqRecvCmd) from PE_CPU:
|
||||||
|
wait for data arrival, return slot address (or copy to dst),
|
||||||
|
send fast-path credit return
|
||||||
|
- Handle IpcqMetaArrival from PE_DMA: update peer_head_cache, wake recv
|
||||||
|
- Handle IpcqCreditMetadata via own credit_inbox: update peer_tail_cache,
|
||||||
|
wake send
|
||||||
|
|
||||||
|
PE_IPCQ does NOT move data — it forwards IpcqDmaToken to PE_DMA which
|
||||||
|
performs the actual fabric DMA.
|
||||||
|
|
||||||
|
Credit return uses a fast path: PE_IPCQ creates a SimPy process with a
|
||||||
|
bottleneck-BW based latency, then puts IpcqCreditMetadata directly into
|
||||||
|
the peer's pre-wired credit_store.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from collections.abc import Generator
|
||||||
|
from typing import TYPE_CHECKING, Any
|
||||||
|
|
||||||
|
import simpy
|
||||||
|
|
||||||
|
from kernbench.common.ipcq_types import (
|
||||||
|
IpcqCreditMetadata,
|
||||||
|
IpcqDmaToken,
|
||||||
|
IpcqInvalidDirection,
|
||||||
|
IpcqMetaArrival,
|
||||||
|
IpcqRecvCmd,
|
||||||
|
IpcqRequest,
|
||||||
|
IpcqSendCmd,
|
||||||
|
)
|
||||||
|
from kernbench.components.base import ComponentBase
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from kernbench.components.context import ComponentContext
|
||||||
|
from kernbench.runtime_api.kernel import IpcqInitMsg
|
||||||
|
from kernbench.topology.types import Node
|
||||||
|
|
||||||
|
|
||||||
|
_DIR_ORDER: tuple[str, ...] = ("N", "S", "E", "W", "parent", "child_left", "child_right")
|
||||||
|
|
||||||
|
|
||||||
|
class PeIpcqComponent(ComponentBase):
|
||||||
|
"""PE_IPCQ: ring buffer pointer + neighbor management for CCL.
|
||||||
|
|
||||||
|
Owned by one PE; talks to PE_DMA via out_ports[<pe_dma_id>] and
|
||||||
|
receives credit return metadata via the public ``credit_inbox``
|
||||||
|
SimPy Store (wired by backend at IpcqInitMsg installation time).
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
|
||||||
|
super().__init__(node, ctx)
|
||||||
|
# Strict shape/dtype validation (D14 F2). Off by default.
|
||||||
|
self._strict: bool = bool(node.attrs.get("strict_validation", False))
|
||||||
|
# direction → list of received tokens (for strict-mode peek of next slot)
|
||||||
|
self._arrived_tokens: dict[str, list] = {}
|
||||||
|
# Parse self (sip, cube, pe) from node id, e.g. "sip0.cube0.pe0.pe_ipcq"
|
||||||
|
self._pe_prefix: str = node.id.rsplit(".", 1)[0]
|
||||||
|
parts = self._pe_prefix.split(".")
|
||||||
|
try:
|
||||||
|
self._self_sip = int(parts[0].replace("sip", ""))
|
||||||
|
except (IndexError, ValueError):
|
||||||
|
self._self_sip = 0
|
||||||
|
try:
|
||||||
|
self._self_cube = int(parts[1].replace("cube", ""))
|
||||||
|
except (IndexError, ValueError):
|
||||||
|
self._self_cube = 0
|
||||||
|
try:
|
||||||
|
self._self_pe = int(parts[2].replace("pe", ""))
|
||||||
|
except (IndexError, ValueError):
|
||||||
|
self._self_pe = 0
|
||||||
|
|
||||||
|
self._dma_node_id = f"{self._pe_prefix}.pe_dma"
|
||||||
|
# direction → state dict (see _install_neighbors for shape)
|
||||||
|
self._queue_pairs: dict[str, dict[str, Any]] = {}
|
||||||
|
self._installed = False
|
||||||
|
self._buffer_kind: str = "tcm"
|
||||||
|
self._backpressure_mode: str = "sleep"
|
||||||
|
self._credit_size_bytes: int = 16
|
||||||
|
# waiters for recv (per direction) and any-direction (for round-robin)
|
||||||
|
self._recv_waiters: dict[str, list[simpy.Event]] = {}
|
||||||
|
self._any_recv_waiters: list[simpy.Event] = []
|
||||||
|
# waiters for send backpressure (per direction)
|
||||||
|
self._send_waiters: dict[str, list[simpy.Event]] = {}
|
||||||
|
# round-robin cursor over installed directions
|
||||||
|
self._rr_dirs: list[str] = []
|
||||||
|
self._rr_cursor: int = 0
|
||||||
|
# credit_inbox is created in start() once env is available
|
||||||
|
self._credit_inbox: simpy.Store | None = None
|
||||||
|
|
||||||
|
# ── Public ──
|
||||||
|
|
||||||
|
@property
|
||||||
|
def credit_inbox(self) -> simpy.Store:
|
||||||
|
"""SimPy Store that backend wires as ``peer_credit_store`` on
|
||||||
|
every remote sender targeting this PE. Used by D9 fast path."""
|
||||||
|
assert self._credit_inbox is not None, "PE_IPCQ not started yet"
|
||||||
|
return self._credit_inbox
|
||||||
|
|
||||||
|
@property
|
||||||
|
def queue_pairs(self) -> dict[str, dict[str, Any]]:
|
||||||
|
"""Test/debug accessor."""
|
||||||
|
return self._queue_pairs
|
||||||
|
|
||||||
|
# ── Lifecycle ──
|
||||||
|
|
||||||
|
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
|
||||||
|
yield env.timeout(0)
|
||||||
|
|
||||||
|
def start(self, env: simpy.Environment) -> None:
|
||||||
|
# Create credit_inbox even if there are no in_ports yet
|
||||||
|
if self._credit_inbox is None:
|
||||||
|
self._credit_inbox = simpy.Store(env)
|
||||||
|
# If no in_ports were wired (e.g. unit test), still spin up workers
|
||||||
|
if not self.in_ports:
|
||||||
|
self._inbox = simpy.Store(env)
|
||||||
|
super().start(env)
|
||||||
|
env.process(self._credit_worker(env))
|
||||||
|
|
||||||
|
# ── Worker (override of ComponentBase._worker) ──
|
||||||
|
|
||||||
|
def _worker(self, env: simpy.Environment) -> Generator:
|
||||||
|
from kernbench.runtime_api.kernel import IpcqInitMsg
|
||||||
|
|
||||||
|
while True:
|
||||||
|
msg: Any = yield self._inbox.get()
|
||||||
|
|
||||||
|
# IpcqInitMsg may arrive wrapped in a transaction (with .request)
|
||||||
|
# or directly.
|
||||||
|
request_obj = getattr(msg, "request", None)
|
||||||
|
if isinstance(request_obj, IpcqInitMsg):
|
||||||
|
self._install_neighbors(request_obj)
|
||||||
|
done = getattr(msg, "done", None)
|
||||||
|
if done is not None and not done.triggered:
|
||||||
|
done.succeed()
|
||||||
|
continue
|
||||||
|
if isinstance(msg, IpcqInitMsg):
|
||||||
|
self._install_neighbors(msg)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if isinstance(msg, IpcqMetaArrival):
|
||||||
|
self._handle_meta_arrival(msg)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if isinstance(msg, IpcqRequest):
|
||||||
|
env.process(self._handle_request(env, msg))
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Unknown message — drop or forward via base class fallback
|
||||||
|
env.process(self._forward_txn(env, msg))
|
||||||
|
|
||||||
|
# ── Init ──
|
||||||
|
|
||||||
|
def _install_neighbors(self, msg: IpcqInitMsg) -> None:
|
||||||
|
self._installed = True
|
||||||
|
self._buffer_kind = msg.buffer_kind
|
||||||
|
self._backpressure_mode = msg.backpressure_mode
|
||||||
|
self._credit_size_bytes = msg.credit_size_bytes
|
||||||
|
for entry in msg.entries:
|
||||||
|
self._queue_pairs[entry.direction] = {
|
||||||
|
"peer": entry.peer,
|
||||||
|
"my_rx_base_pa": entry.my_rx_base_pa,
|
||||||
|
"my_rx_base_va": entry.my_rx_base_va,
|
||||||
|
"n_slots": entry.n_slots,
|
||||||
|
"slot_size": entry.slot_size,
|
||||||
|
"peer_credit_store": entry.peer_credit_store,
|
||||||
|
"my_head": 0,
|
||||||
|
"my_tail": 0,
|
||||||
|
"peer_head_cache": 0,
|
||||||
|
"peer_tail_cache": 0,
|
||||||
|
}
|
||||||
|
self._recv_waiters.setdefault(entry.direction, [])
|
||||||
|
self._send_waiters.setdefault(entry.direction, [])
|
||||||
|
# Reset round-robin order to a stable canonical sequence
|
||||||
|
self._rr_dirs = [d for d in _DIR_ORDER if d in self._queue_pairs]
|
||||||
|
self._rr_cursor = 0
|
||||||
|
|
||||||
|
# ── Send ──
|
||||||
|
|
||||||
|
def _handle_request(self, env: simpy.Environment, req: IpcqRequest) -> Generator:
|
||||||
|
cmd = req.command
|
||||||
|
if isinstance(cmd, IpcqSendCmd):
|
||||||
|
yield from self._handle_send(env, req, cmd)
|
||||||
|
elif isinstance(cmd, IpcqRecvCmd):
|
||||||
|
yield from self._handle_recv(env, req, cmd)
|
||||||
|
|
||||||
|
def _handle_send(
|
||||||
|
self, env: simpy.Environment, req: IpcqRequest, cmd: IpcqSendCmd,
|
||||||
|
) -> Generator:
|
||||||
|
if cmd.direction not in self._queue_pairs:
|
||||||
|
raise IpcqInvalidDirection(
|
||||||
|
f"PE {self._pe_prefix}: direction {cmd.direction!r} not installed"
|
||||||
|
)
|
||||||
|
qp = self._queue_pairs[cmd.direction]
|
||||||
|
peer = qp["peer"]
|
||||||
|
|
||||||
|
# Backpressure: wait while ring full
|
||||||
|
while (qp["my_head"] - qp["peer_tail_cache"]) >= peer.n_slots:
|
||||||
|
wait_event = env.event()
|
||||||
|
self._send_waiters[cmd.direction].append(wait_event)
|
||||||
|
yield wait_event
|
||||||
|
|
||||||
|
# Compute peer slot address
|
||||||
|
slot_idx = qp["my_head"] % peer.n_slots
|
||||||
|
dst_pa = peer.rx_base_pa + slot_idx * peer.slot_size
|
||||||
|
|
||||||
|
token = IpcqDmaToken(
|
||||||
|
src_addr=cmd.src_addr,
|
||||||
|
src_space=cmd.src_space,
|
||||||
|
dst_addr=dst_pa,
|
||||||
|
dst_endpoint=peer,
|
||||||
|
nbytes=cmd.nbytes,
|
||||||
|
handle_id=cmd.handle_id,
|
||||||
|
shape=cmd.shape,
|
||||||
|
dtype=cmd.dtype,
|
||||||
|
# Carry the handle's recv-time data snapshot so the outbound
|
||||||
|
# PE_DMA doesn't need to re-read from MemoryStore (which may
|
||||||
|
# have been overwritten by a later inbound in the meantime).
|
||||||
|
data=getattr(cmd, "data", None),
|
||||||
|
sender_seq=qp["my_head"],
|
||||||
|
src_sip=self._self_sip,
|
||||||
|
src_cube=self._self_cube,
|
||||||
|
src_pe=self._self_pe,
|
||||||
|
src_direction=cmd.direction,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Forward to PE_DMA (vc_comm)
|
||||||
|
yield self.out_ports[self._dma_node_id].put(token)
|
||||||
|
qp["my_head"] += 1
|
||||||
|
# Diagnostics trace (D14)
|
||||||
|
from kernbench.ccl import diagnostics
|
||||||
|
if diagnostics.trace_enabled():
|
||||||
|
diagnostics.log_send(
|
||||||
|
t_ns=float(env.now), sender=self._pe_prefix,
|
||||||
|
direction=cmd.direction, nbytes=cmd.nbytes,
|
||||||
|
sender_seq=qp["my_head"] - 1,
|
||||||
|
)
|
||||||
|
if not req.done.triggered:
|
||||||
|
req.done.succeed()
|
||||||
|
|
||||||
|
# ── Recv ──
|
||||||
|
|
||||||
|
def _handle_recv(
|
||||||
|
self, env: simpy.Environment, req: IpcqRequest, cmd: IpcqRecvCmd,
|
||||||
|
) -> Generator:
|
||||||
|
if cmd.direction is None:
|
||||||
|
direction = yield from self._wait_any_direction(env)
|
||||||
|
else:
|
||||||
|
if cmd.direction not in self._queue_pairs:
|
||||||
|
raise IpcqInvalidDirection(
|
||||||
|
f"PE {self._pe_prefix}: direction {cmd.direction!r} not installed"
|
||||||
|
)
|
||||||
|
direction = cmd.direction
|
||||||
|
qp = self._queue_pairs[direction]
|
||||||
|
while qp["peer_head_cache"] <= qp["my_tail"]:
|
||||||
|
wait_event = env.event()
|
||||||
|
self._recv_waiters[direction].append(wait_event)
|
||||||
|
yield wait_event
|
||||||
|
|
||||||
|
qp = self._queue_pairs[direction]
|
||||||
|
slot_idx = qp["my_tail"] % qp["n_slots"]
|
||||||
|
slot_addr = qp["my_rx_base_pa"] + slot_idx * qp["slot_size"]
|
||||||
|
|
||||||
|
# Strict validation (D14 F2): peek the next-arrived token's metadata
|
||||||
|
# against the recv command's expected shape/dtype/nbytes.
|
||||||
|
arrived = self._arrived_tokens.get(direction, [])
|
||||||
|
if arrived:
|
||||||
|
front = arrived.pop(0)
|
||||||
|
if self._strict:
|
||||||
|
expected_nbytes = self._nbytes_for(cmd.shape, cmd.dtype)
|
||||||
|
if front.dtype != cmd.dtype:
|
||||||
|
raise ValueError(
|
||||||
|
f"PE_IPCQ {self._pe_prefix} recv strict: dtype mismatch — "
|
||||||
|
f"sender={front.dtype} recv={cmd.dtype}"
|
||||||
|
)
|
||||||
|
if front.shape != cmd.shape:
|
||||||
|
raise ValueError(
|
||||||
|
f"PE_IPCQ {self._pe_prefix} recv strict: shape mismatch — "
|
||||||
|
f"sender={front.shape} recv={cmd.shape}"
|
||||||
|
)
|
||||||
|
if front.nbytes != expected_nbytes:
|
||||||
|
raise ValueError(
|
||||||
|
f"PE_IPCQ {self._pe_prefix} recv strict: nbytes mismatch — "
|
||||||
|
f"sender={front.nbytes} recv={expected_nbytes}"
|
||||||
|
)
|
||||||
|
|
||||||
|
req.result_data["src_space"] = self._buffer_kind
|
||||||
|
req.result_data["src_addr"] = slot_addr
|
||||||
|
req.result_data["direction"] = direction
|
||||||
|
req.result_data["dtype"] = cmd.dtype
|
||||||
|
req.result_data["shape"] = cmd.shape
|
||||||
|
req.result_data["nbytes"] = self._nbytes_for(cmd.shape, cmd.dtype)
|
||||||
|
|
||||||
|
# copy_to_dst mode: rebind the result handle to (dst_space, dst_addr).
|
||||||
|
# When op_log is disabled, we also do the actual data move now;
|
||||||
|
# when op_log is enabled, Phase 2 replays the slot→dst copy from
|
||||||
|
# the op_log entry below so we don't pollute the slot in Phase 1.
|
||||||
|
if cmd.recv_mode == "copy_to_dst" and self.ctx is not None:
|
||||||
|
req.result_data["src_space"] = cmd.dst_space
|
||||||
|
req.result_data["src_addr"] = cmd.dst_addr
|
||||||
|
store = getattr(self.ctx, "memory_store", None)
|
||||||
|
if store is not None and self._op_logger is None:
|
||||||
|
try:
|
||||||
|
data = store.read(self._buffer_kind, slot_addr, shape=cmd.shape, dtype=cmd.dtype)
|
||||||
|
store.write(cmd.dst_space, cmd.dst_addr, data)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
if self._op_logger is not None:
|
||||||
|
# Record slot → dst copy for Phase 2 replay (ADR-0023 D9.5).
|
||||||
|
try:
|
||||||
|
self._op_logger.record_copy(
|
||||||
|
t_start=float(env.now), t_end=float(env.now),
|
||||||
|
component_id=self.node.id,
|
||||||
|
src_space=self._buffer_kind, src_addr=slot_addr,
|
||||||
|
dst_space=cmd.dst_space, dst_addr=cmd.dst_addr,
|
||||||
|
shape=cmd.shape, dtype=cmd.dtype,
|
||||||
|
nbytes=self._nbytes_for(cmd.shape, cmd.dtype),
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
qp["my_tail"] += 1
|
||||||
|
|
||||||
|
# Diagnostics trace (D14)
|
||||||
|
from kernbench.ccl import diagnostics
|
||||||
|
if diagnostics.trace_enabled():
|
||||||
|
diagnostics.log_recv(
|
||||||
|
t_ns=float(env.now), receiver=self._pe_prefix,
|
||||||
|
direction=direction,
|
||||||
|
nbytes=req.result_data.get("nbytes", 0),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Fast path credit return — bottleneck BW based latency
|
||||||
|
env.process(
|
||||||
|
self._delayed_credit_send(env, direction, qp["peer_credit_store"], qp["my_tail"])
|
||||||
|
)
|
||||||
|
|
||||||
|
if not req.done.triggered:
|
||||||
|
req.done.succeed()
|
||||||
|
|
||||||
|
def _wait_any_direction(self, env: simpy.Environment) -> Generator:
|
||||||
|
"""Round-robin scan over installed directions; wait until at least one
|
||||||
|
has data. Returns the chosen direction (str)."""
|
||||||
|
if not self._rr_dirs:
|
||||||
|
raise IpcqInvalidDirection(
|
||||||
|
f"PE {self._pe_prefix}: no neighbors installed"
|
||||||
|
)
|
||||||
|
while True:
|
||||||
|
n = len(self._rr_dirs)
|
||||||
|
for i in range(n):
|
||||||
|
idx = (self._rr_cursor + i) % n
|
||||||
|
d = self._rr_dirs[idx]
|
||||||
|
qp = self._queue_pairs[d]
|
||||||
|
if qp["peer_head_cache"] > qp["my_tail"]:
|
||||||
|
self._rr_cursor = (idx + 1) % n
|
||||||
|
return d
|
||||||
|
# Nothing available — wait until any arrival
|
||||||
|
wait_event = env.event()
|
||||||
|
self._any_recv_waiters.append(wait_event)
|
||||||
|
yield wait_event
|
||||||
|
|
||||||
|
# ── Metadata arrival from PE_DMA (D9) ──
|
||||||
|
|
||||||
|
def _handle_meta_arrival(self, msg: IpcqMetaArrival) -> None:
|
||||||
|
"""Match arrival to the correct direction by dst_addr range (ADR-0025 D2).
|
||||||
|
|
||||||
|
Each direction has a unique rx buffer address range
|
||||||
|
([my_rx_base_pa, my_rx_base_pa + n_slots * slot_size)). The token's
|
||||||
|
dst_addr (set by the sender's IPCQ when computing the peer slot
|
||||||
|
address) falls within exactly one such range. Address-based matching
|
||||||
|
is unambiguous even when multiple directions share the same peer
|
||||||
|
(2-rank bidirectional ring).
|
||||||
|
"""
|
||||||
|
token = msg.token
|
||||||
|
dst_addr = token.dst_addr
|
||||||
|
for d, qp in self._queue_pairs.items():
|
||||||
|
base = qp["my_rx_base_pa"]
|
||||||
|
size = qp["n_slots"] * qp["slot_size"]
|
||||||
|
if base <= dst_addr < base + size:
|
||||||
|
qp["peer_head_cache"] = max(qp["peer_head_cache"], token.sender_seq + 1)
|
||||||
|
# Track arrived token for strict-mode peek
|
||||||
|
self._arrived_tokens.setdefault(d, []).append(token)
|
||||||
|
# Wake any blocked recv on this direction
|
||||||
|
waiters = self._recv_waiters.get(d, [])
|
||||||
|
self._recv_waiters[d] = []
|
||||||
|
for ev in waiters:
|
||||||
|
if not ev.triggered:
|
||||||
|
ev.succeed()
|
||||||
|
# Wake any-direction waiters
|
||||||
|
any_waiters = self._any_recv_waiters
|
||||||
|
self._any_recv_waiters = []
|
||||||
|
for ev in any_waiters:
|
||||||
|
if not ev.triggered:
|
||||||
|
ev.succeed()
|
||||||
|
return
|
||||||
|
# Unknown dst_addr — silently drop (could log)
|
||||||
|
|
||||||
|
# ── Credit return (fast path) ──
|
||||||
|
|
||||||
|
def _credit_worker(self, env: simpy.Environment) -> Generator:
|
||||||
|
"""Process IpcqCreditMetadata from credit_inbox.
|
||||||
|
|
||||||
|
Matches credit to the correct direction by `credit.dst_rx_base_pa ==
|
||||||
|
qp.peer.rx_base_pa` (ADR-0025 D3). This is unambiguous even when
|
||||||
|
multiple directions share the same peer (2-rank bidirectional ring).
|
||||||
|
"""
|
||||||
|
assert self._credit_inbox is not None
|
||||||
|
while True:
|
||||||
|
credit: IpcqCreditMetadata = yield self._credit_inbox.get()
|
||||||
|
for d, qp in self._queue_pairs.items():
|
||||||
|
if qp["peer"].rx_base_pa == credit.dst_rx_base_pa:
|
||||||
|
qp["peer_tail_cache"] = max(qp["peer_tail_cache"], credit.consumer_seq)
|
||||||
|
# Wake any blocked send on this direction
|
||||||
|
waiters = self._send_waiters.get(d, [])
|
||||||
|
self._send_waiters[d] = []
|
||||||
|
for ev in waiters:
|
||||||
|
if not ev.triggered:
|
||||||
|
ev.succeed()
|
||||||
|
break
|
||||||
|
|
||||||
|
def _delayed_credit_send(
|
||||||
|
self,
|
||||||
|
env: simpy.Environment,
|
||||||
|
direction: str,
|
||||||
|
peer_credit_store: simpy.Store,
|
||||||
|
new_tail: int,
|
||||||
|
) -> Generator:
|
||||||
|
"""Wait bottleneck-BW latency, then put IpcqCreditMetadata into peer
|
||||||
|
credit store (D9 fast path).
|
||||||
|
|
||||||
|
Carries ``dst_rx_base_pa`` = this PE's my_rx_base_pa for the
|
||||||
|
consumed direction. The peer (original sender) matches this against
|
||||||
|
qp.peer.rx_base_pa to identify the correct qp (ADR-0025 D3).
|
||||||
|
"""
|
||||||
|
latency_ns = self._credit_latency_ns(direction)
|
||||||
|
if latency_ns > 0:
|
||||||
|
yield env.timeout(latency_ns)
|
||||||
|
qp = self._queue_pairs[direction]
|
||||||
|
meta = IpcqCreditMetadata(
|
||||||
|
consumer_seq=new_tail,
|
||||||
|
dst_rx_base_pa=qp["my_rx_base_pa"],
|
||||||
|
src_sip=self._self_sip,
|
||||||
|
src_cube=self._self_cube,
|
||||||
|
src_pe=self._self_pe,
|
||||||
|
src_direction=direction,
|
||||||
|
)
|
||||||
|
yield peer_credit_store.put(meta)
|
||||||
|
|
||||||
|
def _credit_latency_ns(self, direction: str) -> float:
|
||||||
|
"""Compute credit fast path latency = credit_size / bottleneck_bw.
|
||||||
|
|
||||||
|
Falls back to 0 when ctx/router is unavailable (unit-test mode).
|
||||||
|
"""
|
||||||
|
if self.ctx is None:
|
||||||
|
return 0.0
|
||||||
|
qp = self._queue_pairs[direction]
|
||||||
|
peer = qp["peer"]
|
||||||
|
peer_pe_prefix = f"sip{peer.sip}.cube{peer.cube}.pe{peer.pe}"
|
||||||
|
try:
|
||||||
|
path = self.ctx.router.find_path(self._pe_prefix, peer_pe_prefix)
|
||||||
|
return self.ctx.compute_drain_ns(path, self._credit_size_bytes)
|
||||||
|
except Exception:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
# ── Helpers ──
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _nbytes_for(shape: tuple[int, ...], dtype: str) -> int:
|
||||||
|
from math import prod
|
||||||
|
bits = {"f16": 16, "bf16": 16, "f32": 32, "i8": 8, "i16": 16, "i32": 32}.get(dtype, 16)
|
||||||
|
return prod(shape) * (bits // 8) if shape else 0
|
||||||
@@ -1,6 +1,16 @@
|
|||||||
|
"""PE_MATH: element-wise / reduction computation engine (ADR-0021 D6).
|
||||||
|
|
||||||
|
Handles both legacy PeInternalTxn (MathCmd) and pipeline TileToken.
|
||||||
|
In pipeline mode, receives token after fetch stage, computes SIMD, chains to next.
|
||||||
|
|
||||||
|
SIMD latency model (from pe_accel):
|
||||||
|
cycles = ceil(num_elements / vector_width)
|
||||||
|
latency_ns = cycles / clock_freq_ghz
|
||||||
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from collections.abc import Generator
|
from collections.abc import Generator
|
||||||
|
from math import ceil
|
||||||
from typing import TYPE_CHECKING, Any
|
from typing import TYPE_CHECKING, Any
|
||||||
|
|
||||||
import simpy
|
import simpy
|
||||||
@@ -14,15 +24,17 @@ if TYPE_CHECKING:
|
|||||||
|
|
||||||
|
|
||||||
class PeMathComponent(PeEngineBase):
|
class PeMathComponent(PeEngineBase):
|
||||||
"""PE_MATH: element-wise computation engine sharing accel_slot (ADR-0014 D4).
|
"""PE_MATH: SIMD/Vector unit (ADR-0021 D6).
|
||||||
|
|
||||||
Uses a shared compute resource (PE_ACCEL capacity=1) that is mutually
|
In pipeline mode: pure compute — register data already fetched.
|
||||||
exclusive with PE_GEMM within the same PE.
|
In legacy mode: handles PeInternalTxn(MathCmd) with shared accel_slot.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
|
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
|
||||||
super().__init__(node, ctx)
|
super().__init__(node, ctx)
|
||||||
self._accel: simpy.Resource | None = None
|
self._accel: simpy.Resource | None = None
|
||||||
|
self._vector_width: int = int(node.attrs.get("vector_width", 256))
|
||||||
|
self._clock_freq: float = float(node.attrs.get("clock_freq_ghz", 1.0))
|
||||||
|
|
||||||
def init_resources(self, env: simpy.Environment) -> None:
|
def init_resources(self, env: simpy.Environment) -> None:
|
||||||
resource_name = self.node.attrs.get("shared_resource")
|
resource_name = self.node.attrs.get("shared_resource")
|
||||||
@@ -31,21 +43,83 @@ class PeMathComponent(PeEngineBase):
|
|||||||
env, f"{self._pe_prefix}.{resource_name}"
|
env, f"{self._pe_prefix}.{resource_name}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def _compute_ns(self, num_elements: int) -> float:
|
||||||
|
"""SIMD latency (pe_accel model)."""
|
||||||
|
if self._vector_width > 0 and self._clock_freq > 0 and num_elements > 0:
|
||||||
|
cycles = ceil(num_elements / self._vector_width)
|
||||||
|
return cycles / self._clock_freq
|
||||||
|
return float(self.node.attrs.get("overhead_ns", 0.0))
|
||||||
|
|
||||||
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
|
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
|
||||||
overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
|
overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
|
||||||
yield env.timeout(overhead_ns)
|
yield env.timeout(overhead_ns)
|
||||||
|
|
||||||
def handle_command(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
|
def _worker(self, env: simpy.Environment) -> Generator:
|
||||||
|
from kernbench.common.pe_commands import PeInternalTxn
|
||||||
|
from kernbench.components.builtin.pe_types import TileToken
|
||||||
|
|
||||||
|
while True:
|
||||||
|
msg: Any = yield self._inbox.get()
|
||||||
|
if isinstance(msg, TileToken):
|
||||||
|
env.process(self._pipeline_process(env, msg))
|
||||||
|
elif isinstance(msg, PeInternalTxn):
|
||||||
|
env.process(self._handle_with_hooks(env, msg))
|
||||||
|
else:
|
||||||
|
env.process(self._forward_txn(env, msg))
|
||||||
|
|
||||||
|
def _pipeline_process(self, env: simpy.Environment, token: Any) -> Generator:
|
||||||
|
"""Pipeline mode: pure SIMD compute, then self-route."""
|
||||||
|
self._on_process_start(env, token)
|
||||||
|
|
||||||
|
num_elements = token.params.get("num_elements", 0)
|
||||||
|
|
||||||
if self._accel:
|
if self._accel:
|
||||||
with self._accel.request() as req:
|
with self._accel.request() as req:
|
||||||
yield req
|
yield req
|
||||||
yield from self.run(env, 0)
|
ns = self._compute_ns(num_elements)
|
||||||
|
yield env.timeout(ns)
|
||||||
else:
|
else:
|
||||||
yield from self.run(env, 0)
|
ns = self._compute_ns(num_elements)
|
||||||
|
yield env.timeout(ns)
|
||||||
|
|
||||||
|
self._on_process_end(env, token)
|
||||||
|
|
||||||
|
# Self-routing
|
||||||
|
next_stage = token.advance()
|
||||||
|
if next_stage is not None:
|
||||||
|
yield self.out_ports[next_stage.component].put(token)
|
||||||
|
else:
|
||||||
|
token.pipeline_ctx.complete_tile()
|
||||||
|
|
||||||
|
def handle_command(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
|
||||||
|
"""PeInternalTxn handling for standalone MathCmd (CCL kernels).
|
||||||
|
|
||||||
|
Latency = max(overhead_ns, _compute_ns(num_elements)):
|
||||||
|
- overhead_ns: fixed per-invocation setup cost (from node attrs).
|
||||||
|
- _compute_ns: SIMD cycle-based model (from vector_width + clock_freq).
|
||||||
|
The larger of the two dominates (setup-bound vs compute-bound).
|
||||||
|
"""
|
||||||
|
from kernbench.common.pe_commands import MathCmd
|
||||||
|
import math as _math
|
||||||
|
|
||||||
|
cmd = pe_txn.command
|
||||||
|
num_elements = 0
|
||||||
|
if isinstance(cmd, MathCmd) and cmd.out.shape:
|
||||||
|
num_elements = _math.prod(cmd.out.shape)
|
||||||
|
|
||||||
|
overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
|
||||||
|
compute_ns = self._compute_ns(num_elements)
|
||||||
|
ns = max(overhead_ns, compute_ns)
|
||||||
|
|
||||||
|
if self._accel:
|
||||||
|
with self._accel.request() as req:
|
||||||
|
yield req
|
||||||
|
yield env.timeout(ns)
|
||||||
|
else:
|
||||||
|
yield env.timeout(ns)
|
||||||
pe_txn.done.succeed()
|
pe_txn.done.succeed()
|
||||||
|
|
||||||
def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
|
def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
|
||||||
"""Transaction forwarding with accel_slot acquisition."""
|
|
||||||
if self._accel:
|
if self._accel:
|
||||||
with self._accel.request() as req:
|
with self._accel.request() as req:
|
||||||
yield req
|
yield req
|
||||||
|
|||||||
@@ -1,3 +1,13 @@
|
|||||||
|
"""PE_SCHEDULER: plan generation + tile dispatch (ADR-0021 D2).
|
||||||
|
|
||||||
|
Receives PeInternalTxn from PE_CPU, routes to engines:
|
||||||
|
- Simple commands (DmaReadCmd, GemmCmd, etc.) → direct dispatch to engine
|
||||||
|
- CompositeCmd → generate TilePlan, feed tiles via _feed_loop
|
||||||
|
|
||||||
|
Composite pipeline uses token self-routing (ADR-0021 D4):
|
||||||
|
Scheduler only does initial dispatch + completion tracking.
|
||||||
|
Tiles chain through components based on their plan's stage sequence.
|
||||||
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from collections.abc import Generator
|
from collections.abc import Generator
|
||||||
@@ -14,29 +24,18 @@ if TYPE_CHECKING:
|
|||||||
|
|
||||||
|
|
||||||
class PeSchedulerComponent(ComponentBase):
|
class PeSchedulerComponent(ComponentBase):
|
||||||
"""PE_SCHEDULER: sole dispatcher inside a PE (ADR-0014 D1).
|
"""PE_SCHEDULER: sole dispatcher inside a PE (ADR-0014 D1, ADR-0021 D2).
|
||||||
|
|
||||||
Receives PeInternalTxn from PE_CPU, routes to the appropriate engine:
|
Simple commands are forwarded to the appropriate engine.
|
||||||
- DmaReadCmd / DmaWriteCmd → PE_DMA
|
CompositeCmd creates a TilePlan and feeds tiles into the pipeline.
|
||||||
- GemmCmd → PE_GEMM
|
|
||||||
- MathCmd → PE_MATH
|
|
||||||
- CompositeCmd → tiled pipeline (Stage 3: ADR-0014 D3.2)
|
|
||||||
|
|
||||||
Composite GEMM pipeline (32x64x32 tiles):
|
Single _feed_loop process per scheduler ensures FIFO command ordering.
|
||||||
DMA_READ(b_tile_t) → COMPUTE(t) → DMA_WRITE(out_tile_t)
|
|
||||||
with overlap: READ(t+1) || COMPUTE(t) || WRITE(t-1)
|
|
||||||
|
|
||||||
Applies scheduler overhead_ns before dispatching each command.
|
|
||||||
Non-PeInternalTxn messages are forwarded via inherited _forward_txn().
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Scheduler tile dimensions (ADR-0014 D3.2)
|
|
||||||
TILE_M = 32
|
TILE_M = 32
|
||||||
TILE_K = 64
|
TILE_K = 64
|
||||||
TILE_N = 32
|
TILE_N = 32
|
||||||
|
|
||||||
# Command → engine suffix dispatch table.
|
|
||||||
# New engines: add a single entry here (e.g. ConvCmd: "pe_conv").
|
|
||||||
_CMD_DISPATCH: dict[type, str] = {}
|
_CMD_DISPATCH: dict[type, str] = {}
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@@ -44,7 +43,6 @@ class PeSchedulerComponent(ComponentBase):
|
|||||||
if cls._CMD_DISPATCH:
|
if cls._CMD_DISPATCH:
|
||||||
return
|
return
|
||||||
from kernbench.common.pe_commands import DmaReadCmd, DmaWriteCmd, GemmCmd, MathCmd
|
from kernbench.common.pe_commands import DmaReadCmd, DmaWriteCmd, GemmCmd, MathCmd
|
||||||
|
|
||||||
cls._CMD_DISPATCH = {
|
cls._CMD_DISPATCH = {
|
||||||
DmaReadCmd: "pe_dma",
|
DmaReadCmd: "pe_dma",
|
||||||
DmaWriteCmd: "pe_dma",
|
DmaWriteCmd: "pe_dma",
|
||||||
@@ -56,6 +54,13 @@ class PeSchedulerComponent(ComponentBase):
|
|||||||
super().__init__(node, ctx)
|
super().__init__(node, ctx)
|
||||||
self._pe_prefix = node.id.rsplit(".", 1)[0]
|
self._pe_prefix = node.id.rsplit(".", 1)[0]
|
||||||
self._ensure_dispatch_table()
|
self._ensure_dispatch_table()
|
||||||
|
self._pending_feeds: simpy.Store | None = None
|
||||||
|
self._pipeline_counter = 0
|
||||||
|
|
||||||
|
def start(self, env: simpy.Environment) -> None:
|
||||||
|
self._pending_feeds = simpy.Store(env)
|
||||||
|
super().start(env)
|
||||||
|
env.process(self._feed_loop(env))
|
||||||
|
|
||||||
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
|
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
|
||||||
overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
|
overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
|
||||||
@@ -72,174 +77,103 @@ class PeSchedulerComponent(ComponentBase):
|
|||||||
yield from self._forward_txn(env, msg)
|
yield from self._forward_txn(env, msg)
|
||||||
|
|
||||||
def _dispatch(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
|
def _dispatch(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
|
||||||
"""Route a PeInternalTxn to the correct engine via dispatch table."""
|
from kernbench.common.pe_commands import CompositeCmd, PeCpuOverheadCmd
|
||||||
from kernbench.common.pe_commands import CompositeCmd
|
|
||||||
|
|
||||||
# Scheduler overhead
|
yield from self.run(env, 0) # scheduler overhead
|
||||||
yield from self.run(env, 0)
|
|
||||||
|
|
||||||
cmd = pe_txn.command
|
cmd = pe_txn.command
|
||||||
|
|
||||||
# Check dispatch table first
|
# Simple command dispatch
|
||||||
engine_suffix = self._CMD_DISPATCH.get(type(cmd))
|
engine_suffix = self._CMD_DISPATCH.get(type(cmd))
|
||||||
if engine_suffix is not None:
|
if engine_suffix is not None:
|
||||||
yield self.out_ports[f"{self._pe_prefix}.{engine_suffix}"].put(pe_txn)
|
yield self.out_ports[f"{self._pe_prefix}.{engine_suffix}"].put(pe_txn)
|
||||||
return
|
return
|
||||||
|
|
||||||
# CompositeCmd: tiled pipeline (not a simple forward)
|
# CompositeCmd: generate plan and feed
|
||||||
if isinstance(cmd, CompositeCmd):
|
if isinstance(cmd, CompositeCmd):
|
||||||
yield from self._dispatch_composite(env, pe_txn)
|
yield from self._dispatch_composite(env, pe_txn, cmd)
|
||||||
|
return
|
||||||
|
|
||||||
|
if isinstance(cmd, PeCpuOverheadCmd):
|
||||||
|
yield env.timeout(cmd.cycles)
|
||||||
|
pe_txn.done.succeed()
|
||||||
return
|
return
|
||||||
|
|
||||||
# Unknown command — signal done immediately
|
|
||||||
pe_txn.done.succeed()
|
pe_txn.done.succeed()
|
||||||
|
|
||||||
def _dispatch_composite(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
|
def _dispatch_composite(
|
||||||
"""Composite tiled pipeline (ADR-0014 D3.2).
|
self, env: simpy.Environment, pe_txn: Any, cmd: Any,
|
||||||
|
) -> Generator:
|
||||||
|
"""Generate plan and enqueue to feeder. Non-blocking (ADR-0021 D4)."""
|
||||||
|
from kernbench.components.builtin.pe_types import PipelineContext
|
||||||
|
|
||||||
GEMM: 3-stage pipeline with b-tile streaming from HBM.
|
plan = self._generate_plan(cmd)
|
||||||
MATH: sequential compute + DMA_WRITE (no tiling).
|
|
||||||
|
self._pipeline_counter += 1
|
||||||
|
ctx = PipelineContext(
|
||||||
|
id=f"p{self._pipeline_counter}",
|
||||||
|
total_tiles=len(plan.tiles),
|
||||||
|
done_event=pe_txn.done,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Enqueue to feeder — scheduler worker returns immediately
|
||||||
|
assert self._pending_feeds is not None
|
||||||
|
yield self._pending_feeds.put((plan, ctx))
|
||||||
|
|
||||||
|
def _feed_loop(self, env: simpy.Environment) -> Generator:
|
||||||
|
"""Single feeder process: FIFO command ordering (ADR-0021 D2).
|
||||||
|
|
||||||
|
No tile feed interleaving between commands.
|
||||||
|
Queue full → only this process blocks.
|
||||||
"""
|
"""
|
||||||
from kernbench.common.pe_commands import CompositeCmd
|
from kernbench.components.builtin.pe_types import TileToken
|
||||||
|
|
||||||
|
assert self._pending_feeds is not None
|
||||||
|
while True:
|
||||||
|
plan, ctx = yield self._pending_feeds.get()
|
||||||
|
for tile in plan.tiles:
|
||||||
|
first_stage = tile.stages[0]
|
||||||
|
token = TileToken(
|
||||||
|
tile_id=tile.tile_id,
|
||||||
|
pipeline_ctx=ctx,
|
||||||
|
plan=tile,
|
||||||
|
stage_idx=0,
|
||||||
|
params=first_stage.params,
|
||||||
|
)
|
||||||
|
yield self.out_ports[first_stage.component].put(token)
|
||||||
|
|
||||||
|
def _generate_plan(self, cmd: Any) -> Any:
|
||||||
|
"""Generate a PipelinePlan from CompositeCmd."""
|
||||||
|
from kernbench.components.builtin.tiling import (
|
||||||
|
generate_gemm_plan,
|
||||||
|
generate_math_plan,
|
||||||
|
)
|
||||||
|
|
||||||
|
pp = self._pe_prefix
|
||||||
|
bpe = 2 # default bytes per element (f16)
|
||||||
|
|
||||||
cmd = pe_txn.command
|
|
||||||
assert isinstance(cmd, CompositeCmd)
|
|
||||||
if cmd.op == "gemm" and cmd.b is not None:
|
if cmd.op == "gemm" and cmd.b is not None:
|
||||||
yield from self._pipeline_gemm(env, pe_txn, cmd)
|
a = cmd.a
|
||||||
|
b = cmd.b
|
||||||
|
M, K = a.shape[-2], a.shape[-1]
|
||||||
|
N = b.shape[-1]
|
||||||
|
return generate_gemm_plan(
|
||||||
|
M=M, K=K, N=N,
|
||||||
|
tile_m=self.TILE_M, tile_k=self.TILE_K, tile_n=self.TILE_N,
|
||||||
|
bytes_per_element=bpe,
|
||||||
|
A_addr=a.addr, B_addr=b.addr, C_addr=cmd.out_addr,
|
||||||
|
pe_prefix=pp,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
yield from self._pipeline_math(env, pe_txn, cmd)
|
# Math composite
|
||||||
|
a = cmd.a
|
||||||
def _pipeline_gemm(self, env: simpy.Environment, pe_txn: PeInternalTxn, cmd: Any) -> Generator:
|
M = a.shape[-2] if len(a.shape) >= 2 else a.shape[0]
|
||||||
"""Tiled GEMM pipeline: stream b tiles from HBM, compute, write results.
|
N = a.shape[-1] if len(a.shape) >= 2 else 1
|
||||||
|
return generate_math_plan(
|
||||||
Tensor a is in TCM (loaded via tl.load). Tensor b is in HBM (via tl.ref).
|
M=M, N=N,
|
||||||
Pipeline: DMA_READ(b_tile_t) -> COMPUTE(t) -> DMA_WRITE(out_tile_t)
|
tile_m=self.TILE_M, tile_n=self.TILE_N,
|
||||||
Overlap: READ(t+1) || COMPUTE(t) || WRITE(t-1)
|
bytes_per_element=bpe,
|
||||||
"""
|
math_op=cmd.math_op or "identity",
|
||||||
from kernbench.common.pe_commands import (
|
src_addr=a.addr, dst_addr=cmd.out_addr,
|
||||||
DmaReadCmd,
|
pe_prefix=pp,
|
||||||
DmaWriteCmd,
|
|
||||||
GemmCmd,
|
|
||||||
PeInternalTxn as PeTxn,
|
|
||||||
TensorHandle,
|
|
||||||
)
|
|
||||||
|
|
||||||
pp = self._pe_prefix
|
|
||||||
a = cmd.a # already in TCM
|
|
||||||
b = cmd.b # HBM reference (via tl.ref)
|
|
||||||
|
|
||||||
M, K_a = a.shape[-2], a.shape[-1]
|
|
||||||
K_b, N = b.shape[-2], b.shape[-1]
|
|
||||||
dtype = a.dtype
|
|
||||||
dtype_bytes = b.nbytes // (K_b * N) if (K_b * N) > 0 else 2
|
|
||||||
|
|
||||||
# Tile counts
|
|
||||||
n_tiles_k = max(1, (K_a + self.TILE_K - 1) // self.TILE_K)
|
|
||||||
n_tiles_n = max(1, (N + self.TILE_N - 1) // self.TILE_N)
|
|
||||||
n_tiles = n_tiles_k * n_tiles_n
|
|
||||||
|
|
||||||
prev_compute_done = None
|
|
||||||
prev_write_done = None
|
|
||||||
total_dma_ns = 0.0
|
|
||||||
total_compute_ns = 0.0
|
|
||||||
|
|
||||||
for tile_idx in range(n_tiles):
|
|
||||||
tk = tile_idx // n_tiles_n
|
|
||||||
tn = tile_idx % n_tiles_n
|
|
||||||
|
|
||||||
k_start = tk * self.TILE_K
|
|
||||||
n_start = tn * self.TILE_N
|
|
||||||
tile_k = min(self.TILE_K, K_a - k_start)
|
|
||||||
tile_n = min(self.TILE_N, N - n_start)
|
|
||||||
tile_nbytes = tile_k * tile_n * dtype_bytes
|
|
||||||
|
|
||||||
# --- Stage 1: DMA_READ b_tile from HBM ---
|
|
||||||
read_done = env.event()
|
|
||||||
b_tile_addr = b.addr + (k_start * N + n_start) * dtype_bytes
|
|
||||||
b_tile_handle = TensorHandle(
|
|
||||||
id=f"b_tile_{tile_idx}", addr=b_tile_addr,
|
|
||||||
shape=(tile_k, tile_n), dtype=dtype, nbytes=tile_nbytes,
|
|
||||||
)
|
)
|
||||||
read_cmd = DmaReadCmd(handle=b_tile_handle, src_addr=b_tile_addr, nbytes=tile_nbytes)
|
|
||||||
read_txn = PeTxn(command=read_cmd, done=read_done, pe_prefix=pp)
|
|
||||||
t0 = env.now
|
|
||||||
yield self.out_ports[f"{pp}.pe_dma"].put(read_txn)
|
|
||||||
|
|
||||||
# Wait for previous compute before starting this tile's compute
|
|
||||||
if prev_compute_done is not None:
|
|
||||||
yield prev_compute_done
|
|
||||||
|
|
||||||
# Wait for this tile's DMA_READ
|
|
||||||
yield read_done
|
|
||||||
total_dma_ns += env.now - t0
|
|
||||||
|
|
||||||
# --- Stage 2: COMPUTE (GEMM) ---
|
|
||||||
compute_done = env.event()
|
|
||||||
out_handle = TensorHandle(
|
|
||||||
id=f"out_tile_{tile_idx}", addr=0,
|
|
||||||
shape=(M, tile_n), dtype=dtype,
|
|
||||||
nbytes=M * tile_n * dtype_bytes,
|
|
||||||
)
|
|
||||||
compute_cmd = GemmCmd(a=a, b=b_tile_handle, out=out_handle,
|
|
||||||
m=M, k=tile_k, n=tile_n)
|
|
||||||
compute_txn = PeTxn(command=compute_cmd, done=compute_done, pe_prefix=pp)
|
|
||||||
t0 = env.now
|
|
||||||
yield self.out_ports[f"{pp}.pe_gemm"].put(compute_txn)
|
|
||||||
|
|
||||||
# Wait for previous write (DMA_WRITE serialization)
|
|
||||||
if prev_write_done is not None:
|
|
||||||
yield prev_write_done
|
|
||||||
|
|
||||||
# Wait for compute of THIS tile
|
|
||||||
yield compute_done
|
|
||||||
total_compute_ns += env.now - t0
|
|
||||||
prev_compute_done = compute_done
|
|
||||||
|
|
||||||
# --- Stage 3: DMA_WRITE out_tile to HBM ---
|
|
||||||
write_done = env.event()
|
|
||||||
out_tile_pa = cmd.out_addr + n_start * dtype_bytes
|
|
||||||
write_nbytes = M * tile_n * dtype_bytes
|
|
||||||
write_cmd = DmaWriteCmd(handle=out_handle, dst_addr=out_tile_pa, nbytes=write_nbytes)
|
|
||||||
write_txn = PeTxn(command=write_cmd, done=write_done, pe_prefix=pp)
|
|
||||||
t0 = env.now
|
|
||||||
yield self.out_ports[f"{pp}.pe_dma"].put(write_txn)
|
|
||||||
prev_write_done = write_done
|
|
||||||
|
|
||||||
# Wait for final write
|
|
||||||
if prev_write_done is not None:
|
|
||||||
t0 = env.now
|
|
||||||
yield prev_write_done
|
|
||||||
total_dma_ns += env.now - t0
|
|
||||||
|
|
||||||
pe_txn.result_data["dma_ns"] = total_dma_ns
|
|
||||||
pe_txn.result_data["compute_ns"] = total_compute_ns
|
|
||||||
pe_txn.done.succeed()
|
|
||||||
|
|
||||||
def _pipeline_math(self, env: simpy.Environment, pe_txn: PeInternalTxn, cmd: Any) -> Generator:
|
|
||||||
"""Non-GEMM composite: sequential compute + DMA_WRITE (no tiling)."""
|
|
||||||
from kernbench.common.pe_commands import (
|
|
||||||
DmaWriteCmd,
|
|
||||||
MathCmd,
|
|
||||||
PeInternalTxn as PeTxn,
|
|
||||||
)
|
|
||||||
|
|
||||||
pp = self._pe_prefix
|
|
||||||
|
|
||||||
# Step 1: Compute (MATH)
|
|
||||||
compute_done = env.event()
|
|
||||||
compute_cmd = MathCmd(
|
|
||||||
op=cmd.math_op or "identity",
|
|
||||||
inputs=(cmd.a,), out=cmd.a,
|
|
||||||
)
|
|
||||||
compute_txn = PeTxn(command=compute_cmd, done=compute_done, pe_prefix=pp)
|
|
||||||
yield self.out_ports[f"{pp}.pe_math"].put(compute_txn)
|
|
||||||
yield compute_done
|
|
||||||
|
|
||||||
# Step 2: DMA_WRITE result to HBM
|
|
||||||
write_done = env.event()
|
|
||||||
write_cmd = DmaWriteCmd(handle=cmd.a, dst_addr=cmd.out_addr, nbytes=cmd.out_nbytes)
|
|
||||||
write_txn = PeTxn(command=write_cmd, done=write_done, pe_prefix=pp)
|
|
||||||
yield self.out_ports[f"{pp}.pe_dma"].put(write_txn)
|
|
||||||
yield write_done
|
|
||||||
|
|
||||||
pe_txn.done.succeed()
|
|
||||||
|
|||||||
@@ -1,7 +1,18 @@
|
|||||||
|
"""PE_TCM: tightly-coupled memory with BW-based access serialization (ADR-0021).
|
||||||
|
|
||||||
|
Models scratchpad memory inside the PE. Handles both legacy Transaction forwarding
|
||||||
|
and TcmRequest from PE_FETCH_STORE for BW-serialized read/write access.
|
||||||
|
|
||||||
|
Two channels (read/write) with independent serialization.
|
||||||
|
Ported from pe_accel TcmBlock timing model.
|
||||||
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from collections.abc import Generator
|
from collections.abc import Generator
|
||||||
from typing import TYPE_CHECKING
|
from dataclasses import dataclass
|
||||||
|
from typing import TYPE_CHECKING, Any
|
||||||
|
|
||||||
|
import simpy
|
||||||
|
|
||||||
from kernbench.components.base import ComponentBase
|
from kernbench.components.base import ComponentBase
|
||||||
|
|
||||||
@@ -10,16 +21,62 @@ if TYPE_CHECKING:
|
|||||||
from kernbench.topology.types import Node
|
from kernbench.topology.types import Node
|
||||||
|
|
||||||
|
|
||||||
class PeTcmComponent(ComponentBase):
|
@dataclass
|
||||||
"""PE_TCM: tightly-coupled memory / local SRAM staging buffer.
|
class TcmRequest:
|
||||||
|
"""Request to read from or write to TCM (used by PE_FETCH_STORE)."""
|
||||||
|
|
||||||
Terminal storage component for PE-internal dataflow (ADR-0014 D5).
|
direction: str # "read" or "write"
|
||||||
Phase 0: applies overhead_ns and drain_ns at terminal.
|
nbytes: int
|
||||||
|
done: simpy.Event
|
||||||
|
tag: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
class PeTcmComponent(ComponentBase):
|
||||||
|
"""PE_TCM: BW-serialized scratchpad memory (ADR-0021 D1).
|
||||||
|
|
||||||
|
Dual-channel: read and write can proceed in parallel,
|
||||||
|
but concurrent reads serialize, concurrent writes serialize.
|
||||||
|
BW from topology attrs or pe_template links.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
|
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
|
||||||
super().__init__(node, ctx)
|
super().__init__(node, ctx)
|
||||||
|
self._read_bw: float = float(node.attrs.get("read_bw_gbs", 512.0))
|
||||||
|
self._write_bw: float = float(node.attrs.get("write_bw_gbs", 512.0))
|
||||||
|
self._read_res: simpy.Resource | None = None
|
||||||
|
self._write_res: simpy.Resource | None = None
|
||||||
|
|
||||||
def run(self, env, nbytes: int) -> Generator:
|
def start(self, env: simpy.Environment) -> None:
|
||||||
|
self._read_res = simpy.Resource(env, capacity=1)
|
||||||
|
self._write_res = simpy.Resource(env, capacity=1)
|
||||||
|
super().start(env)
|
||||||
|
|
||||||
|
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
|
||||||
overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
|
overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
|
||||||
yield env.timeout(overhead_ns)
|
yield env.timeout(overhead_ns)
|
||||||
|
|
||||||
|
def _worker(self, env: simpy.Environment) -> Generator:
|
||||||
|
"""Dispatch TcmRequest (from fetch_store) and Transaction (fabric)."""
|
||||||
|
while True:
|
||||||
|
msg: Any = yield self._inbox.get()
|
||||||
|
if isinstance(msg, TcmRequest):
|
||||||
|
env.process(self._handle_tcm_request(env, msg))
|
||||||
|
else:
|
||||||
|
env.process(self._forward_txn(env, msg))
|
||||||
|
|
||||||
|
def _handle_tcm_request(self, env: simpy.Environment, req: TcmRequest) -> Generator:
|
||||||
|
"""BW-serialized access: acquire channel, apply delay, signal done."""
|
||||||
|
if req.direction == "write":
|
||||||
|
res = self._write_res
|
||||||
|
bw = self._write_bw
|
||||||
|
else:
|
||||||
|
res = self._read_res
|
||||||
|
bw = self._read_bw
|
||||||
|
|
||||||
|
assert res is not None
|
||||||
|
with res.request() as lock:
|
||||||
|
yield lock
|
||||||
|
if bw > 0 and req.nbytes > 0:
|
||||||
|
delay_ns = req.nbytes / bw
|
||||||
|
yield env.timeout(delay_ns)
|
||||||
|
req.done.succeed()
|
||||||
|
|||||||
@@ -0,0 +1,115 @@
|
|||||||
|
"""PE pipeline types for ADR-0021: TileToken, TilePlan, Stage, PipelineContext.
|
||||||
|
|
||||||
|
These types are used by the PE_SCHEDULER and all PE engine components
|
||||||
|
for tile-based pipeline execution with self-routing.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from enum import Enum, auto
|
||||||
|
from typing import TYPE_CHECKING, Any
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
import simpy
|
||||||
|
|
||||||
|
|
||||||
|
# ── Stage types ──────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class StageType(Enum):
|
||||||
|
DMA_READ = auto()
|
||||||
|
FETCH = auto()
|
||||||
|
GEMM = auto()
|
||||||
|
MATH = auto()
|
||||||
|
STORE = auto()
|
||||||
|
DMA_WRITE = auto()
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Stage:
|
||||||
|
"""One stage in a tile's execution plan."""
|
||||||
|
|
||||||
|
stage_type: StageType
|
||||||
|
component: str # topology node ID (e.g. "sip0.cube0.pe0.pe_dma")
|
||||||
|
params: dict = field(default_factory=dict)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Plan ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class TilePlan:
|
||||||
|
"""Execution plan for a single tile (immutable stage sequence)."""
|
||||||
|
|
||||||
|
tile_id: int
|
||||||
|
stages: tuple[Stage, ...]
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class PipelinePlan:
|
||||||
|
"""Full pipeline plan for one CompositeCmd."""
|
||||||
|
|
||||||
|
tiles: list[TilePlan]
|
||||||
|
# Metadata for metrics
|
||||||
|
m_tiles: int = 0
|
||||||
|
k_tiles: int = 0
|
||||||
|
n_tiles: int = 0
|
||||||
|
|
||||||
|
|
||||||
|
# ── Pipeline Context ─────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class PipelineContext:
|
||||||
|
"""Tracks completion of a pipeline (exactly-once contract).
|
||||||
|
|
||||||
|
Each tile's last stage calls complete_tile() exactly once.
|
||||||
|
When all tiles complete, done_event.succeed() is called.
|
||||||
|
"""
|
||||||
|
|
||||||
|
id: str
|
||||||
|
total_tiles: int
|
||||||
|
completed_tiles: int = 0
|
||||||
|
done_event: Any = None # simpy.Event
|
||||||
|
|
||||||
|
def complete_tile(self) -> None:
|
||||||
|
self.completed_tiles += 1
|
||||||
|
if self.completed_tiles == self.total_tiles:
|
||||||
|
if self.done_event is not None:
|
||||||
|
self.done_event.succeed()
|
||||||
|
|
||||||
|
|
||||||
|
# ── TileToken ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class TileToken:
|
||||||
|
"""Self-routing tile token passed between PE components (ADR-0021 D9).
|
||||||
|
|
||||||
|
Single-owner: only one component holds this token at any time.
|
||||||
|
params is a cache of plan.stages[stage_idx].params (canonical source).
|
||||||
|
"""
|
||||||
|
|
||||||
|
tile_id: int
|
||||||
|
pipeline_ctx: PipelineContext
|
||||||
|
plan: TilePlan
|
||||||
|
stage_idx: int
|
||||||
|
params: dict = field(default_factory=dict)
|
||||||
|
data_op: bool = True # op_log recording target (ADR-0020)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def current_stage(self) -> Stage:
|
||||||
|
return self.plan.stages[self.stage_idx]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def has_next_stage(self) -> bool:
|
||||||
|
return self.stage_idx + 1 < len(self.plan.stages)
|
||||||
|
|
||||||
|
def advance(self) -> Stage | None:
|
||||||
|
"""Advance to next stage. Returns next Stage or None if last."""
|
||||||
|
self.stage_idx += 1
|
||||||
|
if self.stage_idx < len(self.plan.stages):
|
||||||
|
next_stage = self.plan.stages[self.stage_idx]
|
||||||
|
self.params = next_stage.params
|
||||||
|
return next_stage
|
||||||
|
return None
|
||||||
@@ -0,0 +1,176 @@
|
|||||||
|
"""Tile plan generators for PE pipeline (ADR-0021).
|
||||||
|
|
||||||
|
Generates TilePlan with stage sequences for GEMM and Math operations.
|
||||||
|
Ported from pe_accel tiling.py with stage-based plan structure.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from math import ceil
|
||||||
|
|
||||||
|
from kernbench.components.builtin.pe_types import (
|
||||||
|
PipelinePlan,
|
||||||
|
Stage,
|
||||||
|
StageType,
|
||||||
|
TilePlan,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def generate_gemm_plan(
|
||||||
|
M: int, K: int, N: int,
|
||||||
|
tile_m: int, tile_k: int, tile_n: int,
|
||||||
|
bytes_per_element: int,
|
||||||
|
A_addr: int, B_addr: int, C_addr: int,
|
||||||
|
pe_prefix: str,
|
||||||
|
) -> PipelinePlan:
|
||||||
|
"""Generate GEMM tile plan: M→N→K order.
|
||||||
|
|
||||||
|
Each tile follows stage sequence:
|
||||||
|
DMA_READ(A) → DMA_READ(B) → FETCH → GEMM → STORE
|
||||||
|
On last K-tile per (m,n): → DMA_WRITE
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pe_prefix: e.g. "sip0.cube0.pe0" — used to build component IDs.
|
||||||
|
"""
|
||||||
|
M_tiles = max(1, ceil(M / tile_m))
|
||||||
|
K_tiles = max(1, ceil(K / tile_k))
|
||||||
|
N_tiles = max(1, ceil(N / tile_n))
|
||||||
|
bpe = bytes_per_element
|
||||||
|
|
||||||
|
dma_id = f"{pe_prefix}.pe_dma"
|
||||||
|
fetch_id = f"{pe_prefix}.pe_fetch_store"
|
||||||
|
gemm_id = f"{pe_prefix}.pe_gemm"
|
||||||
|
# math_id = f"{pe_prefix}.pe_math" # for K-accumulation if needed
|
||||||
|
|
||||||
|
tiles: list[TilePlan] = []
|
||||||
|
tile_id = 0
|
||||||
|
|
||||||
|
for m in range(M_tiles):
|
||||||
|
for n in range(N_tiles):
|
||||||
|
c_addr = C_addr + (m * tile_m * N + n * tile_n) * bpe
|
||||||
|
for k in range(K_tiles):
|
||||||
|
last_k = k == K_tiles - 1
|
||||||
|
a_addr = A_addr + (m * tile_m * K + k * tile_k) * bpe
|
||||||
|
b_addr = B_addr + (k * tile_k * N + n * tile_n) * bpe
|
||||||
|
|
||||||
|
a_bytes = tile_m * tile_k * bpe
|
||||||
|
b_bytes = tile_k * tile_n * bpe
|
||||||
|
out_bytes = tile_m * tile_n * bpe
|
||||||
|
|
||||||
|
stages: list[Stage] = []
|
||||||
|
|
||||||
|
# DMA READ: load A and B tiles from HBM → TCM
|
||||||
|
stages.append(Stage(
|
||||||
|
stage_type=StageType.DMA_READ,
|
||||||
|
component=dma_id,
|
||||||
|
params={
|
||||||
|
"src_addr": a_addr, "nbytes": a_bytes,
|
||||||
|
"operand": "A", "tile_m": tile_m, "tile_k": tile_k,
|
||||||
|
},
|
||||||
|
))
|
||||||
|
stages.append(Stage(
|
||||||
|
stage_type=StageType.DMA_READ,
|
||||||
|
component=dma_id,
|
||||||
|
params={
|
||||||
|
"src_addr": b_addr, "nbytes": b_bytes,
|
||||||
|
"operand": "B", "tile_k": tile_k, "tile_n": tile_n,
|
||||||
|
},
|
||||||
|
))
|
||||||
|
|
||||||
|
# FETCH: TCM → Register File
|
||||||
|
stages.append(Stage(
|
||||||
|
stage_type=StageType.FETCH,
|
||||||
|
component=fetch_id,
|
||||||
|
params={
|
||||||
|
"direction": "read",
|
||||||
|
"nbytes": a_bytes + b_bytes,
|
||||||
|
},
|
||||||
|
))
|
||||||
|
|
||||||
|
# GEMM: MAC compute
|
||||||
|
stages.append(Stage(
|
||||||
|
stage_type=StageType.GEMM,
|
||||||
|
component=gemm_id,
|
||||||
|
params={
|
||||||
|
"m": tile_m, "k": tile_k, "n": tile_n,
|
||||||
|
"is_last_k": last_k,
|
||||||
|
},
|
||||||
|
))
|
||||||
|
|
||||||
|
# STORE: Register File → TCM
|
||||||
|
stages.append(Stage(
|
||||||
|
stage_type=StageType.STORE,
|
||||||
|
component=fetch_id,
|
||||||
|
params={
|
||||||
|
"direction": "write",
|
||||||
|
"nbytes": out_bytes,
|
||||||
|
},
|
||||||
|
))
|
||||||
|
|
||||||
|
# DMA WRITE: TCM → HBM (only on last K-tile)
|
||||||
|
if last_k:
|
||||||
|
stages.append(Stage(
|
||||||
|
stage_type=StageType.DMA_WRITE,
|
||||||
|
component=dma_id,
|
||||||
|
params={
|
||||||
|
"dst_addr": c_addr, "nbytes": out_bytes,
|
||||||
|
},
|
||||||
|
))
|
||||||
|
|
||||||
|
tiles.append(TilePlan(tile_id=tile_id, stages=tuple(stages)))
|
||||||
|
tile_id += 1
|
||||||
|
|
||||||
|
return PipelinePlan(
|
||||||
|
tiles=tiles, m_tiles=M_tiles, k_tiles=K_tiles, n_tiles=N_tiles,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def generate_math_plan(
|
||||||
|
M: int, N: int,
|
||||||
|
tile_m: int, tile_n: int,
|
||||||
|
bytes_per_element: int,
|
||||||
|
math_op: str,
|
||||||
|
src_addr: int, dst_addr: int,
|
||||||
|
pe_prefix: str,
|
||||||
|
) -> PipelinePlan:
|
||||||
|
"""Generate element-wise math tile plan.
|
||||||
|
|
||||||
|
Each tile: DMA_READ → FETCH → MATH → STORE → DMA_WRITE
|
||||||
|
"""
|
||||||
|
M_tiles = max(1, ceil(M / tile_m))
|
||||||
|
N_tiles = max(1, ceil(N / tile_n))
|
||||||
|
bpe = bytes_per_element
|
||||||
|
|
||||||
|
dma_id = f"{pe_prefix}.pe_dma"
|
||||||
|
fetch_id = f"{pe_prefix}.pe_fetch_store"
|
||||||
|
math_id = f"{pe_prefix}.pe_math"
|
||||||
|
|
||||||
|
tiles: list[TilePlan] = []
|
||||||
|
tile_id = 0
|
||||||
|
|
||||||
|
for m in range(M_tiles):
|
||||||
|
for n in range(N_tiles):
|
||||||
|
offset = (m * tile_m * N + n * tile_n) * bpe
|
||||||
|
tile_bytes = tile_m * tile_n * bpe
|
||||||
|
|
||||||
|
stages = [
|
||||||
|
Stage(StageType.DMA_READ, dma_id, {
|
||||||
|
"src_addr": src_addr + offset, "nbytes": tile_bytes,
|
||||||
|
}),
|
||||||
|
Stage(StageType.FETCH, fetch_id, {
|
||||||
|
"direction": "read", "nbytes": tile_bytes,
|
||||||
|
}),
|
||||||
|
Stage(StageType.MATH, math_id, {
|
||||||
|
"op": math_op, "num_elements": tile_m * tile_n,
|
||||||
|
}),
|
||||||
|
Stage(StageType.STORE, fetch_id, {
|
||||||
|
"direction": "write", "nbytes": tile_bytes,
|
||||||
|
}),
|
||||||
|
Stage(StageType.DMA_WRITE, dma_id, {
|
||||||
|
"dst_addr": dst_addr + offset, "nbytes": tile_bytes,
|
||||||
|
}),
|
||||||
|
]
|
||||||
|
|
||||||
|
tiles.append(TilePlan(tile_id=tile_id, stages=tuple(stages)))
|
||||||
|
tile_id += 1
|
||||||
|
|
||||||
|
return PipelinePlan(tiles=tiles, m_tiles=M_tiles, n_tiles=N_tiles)
|
||||||
@@ -24,6 +24,8 @@ class ComponentContext:
|
|||||||
ns_per_mm: float # wire propagation constant (from topology spec)
|
ns_per_mm: float # wire propagation constant (from topology spec)
|
||||||
edge_map: dict[tuple[str, str], Any] = field(default_factory=dict)
|
edge_map: dict[tuple[str, str], Any] = field(default_factory=dict)
|
||||||
spec: dict = field(default_factory=dict) # topology spec (cube layout, PE count, etc.)
|
spec: dict = field(default_factory=dict) # topology spec (cube layout, PE count, etc.)
|
||||||
|
memory_store: Any = None # MemoryStore for Phase 1 data-aware execution (ADR-0020)
|
||||||
|
op_logger: Any = None # OpLogger for Phase 1 op recording (ADR-0020)
|
||||||
|
|
||||||
def get_shared_resource(
|
def get_shared_resource(
|
||||||
self, env: simpy.Environment, key: str, capacity: int = 1,
|
self, env: simpy.Environment, key: str, capacity: int = 1,
|
||||||
|
|||||||
@@ -0,0 +1,2 @@
|
|||||||
|
# Legacy component backups — not actively used.
|
||||||
|
# Kept for reference during ADR-0021 migration.
|
||||||
@@ -0,0 +1,34 @@
|
|||||||
|
"""Concrete component implementations.
|
||||||
|
|
||||||
|
Loaded from components.yaml via ComponentRegistry.load_components_yaml().
|
||||||
|
Manual imports are no longer needed — add new impls to components.yaml.
|
||||||
|
|
||||||
|
Classes are still importable from this package via lazy __getattr__.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from kernbench.components.base import ComponentRegistry
|
||||||
|
|
||||||
|
ComponentRegistry.load_components_yaml()
|
||||||
|
|
||||||
|
# Lazy re-export: allow `from kernbench.components.builtin import FooComponent`
|
||||||
|
# without eagerly importing every module.
|
||||||
|
_CLASS_MAP: dict[str, str] = {} # ClassName → "module.path:ClassName"
|
||||||
|
|
||||||
|
|
||||||
|
def _build_class_map() -> None:
|
||||||
|
if _CLASS_MAP:
|
||||||
|
return
|
||||||
|
for class_path in ComponentRegistry._lazy.values():
|
||||||
|
module_path, class_name = class_path.rsplit(":", 1)
|
||||||
|
_CLASS_MAP[class_name] = class_path
|
||||||
|
|
||||||
|
|
||||||
|
def __getattr__(name: str):
|
||||||
|
_build_class_map()
|
||||||
|
class_path = _CLASS_MAP.get(name)
|
||||||
|
if class_path is None:
|
||||||
|
raise ImportError(f"cannot import name '{name}' from 'kernbench.components.builtin'")
|
||||||
|
import importlib
|
||||||
|
module_path, class_name = class_path.rsplit(":", 1)
|
||||||
|
mod = importlib.import_module(module_path)
|
||||||
|
return getattr(mod, class_name)
|
||||||
@@ -0,0 +1,27 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from collections.abc import Generator
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
|
import simpy
|
||||||
|
|
||||||
|
from kernbench.components.base import ComponentBase
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from kernbench.components.context import ComponentContext
|
||||||
|
from kernbench.topology.types import Node
|
||||||
|
|
||||||
|
|
||||||
|
class TransitComponent(ComponentBase):
|
||||||
|
"""Transit component for NOC, UCIe, XBAR nodes.
|
||||||
|
|
||||||
|
Applies overhead_ns processing delay (from node.attrs) then forwards the
|
||||||
|
Transaction to the next hop via inherited _forward_txn().
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
|
||||||
|
super().__init__(node, ctx)
|
||||||
|
|
||||||
|
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
|
||||||
|
overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
|
||||||
|
yield env.timeout(overhead_ns)
|
||||||
@@ -0,0 +1,129 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from collections.abc import Generator
|
||||||
|
from typing import TYPE_CHECKING, Any
|
||||||
|
|
||||||
|
import simpy
|
||||||
|
|
||||||
|
from kernbench.components.base import ComponentBase
|
||||||
|
from kernbench.sim_engine.transaction import Transaction
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from kernbench.components.context import ComponentContext
|
||||||
|
from kernbench.topology.types import Node
|
||||||
|
|
||||||
|
|
||||||
|
class HbmCtrlComponent(ComponentBase):
|
||||||
|
"""HBM controller: terminal component that models HBM access latency.
|
||||||
|
|
||||||
|
Dual-channel model: separate read and write resources (each capacity=1)
|
||||||
|
allowing concurrent read/write like PE_DMA. Multiple reads or multiple
|
||||||
|
writes still serialize within their respective channel.
|
||||||
|
|
||||||
|
On completion, creates a ResponseMsg and sends it back on the reverse path
|
||||||
|
so that response latency is modeled through the fabric.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
|
||||||
|
super().__init__(node, ctx)
|
||||||
|
self._read: simpy.Resource | None = None
|
||||||
|
self._write: simpy.Resource | None = None
|
||||||
|
|
||||||
|
def start(self, env: simpy.Environment) -> None:
|
||||||
|
capacity = int(self.node.attrs.get("capacity", 1))
|
||||||
|
self._read = simpy.Resource(env, capacity=capacity)
|
||||||
|
self._write = simpy.Resource(env, capacity=capacity)
|
||||||
|
super().start(env)
|
||||||
|
|
||||||
|
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
|
||||||
|
overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
|
||||||
|
yield env.timeout(overhead_ns)
|
||||||
|
|
||||||
|
def _select_channel(self, txn: Any) -> simpy.Resource:
|
||||||
|
"""Select channel based on request type: write requests → write, else → read."""
|
||||||
|
from kernbench.runtime_api.kernel import MemoryWriteMsg, PeDmaMsg
|
||||||
|
|
||||||
|
assert self._read is not None and self._write is not None
|
||||||
|
req = txn.request
|
||||||
|
if isinstance(req, MemoryWriteMsg):
|
||||||
|
return self._write
|
||||||
|
if isinstance(req, PeDmaMsg) and req.is_write:
|
||||||
|
return self._write
|
||||||
|
return self._read
|
||||||
|
|
||||||
|
def _worker(self, env: simpy.Environment) -> Generator:
|
||||||
|
"""Dispatch each incoming txn to a concurrent process for channel-level parallelism."""
|
||||||
|
while True:
|
||||||
|
txn: Any = yield self._inbox.get()
|
||||||
|
env.process(self._handle_txn(env, txn))
|
||||||
|
|
||||||
|
def _handle_txn(self, env: simpy.Environment, txn: Any) -> Generator:
|
||||||
|
"""Acquire channel, run, apply drain, send response."""
|
||||||
|
channel = self._select_channel(txn)
|
||||||
|
with channel.request() as req:
|
||||||
|
yield req
|
||||||
|
yield from self.run(env, txn.nbytes)
|
||||||
|
drain = getattr(txn, "drain_ns", 0.0)
|
||||||
|
if drain > 0:
|
||||||
|
yield env.timeout(drain)
|
||||||
|
yield from self._send_response(env, txn)
|
||||||
|
|
||||||
|
def _send_response(self, env: simpy.Environment, txn: Any) -> Generator:
|
||||||
|
"""Route completion based on path type.
|
||||||
|
|
||||||
|
- PeDmaMsg: succeed done directly (probe).
|
||||||
|
- Bypass path (no m_cpu): MemoryWrite succeeds done; MemoryRead sends
|
||||||
|
data back on reverse path with original done event.
|
||||||
|
- M_CPU DMA path: send ResponseMsg for m_cpu/io_cpu aggregation.
|
||||||
|
"""
|
||||||
|
from kernbench.runtime_api.kernel import MemoryReadMsg, PeDmaMsg
|
||||||
|
|
||||||
|
if isinstance(txn.request, PeDmaMsg):
|
||||||
|
reverse_path = list(reversed(txn.path))
|
||||||
|
if len(reverse_path) >= 2:
|
||||||
|
resp_txn = Transaction(
|
||||||
|
request=txn.request, path=reverse_path, step=0,
|
||||||
|
nbytes=0, done=txn.done, is_response=True,
|
||||||
|
)
|
||||||
|
yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
|
||||||
|
return
|
||||||
|
txn.done.succeed()
|
||||||
|
return
|
||||||
|
|
||||||
|
# Bypass path: no m_cpu in the transaction path
|
||||||
|
is_bypass = not any("m_cpu" in n for n in txn.path)
|
||||||
|
if is_bypass:
|
||||||
|
if isinstance(txn.request, MemoryReadMsg):
|
||||||
|
# D2H: send data back on reverse path to pcie_ep
|
||||||
|
reverse_path = list(reversed(txn.path))
|
||||||
|
if len(reverse_path) >= 2:
|
||||||
|
resp_txn = Transaction(
|
||||||
|
request=txn.request, path=reverse_path, step=0,
|
||||||
|
nbytes=txn.request.nbytes, done=txn.done,
|
||||||
|
)
|
||||||
|
yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
|
||||||
|
return
|
||||||
|
# MemoryWrite bypass or short path: done
|
||||||
|
txn.done.succeed()
|
||||||
|
return
|
||||||
|
|
||||||
|
# M_CPU DMA path: send ResponseMsg for aggregation
|
||||||
|
reverse_path = list(reversed(txn.path))
|
||||||
|
if len(reverse_path) >= 2 and self.ctx:
|
||||||
|
from kernbench.runtime_api.kernel import ResponseMsg
|
||||||
|
|
||||||
|
parts = self.node.id.split(".")
|
||||||
|
cube_id = int(parts[1].replace("cube", ""))
|
||||||
|
pe_id = 0 # single hbm_ctrl, PE info from request
|
||||||
|
resp_msg = ResponseMsg(
|
||||||
|
correlation_id=txn.request.correlation_id,
|
||||||
|
request_id=txn.request.request_id,
|
||||||
|
src_cube=cube_id, src_pe=pe_id, success=True,
|
||||||
|
)
|
||||||
|
resp_txn = Transaction(
|
||||||
|
request=resp_msg, path=reverse_path, step=0,
|
||||||
|
nbytes=0, done=env.event(), is_response=True,
|
||||||
|
)
|
||||||
|
yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
|
||||||
|
else:
|
||||||
|
txn.done.succeed()
|
||||||
@@ -0,0 +1,157 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from collections.abc import Generator
|
||||||
|
from typing import TYPE_CHECKING, Any
|
||||||
|
|
||||||
|
import simpy
|
||||||
|
|
||||||
|
from kernbench.components.base import ComponentBase
|
||||||
|
from kernbench.sim_engine.transaction import Transaction
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from kernbench.components.context import ComponentContext
|
||||||
|
from kernbench.topology.types import Node
|
||||||
|
|
||||||
|
|
||||||
|
class IoCpuComponent(ComponentBase):
|
||||||
|
"""IO_CPU component: multi-cube fan-out with response aggregation.
|
||||||
|
|
||||||
|
Forward path:
|
||||||
|
1. Applies overhead_ns processing overhead.
|
||||||
|
2. Resolves target cube(s) from request.target_cubes.
|
||||||
|
3. Fans out sub-Transactions to each target cube's M_CPU.
|
||||||
|
|
||||||
|
Response path:
|
||||||
|
Collects ResponseMsg from each M_CPU. When all cube responses are
|
||||||
|
received, succeeds the parent txn.done.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
|
||||||
|
super().__init__(node, ctx)
|
||||||
|
# Pending fan-out tracking: request_id → (expected, received, parent_txn_done)
|
||||||
|
self._pending: dict[str, tuple[int, int, simpy.Event]] = {}
|
||||||
|
|
||||||
|
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
|
||||||
|
overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
|
||||||
|
yield env.timeout(overhead_ns)
|
||||||
|
|
||||||
|
def _worker(self, env: simpy.Environment) -> Generator:
|
||||||
|
while True:
|
||||||
|
txn: Any = yield self._inbox.get()
|
||||||
|
if getattr(txn, "is_response", False):
|
||||||
|
self._collect_response(txn)
|
||||||
|
else:
|
||||||
|
yield from self.run(env, txn.nbytes)
|
||||||
|
env.process(self._dispatch_to_m_cpus(env, txn))
|
||||||
|
|
||||||
|
def _collect_response(self, resp_txn: Any) -> None:
|
||||||
|
"""Receive a cube response and increment the aggregation counter."""
|
||||||
|
key = resp_txn.request.request_id
|
||||||
|
if key not in self._pending:
|
||||||
|
return
|
||||||
|
expected, received, parent_done = self._pending[key]
|
||||||
|
received += 1
|
||||||
|
if received >= expected:
|
||||||
|
parent_done.succeed()
|
||||||
|
del self._pending[key]
|
||||||
|
else:
|
||||||
|
self._pending[key] = (expected, received, parent_done)
|
||||||
|
|
||||||
|
def _dispatch_to_m_cpus(self, env: simpy.Environment, txn: Any) -> Generator:
|
||||||
|
"""Fan out sub-Transactions to target cube M_CPUs, wait for responses."""
|
||||||
|
from kernbench.runtime_api.kernel import KernelLaunchMsg, MemoryReadMsg, MemoryWriteMsg
|
||||||
|
|
||||||
|
request = txn.request
|
||||||
|
try:
|
||||||
|
cube_targets = self._resolve_cube_targets(request)
|
||||||
|
except Exception:
|
||||||
|
txn.done.succeed()
|
||||||
|
return
|
||||||
|
|
||||||
|
if not cube_targets:
|
||||||
|
txn.done.succeed()
|
||||||
|
return
|
||||||
|
|
||||||
|
# Setup aggregation
|
||||||
|
self._pending[request.request_id] = (len(cube_targets), 0, txn.done)
|
||||||
|
|
||||||
|
# Fan out to each target cube's M_CPU
|
||||||
|
for sip, cube in cube_targets:
|
||||||
|
try:
|
||||||
|
m_cpu_id = self.ctx.resolver.find_m_cpu(sip, cube)
|
||||||
|
path = self.ctx.router.find_node_path(self.node.id, m_cpu_id)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
if len(path) < 2:
|
||||||
|
continue
|
||||||
|
sub_txn = Transaction(
|
||||||
|
request=request, path=path, step=0,
|
||||||
|
nbytes=txn.nbytes, done=env.event(),
|
||||||
|
result_data=txn.result_data,
|
||||||
|
)
|
||||||
|
yield self.out_ports[path[1]].put(sub_txn.advance())
|
||||||
|
|
||||||
|
def _resolve_cube_targets(self, request: Any) -> list[tuple[int, int]]:
|
||||||
|
"""Return list of (sip, cube) pairs to fan out to."""
|
||||||
|
from kernbench.runtime_api.kernel import (
|
||||||
|
KernelLaunchMsg, MemoryReadMsg, MemoryWriteMsg, MmuMapMsg, MmuUnmapMsg,
|
||||||
|
)
|
||||||
|
|
||||||
|
target_cubes = getattr(request, "target_cubes", "all")
|
||||||
|
|
||||||
|
if isinstance(request, MemoryWriteMsg):
|
||||||
|
sip = request.dst_sip
|
||||||
|
if target_cubes == "all":
|
||||||
|
cube = self._cube_from_pa(request.dst_pa, fallback=request.dst_cube)
|
||||||
|
return [(sip, cube)]
|
||||||
|
return [(sip, c) for c in target_cubes]
|
||||||
|
|
||||||
|
if isinstance(request, MemoryReadMsg):
|
||||||
|
sip = request.src_sip
|
||||||
|
if target_cubes == "all":
|
||||||
|
cube = self._cube_from_pa(request.src_pa, fallback=request.src_cube)
|
||||||
|
return [(sip, cube)]
|
||||||
|
return [(sip, c) for c in target_cubes]
|
||||||
|
|
||||||
|
if isinstance(request, KernelLaunchMsg):
|
||||||
|
my_sip = self._my_sip()
|
||||||
|
if target_cubes != "all":
|
||||||
|
return [(my_sip, c) for c in target_cubes]
|
||||||
|
# "all": derive from tensor shards, filtered to this SIP
|
||||||
|
seen: set[tuple[int, int]] = set()
|
||||||
|
targets: list[tuple[int, int]] = []
|
||||||
|
for arg in request.args:
|
||||||
|
if arg.arg_kind != "tensor":
|
||||||
|
continue
|
||||||
|
for shard in arg.shards:
|
||||||
|
if shard.sip != my_sip:
|
||||||
|
continue
|
||||||
|
key = (shard.sip, shard.cube)
|
||||||
|
if key not in seen:
|
||||||
|
seen.add(key)
|
||||||
|
targets.append(key)
|
||||||
|
return targets
|
||||||
|
|
||||||
|
if isinstance(request, (MmuMapMsg, MmuUnmapMsg)):
|
||||||
|
my_sip = self._my_sip()
|
||||||
|
if target_cubes == "all":
|
||||||
|
n_cubes = 16
|
||||||
|
if self.ctx and self.ctx.spec:
|
||||||
|
sips = self.ctx.spec.get("system", {}).get("sips", {})
|
||||||
|
n_cubes = sips.get("cubes_per_sip", 16)
|
||||||
|
return [(my_sip, c) for c in range(n_cubes)]
|
||||||
|
return [(my_sip, c) for c in target_cubes]
|
||||||
|
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _cube_from_pa(self, pa_val: int, fallback: int) -> int:
|
||||||
|
"""Extract cube_id from a physical address, with fallback."""
|
||||||
|
from kernbench.policy.address.phyaddr import PhysAddr
|
||||||
|
try:
|
||||||
|
return PhysAddr.decode(pa_val).cube_id
|
||||||
|
except Exception:
|
||||||
|
return fallback
|
||||||
|
|
||||||
|
def _my_sip(self) -> int:
|
||||||
|
"""Extract this IO_CPU's SIP ID from its node ID (e.g. 'sip0.io0.io_cpu' → 0)."""
|
||||||
|
return int(self.node.id.split(".")[0].replace("sip", ""))
|
||||||
@@ -0,0 +1,327 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from collections.abc import Generator
|
||||||
|
from typing import TYPE_CHECKING, Any
|
||||||
|
|
||||||
|
import simpy
|
||||||
|
|
||||||
|
from kernbench.components.base import ComponentBase
|
||||||
|
from kernbench.sim_engine.transaction import Transaction
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from kernbench.components.context import ComponentContext
|
||||||
|
from kernbench.topology.types import Node
|
||||||
|
|
||||||
|
|
||||||
|
class MCpuComponent(ComponentBase):
|
||||||
|
"""M_CPU component: multi-PE DMA fan-out with response aggregation.
|
||||||
|
|
||||||
|
Forward path (ADR-0015 D5):
|
||||||
|
When a forward Transaction arrives at m_cpu (terminal hop), M_CPU fans out
|
||||||
|
DMA sub-Transactions to target PEs' HBM slices. target_pe on the request
|
||||||
|
controls fan-out: int → single PE, "all" → all PEs in the cube.
|
||||||
|
|
||||||
|
Response path:
|
||||||
|
ResponseMsg from each hbm_ctrl arrives back at m_cpu. Once all PE responses
|
||||||
|
are collected, m_cpu sends an aggregate ResponseMsg on the reverse command
|
||||||
|
path back to io_cpu.
|
||||||
|
|
||||||
|
Transit:
|
||||||
|
When m_cpu is NOT the terminal hop (transit or response relay), the
|
||||||
|
Transaction is forwarded normally to the next hop.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
|
||||||
|
super().__init__(node, ctx)
|
||||||
|
# Pending fan-out tracking: request_id → (expected, received, all_done_event)
|
||||||
|
self._pending: dict[str, tuple[int, int, simpy.Event]] = {}
|
||||||
|
# Store parent txn for response sending: request_id → parent_txn
|
||||||
|
self._parent_txns: dict[str, Any] = {}
|
||||||
|
# DMA engine resources (ADR-0015 D5, ADR-0014 D4): capacity=1 each
|
||||||
|
self._dma_write: simpy.Resource | None = None
|
||||||
|
self._dma_read: simpy.Resource | None = None
|
||||||
|
|
||||||
|
def start(self, env: simpy.Environment) -> None:
|
||||||
|
self._dma_write = simpy.Resource(env, capacity=1)
|
||||||
|
self._dma_read = simpy.Resource(env, capacity=1)
|
||||||
|
super().start(env)
|
||||||
|
|
||||||
|
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
|
||||||
|
overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
|
||||||
|
yield env.timeout(overhead_ns)
|
||||||
|
|
||||||
|
def _worker(self, env: simpy.Environment) -> Generator:
|
||||||
|
"""Dispatch forward txns, collect response txns."""
|
||||||
|
from kernbench.runtime_api.kernel import KernelLaunchMsg, MmuMapMsg, MmuUnmapMsg
|
||||||
|
|
||||||
|
while True:
|
||||||
|
txn: Any = yield self._inbox.get()
|
||||||
|
if getattr(txn, "is_response", False):
|
||||||
|
self._collect_response(txn)
|
||||||
|
else:
|
||||||
|
yield from self.run(env, txn.nbytes)
|
||||||
|
next_hop = txn.next_hop
|
||||||
|
if next_hop:
|
||||||
|
yield self.out_ports[next_hop].put(txn.advance())
|
||||||
|
elif self.ctx is not None and txn.request is not None:
|
||||||
|
if isinstance(txn.request, KernelLaunchMsg):
|
||||||
|
env.process(self._kernel_launch_fanout(env, txn))
|
||||||
|
elif isinstance(txn.request, (MmuMapMsg, MmuUnmapMsg)):
|
||||||
|
env.process(self._mmu_msg_fanout(env, txn))
|
||||||
|
else:
|
||||||
|
env.process(self._dma_fanout(env, txn))
|
||||||
|
else:
|
||||||
|
txn.done.succeed()
|
||||||
|
|
||||||
|
def _collect_response(self, resp_txn: Any) -> None:
|
||||||
|
"""Receive a PE response and increment the aggregation counter."""
|
||||||
|
key = resp_txn.request.request_id
|
||||||
|
if key not in self._pending:
|
||||||
|
return
|
||||||
|
expected, received, all_done = self._pending[key]
|
||||||
|
received += 1
|
||||||
|
if received >= expected:
|
||||||
|
all_done.succeed()
|
||||||
|
del self._pending[key]
|
||||||
|
else:
|
||||||
|
self._pending[key] = (expected, received, all_done)
|
||||||
|
|
||||||
|
def _dma_fanout(self, env: simpy.Environment, txn: Any) -> Generator:
|
||||||
|
"""Fan out DMA sub-Transactions to target PE(s), wait for responses,
|
||||||
|
then send aggregate response on reverse command path.
|
||||||
|
|
||||||
|
Each DMA transfer acquires the DMA resource (capacity=1 per ADR-0014 D4),
|
||||||
|
so multi-PE fan-out is serialized through the DMA engine.
|
||||||
|
"""
|
||||||
|
from kernbench.runtime_api.kernel import MemoryWriteMsg
|
||||||
|
|
||||||
|
request = txn.request
|
||||||
|
target_pe = getattr(request, "target_pe", "all")
|
||||||
|
|
||||||
|
dst_nodes = self._resolve_dma_destinations(request, target_pe)
|
||||||
|
if not dst_nodes:
|
||||||
|
txn.done.succeed()
|
||||||
|
return
|
||||||
|
|
||||||
|
# Setup aggregation
|
||||||
|
all_done = env.event()
|
||||||
|
self._pending[request.request_id] = (len(dst_nodes), 0, all_done)
|
||||||
|
self._parent_txns[request.request_id] = txn
|
||||||
|
|
||||||
|
# Select DMA resource based on operation type
|
||||||
|
dma_res = self._dma_write if isinstance(request, MemoryWriteMsg) else self._dma_read
|
||||||
|
|
||||||
|
# Fan out DMA sub-txns (serialized through DMA resource)
|
||||||
|
max_drain_ns = 0.0
|
||||||
|
for dst_node in dst_nodes:
|
||||||
|
try:
|
||||||
|
dma_path = self.ctx.router.find_mcpu_dma_path(self.node.id, dst_node)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
if len(dma_path) < 2:
|
||||||
|
continue
|
||||||
|
drain_ns = self.ctx.compute_drain_ns(dma_path, txn.nbytes)
|
||||||
|
max_drain_ns = max(max_drain_ns, drain_ns)
|
||||||
|
sub_txn = Transaction(
|
||||||
|
request=request, path=dma_path, step=0,
|
||||||
|
nbytes=txn.nbytes, done=env.event(),
|
||||||
|
drain_ns=drain_ns,
|
||||||
|
)
|
||||||
|
with dma_res.request() as req:
|
||||||
|
yield req
|
||||||
|
yield self.out_ports[dma_path[1]].put(sub_txn.advance())
|
||||||
|
|
||||||
|
# Wait for all PE responses
|
||||||
|
yield all_done
|
||||||
|
txn.result_data["xfer_ns"] = max_drain_ns
|
||||||
|
del self._parent_txns[request.request_id]
|
||||||
|
|
||||||
|
# Send aggregate response on reverse command path
|
||||||
|
reverse_path = list(reversed(txn.path))
|
||||||
|
if len(reverse_path) >= 2:
|
||||||
|
from kernbench.runtime_api.kernel import ResponseMsg
|
||||||
|
|
||||||
|
parts = self.node.id.split(".")
|
||||||
|
cube_id = int(parts[1].replace("cube", ""))
|
||||||
|
resp_msg = ResponseMsg(
|
||||||
|
correlation_id=request.correlation_id,
|
||||||
|
request_id=request.request_id,
|
||||||
|
src_cube=cube_id, src_pe=-1, success=True,
|
||||||
|
)
|
||||||
|
resp_txn = Transaction(
|
||||||
|
request=resp_msg, path=reverse_path, step=0,
|
||||||
|
nbytes=0, done=env.event(), is_response=True,
|
||||||
|
)
|
||||||
|
yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
|
||||||
|
else:
|
||||||
|
txn.done.succeed()
|
||||||
|
|
||||||
|
def _kernel_launch_fanout(self, env: simpy.Environment, txn: Any) -> Generator:
|
||||||
|
"""Fan out KernelLaunchMsg to target PE_CPU(s) via NOC (ADR-0009 D3).
|
||||||
|
|
||||||
|
Routes through find_node_path (M_CPU → NOC → PE_CPU command edges).
|
||||||
|
PE_CPU sends ResponseMsg back via NOC → M_CPU on completion.
|
||||||
|
Then sends aggregate ResponseMsg back to IO_CPU on the reverse path.
|
||||||
|
"""
|
||||||
|
request = txn.request
|
||||||
|
target_pe = getattr(request, "target_pe", "all")
|
||||||
|
cube_prefix = self.node.id.rsplit(".", 1)[0] # e.g. "sip0.cube0"
|
||||||
|
pe_ids = self._resolve_pe_ids(target_pe)
|
||||||
|
|
||||||
|
if not pe_ids:
|
||||||
|
txn.done.succeed()
|
||||||
|
return
|
||||||
|
|
||||||
|
# Fan out to each PE_CPU, using response-based aggregation
|
||||||
|
sub_txns: list[Transaction] = []
|
||||||
|
n_dispatched = 0
|
||||||
|
for pe_id in pe_ids:
|
||||||
|
pe_cpu_id = f"{cube_prefix}.pe{pe_id}.pe_cpu"
|
||||||
|
try:
|
||||||
|
path = self.ctx.router.find_node_path(self.node.id, pe_cpu_id)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
if len(path) < 2:
|
||||||
|
continue
|
||||||
|
sub_txn = Transaction(
|
||||||
|
request=request, path=path, step=0,
|
||||||
|
nbytes=0, done=env.event(),
|
||||||
|
)
|
||||||
|
yield self.out_ports[path[1]].put(sub_txn.advance())
|
||||||
|
sub_txns.append(sub_txn)
|
||||||
|
n_dispatched += 1
|
||||||
|
|
||||||
|
if n_dispatched == 0:
|
||||||
|
txn.done.succeed()
|
||||||
|
return
|
||||||
|
|
||||||
|
# Setup response aggregation (PE_CPU ResponseMsg arrives via _collect_response)
|
||||||
|
all_done = env.event()
|
||||||
|
self._pending[request.request_id] = (n_dispatched, 0, all_done)
|
||||||
|
self._parent_txns[request.request_id] = txn
|
||||||
|
|
||||||
|
# Wait for all PE_CPU responses via NOC
|
||||||
|
yield all_done
|
||||||
|
del self._parent_txns[request.request_id]
|
||||||
|
|
||||||
|
# Aggregate PE-internal metrics (max across PEs)
|
||||||
|
pe_exec_values = [st.result_data.get("pe_exec_ns", 0.0) for st in sub_txns]
|
||||||
|
if pe_exec_values:
|
||||||
|
txn.result_data["pe_exec_ns"] = max(pe_exec_values)
|
||||||
|
dma_values = [st.result_data.get("dma_ns", 0.0) for st in sub_txns]
|
||||||
|
if dma_values:
|
||||||
|
txn.result_data["dma_ns"] = max(dma_values)
|
||||||
|
compute_values = [st.result_data.get("compute_ns", 0.0) for st in sub_txns]
|
||||||
|
if compute_values:
|
||||||
|
txn.result_data["compute_ns"] = max(compute_values)
|
||||||
|
|
||||||
|
# Send aggregate response on reverse command path back to IO_CPU
|
||||||
|
reverse_path = list(reversed(txn.path))
|
||||||
|
if len(reverse_path) >= 2:
|
||||||
|
from kernbench.runtime_api.kernel import ResponseMsg
|
||||||
|
|
||||||
|
parts = self.node.id.split(".")
|
||||||
|
cube_id = int(parts[1].replace("cube", ""))
|
||||||
|
resp_msg = ResponseMsg(
|
||||||
|
correlation_id=request.correlation_id,
|
||||||
|
request_id=request.request_id,
|
||||||
|
src_cube=cube_id, src_pe=-1, success=True,
|
||||||
|
)
|
||||||
|
resp_txn = Transaction(
|
||||||
|
request=resp_msg, path=reverse_path, step=0,
|
||||||
|
nbytes=0, done=env.event(), is_response=True,
|
||||||
|
)
|
||||||
|
yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
|
||||||
|
else:
|
||||||
|
txn.done.succeed()
|
||||||
|
|
||||||
|
def _resolve_dma_destinations(self, request: Any, target_pe: int | str) -> list[str]:
|
||||||
|
"""Return list of HBM destination node_ids for DMA fan-out.
|
||||||
|
|
||||||
|
With single hbm_ctrl per cube (ADR-0019), always returns one node.
|
||||||
|
PA-based resolution still used for cross-cube routing.
|
||||||
|
"""
|
||||||
|
cube_prefix = self.node.id.rsplit(".", 1)[0] # e.g. "sip0.cube0"
|
||||||
|
|
||||||
|
# PA-based resolution: extract actual target from physical address
|
||||||
|
pa_val = getattr(request, "dst_pa", None) or getattr(request, "src_pa", None)
|
||||||
|
if pa_val is not None:
|
||||||
|
from kernbench.policy.address.phyaddr import PhysAddr
|
||||||
|
try:
|
||||||
|
pa = PhysAddr.decode(pa_val)
|
||||||
|
return [self.ctx.resolver.resolve(pa)]
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Default: single hbm_ctrl in local cube
|
||||||
|
return [f"{cube_prefix}.hbm_ctrl"]
|
||||||
|
|
||||||
|
def _mmu_msg_fanout(self, env: simpy.Environment, txn: Any) -> Generator:
|
||||||
|
"""Fan out MmuMapMsg/MmuUnmapMsg to target PE_MMU(s) via NOC.
|
||||||
|
|
||||||
|
Routes through find_node_path (M_CPU → NOC → PE_MMU command edges).
|
||||||
|
PE_MMU is a terminal node — completes the transaction directly.
|
||||||
|
"""
|
||||||
|
request = txn.request
|
||||||
|
target_pe = getattr(request, "target_pe", "all")
|
||||||
|
cube_prefix = self.node.id.rsplit(".", 1)[0] # e.g. "sip0.cube0"
|
||||||
|
pe_ids = self._resolve_pe_ids(target_pe)
|
||||||
|
|
||||||
|
if not pe_ids:
|
||||||
|
txn.done.succeed()
|
||||||
|
return
|
||||||
|
|
||||||
|
# Fan out to each PE_MMU
|
||||||
|
sub_dones: list[simpy.Event] = []
|
||||||
|
for pe_id in pe_ids:
|
||||||
|
pe_mmu_id = f"{cube_prefix}.pe{pe_id}.pe_mmu"
|
||||||
|
try:
|
||||||
|
path = self.ctx.router.find_node_path(self.node.id, pe_mmu_id)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
if len(path) < 2:
|
||||||
|
continue
|
||||||
|
sub_done = env.event()
|
||||||
|
sub_txn = Transaction(
|
||||||
|
request=request, path=path, step=0,
|
||||||
|
nbytes=0, done=sub_done,
|
||||||
|
)
|
||||||
|
yield self.out_ports[path[1]].put(sub_txn.advance())
|
||||||
|
sub_dones.append(sub_done)
|
||||||
|
|
||||||
|
# Wait for all PE_MMUs to complete
|
||||||
|
for sd in sub_dones:
|
||||||
|
yield sd
|
||||||
|
|
||||||
|
# Send aggregate response on reverse path
|
||||||
|
reverse_path = list(reversed(txn.path))
|
||||||
|
if len(reverse_path) >= 2:
|
||||||
|
from kernbench.runtime_api.kernel import ResponseMsg
|
||||||
|
|
||||||
|
parts = self.node.id.split(".")
|
||||||
|
cube_id = int(parts[1].replace("cube", ""))
|
||||||
|
resp_msg = ResponseMsg(
|
||||||
|
correlation_id=request.correlation_id,
|
||||||
|
request_id=request.request_id,
|
||||||
|
src_cube=cube_id, src_pe=-1, success=True,
|
||||||
|
)
|
||||||
|
resp_txn = Transaction(
|
||||||
|
request=resp_msg, path=reverse_path, step=0,
|
||||||
|
nbytes=0, done=env.event(), is_response=True,
|
||||||
|
)
|
||||||
|
yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
|
||||||
|
else:
|
||||||
|
txn.done.succeed()
|
||||||
|
|
||||||
|
def _resolve_pe_ids(self, target_pe: int | tuple | str) -> list[int]:
|
||||||
|
"""Return list of PE IDs to fan out to (used by kernel launch fan-out)."""
|
||||||
|
if isinstance(target_pe, int):
|
||||||
|
return [target_pe]
|
||||||
|
if isinstance(target_pe, tuple):
|
||||||
|
return list(target_pe)
|
||||||
|
# "all": all PEs in local cube
|
||||||
|
n_slices = 8
|
||||||
|
if self.ctx and self.ctx.spec:
|
||||||
|
mm = self.ctx.spec.get("cube", {}).get("memory_map", {})
|
||||||
|
n_slices = mm.get("hbm_slices_per_cube", 8)
|
||||||
|
return list(range(n_slices))
|
||||||
@@ -0,0 +1,27 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from collections.abc import Generator
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
|
import simpy
|
||||||
|
|
||||||
|
from kernbench.components.base import ComponentBase
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from kernbench.components.context import ComponentContext
|
||||||
|
from kernbench.topology.types import Node
|
||||||
|
|
||||||
|
|
||||||
|
class PcieEpComponent(ComponentBase):
|
||||||
|
"""PCIe endpoint: protocol processing overhead before forwarding.
|
||||||
|
|
||||||
|
Applies overhead_ns (from node.attrs) for PCIe protocol handling,
|
||||||
|
then forwards via inherited _forward_txn().
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
|
||||||
|
super().__init__(node, ctx)
|
||||||
|
|
||||||
|
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
|
||||||
|
overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
|
||||||
|
yield env.timeout(overhead_ns)
|
||||||
@@ -0,0 +1,214 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from collections.abc import Generator
|
||||||
|
from typing import TYPE_CHECKING, Any
|
||||||
|
|
||||||
|
import simpy
|
||||||
|
|
||||||
|
from kernbench.components.base import ComponentBase
|
||||||
|
from kernbench.sim_engine.transaction import Transaction
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from kernbench.components.context import ComponentContext
|
||||||
|
from kernbench.topology.types import Node
|
||||||
|
|
||||||
|
|
||||||
|
class PeCpuComponent(ComponentBase):
|
||||||
|
"""PE_CPU: kernel execution controller (Stage 2).
|
||||||
|
|
||||||
|
Two-phase kernel execution (ADR-0014 D1):
|
||||||
|
Phase 1 (compile): look up kernel from registry, run it with TLContext
|
||||||
|
to generate a PeCommand list.
|
||||||
|
Phase 2 (replay): iterate commands, dispatch to PE_SCHEDULER via
|
||||||
|
PeInternalTxn, wait for blocking commands.
|
||||||
|
|
||||||
|
Non-kernel Transactions are forwarded normally.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
|
||||||
|
super().__init__(node, ctx)
|
||||||
|
self._pe_prefix = node.id.rsplit(".", 1)[0] # e.g. "sip0.cube0.pe0"
|
||||||
|
try:
|
||||||
|
self._pe_idx = int(self._pe_prefix.rsplit("pe", 1)[1])
|
||||||
|
except (IndexError, ValueError):
|
||||||
|
self._pe_idx = 0
|
||||||
|
# Extract sip/cube index for multi-SIP/cube shard matching
|
||||||
|
parts = node.id.split(".")
|
||||||
|
try:
|
||||||
|
self._sip_idx = int(parts[0].replace("sip", ""))
|
||||||
|
except (IndexError, ValueError):
|
||||||
|
self._sip_idx = 0
|
||||||
|
try:
|
||||||
|
self._cube_idx = int(parts[1].replace("cube", ""))
|
||||||
|
except (IndexError, ValueError):
|
||||||
|
self._cube_idx = 0
|
||||||
|
|
||||||
|
def _find_shard(self, shards: tuple) -> Any:
|
||||||
|
"""Find shard matching this PE's (sip, cube, pe). Fallback to positional index."""
|
||||||
|
for s in shards:
|
||||||
|
if s.sip == self._sip_idx and s.cube == self._cube_idx and s.pe == self._pe_idx:
|
||||||
|
return s
|
||||||
|
return shards[min(self._pe_idx, len(shards) - 1)]
|
||||||
|
|
||||||
|
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
|
||||||
|
overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
|
||||||
|
yield env.timeout(overhead_ns)
|
||||||
|
|
||||||
|
def _worker(self, env: simpy.Environment) -> Generator:
|
||||||
|
while True:
|
||||||
|
txn: Any = yield self._inbox.get()
|
||||||
|
from kernbench.runtime_api.kernel import KernelLaunchMsg
|
||||||
|
|
||||||
|
if hasattr(txn, "request") and isinstance(txn.request, KernelLaunchMsg):
|
||||||
|
yield from self._execute_kernel(env, txn)
|
||||||
|
else:
|
||||||
|
yield from self._forward_txn(env, txn)
|
||||||
|
|
||||||
|
def _execute_kernel(self, env: simpy.Environment, txn: Any) -> Generator:
|
||||||
|
"""Execute kernel: greenlet mode (ADR-0020) or legacy Phase 0 + replay."""
|
||||||
|
from kernbench.triton_emu.registry import get_kernel
|
||||||
|
|
||||||
|
request = txn.request
|
||||||
|
yield from self.run(env, 0)
|
||||||
|
|
||||||
|
kernel_fn = get_kernel(request.kernel_ref.name)
|
||||||
|
num_programs = self._derive_num_programs(request)
|
||||||
|
kernel_args = self._unpack_kernel_args(request)
|
||||||
|
|
||||||
|
pe_exec_start = env.now
|
||||||
|
scheduler_id = f"{self._pe_prefix}.pe_scheduler"
|
||||||
|
|
||||||
|
# Choose execution mode: greenlet (ADR-0020) or legacy command-list
|
||||||
|
store = getattr(self.ctx, "memory_store", None) if self.ctx else None
|
||||||
|
|
||||||
|
if store is not None:
|
||||||
|
composite_results = yield from self._execute_greenlet(
|
||||||
|
env, kernel_fn, kernel_args, num_programs, scheduler_id, store,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
composite_results = yield from self._execute_legacy(
|
||||||
|
env, kernel_fn, kernel_args, num_programs, scheduler_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Record PE-internal execution time
|
||||||
|
txn.result_data["pe_exec_ns"] = env.now - pe_exec_start
|
||||||
|
total_dma_ns = 0.0
|
||||||
|
total_compute_ns = 0.0
|
||||||
|
for rd in composite_results:
|
||||||
|
total_dma_ns += rd.get("dma_ns", 0.0)
|
||||||
|
total_compute_ns += rd.get("compute_ns", 0.0)
|
||||||
|
txn.result_data["dma_ns"] = total_dma_ns
|
||||||
|
txn.result_data["compute_ns"] = total_compute_ns
|
||||||
|
|
||||||
|
# Send ResponseMsg on reverse path
|
||||||
|
yield from self._send_response(env, txn, request)
|
||||||
|
|
||||||
|
def _derive_num_programs(self, request: Any) -> int:
|
||||||
|
num_programs = 1
|
||||||
|
for arg in request.args:
|
||||||
|
if arg.arg_kind == "tensor":
|
||||||
|
cube_pe_count = sum(
|
||||||
|
1 for s in arg.shards
|
||||||
|
if s.sip == self._sip_idx and s.cube == self._cube_idx
|
||||||
|
)
|
||||||
|
if cube_pe_count > num_programs:
|
||||||
|
num_programs = cube_pe_count
|
||||||
|
return num_programs
|
||||||
|
|
||||||
|
def _unpack_kernel_args(self, request: Any) -> list:
|
||||||
|
kernel_args: list = []
|
||||||
|
for arg in request.args:
|
||||||
|
if arg.arg_kind == "tensor":
|
||||||
|
if arg.va_base:
|
||||||
|
kernel_args.append(arg.va_base)
|
||||||
|
else:
|
||||||
|
shard = self._find_shard(arg.shards)
|
||||||
|
kernel_args.append(shard.pa)
|
||||||
|
elif arg.arg_kind == "scalar":
|
||||||
|
kernel_args.append(arg.value)
|
||||||
|
return kernel_args
|
||||||
|
|
||||||
|
def _execute_greenlet(
|
||||||
|
self, env, kernel_fn, kernel_args, num_programs, scheduler_id, store,
|
||||||
|
) -> Generator:
|
||||||
|
"""Greenlet-based execution (ADR-0020 D3): kernel ↔ SimPy interleaved."""
|
||||||
|
from kernbench.triton_emu.kernel_runner import KernelRunner
|
||||||
|
|
||||||
|
runner = KernelRunner(
|
||||||
|
pe_prefix=self._pe_prefix,
|
||||||
|
pe_idx=self._pe_idx,
|
||||||
|
sip_idx=self._sip_idx,
|
||||||
|
cube_idx=self._cube_idx,
|
||||||
|
scheduler_id=scheduler_id,
|
||||||
|
out_ports=self.out_ports,
|
||||||
|
store=store,
|
||||||
|
)
|
||||||
|
yield from runner.run(env, kernel_fn, kernel_args, num_programs)
|
||||||
|
return getattr(runner, "_composite_results", [])
|
||||||
|
|
||||||
|
def _execute_legacy(
|
||||||
|
self, env, kernel_fn, kernel_args, num_programs, scheduler_id,
|
||||||
|
) -> Generator:
|
||||||
|
"""Legacy Phase 0 + replay: generate command list, then dispatch."""
|
||||||
|
from kernbench.common.pe_commands import (
|
||||||
|
CompositeCmd, PeCpuOverheadCmd, PeInternalTxn, WaitCmd,
|
||||||
|
)
|
||||||
|
from kernbench.triton_emu.tl_context import TLContext, run_kernel
|
||||||
|
|
||||||
|
tl = TLContext(pe_id=self._pe_idx, num_programs=num_programs, dispatch_cycles=0)
|
||||||
|
run_kernel(kernel_fn, tl, *kernel_args)
|
||||||
|
commands = tl.commands
|
||||||
|
|
||||||
|
pending: dict[str, simpy.Event] = {}
|
||||||
|
composite_results: list[dict] = []
|
||||||
|
|
||||||
|
for cmd in commands:
|
||||||
|
if isinstance(cmd, PeCpuOverheadCmd):
|
||||||
|
yield env.timeout(cmd.cycles)
|
||||||
|
elif isinstance(cmd, WaitCmd):
|
||||||
|
if cmd.handle is not None:
|
||||||
|
evt = pending.pop(cmd.handle.id, None)
|
||||||
|
if evt:
|
||||||
|
yield evt
|
||||||
|
else:
|
||||||
|
for evt in pending.values():
|
||||||
|
yield evt
|
||||||
|
pending.clear()
|
||||||
|
elif isinstance(cmd, CompositeCmd):
|
||||||
|
done_evt = env.event()
|
||||||
|
pe_txn = PeInternalTxn(
|
||||||
|
command=cmd, done=done_evt, pe_prefix=self._pe_prefix,
|
||||||
|
)
|
||||||
|
composite_results.append(pe_txn.result_data)
|
||||||
|
yield self.out_ports[scheduler_id].put(pe_txn)
|
||||||
|
pending[cmd.completion.id] = done_evt
|
||||||
|
else:
|
||||||
|
done_evt = env.event()
|
||||||
|
pe_txn = PeInternalTxn(
|
||||||
|
command=cmd, done=done_evt, pe_prefix=self._pe_prefix,
|
||||||
|
)
|
||||||
|
yield self.out_ports[scheduler_id].put(pe_txn)
|
||||||
|
yield done_evt
|
||||||
|
|
||||||
|
for evt in pending.values():
|
||||||
|
yield evt
|
||||||
|
return composite_results
|
||||||
|
|
||||||
|
def _send_response(self, env, txn, request) -> Generator:
|
||||||
|
reverse_path = list(reversed(txn.path))
|
||||||
|
if len(reverse_path) >= 2:
|
||||||
|
from kernbench.runtime_api.kernel import ResponseMsg
|
||||||
|
|
||||||
|
resp_msg = ResponseMsg(
|
||||||
|
correlation_id=request.correlation_id,
|
||||||
|
request_id=request.request_id,
|
||||||
|
src_cube=self._cube_idx, src_pe=self._pe_idx,
|
||||||
|
success=True,
|
||||||
|
)
|
||||||
|
resp_txn = Transaction(
|
||||||
|
request=resp_msg, path=reverse_path, step=0,
|
||||||
|
nbytes=0, done=env.event(), is_response=True,
|
||||||
|
)
|
||||||
|
yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
|
||||||
|
else:
|
||||||
|
txn.done.succeed()
|
||||||
@@ -0,0 +1,138 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from collections.abc import Generator
|
||||||
|
from typing import TYPE_CHECKING, Any
|
||||||
|
|
||||||
|
import simpy
|
||||||
|
|
||||||
|
from kernbench.components.base import PeEngineBase
|
||||||
|
from kernbench.sim_engine.transaction import Transaction
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from kernbench.common.pe_commands import PeInternalTxn
|
||||||
|
from kernbench.components.context import ComponentContext
|
||||||
|
from kernbench.topology.types import Node
|
||||||
|
|
||||||
|
|
||||||
|
class PeDmaComponent(PeEngineBase):
|
||||||
|
"""PE_DMA: dual-channel DMA engine with READ and WRITE resources.
|
||||||
|
|
||||||
|
Each channel has capacity=1 (ADR-0014 D4):
|
||||||
|
- DMA_READ and DMA_WRITE may execute concurrently.
|
||||||
|
- Multiple READs cannot overlap; multiple WRITEs cannot overlap.
|
||||||
|
|
||||||
|
Handles two message types:
|
||||||
|
- Transaction: external fabric messages (PeDmaMsg probes, M_CPU DMA)
|
||||||
|
- PeInternalTxn: PE-internal commands from PE_SCHEDULER
|
||||||
|
(DmaReadCmd → HBM read, DmaWriteCmd → HBM write)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
|
||||||
|
super().__init__(node, ctx)
|
||||||
|
self._dma_read: simpy.Resource | None = None
|
||||||
|
self._dma_write: simpy.Resource | None = None
|
||||||
|
self._mmu = None # PeMMU instance, set by engine wiring
|
||||||
|
|
||||||
|
def init_resources(self, env: simpy.Environment) -> None:
|
||||||
|
self._dma_read = simpy.Resource(env, capacity=1)
|
||||||
|
self._dma_write = simpy.Resource(env, capacity=1)
|
||||||
|
|
||||||
|
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
|
||||||
|
yield env.timeout(0)
|
||||||
|
|
||||||
|
def handle_command(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
|
||||||
|
"""Handle PE-internal DMA command: resolve PA → HBM path → transfer."""
|
||||||
|
from kernbench.common.pe_commands import DmaReadCmd, DmaWriteCmd
|
||||||
|
from kernbench.policy.address.phyaddr import PhysAddr
|
||||||
|
from kernbench.runtime_api.kernel import PeDmaMsg
|
||||||
|
|
||||||
|
cmd = pe_txn.command
|
||||||
|
assert self._dma_read is not None and self._dma_write is not None
|
||||||
|
|
||||||
|
# Determine direction and target address (VA → PA via MMU)
|
||||||
|
if isinstance(cmd, DmaReadCmd):
|
||||||
|
dma_res = self._dma_read
|
||||||
|
raw_addr = cmd.src_addr
|
||||||
|
is_write = False
|
||||||
|
elif isinstance(cmd, DmaWriteCmd):
|
||||||
|
dma_res = self._dma_write
|
||||||
|
raw_addr = cmd.dst_addr
|
||||||
|
is_write = True
|
||||||
|
else:
|
||||||
|
pe_txn.done.succeed()
|
||||||
|
return
|
||||||
|
|
||||||
|
# Translate VA → PA via MMU (if available), then resolve HBM node
|
||||||
|
# If MMU has no mapping for this address (PageFault), treat as PA directly
|
||||||
|
# (backward-compatible with PA-only mode)
|
||||||
|
if self._mmu is not None:
|
||||||
|
from kernbench.policy.address.pe_mmu import PageFault
|
||||||
|
try:
|
||||||
|
target_pa = self._mmu.translate(raw_addr)
|
||||||
|
if self._mmu.overhead_ns > 0:
|
||||||
|
yield env.timeout(self._mmu.overhead_ns)
|
||||||
|
except PageFault:
|
||||||
|
target_pa = raw_addr
|
||||||
|
else:
|
||||||
|
target_pa = raw_addr # fallback: treat as PA directly
|
||||||
|
pa = PhysAddr.decode(target_pa)
|
||||||
|
dst_node = self.ctx.resolver.resolve(pa)
|
||||||
|
path = self.ctx.router.find_path(self._pe_prefix, dst_node)
|
||||||
|
drain_ns = self.ctx.compute_drain_ns(path, cmd.nbytes)
|
||||||
|
|
||||||
|
# Acquire DMA channel (command issue serialization)
|
||||||
|
with dma_res.request() as req:
|
||||||
|
yield req
|
||||||
|
# Create sub-Transaction with PeDmaMsg (HbmCtrl handles it directly)
|
||||||
|
sub_done = env.event()
|
||||||
|
sub_request = PeDmaMsg(
|
||||||
|
correlation_id="pe_internal",
|
||||||
|
request_id=f"dma_{id(pe_txn)}",
|
||||||
|
src_sip=0, src_cube=0, src_pe=0,
|
||||||
|
dst_pa=target_pa, nbytes=cmd.nbytes,
|
||||||
|
is_write=is_write,
|
||||||
|
)
|
||||||
|
sub_txn = Transaction(
|
||||||
|
request=sub_request, path=path, step=0,
|
||||||
|
nbytes=cmd.nbytes, done=sub_done, drain_ns=drain_ns,
|
||||||
|
)
|
||||||
|
# Send to next hop (path[0] is pe_dma itself, path[1] is router)
|
||||||
|
if len(path) > 1:
|
||||||
|
yield self.out_ports[path[1]].put(sub_txn.advance())
|
||||||
|
# DMA channel released after issue
|
||||||
|
|
||||||
|
# Wait for HBM transfer completion
|
||||||
|
yield sub_done
|
||||||
|
pe_txn.done.succeed()
|
||||||
|
|
||||||
|
def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
|
||||||
|
"""Handle external Transaction (PeDmaMsg probe, M_CPU DMA) with channel acquisition."""
|
||||||
|
# Response transactions bypass DMA channel (no outbound resource needed)
|
||||||
|
if getattr(txn, "is_response", False):
|
||||||
|
next_hop = txn.next_hop
|
||||||
|
if next_hop:
|
||||||
|
yield self.out_ports[next_hop].put(txn.advance())
|
||||||
|
else:
|
||||||
|
txn.done.succeed()
|
||||||
|
return
|
||||||
|
|
||||||
|
dma_res = self._select_channel(txn)
|
||||||
|
with dma_res.request() as req:
|
||||||
|
yield req
|
||||||
|
next_hop = txn.next_hop
|
||||||
|
if next_hop:
|
||||||
|
yield self.out_ports[next_hop].put(txn.advance())
|
||||||
|
else:
|
||||||
|
drain = getattr(txn, "drain_ns", 0.0)
|
||||||
|
if drain > 0:
|
||||||
|
yield env.timeout(drain)
|
||||||
|
txn.done.succeed()
|
||||||
|
|
||||||
|
def _select_channel(self, txn: Any) -> simpy.Resource:
|
||||||
|
"""Select DMA channel based on request type."""
|
||||||
|
from kernbench.runtime_api.kernel import MemoryWriteMsg
|
||||||
|
|
||||||
|
assert self._dma_read is not None and self._dma_write is not None
|
||||||
|
if isinstance(txn.request, MemoryWriteMsg):
|
||||||
|
return self._dma_write
|
||||||
|
return self._dma_read
|
||||||
@@ -0,0 +1,90 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from collections.abc import Generator
|
||||||
|
from typing import TYPE_CHECKING, Any
|
||||||
|
|
||||||
|
import simpy
|
||||||
|
|
||||||
|
from kernbench.components.base import PeEngineBase
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from kernbench.common.pe_commands import PeInternalTxn
|
||||||
|
from kernbench.components.context import ComponentContext
|
||||||
|
from kernbench.topology.types import Node
|
||||||
|
|
||||||
|
|
||||||
|
# dtype → bit width (for TFLOPS scaling)
|
||||||
|
_DTYPE_BITS: dict[str, int] = {
|
||||||
|
"f16": 16, "fp16": 16, "float16": 16, "bf16": 16,
|
||||||
|
"f32": 32, "fp32": 32, "float32": 32,
|
||||||
|
"i8": 8, "int8": 8,
|
||||||
|
"i16": 16, "int16": 16,
|
||||||
|
"i32": 32, "int32": 32,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class PeGemmComponent(PeEngineBase):
|
||||||
|
"""PE_GEMM: matrix multiplication engine sharing accel_slot (ADR-0014 D4).
|
||||||
|
|
||||||
|
Uses a shared compute resource (PE_ACCEL capacity=1) that is mutually
|
||||||
|
exclusive with PE_MATH within the same PE.
|
||||||
|
|
||||||
|
Compute latency model:
|
||||||
|
FLOPs = 2 * M * K * N
|
||||||
|
effective_tflops = peak_tflops_f16 * (16 / dtype_bits)
|
||||||
|
compute_ns = FLOPs / (effective_tflops * 1e3)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
|
||||||
|
super().__init__(node, ctx)
|
||||||
|
self._accel: simpy.Resource | None = None
|
||||||
|
self._peak_tflops_f16: float = float(node.attrs.get("peak_tflops_f16", 0.0))
|
||||||
|
|
||||||
|
def init_resources(self, env: simpy.Environment) -> None:
|
||||||
|
resource_name = self.node.attrs.get("shared_resource")
|
||||||
|
if resource_name and self.ctx:
|
||||||
|
self._accel = self.ctx.get_shared_resource(
|
||||||
|
env, f"{self._pe_prefix}.{resource_name}"
|
||||||
|
)
|
||||||
|
|
||||||
|
def _compute_ns(self, m: int, k: int, n: int, dtype: str) -> float:
|
||||||
|
"""Compute GEMM latency in nanoseconds."""
|
||||||
|
if self._peak_tflops_f16 <= 0:
|
||||||
|
return float(self.node.attrs.get("overhead_ns", 0.0))
|
||||||
|
dtype_bits = _DTYPE_BITS.get(dtype, 16)
|
||||||
|
effective_tflops = self._peak_tflops_f16 * (16.0 / dtype_bits)
|
||||||
|
flops = 2.0 * m * k * n
|
||||||
|
return flops / (effective_tflops * 1e3)
|
||||||
|
|
||||||
|
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
|
||||||
|
overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
|
||||||
|
yield env.timeout(overhead_ns)
|
||||||
|
|
||||||
|
def handle_command(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
|
||||||
|
from kernbench.common.pe_commands import GemmCmd
|
||||||
|
|
||||||
|
cmd = pe_txn.command
|
||||||
|
if self._accel:
|
||||||
|
with self._accel.request() as req:
|
||||||
|
yield req
|
||||||
|
if isinstance(cmd, GemmCmd):
|
||||||
|
ns = self._compute_ns(cmd.m, cmd.k, cmd.n, cmd.a.dtype)
|
||||||
|
yield env.timeout(ns)
|
||||||
|
else:
|
||||||
|
yield from self.run(env, 0)
|
||||||
|
else:
|
||||||
|
if isinstance(cmd, GemmCmd):
|
||||||
|
ns = self._compute_ns(cmd.m, cmd.k, cmd.n, cmd.a.dtype)
|
||||||
|
yield env.timeout(ns)
|
||||||
|
else:
|
||||||
|
yield from self.run(env, 0)
|
||||||
|
pe_txn.done.succeed()
|
||||||
|
|
||||||
|
def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
|
||||||
|
"""Transaction forwarding with accel_slot acquisition."""
|
||||||
|
if self._accel:
|
||||||
|
with self._accel.request() as req:
|
||||||
|
yield req
|
||||||
|
yield from super()._forward_txn(env, txn)
|
||||||
|
else:
|
||||||
|
yield from super()._forward_txn(env, txn)
|
||||||
@@ -0,0 +1,54 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from collections.abc import Generator
|
||||||
|
from typing import TYPE_CHECKING, Any
|
||||||
|
|
||||||
|
import simpy
|
||||||
|
|
||||||
|
from kernbench.components.base import PeEngineBase
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from kernbench.common.pe_commands import PeInternalTxn
|
||||||
|
from kernbench.components.context import ComponentContext
|
||||||
|
from kernbench.topology.types import Node
|
||||||
|
|
||||||
|
|
||||||
|
class PeMathComponent(PeEngineBase):
|
||||||
|
"""PE_MATH: element-wise computation engine sharing accel_slot (ADR-0014 D4).
|
||||||
|
|
||||||
|
Uses a shared compute resource (PE_ACCEL capacity=1) that is mutually
|
||||||
|
exclusive with PE_GEMM within the same PE.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
|
||||||
|
super().__init__(node, ctx)
|
||||||
|
self._accel: simpy.Resource | None = None
|
||||||
|
|
||||||
|
def init_resources(self, env: simpy.Environment) -> None:
|
||||||
|
resource_name = self.node.attrs.get("shared_resource")
|
||||||
|
if resource_name and self.ctx:
|
||||||
|
self._accel = self.ctx.get_shared_resource(
|
||||||
|
env, f"{self._pe_prefix}.{resource_name}"
|
||||||
|
)
|
||||||
|
|
||||||
|
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
|
||||||
|
overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
|
||||||
|
yield env.timeout(overhead_ns)
|
||||||
|
|
||||||
|
def handle_command(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
|
||||||
|
if self._accel:
|
||||||
|
with self._accel.request() as req:
|
||||||
|
yield req
|
||||||
|
yield from self.run(env, 0)
|
||||||
|
else:
|
||||||
|
yield from self.run(env, 0)
|
||||||
|
pe_txn.done.succeed()
|
||||||
|
|
||||||
|
def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
|
||||||
|
"""Transaction forwarding with accel_slot acquisition."""
|
||||||
|
if self._accel:
|
||||||
|
with self._accel.request() as req:
|
||||||
|
yield req
|
||||||
|
yield from super()._forward_txn(env, txn)
|
||||||
|
else:
|
||||||
|
yield from super()._forward_txn(env, txn)
|
||||||
@@ -0,0 +1,66 @@
|
|||||||
|
"""PE_MMU component: address translation unit.
|
||||||
|
|
||||||
|
Component role: receives MmuMapMsg/MmuUnmapMsg via inbox (independent of PE_CPU).
|
||||||
|
Utility role: PE_DMA/PE_GEMM call mmu.translate() directly (no SimPy overhead).
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from collections.abc import Generator
|
||||||
|
from typing import TYPE_CHECKING, Any
|
||||||
|
|
||||||
|
import simpy
|
||||||
|
|
||||||
|
from kernbench.components.base import ComponentBase, ComponentRegistry
|
||||||
|
from kernbench.policy.address.pe_mmu import PeMMU
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from kernbench.components.context import ComponentContext
|
||||||
|
from kernbench.topology.types import Node
|
||||||
|
|
||||||
|
|
||||||
|
class PeMmuComponent(ComponentBase):
|
||||||
|
"""PE_MMU: per-PE virtual-to-physical address translation.
|
||||||
|
|
||||||
|
Receives MmuMapMsg/MmuUnmapMsg via inbox and updates the internal
|
||||||
|
page table. PE_DMA and PE_GEMM access the underlying PeMMU object
|
||||||
|
via the ``mmu`` property for synchronous VA→PA translation.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
|
||||||
|
super().__init__(node, ctx)
|
||||||
|
page_size = int(node.attrs.get("page_size", 2 * 1024 * 1024))
|
||||||
|
overhead_ns = float(node.attrs.get("tlb_overhead_ns", 0.0))
|
||||||
|
self._mmu = PeMMU(page_size=page_size, overhead_ns=overhead_ns)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def mmu(self) -> PeMMU:
|
||||||
|
"""The underlying PeMMU utility object for direct translate() calls."""
|
||||||
|
return self._mmu
|
||||||
|
|
||||||
|
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
|
||||||
|
yield env.timeout(0)
|
||||||
|
|
||||||
|
def _worker(self, env: simpy.Environment) -> Generator:
|
||||||
|
"""Process MmuMapMsg/MmuUnmapMsg from inbox."""
|
||||||
|
from kernbench.runtime_api.kernel import MmuMapMsg, MmuUnmapMsg
|
||||||
|
|
||||||
|
while True:
|
||||||
|
txn: Any = yield self._inbox.get()
|
||||||
|
|
||||||
|
if hasattr(txn, "request"):
|
||||||
|
request = txn.request
|
||||||
|
if isinstance(request, MmuMapMsg):
|
||||||
|
for entry in request.entries:
|
||||||
|
self._mmu.map(
|
||||||
|
va=entry["va"], pa=entry["pa"], size=entry["size"],
|
||||||
|
)
|
||||||
|
txn.done.succeed()
|
||||||
|
elif isinstance(request, MmuUnmapMsg):
|
||||||
|
for entry in request.entries:
|
||||||
|
self._mmu.unmap(va=entry["va"], size=entry["size"])
|
||||||
|
txn.done.succeed()
|
||||||
|
else:
|
||||||
|
# Forward non-MMU transactions normally
|
||||||
|
yield from self._forward_txn(env, txn)
|
||||||
|
else:
|
||||||
|
yield from self._forward_txn(env, txn)
|
||||||
@@ -0,0 +1,245 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from collections.abc import Generator
|
||||||
|
from typing import TYPE_CHECKING, Any
|
||||||
|
|
||||||
|
import simpy
|
||||||
|
|
||||||
|
from kernbench.components.base import ComponentBase
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from kernbench.common.pe_commands import PeInternalTxn
|
||||||
|
from kernbench.components.context import ComponentContext
|
||||||
|
from kernbench.topology.types import Node
|
||||||
|
|
||||||
|
|
||||||
|
class PeSchedulerComponent(ComponentBase):
|
||||||
|
"""PE_SCHEDULER: sole dispatcher inside a PE (ADR-0014 D1).
|
||||||
|
|
||||||
|
Receives PeInternalTxn from PE_CPU, routes to the appropriate engine:
|
||||||
|
- DmaReadCmd / DmaWriteCmd → PE_DMA
|
||||||
|
- GemmCmd → PE_GEMM
|
||||||
|
- MathCmd → PE_MATH
|
||||||
|
- CompositeCmd → tiled pipeline (Stage 3: ADR-0014 D3.2)
|
||||||
|
|
||||||
|
Composite GEMM pipeline (32x64x32 tiles):
|
||||||
|
DMA_READ(b_tile_t) → COMPUTE(t) → DMA_WRITE(out_tile_t)
|
||||||
|
with overlap: READ(t+1) || COMPUTE(t) || WRITE(t-1)
|
||||||
|
|
||||||
|
Applies scheduler overhead_ns before dispatching each command.
|
||||||
|
Non-PeInternalTxn messages are forwarded via inherited _forward_txn().
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Scheduler tile dimensions (ADR-0014 D3.2)
|
||||||
|
TILE_M = 32
|
||||||
|
TILE_K = 64
|
||||||
|
TILE_N = 32
|
||||||
|
|
||||||
|
# Command → engine suffix dispatch table.
|
||||||
|
# New engines: add a single entry here (e.g. ConvCmd: "pe_conv").
|
||||||
|
_CMD_DISPATCH: dict[type, str] = {}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _ensure_dispatch_table(cls) -> None:
|
||||||
|
if cls._CMD_DISPATCH:
|
||||||
|
return
|
||||||
|
from kernbench.common.pe_commands import DmaReadCmd, DmaWriteCmd, GemmCmd, MathCmd
|
||||||
|
|
||||||
|
cls._CMD_DISPATCH = {
|
||||||
|
DmaReadCmd: "pe_dma",
|
||||||
|
DmaWriteCmd: "pe_dma",
|
||||||
|
GemmCmd: "pe_gemm",
|
||||||
|
MathCmd: "pe_math",
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
|
||||||
|
super().__init__(node, ctx)
|
||||||
|
self._pe_prefix = node.id.rsplit(".", 1)[0]
|
||||||
|
self._ensure_dispatch_table()
|
||||||
|
|
||||||
|
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
|
||||||
|
overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
|
||||||
|
yield env.timeout(overhead_ns)
|
||||||
|
|
||||||
|
def _worker(self, env: simpy.Environment) -> Generator:
|
||||||
|
from kernbench.common.pe_commands import PeInternalTxn
|
||||||
|
|
||||||
|
while True:
|
||||||
|
msg: Any = yield self._inbox.get()
|
||||||
|
if isinstance(msg, PeInternalTxn):
|
||||||
|
env.process(self._dispatch(env, msg))
|
||||||
|
else:
|
||||||
|
yield from self._forward_txn(env, msg)
|
||||||
|
|
||||||
|
def _dispatch(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
|
||||||
|
"""Route a PeInternalTxn to the correct engine via dispatch table."""
|
||||||
|
from kernbench.common.pe_commands import CompositeCmd
|
||||||
|
|
||||||
|
# Scheduler overhead
|
||||||
|
yield from self.run(env, 0)
|
||||||
|
|
||||||
|
cmd = pe_txn.command
|
||||||
|
|
||||||
|
# Check dispatch table first
|
||||||
|
engine_suffix = self._CMD_DISPATCH.get(type(cmd))
|
||||||
|
if engine_suffix is not None:
|
||||||
|
yield self.out_ports[f"{self._pe_prefix}.{engine_suffix}"].put(pe_txn)
|
||||||
|
return
|
||||||
|
|
||||||
|
# CompositeCmd: tiled pipeline (not a simple forward)
|
||||||
|
if isinstance(cmd, CompositeCmd):
|
||||||
|
yield from self._dispatch_composite(env, pe_txn)
|
||||||
|
return
|
||||||
|
|
||||||
|
# Unknown command — signal done immediately
|
||||||
|
pe_txn.done.succeed()
|
||||||
|
|
||||||
|
def _dispatch_composite(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
|
||||||
|
"""Composite tiled pipeline (ADR-0014 D3.2).
|
||||||
|
|
||||||
|
GEMM: 3-stage pipeline with b-tile streaming from HBM.
|
||||||
|
MATH: sequential compute + DMA_WRITE (no tiling).
|
||||||
|
"""
|
||||||
|
from kernbench.common.pe_commands import CompositeCmd
|
||||||
|
|
||||||
|
cmd = pe_txn.command
|
||||||
|
assert isinstance(cmd, CompositeCmd)
|
||||||
|
if cmd.op == "gemm" and cmd.b is not None:
|
||||||
|
yield from self._pipeline_gemm(env, pe_txn, cmd)
|
||||||
|
else:
|
||||||
|
yield from self._pipeline_math(env, pe_txn, cmd)
|
||||||
|
|
||||||
|
def _pipeline_gemm(self, env: simpy.Environment, pe_txn: PeInternalTxn, cmd: Any) -> Generator:
|
||||||
|
"""Tiled GEMM pipeline: stream b tiles from HBM, compute, write results.
|
||||||
|
|
||||||
|
Tensor a is in TCM (loaded via tl.load). Tensor b is in HBM (via tl.ref).
|
||||||
|
Pipeline: DMA_READ(b_tile_t) -> COMPUTE(t) -> DMA_WRITE(out_tile_t)
|
||||||
|
Overlap: READ(t+1) || COMPUTE(t) || WRITE(t-1)
|
||||||
|
"""
|
||||||
|
from kernbench.common.pe_commands import (
|
||||||
|
DmaReadCmd,
|
||||||
|
DmaWriteCmd,
|
||||||
|
GemmCmd,
|
||||||
|
PeInternalTxn as PeTxn,
|
||||||
|
TensorHandle,
|
||||||
|
)
|
||||||
|
|
||||||
|
pp = self._pe_prefix
|
||||||
|
a = cmd.a # already in TCM
|
||||||
|
b = cmd.b # HBM reference (via tl.ref)
|
||||||
|
|
||||||
|
M, K_a = a.shape[-2], a.shape[-1]
|
||||||
|
K_b, N = b.shape[-2], b.shape[-1]
|
||||||
|
dtype = a.dtype
|
||||||
|
dtype_bytes = b.nbytes // (K_b * N) if (K_b * N) > 0 else 2
|
||||||
|
|
||||||
|
# Tile counts
|
||||||
|
n_tiles_k = max(1, (K_a + self.TILE_K - 1) // self.TILE_K)
|
||||||
|
n_tiles_n = max(1, (N + self.TILE_N - 1) // self.TILE_N)
|
||||||
|
n_tiles = n_tiles_k * n_tiles_n
|
||||||
|
|
||||||
|
prev_compute_done = None
|
||||||
|
prev_write_done = None
|
||||||
|
total_dma_ns = 0.0
|
||||||
|
total_compute_ns = 0.0
|
||||||
|
|
||||||
|
for tile_idx in range(n_tiles):
|
||||||
|
tk = tile_idx // n_tiles_n
|
||||||
|
tn = tile_idx % n_tiles_n
|
||||||
|
|
||||||
|
k_start = tk * self.TILE_K
|
||||||
|
n_start = tn * self.TILE_N
|
||||||
|
tile_k = min(self.TILE_K, K_a - k_start)
|
||||||
|
tile_n = min(self.TILE_N, N - n_start)
|
||||||
|
tile_nbytes = tile_k * tile_n * dtype_bytes
|
||||||
|
|
||||||
|
# --- Stage 1: DMA_READ b_tile from HBM ---
|
||||||
|
read_done = env.event()
|
||||||
|
b_tile_addr = b.addr + (k_start * N + n_start) * dtype_bytes
|
||||||
|
b_tile_handle = TensorHandle(
|
||||||
|
id=f"b_tile_{tile_idx}", addr=b_tile_addr,
|
||||||
|
shape=(tile_k, tile_n), dtype=dtype, nbytes=tile_nbytes,
|
||||||
|
)
|
||||||
|
read_cmd = DmaReadCmd(handle=b_tile_handle, src_addr=b_tile_addr, nbytes=tile_nbytes)
|
||||||
|
read_txn = PeTxn(command=read_cmd, done=read_done, pe_prefix=pp)
|
||||||
|
t0 = env.now
|
||||||
|
yield self.out_ports[f"{pp}.pe_dma"].put(read_txn)
|
||||||
|
|
||||||
|
# Wait for previous compute before starting this tile's compute
|
||||||
|
if prev_compute_done is not None:
|
||||||
|
yield prev_compute_done
|
||||||
|
|
||||||
|
# Wait for this tile's DMA_READ
|
||||||
|
yield read_done
|
||||||
|
total_dma_ns += env.now - t0
|
||||||
|
|
||||||
|
# --- Stage 2: COMPUTE (GEMM) ---
|
||||||
|
compute_done = env.event()
|
||||||
|
out_handle = TensorHandle(
|
||||||
|
id=f"out_tile_{tile_idx}", addr=0,
|
||||||
|
shape=(M, tile_n), dtype=dtype,
|
||||||
|
nbytes=M * tile_n * dtype_bytes,
|
||||||
|
)
|
||||||
|
compute_cmd = GemmCmd(a=a, b=b_tile_handle, out=out_handle,
|
||||||
|
m=M, k=tile_k, n=tile_n)
|
||||||
|
compute_txn = PeTxn(command=compute_cmd, done=compute_done, pe_prefix=pp)
|
||||||
|
t0 = env.now
|
||||||
|
yield self.out_ports[f"{pp}.pe_gemm"].put(compute_txn)
|
||||||
|
|
||||||
|
# Wait for previous write (DMA_WRITE serialization)
|
||||||
|
if prev_write_done is not None:
|
||||||
|
yield prev_write_done
|
||||||
|
|
||||||
|
# Wait for compute of THIS tile
|
||||||
|
yield compute_done
|
||||||
|
total_compute_ns += env.now - t0
|
||||||
|
prev_compute_done = compute_done
|
||||||
|
|
||||||
|
# --- Stage 3: DMA_WRITE out_tile to HBM ---
|
||||||
|
write_done = env.event()
|
||||||
|
out_tile_pa = cmd.out_addr + n_start * dtype_bytes
|
||||||
|
write_nbytes = M * tile_n * dtype_bytes
|
||||||
|
write_cmd = DmaWriteCmd(handle=out_handle, dst_addr=out_tile_pa, nbytes=write_nbytes)
|
||||||
|
write_txn = PeTxn(command=write_cmd, done=write_done, pe_prefix=pp)
|
||||||
|
t0 = env.now
|
||||||
|
yield self.out_ports[f"{pp}.pe_dma"].put(write_txn)
|
||||||
|
prev_write_done = write_done
|
||||||
|
|
||||||
|
# Wait for final write
|
||||||
|
if prev_write_done is not None:
|
||||||
|
t0 = env.now
|
||||||
|
yield prev_write_done
|
||||||
|
total_dma_ns += env.now - t0
|
||||||
|
|
||||||
|
pe_txn.result_data["dma_ns"] = total_dma_ns
|
||||||
|
pe_txn.result_data["compute_ns"] = total_compute_ns
|
||||||
|
pe_txn.done.succeed()
|
||||||
|
|
||||||
|
def _pipeline_math(self, env: simpy.Environment, pe_txn: PeInternalTxn, cmd: Any) -> Generator:
|
||||||
|
"""Non-GEMM composite: sequential compute + DMA_WRITE (no tiling)."""
|
||||||
|
from kernbench.common.pe_commands import (
|
||||||
|
DmaWriteCmd,
|
||||||
|
MathCmd,
|
||||||
|
PeInternalTxn as PeTxn,
|
||||||
|
)
|
||||||
|
|
||||||
|
pp = self._pe_prefix
|
||||||
|
|
||||||
|
# Step 1: Compute (MATH)
|
||||||
|
compute_done = env.event()
|
||||||
|
compute_cmd = MathCmd(
|
||||||
|
op=cmd.math_op or "identity",
|
||||||
|
inputs=(cmd.a,), out=cmd.a,
|
||||||
|
)
|
||||||
|
compute_txn = PeTxn(command=compute_cmd, done=compute_done, pe_prefix=pp)
|
||||||
|
yield self.out_ports[f"{pp}.pe_math"].put(compute_txn)
|
||||||
|
yield compute_done
|
||||||
|
|
||||||
|
# Step 2: DMA_WRITE result to HBM
|
||||||
|
write_done = env.event()
|
||||||
|
write_cmd = DmaWriteCmd(handle=cmd.a, dst_addr=cmd.out_addr, nbytes=cmd.out_nbytes)
|
||||||
|
write_txn = PeTxn(command=write_cmd, done=write_done, pe_prefix=pp)
|
||||||
|
yield self.out_ports[f"{pp}.pe_dma"].put(write_txn)
|
||||||
|
yield write_done
|
||||||
|
|
||||||
|
pe_txn.done.succeed()
|
||||||
@@ -0,0 +1,25 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from collections.abc import Generator
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
|
from kernbench.components.base import ComponentBase
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from kernbench.components.context import ComponentContext
|
||||||
|
from kernbench.topology.types import Node
|
||||||
|
|
||||||
|
|
||||||
|
class PeTcmComponent(ComponentBase):
|
||||||
|
"""PE_TCM: tightly-coupled memory / local SRAM staging buffer.
|
||||||
|
|
||||||
|
Terminal storage component for PE-internal dataflow (ADR-0014 D5).
|
||||||
|
Phase 0: applies overhead_ns and drain_ns at terminal.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
|
||||||
|
super().__init__(node, ctx)
|
||||||
|
|
||||||
|
def run(self, env, nbytes: int) -> Generator:
|
||||||
|
overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
|
||||||
|
yield env.timeout(overhead_ns)
|
||||||
@@ -0,0 +1,59 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from collections.abc import Generator
|
||||||
|
from typing import TYPE_CHECKING, Any
|
||||||
|
|
||||||
|
import simpy
|
||||||
|
|
||||||
|
from kernbench.components.base import ComponentBase
|
||||||
|
from kernbench.sim_engine.transaction import Transaction
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from kernbench.components.context import ComponentContext
|
||||||
|
from kernbench.topology.types import Node
|
||||||
|
|
||||||
|
|
||||||
|
class SramComponent(ComponentBase):
|
||||||
|
"""Cube SRAM: terminal component that models SRAM access latency.
|
||||||
|
|
||||||
|
Applies overhead_ns processing overhead (from node.attrs).
|
||||||
|
On completion, sends a ResponseMsg back on the reverse path.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
|
||||||
|
super().__init__(node, ctx)
|
||||||
|
|
||||||
|
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
|
||||||
|
overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
|
||||||
|
yield env.timeout(overhead_ns)
|
||||||
|
|
||||||
|
def _worker(self, env: simpy.Environment) -> Generator:
|
||||||
|
"""Terminal worker: process, apply drain, send response."""
|
||||||
|
while True:
|
||||||
|
txn: Any = yield self._inbox.get()
|
||||||
|
yield from self.run(env, txn.nbytes)
|
||||||
|
drain = getattr(txn, "drain_ns", 0.0)
|
||||||
|
if drain > 0:
|
||||||
|
yield env.timeout(drain)
|
||||||
|
yield from self._send_response(env, txn)
|
||||||
|
|
||||||
|
def _send_response(self, env: simpy.Environment, txn: Any) -> Generator:
|
||||||
|
"""Create ResponseMsg and send on reverse path."""
|
||||||
|
reverse_path = list(reversed(txn.path))
|
||||||
|
if len(reverse_path) >= 2 and self.ctx:
|
||||||
|
from kernbench.runtime_api.kernel import ResponseMsg
|
||||||
|
|
||||||
|
parts = self.node.id.split(".")
|
||||||
|
cube_id = int(parts[1].replace("cube", ""))
|
||||||
|
resp_msg = ResponseMsg(
|
||||||
|
correlation_id=txn.request.correlation_id,
|
||||||
|
request_id=txn.request.request_id,
|
||||||
|
src_cube=cube_id, src_pe=-1, success=True,
|
||||||
|
)
|
||||||
|
resp_txn = Transaction(
|
||||||
|
request=resp_msg, path=reverse_path, step=0,
|
||||||
|
nbytes=0, done=env.event(), is_response=True,
|
||||||
|
)
|
||||||
|
yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
|
||||||
|
else:
|
||||||
|
txn.done.succeed()
|
||||||
@@ -1,3 +1,14 @@
|
|||||||
|
"""Data-parallel placement policy (ADR-0026: intra-device only).
|
||||||
|
|
||||||
|
``DPPolicy`` describes how a tensor is sharded *within a single SIP* across
|
||||||
|
that SIP's cubes and PEs. Crossing the SIP boundary is not a DPPolicy
|
||||||
|
concern: ADR-0024's ``torch.ahbm.set_device(rank)`` picks the SIP, and
|
||||||
|
Megatron-style TP (ADR-0027) expresses multi-SIP tensors when needed.
|
||||||
|
|
||||||
|
``ShardSpec`` is expressed in structural ``(sip, cube, pe)`` coordinates.
|
||||||
|
The former flat ``pe_index`` field/property is fully removed — callers
|
||||||
|
needing a flat integer key compute it explicitly at the call site.
|
||||||
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
@@ -7,25 +18,58 @@ from typing import Literal
|
|||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
class DPPolicy:
|
class DPPolicy:
|
||||||
"""Three-level data-parallel policy: sip-level + cube-level + pe-level.
|
"""Intra-device (cube × PE) data-parallel policy.
|
||||||
|
|
||||||
Policies:
|
SIP-level placement is controlled by ``torch.ahbm.set_device(rank)``
|
||||||
|
(ADR-0024). For tensors that must cross SIP boundaries, use
|
||||||
|
Megatron-style parallel layers (ADR-0027). DPPolicy itself never
|
||||||
|
crosses a SIP boundary.
|
||||||
|
|
||||||
|
Policies (per axis):
|
||||||
- "replicate": full copy at each unit
|
- "replicate": full copy at each unit
|
||||||
- "column_wise": split K (column) axis across units
|
- "column_wise": split K (column) axis across units
|
||||||
- "row_wise": split M (row) axis across units
|
- "row_wise": split M (row) axis across units
|
||||||
|
|
||||||
Optional overrides (default None = use topology dimensions):
|
Optional overrides (``None`` = use topology dimensions):
|
||||||
- num_pes: override PEs per cube (e.g., 1 for single-PE test)
|
- num_pes: override PEs per cube
|
||||||
- num_cubes: override cubes per SIP (e.g., 1 for single-cube test)
|
- num_cubes: override cubes per SIP
|
||||||
- num_sips: override SIP count
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
sip: Literal["replicate", "column_wise", "row_wise"] = "replicate"
|
|
||||||
cube: Literal["replicate", "column_wise", "row_wise"] = "replicate"
|
cube: Literal["replicate", "column_wise", "row_wise"] = "replicate"
|
||||||
pe: Literal["replicate", "column_wise", "row_wise"] = "replicate"
|
pe: Literal["replicate", "column_wise", "row_wise"] = "replicate"
|
||||||
num_pes: int | None = None
|
num_pes: int | None = None
|
||||||
num_cubes: int | None = None
|
num_cubes: int | None = None
|
||||||
num_sips: int | None = None
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class ShardSpec:
|
||||||
|
"""Structural shard placement — ``(sip, cube, pe)`` coord (ADR-0026).
|
||||||
|
|
||||||
|
Global-flat ``pe_index`` was removed: callers must use structural
|
||||||
|
coords directly. If a flat integer key is needed in a local context
|
||||||
|
(e.g. internal dict lookup), compute it explicitly at the call site
|
||||||
|
and do not expose it in any public API.
|
||||||
|
"""
|
||||||
|
|
||||||
|
sip: int
|
||||||
|
cube: int
|
||||||
|
pe: int
|
||||||
|
offset_bytes: int
|
||||||
|
nbytes: int
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class _LocalPeShard:
|
||||||
|
"""Internal — PE resolver's return type (ADR-0026 D3).
|
||||||
|
|
||||||
|
Holds a cube-local PE identifier (``local_pe``) plus the shard's
|
||||||
|
byte payload. Lifted into ``ShardSpec`` with full ``(sip, cube, pe)``
|
||||||
|
coordinates inside ``resolve_dp_policy``.
|
||||||
|
"""
|
||||||
|
|
||||||
|
local_pe: int
|
||||||
|
offset_bytes: int
|
||||||
|
nbytes: int
|
||||||
|
|
||||||
|
|
||||||
def _split_shape(
|
def _split_shape(
|
||||||
@@ -52,14 +96,13 @@ def resolve_dp_policy(
|
|||||||
itemsize: int,
|
itemsize: int,
|
||||||
num_pe: int,
|
num_pe: int,
|
||||||
num_cubes: int = 1,
|
num_cubes: int = 1,
|
||||||
num_sips: int = 1,
|
target_sip: int,
|
||||||
) -> list[ShardSpec]:
|
) -> list[ShardSpec]:
|
||||||
"""Resolve a DPPolicy into a list[ShardSpec] with three-level resolution.
|
"""Resolve a DPPolicy into a list[ShardSpec] on a single SIP.
|
||||||
|
|
||||||
SIP-level → cube-level → pe-level.
|
Two-level resolution (cube × PE) within ``target_sip``. Each returned
|
||||||
num_cubes is cubes per SIP (not total).
|
``ShardSpec`` carries ``sip=target_sip`` and cube/pe local to the SIP.
|
||||||
ShardSpec.pe_index uses flat indexing:
|
No SIP-level split — DPPolicy is intra-device only (ADR-0026).
|
||||||
sip_id * num_cubes * num_pe + cube_id * num_pe + pe_id
|
|
||||||
"""
|
"""
|
||||||
_PE_RESOLVERS = {
|
_PE_RESOLVERS = {
|
||||||
"replicate": replicate,
|
"replicate": replicate,
|
||||||
@@ -70,84 +113,61 @@ def resolve_dp_policy(
|
|||||||
if resolver is None:
|
if resolver is None:
|
||||||
raise ValueError(f"Unknown pe-level policy: {policy.pe}")
|
raise ValueError(f"Unknown pe-level policy: {policy.pe}")
|
||||||
|
|
||||||
cubes_per_sip = num_cubes
|
|
||||||
all_shards: list[ShardSpec] = []
|
all_shards: list[ShardSpec] = []
|
||||||
|
|
||||||
# Level 1: SIP
|
# Level 1: cube within SIP
|
||||||
sip_splits = _split_shape(policy.sip, shape, num_sips, itemsize)
|
cube_splits = _split_shape(policy.cube, shape, num_cubes, itemsize)
|
||||||
|
|
||||||
for sip_id, (sip_shape, sip_offset) in enumerate(sip_splits):
|
for cube_id, (cube_shape, cube_offset) in enumerate(cube_splits):
|
||||||
# Level 2: Cube within SIP
|
# Level 2: PE within cube — resolver returns _LocalPeShard
|
||||||
cube_splits = _split_shape(policy.cube, sip_shape, cubes_per_sip, itemsize)
|
local_shards = resolver(shape=cube_shape, itemsize=itemsize, num_pe=num_pe)
|
||||||
|
|
||||||
for cube_id, (cube_shape, cube_offset) in enumerate(cube_splits):
|
for ls in local_shards:
|
||||||
# Level 3: PE within cube
|
all_shards.append(ShardSpec(
|
||||||
pe_shards = resolver(shape=cube_shape, itemsize=itemsize, num_pe=num_pe)
|
sip=target_sip,
|
||||||
|
cube=cube_id,
|
||||||
for ps in pe_shards:
|
pe=ls.local_pe,
|
||||||
flat_idx = (
|
offset_bytes=cube_offset + ls.offset_bytes,
|
||||||
sip_id * cubes_per_sip * num_pe
|
nbytes=ls.nbytes,
|
||||||
+ cube_id * num_pe
|
))
|
||||||
+ ps.pe_index
|
|
||||||
)
|
|
||||||
all_shards.append(ShardSpec(
|
|
||||||
pe_index=flat_idx,
|
|
||||||
offset_bytes=sip_offset + cube_offset + ps.offset_bytes,
|
|
||||||
nbytes=ps.nbytes,
|
|
||||||
))
|
|
||||||
|
|
||||||
return all_shards
|
return all_shards
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
|
||||||
class ShardSpec:
|
|
||||||
pe_index: int
|
|
||||||
offset_bytes: int
|
|
||||||
nbytes: int
|
|
||||||
|
|
||||||
|
|
||||||
def column_wise(
|
def column_wise(
|
||||||
*, shape: tuple[int, int], itemsize: int, num_pe: int,
|
*, shape: tuple[int, int], itemsize: int, num_pe: int,
|
||||||
) -> list[ShardSpec]:
|
) -> list[_LocalPeShard]:
|
||||||
"""Split K axis into num_pe equal parts. Each PE gets (M, K/P)."""
|
"""Split K axis into num_pe equal parts. Each PE gets (M, K/P)."""
|
||||||
M, K = shape
|
M, K = shape
|
||||||
chunk_k = K // num_pe
|
chunk_k = K // num_pe
|
||||||
chunk_bytes = M * chunk_k * itemsize
|
chunk_bytes = M * chunk_k * itemsize
|
||||||
shards = []
|
return [
|
||||||
for i in range(num_pe):
|
_LocalPeShard(local_pe=i, offset_bytes=i * chunk_bytes, nbytes=chunk_bytes)
|
||||||
shards.append(ShardSpec(
|
for i in range(num_pe)
|
||||||
pe_index=i,
|
]
|
||||||
offset_bytes=i * chunk_bytes,
|
|
||||||
nbytes=chunk_bytes,
|
|
||||||
))
|
|
||||||
return shards
|
|
||||||
|
|
||||||
|
|
||||||
def row_wise(
|
def row_wise(
|
||||||
*, shape: tuple[int, int], itemsize: int, num_pe: int,
|
*, shape: tuple[int, int], itemsize: int, num_pe: int,
|
||||||
) -> list[ShardSpec]:
|
) -> list[_LocalPeShard]:
|
||||||
"""Split M axis into num_pe equal parts. Each PE gets (M/P, K)."""
|
"""Split M axis into num_pe equal parts. Each PE gets (M/P, K)."""
|
||||||
M, K = shape
|
M, K = shape
|
||||||
chunk_m = M // num_pe
|
chunk_m = M // num_pe
|
||||||
chunk_bytes = chunk_m * K * itemsize
|
chunk_bytes = chunk_m * K * itemsize
|
||||||
shards = []
|
return [
|
||||||
for i in range(num_pe):
|
_LocalPeShard(local_pe=i, offset_bytes=i * chunk_bytes, nbytes=chunk_bytes)
|
||||||
shards.append(ShardSpec(
|
for i in range(num_pe)
|
||||||
pe_index=i,
|
]
|
||||||
offset_bytes=i * chunk_bytes,
|
|
||||||
nbytes=chunk_bytes,
|
|
||||||
))
|
|
||||||
return shards
|
|
||||||
|
|
||||||
|
|
||||||
def replicate(
|
def replicate(
|
||||||
*, shape: tuple[int, int], itemsize: int, num_pe: int,
|
*, shape: tuple[int, int], itemsize: int, num_pe: int,
|
||||||
) -> list[ShardSpec]:
|
) -> list[_LocalPeShard]:
|
||||||
"""Full copy per PE. Each PE gets (M, K)."""
|
"""Full copy per PE. Each PE gets (M, K)."""
|
||||||
M, K = shape
|
M, K = shape
|
||||||
full_bytes = M * K * itemsize
|
full_bytes = M * K * itemsize
|
||||||
return [
|
return [
|
||||||
ShardSpec(pe_index=i, offset_bytes=0, nbytes=full_bytes)
|
_LocalPeShard(local_pe=i, offset_bytes=0, nbytes=full_bytes)
|
||||||
for i in range(num_pe)
|
for i in range(num_pe)
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -155,20 +175,20 @@ def replicate(
|
|||||||
def tiled_column_major(
|
def tiled_column_major(
|
||||||
*, shape: tuple[int, int], itemsize: int, num_pe: int,
|
*, shape: tuple[int, int], itemsize: int, num_pe: int,
|
||||||
tile_m: int, tile_k: int,
|
tile_m: int, tile_k: int,
|
||||||
) -> list[ShardSpec]:
|
) -> list[_LocalPeShard]:
|
||||||
"""2D tiling, column-major order (K axis first), round-robin across PEs."""
|
"""2D tiling, column-major order (K axis first), round-robin across PEs."""
|
||||||
M, K = shape
|
M, K = shape
|
||||||
tiles_m = ceil(M / tile_m)
|
tiles_m = ceil(M / tile_m)
|
||||||
tiles_k = ceil(K / tile_k)
|
tiles_k = ceil(K / tile_k)
|
||||||
tile_bytes = tile_m * tile_k * itemsize
|
tile_bytes = tile_m * tile_k * itemsize
|
||||||
row_bytes = K * itemsize
|
row_bytes = K * itemsize
|
||||||
shards = []
|
shards: list[_LocalPeShard] = []
|
||||||
idx = 0
|
idx = 0
|
||||||
for mi in range(tiles_m):
|
for mi in range(tiles_m):
|
||||||
for ki in range(tiles_k):
|
for ki in range(tiles_k):
|
||||||
offset = (mi * tile_m * row_bytes) + (ki * tile_k * itemsize)
|
offset = (mi * tile_m * row_bytes) + (ki * tile_k * itemsize)
|
||||||
shards.append(ShardSpec(
|
shards.append(_LocalPeShard(
|
||||||
pe_index=idx % num_pe,
|
local_pe=idx % num_pe,
|
||||||
offset_bytes=offset,
|
offset_bytes=offset,
|
||||||
nbytes=tile_bytes,
|
nbytes=tile_bytes,
|
||||||
))
|
))
|
||||||
@@ -179,20 +199,20 @@ def tiled_column_major(
|
|||||||
def tiled_row_major(
|
def tiled_row_major(
|
||||||
*, shape: tuple[int, int], itemsize: int, num_pe: int,
|
*, shape: tuple[int, int], itemsize: int, num_pe: int,
|
||||||
tile_m: int, tile_k: int,
|
tile_m: int, tile_k: int,
|
||||||
) -> list[ShardSpec]:
|
) -> list[_LocalPeShard]:
|
||||||
"""2D tiling, row-major order (M axis first), round-robin across PEs."""
|
"""2D tiling, row-major order (M axis first), round-robin across PEs."""
|
||||||
M, K = shape
|
M, K = shape
|
||||||
tiles_m = ceil(M / tile_m)
|
tiles_m = ceil(M / tile_m)
|
||||||
tiles_k = ceil(K / tile_k)
|
tiles_k = ceil(K / tile_k)
|
||||||
tile_bytes = tile_m * tile_k * itemsize
|
tile_bytes = tile_m * tile_k * itemsize
|
||||||
row_bytes = K * itemsize
|
row_bytes = K * itemsize
|
||||||
shards = []
|
shards: list[_LocalPeShard] = []
|
||||||
idx = 0
|
idx = 0
|
||||||
for ki in range(tiles_k):
|
for ki in range(tiles_k):
|
||||||
for mi in range(tiles_m):
|
for mi in range(tiles_m):
|
||||||
offset = (mi * tile_m * row_bytes) + (ki * tile_k * itemsize)
|
offset = (mi * tile_m * row_bytes) + (ki * tile_k * itemsize)
|
||||||
shards.append(ShardSpec(
|
shards.append(_LocalPeShard(
|
||||||
pe_index=idx % num_pe,
|
local_pe=idx % num_pe,
|
||||||
offset_bytes=offset,
|
offset_bytes=offset,
|
||||||
nbytes=tile_bytes,
|
nbytes=tile_bytes,
|
||||||
))
|
))
|
||||||
|
|||||||
@@ -29,11 +29,10 @@ def run_bench(
|
|||||||
correlation_id: str = "bench0",
|
correlation_id: str = "bench0",
|
||||||
completion_policy: CompletionPolicy = CompletionPolicy.LAST_SUBMITTED,
|
completion_policy: CompletionPolicy = CompletionPolicy.LAST_SUBMITTED,
|
||||||
) -> BenchResult:
|
) -> BenchResult:
|
||||||
"""
|
"""Minimal bench runner.
|
||||||
Minimal bench runner.
|
|
||||||
|
|
||||||
- topology: compiled topology object (opaque to runtime here)
|
- topology: compiled topology object (opaque to runtime here)
|
||||||
- bench_fn: callable that receives RuntimeContext and submits requests
|
- bench_fn: callable ``run(torch)`` receiving a RuntimeContext
|
||||||
- device: DeviceSelector ("all" or "sip:<N>")
|
- device: DeviceSelector ("all" or "sip:<N>")
|
||||||
- engine_factory: builds sim_engine for given topology & device
|
- engine_factory: builds sim_engine for given topology & device
|
||||||
- completion_policy: how to determine overall completion/result
|
- completion_policy: how to determine overall completion/result
|
||||||
@@ -48,7 +47,6 @@ def run_bench(
|
|||||||
)
|
)
|
||||||
|
|
||||||
bench_fn(ctx)
|
bench_fn(ctx)
|
||||||
|
|
||||||
ctx.wait_all()
|
ctx.wait_all()
|
||||||
|
|
||||||
collected_traces = ctx._traces or None
|
collected_traces = ctx._traces or None
|
||||||
@@ -62,6 +60,7 @@ def run_bench(
|
|||||||
correlation_id=correlation_id,
|
correlation_id=correlation_id,
|
||||||
trace=None,
|
trace=None,
|
||||||
traces=collected_traces,
|
traces=collected_traces,
|
||||||
|
engine=engine,
|
||||||
)
|
)
|
||||||
|
|
||||||
if completion_policy == CompletionPolicy.LAST_SUBMITTED:
|
if completion_policy == CompletionPolicy.LAST_SUBMITTED:
|
||||||
@@ -69,7 +68,7 @@ def run_bench(
|
|||||||
completion, trace = engine.get_completion(last)
|
completion, trace = engine.get_completion(last)
|
||||||
return BenchResult(
|
return BenchResult(
|
||||||
completion=completion, correlation_id=correlation_id,
|
completion=completion, correlation_id=correlation_id,
|
||||||
trace=trace, traces=collected_traces,
|
trace=trace, traces=collected_traces, engine=engine,
|
||||||
)
|
)
|
||||||
|
|
||||||
if completion_policy == CompletionPolicy.ALL_OK_FAIL_FAST:
|
if completion_policy == CompletionPolicy.ALL_OK_FAIL_FAST:
|
||||||
@@ -80,11 +79,11 @@ def run_bench(
|
|||||||
if not c.ok:
|
if not c.ok:
|
||||||
return BenchResult(
|
return BenchResult(
|
||||||
completion=c, correlation_id=correlation_id,
|
completion=c, correlation_id=correlation_id,
|
||||||
trace=last_trace, traces=collected_traces,
|
trace=last_trace, traces=collected_traces, engine=engine,
|
||||||
)
|
)
|
||||||
return BenchResult(
|
return BenchResult(
|
||||||
completion=Completion(ok=True), correlation_id=correlation_id,
|
completion=Completion(ok=True), correlation_id=correlation_id,
|
||||||
trace=last_trace, traces=collected_traces,
|
trace=last_trace, traces=collected_traces, engine=engine,
|
||||||
)
|
)
|
||||||
|
|
||||||
# LAST_COMPLETED placeholder (needs engine support for timing). Fall back.
|
# LAST_COMPLETED placeholder (needs engine support for timing). Fall back.
|
||||||
@@ -92,5 +91,5 @@ def run_bench(
|
|||||||
completion, trace = engine.get_completion(last)
|
completion, trace = engine.get_completion(last)
|
||||||
return BenchResult(
|
return BenchResult(
|
||||||
completion=completion, correlation_id=correlation_id,
|
completion=completion, correlation_id=correlation_id,
|
||||||
trace=trace, traces=collected_traces,
|
trace=trace, traces=collected_traces, engine=engine,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -9,6 +9,92 @@ from kernbench.common.types import Completion, RequestHandle, SimEngine
|
|||||||
from .types import DeviceSelector
|
from .types import DeviceSelector
|
||||||
|
|
||||||
|
|
||||||
|
def _world_size_from_spec(spec: dict | None) -> int:
|
||||||
|
"""Derive world_size from topology spec: sips × cubes × pes_per_cube."""
|
||||||
|
spec = spec or {}
|
||||||
|
sips = int(spec.get("system", {}).get("sips", {}).get("count", 1))
|
||||||
|
cm = spec.get("sip", {}).get("cube_mesh", {})
|
||||||
|
cubes_per_sip = int(cm.get("w", 1)) * int(cm.get("h", 1))
|
||||||
|
pl = spec.get("cube", {}).get("pe_layout", {})
|
||||||
|
corners = pl.get("corners", [])
|
||||||
|
pe_per_corner = int(pl.get("pe_per_corner", 1))
|
||||||
|
pes_per_cube = pe_per_corner * max(len(corners), 1)
|
||||||
|
return sips * cubes_per_sip * pes_per_cube
|
||||||
|
|
||||||
|
|
||||||
|
def _numpy_to_dtype_str(np_dtype) -> str:
|
||||||
|
"""Map numpy dtype → kernbench dtype string used by Tensor."""
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
kind_map = {
|
||||||
|
np.float16: "f16",
|
||||||
|
np.float32: "f32",
|
||||||
|
np.int8: "i8",
|
||||||
|
np.int16: "i16",
|
||||||
|
np.int32: "i32",
|
||||||
|
np.uint8: "u8",
|
||||||
|
np.uint16: "u16",
|
||||||
|
np.uint32: "u32",
|
||||||
|
}
|
||||||
|
for np_type, s in kind_map.items():
|
||||||
|
if np.dtype(np_dtype) == np.dtype(np_type):
|
||||||
|
return s
|
||||||
|
raise ValueError(f"unsupported numpy dtype: {np_dtype!r}")
|
||||||
|
|
||||||
|
|
||||||
|
# ADR-0027 D3: weak registry of the currently-active RuntimeContext so
|
||||||
|
# module-level helpers (e.g. ``kernbench.tp.parallel_state``) can resolve
|
||||||
|
# the ctx without threading it through every call.
|
||||||
|
import weakref as _weakref
|
||||||
|
|
||||||
|
_ACTIVE_CTX_REF: _weakref.ref | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def _get_active_context():
|
||||||
|
"""Return the most-recently-entered RuntimeContext, or None."""
|
||||||
|
if _ACTIVE_CTX_REF is None:
|
||||||
|
return None
|
||||||
|
return _ACTIVE_CTX_REF()
|
||||||
|
|
||||||
|
|
||||||
|
class _AhbmNamespace:
|
||||||
|
"""torch.ahbm — per-greenlet SIP device binding (ADR-0024 D10).
|
||||||
|
|
||||||
|
Real-PyTorch parity idiom: ``torch.cuda.set_device(rank)``. KernBench's
|
||||||
|
backend is 'ahbm' (not CUDA), so this namespace avoids pretending to be
|
||||||
|
a CUDA runtime.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self._device_by_greenlet: dict = {}
|
||||||
|
|
||||||
|
def set_device(self, device: int) -> None:
|
||||||
|
from greenlet import getcurrent
|
||||||
|
self._device_by_greenlet[getcurrent()] = int(device)
|
||||||
|
|
||||||
|
def current_device(self) -> int | None:
|
||||||
|
from greenlet import getcurrent
|
||||||
|
return self._device_by_greenlet.get(getcurrent())
|
||||||
|
|
||||||
|
|
||||||
|
class _AcceleratorNamespace:
|
||||||
|
"""torch.accelerator — device-agnostic alias (PyTorch 2.x style).
|
||||||
|
|
||||||
|
Wraps _AhbmNamespace. Bench code can pick either:
|
||||||
|
torch.ahbm.set_device(rank) # explicit backend
|
||||||
|
torch.accelerator.set_device_index(rank) # portable
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, ahbm: "_AhbmNamespace") -> None:
|
||||||
|
self._ahbm = ahbm
|
||||||
|
|
||||||
|
def set_device_index(self, device: int) -> None:
|
||||||
|
self._ahbm.set_device(device)
|
||||||
|
|
||||||
|
def current_device_index(self) -> int | None:
|
||||||
|
return self._ahbm.current_device()
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class RuntimeContext:
|
class RuntimeContext:
|
||||||
engine: SimEngine
|
engine: SimEngine
|
||||||
@@ -18,17 +104,94 @@ class RuntimeContext:
|
|||||||
|
|
||||||
_handles: list[RequestHandle] = field(default_factory=list, init=False)
|
_handles: list[RequestHandle] = field(default_factory=list, init=False)
|
||||||
_completed: set[RequestHandle] = field(default_factory=set, init=False)
|
_completed: set[RequestHandle] = field(default_factory=set, init=False)
|
||||||
_allocators: dict[int, Any] = field(default_factory=dict, init=False)
|
# ADR-0027 D0.1: worker-deferred wait queue. When a worker greenlet
|
||||||
|
# calls ctx.wait(h), the handle is appended here and control yields to
|
||||||
|
# main. Main's scheduler drain consumes this list.
|
||||||
|
_pending_worker_waits: list[RequestHandle] = field(default_factory=list, init=False)
|
||||||
|
_allocators: dict[tuple[int, int, int], Any] = field(default_factory=dict, init=False)
|
||||||
_va_allocator: Any = field(default=None, init=False)
|
_va_allocator: Any = field(default=None, init=False)
|
||||||
_tensor_counter: int = field(default=0, init=False)
|
_tensor_counter: int = field(default=0, init=False)
|
||||||
_traces: list[dict] = field(default_factory=list, init=False)
|
_traces: list[dict] = field(default_factory=list, init=False)
|
||||||
_tensors: list[Any] = field(default_factory=list, init=False)
|
_tensors: list[Any] = field(default_factory=list, init=False)
|
||||||
|
distributed: Any = field(default=None, init=False) # DistributedContext for CCL benches
|
||||||
|
_ipcq_plan: dict = field(default_factory=dict, init=False) # ADR-0023 install plan
|
||||||
|
|
||||||
|
def __post_init__(self) -> None:
|
||||||
|
# Eagerly attach a DistributedContext so bench code can do
|
||||||
|
# ``dist = torch.distributed`` + ``dist.init_process_group(...)``
|
||||||
|
# without needing a separate launcher to install it.
|
||||||
|
from kernbench.runtime_api.distributed import DistributedContext
|
||||||
|
dc = DistributedContext()
|
||||||
|
dc._ctx_ref = self # back-reference for AhbmCCLBackend to reach ctx.launch etc.
|
||||||
|
self.distributed = dc
|
||||||
|
# ADR-0024 D10: torch.ahbm (KernBench-native) + torch.accelerator
|
||||||
|
# (PyTorch 2.x portable) namespaces for per-greenlet device binding.
|
||||||
|
self.ahbm = _AhbmNamespace()
|
||||||
|
self.accelerator = _AcceleratorNamespace(self.ahbm)
|
||||||
|
# ADR-0027 D1.3: torch.multiprocessing.spawn namespace.
|
||||||
|
from kernbench.runtime_api.multiprocessing import _MultiprocessingNamespace
|
||||||
|
self.multiprocessing = _MultiprocessingNamespace(self)
|
||||||
|
|
||||||
|
def install_ipcq(
|
||||||
|
self,
|
||||||
|
algorithm: str | None = None,
|
||||||
|
ccl_yaml: str | None = None,
|
||||||
|
world_size_override: int | None = None,
|
||||||
|
rank_to_pe: list[tuple[int, int, int]] | None = None,
|
||||||
|
) -> dict:
|
||||||
|
"""Install IPCQ neighbor tables on all participating PEs (ADR-0023 D10).
|
||||||
|
|
||||||
|
Loads ``ccl.yaml`` (or the path provided), resolves the chosen
|
||||||
|
algorithm (or ``defaults.algorithm`` if None), and pushes per-PE
|
||||||
|
IpcqInitMsg into every PE_IPCQ component via the engine.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
algorithm: name of the algorithm in ccl.yaml (or use defaults).
|
||||||
|
ccl_yaml: optional path to ccl.yaml.
|
||||||
|
world_size_override: if set, replace the algorithm's world_size.
|
||||||
|
|
||||||
|
Returns the install plan dict (rank → (sip,cube,pe), neighbor table).
|
||||||
|
"""
|
||||||
|
import importlib
|
||||||
|
from kernbench.ccl.install import (
|
||||||
|
install_ipcq as _install,
|
||||||
|
load_ccl_config,
|
||||||
|
resolve_algorithm_config,
|
||||||
|
)
|
||||||
|
|
||||||
|
cfg = load_ccl_config(ccl_yaml)
|
||||||
|
merged = resolve_algorithm_config(cfg, algorithm)
|
||||||
|
if world_size_override is not None:
|
||||||
|
merged["world_size"] = world_size_override
|
||||||
|
elif "world_size" not in merged:
|
||||||
|
# Derive from topology.yaml when neither the algorithm entry
|
||||||
|
# nor ``defaults`` carries ``world_size`` (matches pytorch DDP
|
||||||
|
# where env vars determine ranks, not the ccl config file).
|
||||||
|
merged["world_size"] = _world_size_from_spec(self.spec)
|
||||||
|
algo_module = None
|
||||||
|
try:
|
||||||
|
algo_module = importlib.import_module(merged["module"])
|
||||||
|
except ModuleNotFoundError:
|
||||||
|
pass
|
||||||
|
plan = _install(
|
||||||
|
self.engine, self.spec, merged,
|
||||||
|
algo_module=algo_module, rank_to_pe=rank_to_pe,
|
||||||
|
)
|
||||||
|
self._ipcq_plan = plan
|
||||||
|
self._ipcq_config = merged
|
||||||
|
return plan
|
||||||
|
|
||||||
def __enter__(self):
|
def __enter__(self):
|
||||||
|
global _ACTIVE_CTX_REF
|
||||||
|
_ACTIVE_CTX_REF = _weakref.ref(self)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def __exit__(self, *exc):
|
def __exit__(self, *exc):
|
||||||
|
global _ACTIVE_CTX_REF
|
||||||
self.cleanup()
|
self.cleanup()
|
||||||
|
# Clear active-context registry if we are it.
|
||||||
|
if _ACTIVE_CTX_REF is not None and _ACTIVE_CTX_REF() is self:
|
||||||
|
_ACTIVE_CTX_REF = None
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def submit(self, request: Any) -> RequestHandle:
|
def submit(self, request: Any) -> RequestHandle:
|
||||||
@@ -43,10 +206,24 @@ class RuntimeContext:
|
|||||||
return handle in self._completed
|
return handle in self._completed
|
||||||
|
|
||||||
def wait(self, handle: RequestHandle, *, _meta: dict | None = None) -> Completion:
|
def wait(self, handle: RequestHandle, *, _meta: dict | None = None) -> Completion:
|
||||||
|
# ADR-0027 D0.2: fast-path for already-completed handles (avoid
|
||||||
|
# redundant worker→main→worker round-trip).
|
||||||
if handle in self._completed:
|
if handle in self._completed:
|
||||||
completion, trace = self.engine.get_completion(handle)
|
completion, trace = self.engine.get_completion(handle)
|
||||||
return completion
|
return completion
|
||||||
|
|
||||||
|
# ADR-0027 D0.2: if called from a worker greenlet (parent is main,
|
||||||
|
# not dead), defer the wait to the main scheduler — enqueue and
|
||||||
|
# yield. Main drains env.run, then switches back. On resume the
|
||||||
|
# handle must be in _completed (D0.3 resume invariant).
|
||||||
|
from greenlet import getcurrent
|
||||||
|
g = getcurrent()
|
||||||
|
if g.parent is not None and not g.parent.dead:
|
||||||
|
self._pending_worker_waits.append(handle)
|
||||||
|
g.parent.switch()
|
||||||
|
# Resume: main drained. Fall through to completion/trace assembly.
|
||||||
|
|
||||||
|
# Main context (or single-driver): drive engine directly.
|
||||||
wait_fn = getattr(self.engine, "wait", None)
|
wait_fn = getattr(self.engine, "wait", None)
|
||||||
if wait_fn is not None:
|
if wait_fn is not None:
|
||||||
wait_fn(handle) # type: ignore[misc]
|
wait_fn(handle) # type: ignore[misc]
|
||||||
@@ -135,12 +312,7 @@ class RuntimeContext:
|
|||||||
# Return PA space
|
# Return PA space
|
||||||
if self._allocators:
|
if self._allocators:
|
||||||
for shard in handle.shards:
|
for shard in handle.shards:
|
||||||
flat_idx = (
|
alloc = self._allocators.get((shard.sip, shard.cube, shard.pe))
|
||||||
shard.sip * self._num_cubes * self._pes_per_cube
|
|
||||||
+ shard.cube * self._pes_per_cube
|
|
||||||
+ shard.pe
|
|
||||||
)
|
|
||||||
alloc = self._allocators.get(flat_idx)
|
|
||||||
if alloc is not None:
|
if alloc is not None:
|
||||||
from kernbench.policy.address.phyaddr import PhysAddr
|
from kernbench.policy.address.phyaddr import PhysAddr
|
||||||
alloc.free_hbm(PhysAddr.decode(shard.pa), shard.nbytes)
|
alloc.free_hbm(PhysAddr.decode(shard.pa), shard.nbytes)
|
||||||
@@ -204,17 +376,15 @@ class RuntimeContext:
|
|||||||
tcm_scheduler_reserved_bytes=4 * (1 << 20),
|
tcm_scheduler_reserved_bytes=4 * (1 << 20),
|
||||||
sram_bytes_per_cube=32 * (1 << 20),
|
sram_bytes_per_cube=32 * (1 << 20),
|
||||||
)
|
)
|
||||||
# Create allocators scoped to target SIP(s) only
|
# Create allocators scoped to target SIP(s) only.
|
||||||
# Flat index: sip_id * cubes_per_sip * pes_per_cube + cube_id * pes_per_cube + pe_id
|
# ADR-0026 D5: dict key is the structural (sip, cube, pe) tuple.
|
||||||
self._pes_per_cube = pes_per_cube
|
self._pes_per_cube = pes_per_cube
|
||||||
self._num_cubes = cubes_per_sip
|
self._num_cubes = cubes_per_sip
|
||||||
self._num_sips = sip_count
|
self._num_sips = sip_count
|
||||||
cubes_x_pes = cubes_per_sip * pes_per_cube
|
|
||||||
for sip_id in sip_range:
|
for sip_id in sip_range:
|
||||||
for cube_id in range(cubes_per_sip):
|
for cube_id in range(cubes_per_sip):
|
||||||
for pe_id in range(pes_per_cube):
|
for pe_id in range(pes_per_cube):
|
||||||
flat_idx = sip_id * cubes_x_pes + cube_id * pes_per_cube + pe_id
|
self._allocators[(sip_id, cube_id, pe_id)] = PEMemAllocator(
|
||||||
self._allocators[flat_idx] = PEMemAllocator(
|
|
||||||
rack_id=0, sip_id=sip_id, cube_id=cube_id, pe_id=pe_id, cfg=cfg,
|
rack_id=0, sip_id=sip_id, cube_id=cube_id, pe_id=pe_id, cfg=cfg,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -258,6 +428,24 @@ class RuntimeContext:
|
|||||||
"""Allocate a tensor in HBM without initialization (like torch.empty)."""
|
"""Allocate a tensor in HBM without initialization (like torch.empty)."""
|
||||||
return self._create_tensor(shape, dtype, name, pattern=None, dp=dp)
|
return self._create_tensor(shape, dtype, name, pattern=None, dp=dp)
|
||||||
|
|
||||||
|
def from_numpy(self, arr: Any):
|
||||||
|
"""Create a host-side tensor wrapping a numpy array.
|
||||||
|
|
||||||
|
Mirrors ``torch.from_numpy``. The returned tensor is NOT deployed
|
||||||
|
to any PE — it lives in an in-memory host staging buffer. Use
|
||||||
|
``target.copy_(host_tensor)`` to scatter its contents into a
|
||||||
|
sharded, deployed tensor.
|
||||||
|
"""
|
||||||
|
import numpy as np
|
||||||
|
from kernbench.runtime_api.tensor import Tensor
|
||||||
|
|
||||||
|
arr_c = np.ascontiguousarray(arr)
|
||||||
|
dtype_str = _numpy_to_dtype_str(arr_c.dtype)
|
||||||
|
t = Tensor(shape=tuple(arr_c.shape), dtype=dtype_str, name="host")
|
||||||
|
t._host_buffer = arr_c
|
||||||
|
t._memory_store = getattr(self.engine, "_memory_store", None)
|
||||||
|
return t
|
||||||
|
|
||||||
def _create_tensor(
|
def _create_tensor(
|
||||||
self,
|
self,
|
||||||
shape: tuple[int, ...],
|
shape: tuple[int, ...],
|
||||||
@@ -283,16 +471,23 @@ class RuntimeContext:
|
|||||||
# DPPolicy overrides take precedence over topology dimensions
|
# DPPolicy overrides take precedence over topology dimensions
|
||||||
eff_num_pe = dp.num_pes if dp.num_pes is not None else self._pes_per_cube
|
eff_num_pe = dp.num_pes if dp.num_pes is not None else self._pes_per_cube
|
||||||
eff_num_cubes = dp.num_cubes if dp.num_cubes is not None else self._num_cubes
|
eff_num_cubes = dp.num_cubes if dp.num_cubes is not None else self._num_cubes
|
||||||
eff_num_sips = dp.num_sips if dp.num_sips is not None else self._num_sips
|
# ADR-0026 D4: resolve structural coords directly at resolve time.
|
||||||
|
# ``torch.ahbm.set_device(rank)`` (ADR-0024 D10) selects the target
|
||||||
|
# SIP; if unset, fall back to SIP 0 for single-driver compatibility.
|
||||||
|
current_sip = (
|
||||||
|
self.ahbm.current_device() if hasattr(self, "ahbm") else None
|
||||||
|
)
|
||||||
|
if current_sip is None:
|
||||||
|
current_sip = 0
|
||||||
placement = resolve_dp_policy(
|
placement = resolve_dp_policy(
|
||||||
dp, shape=shape_2d, itemsize=itemsize,
|
dp, shape=shape_2d, itemsize=itemsize,
|
||||||
num_pe=eff_num_pe, num_cubes=eff_num_cubes,
|
num_pe=eff_num_pe, num_cubes=eff_num_cubes,
|
||||||
num_sips=eff_num_sips,
|
target_sip=int(current_sip),
|
||||||
)
|
)
|
||||||
|
|
||||||
# Infer target_pe from placement using local (within-cube) PE IDs.
|
# Infer target_pe from placement using local (within-cube) PE IDs.
|
||||||
# This ensures M_CPU only fans out to PEs that own shards, not all PEs.
|
# This ensures M_CPU only fans out to PEs that own shards, not all PEs.
|
||||||
local_pe_ids = sorted({s.pe_index % eff_num_pe for s in placement})
|
local_pe_ids = sorted({s.pe for s in placement})
|
||||||
if len(local_pe_ids) == 1:
|
if len(local_pe_ids) == 1:
|
||||||
target_pe: int | tuple[int, ...] | str = local_pe_ids[0]
|
target_pe: int | tuple[int, ...] | str = local_pe_ids[0]
|
||||||
elif len(local_pe_ids) == eff_num_pe and eff_num_pe == self._pes_per_cube:
|
elif len(local_pe_ids) == eff_num_pe and eff_num_pe == self._pes_per_cube:
|
||||||
@@ -314,6 +509,7 @@ class RuntimeContext:
|
|||||||
t._handle = handle
|
t._handle = handle
|
||||||
import weakref
|
import weakref
|
||||||
t._ctx_ref = weakref.ref(self)
|
t._ctx_ref = weakref.ref(self)
|
||||||
|
t._memory_store = getattr(self.engine, "_memory_store", None)
|
||||||
self._tensors.append(weakref.ref(t))
|
self._tensors.append(weakref.ref(t))
|
||||||
|
|
||||||
# Install VA→PA mappings via fabric MmuMapMsg
|
# Install VA→PA mappings via fabric MmuMapMsg
|
||||||
@@ -389,6 +585,21 @@ class RuntimeContext:
|
|||||||
"sip": shard.sip, "cube": shard.cube, "pe": shard.pe,
|
"sip": shard.sip, "cube": shard.cube, "pe": shard.pe,
|
||||||
"nbytes": shard.nbytes,
|
"nbytes": shard.nbytes,
|
||||||
})
|
})
|
||||||
|
# ADR-0027: also populate MemoryStore at VA keys so kernels
|
||||||
|
# reading via VA (the common ``tl.load`` path) see the init
|
||||||
|
# data. Phase 1 MemoryWriteMsg writes via PA; kernels read via
|
||||||
|
# VA; Phase 2 DataExecutor reads via the addresses captured in
|
||||||
|
# op_log (VA for tl.load). Without this, zero-init tensors are
|
||||||
|
# invisible to kernels in Phase 2.
|
||||||
|
store = getattr(self.engine, "_memory_store", None)
|
||||||
|
if store is not None and pattern == "zero" and handle.va_base:
|
||||||
|
import numpy as np
|
||||||
|
from kernbench.runtime_api.tensor import _numpy_dtype
|
||||||
|
np_dtype = _numpy_dtype(dtype)
|
||||||
|
for shard in handle.shards:
|
||||||
|
count = shard.nbytes // itemsize
|
||||||
|
addr = handle.va_base + shard.offset_bytes
|
||||||
|
store.write("hbm", addr, np.zeros(count, dtype=np_dtype))
|
||||||
|
|
||||||
return t
|
return t
|
||||||
|
|
||||||
@@ -397,6 +608,7 @@ class RuntimeContext:
|
|||||||
kernel_name: str,
|
kernel_name: str,
|
||||||
kernel_fn: Any,
|
kernel_fn: Any,
|
||||||
*args: Any,
|
*args: Any,
|
||||||
|
_defer_wait: bool = False,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> RequestHandle:
|
) -> RequestHandle:
|
||||||
"""Register and launch a kernel (like a fused torch op).
|
"""Register and launch a kernel (like a fused torch op).
|
||||||
@@ -406,6 +618,11 @@ class RuntimeContext:
|
|||||||
|
|
||||||
Creates per-SIP KernelLaunchMsg with local va_base per tensor
|
Creates per-SIP KernelLaunchMsg with local va_base per tensor
|
||||||
(like host driver sending per-rank launch commands).
|
(like host driver sending per-rank launch commands).
|
||||||
|
|
||||||
|
When ``_defer_wait=True`` (ADR-0024 D7), returns the list of
|
||||||
|
``(handle, sip_id, meta)`` tuples instead of waiting. Caller is
|
||||||
|
responsible for waiting — used by collective ops to yield between
|
||||||
|
submit and wait so all sibling ranks can submit first.
|
||||||
"""
|
"""
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
|
||||||
@@ -417,13 +634,12 @@ class RuntimeContext:
|
|||||||
TensorArgShard,
|
TensorArgShard,
|
||||||
)
|
)
|
||||||
from kernbench.runtime_api.tensor import Tensor
|
from kernbench.runtime_api.tensor import Tensor
|
||||||
from kernbench.triton_emu.registry import register_kernel
|
from kernbench.triton_emu.registry import _kernels, register_kernel
|
||||||
|
|
||||||
# Register kernel (idempotent)
|
# Register kernel (idempotent overwrite — last call wins).
|
||||||
try:
|
# Tests can re-register the same kernel_name with a different
|
||||||
register_kernel(kernel_name, kernel_fn)
|
# function; the user's most recent launch must use the latest fn.
|
||||||
except ValueError:
|
_kernels[kernel_name] = kernel_fn
|
||||||
pass
|
|
||||||
|
|
||||||
# Collect tensors and scalars
|
# Collect tensors and scalars
|
||||||
tensor_args: list[Tensor] = []
|
tensor_args: list[Tensor] = []
|
||||||
@@ -482,11 +698,8 @@ class RuntimeContext:
|
|||||||
dp = t._dp_metadata.dp_policy if t._dp_metadata else None
|
dp = t._dp_metadata.dp_policy if t._dp_metadata else None
|
||||||
if dp is None:
|
if dp is None:
|
||||||
return t.shape
|
return t.shape
|
||||||
if dp.sip != "replicate":
|
# ADR-0026: DPPolicy no longer crosses SIP boundaries; cube + PE
|
||||||
if dp.sip == "column_wise":
|
# are the only axes that shrink the local shape.
|
||||||
K = K // self._num_sips
|
|
||||||
elif dp.sip == "row_wise":
|
|
||||||
M = M // self._num_sips
|
|
||||||
if dp.cube != "replicate":
|
if dp.cube != "replicate":
|
||||||
if dp.cube == "column_wise":
|
if dp.cube == "column_wise":
|
||||||
K = K // self._num_cubes
|
K = K // self._num_cubes
|
||||||
@@ -505,6 +718,7 @@ class RuntimeContext:
|
|||||||
|
|
||||||
# Per-SIP kernel launch: each SIP gets TensorArgs with local va_base
|
# Per-SIP kernel launch: each SIP gets TensorArgs with local va_base
|
||||||
last_handle = None
|
last_handle = None
|
||||||
|
_pending_handles: list[tuple[Any, int]] = []
|
||||||
for sip_id in sorted(sip_set):
|
for sip_id in sorted(sip_set):
|
||||||
sip_kernel_args: list = []
|
sip_kernel_args: list = []
|
||||||
sip_cube_set: set[int] = set()
|
sip_cube_set: set[int] = set()
|
||||||
@@ -565,10 +779,29 @@ class RuntimeContext:
|
|||||||
target_cubes=target_cubes,
|
target_cubes=target_cubes,
|
||||||
target_pe=target_pe,
|
target_pe=target_pe,
|
||||||
))
|
))
|
||||||
|
# Defer wait until all SIPs are submitted (multi-SIP CCL needs
|
||||||
|
# all participating PEs to be live concurrently — waiting
|
||||||
|
# per-SIP would deadlock when ranks span SIP boundaries).
|
||||||
|
_pending_handles.append((h, sip_id))
|
||||||
|
last_handle = h
|
||||||
|
|
||||||
|
if _defer_wait:
|
||||||
|
# ADR-0024 D7: return the pending-list so the caller can yield
|
||||||
|
# between submit and drain. Used by collective ops that need
|
||||||
|
# all sibling ranks to submit before any rank waits.
|
||||||
|
return [
|
||||||
|
(h, sip_id, {
|
||||||
|
"phase": "kernel", "name": kernel_name,
|
||||||
|
"sip": sip_id, "target_pe": target_pe,
|
||||||
|
})
|
||||||
|
for h, sip_id in _pending_handles
|
||||||
|
]
|
||||||
|
|
||||||
|
# Drain pending handles now that every SIP has a launch posted.
|
||||||
|
for h, sip_id in _pending_handles:
|
||||||
self.wait(h, _meta={
|
self.wait(h, _meta={
|
||||||
"phase": "kernel", "name": kernel_name,
|
"phase": "kernel", "name": kernel_name,
|
||||||
"sip": sip_id, "target_pe": target_pe,
|
"sip": sip_id, "target_pe": target_pe,
|
||||||
})
|
})
|
||||||
last_handle = h
|
|
||||||
|
|
||||||
return last_handle
|
return last_handle
|
||||||
|
|||||||
@@ -0,0 +1,242 @@
|
|||||||
|
"""PyTorch-compatible distributed communication shim (ADR-0023 D11).
|
||||||
|
|
||||||
|
Provides a ``torch.distributed``-like API whose public surface matches
|
||||||
|
real PyTorch so that bench code looks identical to a DDP training script.
|
||||||
|
|
||||||
|
Only the ``ahbm`` backend is implemented. It:
|
||||||
|
|
||||||
|
1. Reads ``ccl.yaml`` to decide which collective algorithm to run.
|
||||||
|
2. Derives world_size from the algorithm entry, the defaults section, or
|
||||||
|
from the topology spec (``system.sips.count × sip.cube_mesh × pe_layout``).
|
||||||
|
3. At ``init_process_group`` time, eagerly installs the IPCQ neighbor
|
||||||
|
table once (one-time comm setup — mirrors NCCL communicator creation).
|
||||||
|
4. On each ``all_reduce(tensor)`` call, reads per-shard metadata from the
|
||||||
|
tensor handle and dispatches ``torch.launch`` with the registered
|
||||||
|
kernel. The kernel performs intra-PE ring/tree/mesh CCL via IPCQ,
|
||||||
|
and Phase 2 DataExecutor replays math + copies from op_log so
|
||||||
|
MemoryStore is correct when ``all_reduce`` returns.
|
||||||
|
|
||||||
|
Host bench code uses only real-PyTorch names:
|
||||||
|
dist.init_process_group, dist.is_initialized, dist.get_world_size,
|
||||||
|
dist.get_rank, dist.get_backend, dist.all_reduce, dist.barrier
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import importlib
|
||||||
|
import math
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
|
||||||
|
class AhbmCCLBackend:
|
||||||
|
"""Ahbm CCL backend — drives kernel-level collectives via IPCQ."""
|
||||||
|
|
||||||
|
def __init__(self, torch_ctx: Any) -> None:
|
||||||
|
from kernbench.ccl.install import (
|
||||||
|
load_ccl_config,
|
||||||
|
resolve_algorithm_config,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.ctx = torch_ctx
|
||||||
|
self._cfg_all = load_ccl_config()
|
||||||
|
self._merged = resolve_algorithm_config(self._cfg_all)
|
||||||
|
self._algo_module = importlib.import_module(self._merged["module"])
|
||||||
|
self._world_size = self._resolve_world_size()
|
||||||
|
self._pending_collective_handles: list = []
|
||||||
|
self._dist_ctx: Any = None
|
||||||
|
|
||||||
|
spec = self.ctx.spec or {}
|
||||||
|
self._n_sips = int(spec.get("system", {}).get("sips", {}).get("count", 1))
|
||||||
|
self._sip_topo = str(
|
||||||
|
spec.get("system", {}).get("sips", {}).get("topology", "ring_1d")
|
||||||
|
)
|
||||||
|
cm = spec.get("sip", {}).get("cube_mesh", {})
|
||||||
|
self._cube_w = int(cm.get("w", 4))
|
||||||
|
self._cube_h = int(cm.get("h", 4))
|
||||||
|
|
||||||
|
# Resolve SIP topology dims for the kernel
|
||||||
|
topo_map = getattr(self._algo_module, "TOPO_NAME_TO_KIND", None)
|
||||||
|
if topo_map is not None:
|
||||||
|
self._sip_topo_kind = topo_map.get(self._sip_topo, 0)
|
||||||
|
else:
|
||||||
|
self._sip_topo_kind = 0
|
||||||
|
if self._sip_topo == "ring_1d":
|
||||||
|
self._sip_topo_w, self._sip_topo_h = 0, 0
|
||||||
|
else:
|
||||||
|
side = int(round(math.sqrt(self._n_sips)))
|
||||||
|
self._sip_topo_w, self._sip_topo_h = side, side
|
||||||
|
|
||||||
|
# IPCQ install: wire all pe0s across all cubes and SIPs
|
||||||
|
engine = getattr(self.ctx, "engine", None)
|
||||||
|
if engine is not None:
|
||||||
|
from kernbench.ccl.sfr_config import configure_sfr_intercube_multisip
|
||||||
|
configure_sfr_intercube_multisip(engine, spec, self._merged)
|
||||||
|
|
||||||
|
def _resolve_world_size(self) -> int:
|
||||||
|
"""Derive world_size (priority: algorithm override > defaults > topology).
|
||||||
|
|
||||||
|
ADR-0024 D1: topology fallback is SIP count. Each rank represents one
|
||||||
|
SIP (TP dimension). Intra-SIP parallelism is expressed via DPPolicy
|
||||||
|
inside each worker and is independent of world_size.
|
||||||
|
Explicit ``ccl.yaml`` override still respected — legacy "rank = flat
|
||||||
|
PE index" tests use this path.
|
||||||
|
"""
|
||||||
|
if "world_size" in self._merged:
|
||||||
|
return int(self._merged["world_size"])
|
||||||
|
defaults = self._cfg_all.get("defaults", {})
|
||||||
|
if "world_size" in defaults:
|
||||||
|
return int(defaults["world_size"])
|
||||||
|
spec = self.ctx.spec or {}
|
||||||
|
return int(spec.get("system", {}).get("sips", {}).get("count", 1))
|
||||||
|
|
||||||
|
@property
|
||||||
|
def world_size(self) -> int:
|
||||||
|
return self._world_size
|
||||||
|
|
||||||
|
def all_reduce(self, tensor: Any, op: str = "sum") -> None:
|
||||||
|
"""Dispatch the configured CCL algorithm as a single kernel launch.
|
||||||
|
|
||||||
|
Raises if ``op != "sum"`` (current kernels only implement add
|
||||||
|
reduction) or if the tensor's shard count disagrees with the
|
||||||
|
world_size that was installed into PE_IPCQ.
|
||||||
|
"""
|
||||||
|
if op != "sum":
|
||||||
|
raise NotImplementedError(f"all_reduce op={op!r} not supported")
|
||||||
|
if tensor._handle is None:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Tensor '{tensor.name}' is not deployed (call torch.zeros "
|
||||||
|
"with a DPPolicy first)"
|
||||||
|
)
|
||||||
|
shards = tensor._handle.shards
|
||||||
|
if not shards:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"all_reduce tensor '{tensor.name}' has no shards"
|
||||||
|
)
|
||||||
|
n_elem = shards[0].nbytes // tensor.itemsize
|
||||||
|
kernel_fn = self._algo_module.kernel
|
||||||
|
kernel_args = self._algo_module.kernel_args(self._world_size, n_elem)
|
||||||
|
|
||||||
|
# Resolve sip_rank from the current greenlet's bound rank
|
||||||
|
from greenlet import getcurrent as _gc
|
||||||
|
g = _gc()
|
||||||
|
dist_ctx = getattr(self, "_dist_ctx", None)
|
||||||
|
if dist_ctx is not None:
|
||||||
|
sip_rank = int(dist_ctx._rank_by_greenlet.get(g, 0))
|
||||||
|
else:
|
||||||
|
sip_rank = 0
|
||||||
|
|
||||||
|
extra_args = (
|
||||||
|
sip_rank,
|
||||||
|
self._sip_topo_kind,
|
||||||
|
self._sip_topo_w,
|
||||||
|
self._sip_topo_h,
|
||||||
|
)
|
||||||
|
|
||||||
|
pending = self.ctx.launch(
|
||||||
|
self._merged["algorithm"], kernel_fn, tensor,
|
||||||
|
*kernel_args, *extra_args,
|
||||||
|
_defer_wait=True,
|
||||||
|
)
|
||||||
|
from greenlet import getcurrent
|
||||||
|
g = getcurrent()
|
||||||
|
if g.parent is not None and not g.parent.dead:
|
||||||
|
# Multi-greenlet mode: hand pending to the backend-level queue so
|
||||||
|
# the main scheduler drains. Worker just yields.
|
||||||
|
self._pending_collective_handles.extend(pending)
|
||||||
|
g.parent.switch()
|
||||||
|
# On resume, all pending handles have been drained by main.
|
||||||
|
else:
|
||||||
|
# Single-driver (no bench scheduler): drain inline.
|
||||||
|
for h, _sip_id, meta in pending:
|
||||||
|
self.ctx.wait(h, _meta=meta)
|
||||||
|
|
||||||
|
def barrier(self) -> None:
|
||||||
|
# Single-driver model → no cross-process sync needed. Keeping the
|
||||||
|
# method so ``dist.barrier()`` is callable (pytorch-compat surface).
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class DistributedContext:
|
||||||
|
"""torch.distributed-compat facade.
|
||||||
|
|
||||||
|
Public surface matches real PyTorch so bench code reads identically
|
||||||
|
to a DDP training script. Single-driver semantics: ``get_rank()``
|
||||||
|
always returns 0 because kernbench runs as one Python process;
|
||||||
|
``get_world_size()`` returns the CCL group size (number of PEs
|
||||||
|
participating in the collective).
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self._backend: AhbmCCLBackend | None = None
|
||||||
|
# ADR-0024 D9: greenlet-local rank registry. Bench launcher calls
|
||||||
|
# _bind_rank(g, rank) when spawning workers; get_rank() resolves the
|
||||||
|
# current greenlet to its rank. Unbound greenlets fall back to 0 for
|
||||||
|
# single-driver test compat.
|
||||||
|
self._rank_by_greenlet: dict = {}
|
||||||
|
|
||||||
|
def init_process_group(
|
||||||
|
self,
|
||||||
|
backend: str = "ahbm",
|
||||||
|
world_size: int | None = None,
|
||||||
|
rank: int | None = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> None:
|
||||||
|
"""Create the default process group.
|
||||||
|
|
||||||
|
``world_size`` and ``rank`` are accepted for API parity with
|
||||||
|
``torch.distributed.init_process_group`` but ignored — the ahbm
|
||||||
|
backend derives both from ``ccl.yaml`` + topology automatically
|
||||||
|
(like reading ``RANK``/``WORLD_SIZE`` env vars in real DDP).
|
||||||
|
"""
|
||||||
|
if backend != "ahbm":
|
||||||
|
raise ValueError(
|
||||||
|
f"Unsupported backend '{backend}'. Only 'ahbm' is supported."
|
||||||
|
)
|
||||||
|
ctx = getattr(self, "_ctx_ref", None)
|
||||||
|
if ctx is None:
|
||||||
|
raise RuntimeError(
|
||||||
|
"DistributedContext not bound to a RuntimeContext"
|
||||||
|
)
|
||||||
|
self._backend = AhbmCCLBackend(torch_ctx=ctx)
|
||||||
|
self._backend._dist_ctx = self
|
||||||
|
|
||||||
|
def is_initialized(self) -> bool:
|
||||||
|
return self._backend is not None
|
||||||
|
|
||||||
|
def get_world_size(self) -> int:
|
||||||
|
self._ensure_initialized()
|
||||||
|
return self._backend.world_size
|
||||||
|
|
||||||
|
def get_rank(self) -> int:
|
||||||
|
"""Return the rank bound to the current greenlet (default 0).
|
||||||
|
|
||||||
|
ADR-0024 D9: workers spawned by the bench launcher each get a rank
|
||||||
|
registered via ``_bind_rank``. Callers outside any bound greenlet
|
||||||
|
fall back to rank 0 for single-driver test compat.
|
||||||
|
"""
|
||||||
|
self._ensure_initialized()
|
||||||
|
from greenlet import getcurrent
|
||||||
|
g = getcurrent()
|
||||||
|
return int(self._rank_by_greenlet.get(g, 0))
|
||||||
|
|
||||||
|
def _bind_rank(self, g: Any, rank: int) -> None:
|
||||||
|
"""Bind a greenlet to a rank so ``get_rank()`` returns it (ADR-0024 D9)."""
|
||||||
|
self._rank_by_greenlet[g] = int(rank)
|
||||||
|
|
||||||
|
def get_backend(self) -> str:
|
||||||
|
self._ensure_initialized()
|
||||||
|
return "ahbm"
|
||||||
|
|
||||||
|
def all_reduce(self, tensor: Any, op: str = "sum") -> None:
|
||||||
|
self._ensure_initialized()
|
||||||
|
self._backend.all_reduce(tensor, op=op)
|
||||||
|
|
||||||
|
def barrier(self) -> None:
|
||||||
|
self._ensure_initialized()
|
||||||
|
self._backend.barrier()
|
||||||
|
|
||||||
|
def _ensure_initialized(self) -> None:
|
||||||
|
if self._backend is None:
|
||||||
|
raise RuntimeError(
|
||||||
|
"Default process group has not been initialized. "
|
||||||
|
"Call init_process_group(backend='ahbm') first."
|
||||||
|
)
|
||||||
@@ -152,3 +152,30 @@ class MmuUnmapMsg:
|
|||||||
target_cubes: tuple[int, ...] | Literal["all"] = "all"
|
target_cubes: tuple[int, ...] | Literal["all"] = "all"
|
||||||
target_pe: int | Literal["all"] = "all"
|
target_pe: int | Literal["all"] = "all"
|
||||||
msg_type: Literal["mmu_unmap"] = "mmu_unmap"
|
msg_type: Literal["mmu_unmap"] = "mmu_unmap"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class IpcqInitMsg:
|
||||||
|
"""IPCQ neighbor table install (sideband fan-out, ADR-0023 D10/D12).
|
||||||
|
|
||||||
|
Backend issues this at ``init_process_group`` time to install per-PE
|
||||||
|
IPCQ neighbor tables. Each entry covers one direction (N/S/E/W) and
|
||||||
|
carries the peer's IpcqEndpoint plus this PE's own rx_buffer base
|
||||||
|
and a pre-wired SimPy Store for credit return fast path (D9).
|
||||||
|
|
||||||
|
Routing is similar to MmuMapMsg.
|
||||||
|
"""
|
||||||
|
|
||||||
|
correlation_id: str
|
||||||
|
request_id: str
|
||||||
|
target_sips: tuple[int, ...] | Literal["all"] = "all"
|
||||||
|
target_cubes: tuple[int, ...] | Literal["all"] = "all"
|
||||||
|
target_pe: int | tuple[int, ...] | Literal["all"] = "all"
|
||||||
|
# entries: tuple[IpcqInitEntry, ...] — kept as tuple of plain objects to
|
||||||
|
# avoid a runtime import cycle (IpcqInitEntry lives in
|
||||||
|
# kernbench.common.ipcq_types).
|
||||||
|
entries: tuple = ()
|
||||||
|
backpressure_mode: str = "sleep" # "poll" | "sleep"
|
||||||
|
buffer_kind: str = "tcm" # "tcm" | "hbm" | "sram"
|
||||||
|
credit_size_bytes: int = 16
|
||||||
|
msg_type: Literal["ipcq_init"] = "ipcq_init"
|
||||||
|
|||||||
@@ -0,0 +1,152 @@
|
|||||||
|
"""``torch.multiprocessing.spawn``-compatible namespace (ADR-0027 D1).
|
||||||
|
|
||||||
|
Real-PyTorch API *signature* parity only — execution model is a cooperative
|
||||||
|
greenlet scheduler in a single Python process (D1.0). Non-goals: process
|
||||||
|
isolation, independent address space, failure isolation, OS-level scheduler
|
||||||
|
fairness, mp.Queue/Lock.
|
||||||
|
|
||||||
|
Attached to ``RuntimeContext`` as ``ctx.multiprocessing`` in
|
||||||
|
``__post_init__`` (D1.3).
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Any, Callable
|
||||||
|
|
||||||
|
|
||||||
|
class SpawnException(RuntimeError):
|
||||||
|
"""Raised from ``_MultiprocessingNamespace.spawn`` on worker failure.
|
||||||
|
|
||||||
|
``errors`` contains only root-cause ranks — the rank(s) whose body
|
||||||
|
raised. Sibling greenlets terminated via ``throw(SystemExit)`` during
|
||||||
|
cleanup are NOT recorded (SystemExit does not satisfy ``except
|
||||||
|
Exception`` in the entry wrapper).
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, errors: dict[int, Exception]):
|
||||||
|
self.errors = errors
|
||||||
|
first = next(iter(errors.items()), None)
|
||||||
|
msg = (
|
||||||
|
f"spawn failed on ranks {sorted(errors.keys())}"
|
||||||
|
+ (
|
||||||
|
f": rank {first[0]} raised {first[1]!r}"
|
||||||
|
if first is not None
|
||||||
|
else ""
|
||||||
|
)
|
||||||
|
)
|
||||||
|
super().__init__(msg)
|
||||||
|
|
||||||
|
|
||||||
|
def _drain_pending(ctx: Any) -> None:
|
||||||
|
"""Drain worker-wait + collective-pending queues in main context (D0.4/D0.5).
|
||||||
|
|
||||||
|
Loop-until-empty: runs until both queues are simultaneously empty. Safe
|
||||||
|
under the current model where main-context ``ctx.wait`` never re-enqueues
|
||||||
|
(D0.5 main-context non-reentrance invariant); also safe under future
|
||||||
|
extensions where drain can add sub-handles (SimPy causality gives finite
|
||||||
|
depth).
|
||||||
|
"""
|
||||||
|
distributed = getattr(ctx, "distributed", None)
|
||||||
|
backend = getattr(distributed, "_backend", None) if distributed else None
|
||||||
|
|
||||||
|
def _collective_nonempty() -> bool:
|
||||||
|
if backend is None:
|
||||||
|
return False
|
||||||
|
pending = getattr(backend, "_pending_collective_handles", None)
|
||||||
|
return bool(pending)
|
||||||
|
|
||||||
|
while ctx._pending_worker_waits or _collective_nonempty():
|
||||||
|
# (a) Worker-driven waits (D0.1). FIFO.
|
||||||
|
while ctx._pending_worker_waits:
|
||||||
|
h = ctx._pending_worker_waits.pop(0)
|
||||||
|
if h not in ctx._completed:
|
||||||
|
wait_fn = getattr(ctx.engine, "wait", None)
|
||||||
|
if wait_fn is not None:
|
||||||
|
wait_fn(h)
|
||||||
|
# Populate _completed so fast-path in ctx.wait short-circuits
|
||||||
|
# on the return leg.
|
||||||
|
ctx._completed.add(h)
|
||||||
|
# (b) Collective backend queue (ADR-0024 D7 + D0.4-(2)).
|
||||||
|
if backend is not None:
|
||||||
|
pending_list = getattr(backend, "_pending_collective_handles", None)
|
||||||
|
if pending_list is not None:
|
||||||
|
while pending_list:
|
||||||
|
h, _sip_id, meta = pending_list.pop(0)
|
||||||
|
# Main context: ctx.wait drives engine directly and does
|
||||||
|
# NOT re-enqueue (D0.5 invariant).
|
||||||
|
ctx.wait(h, _meta=meta)
|
||||||
|
|
||||||
|
|
||||||
|
class _MultiprocessingNamespace:
|
||||||
|
"""torch.multiprocessing-compat facade bound to a RuntimeContext."""
|
||||||
|
|
||||||
|
def __init__(self, ctx: Any) -> None:
|
||||||
|
self._ctx = ctx
|
||||||
|
|
||||||
|
def spawn(
|
||||||
|
self,
|
||||||
|
fn: Callable,
|
||||||
|
args: tuple = (),
|
||||||
|
nprocs: int = 1,
|
||||||
|
join: bool = True,
|
||||||
|
) -> None:
|
||||||
|
"""Spawn ``nprocs`` worker greenlets, each calling ``fn(rank, *args)``.
|
||||||
|
|
||||||
|
Mirrors ``torch.multiprocessing.spawn`` signature (minus ``daemon``).
|
||||||
|
Runs the D0.4 round-robin scheduler loop until all workers finish,
|
||||||
|
draining pending queues between rounds.
|
||||||
|
"""
|
||||||
|
from greenlet import greenlet
|
||||||
|
|
||||||
|
ctx = self._ctx
|
||||||
|
dist = ctx.distributed
|
||||||
|
gs: list = []
|
||||||
|
errors: dict[int, Exception] = {}
|
||||||
|
|
||||||
|
for rank in range(nprocs):
|
||||||
|
def _entry(r: int = rank) -> None:
|
||||||
|
try:
|
||||||
|
fn(r, *args)
|
||||||
|
except Exception as e:
|
||||||
|
errors[r] = e
|
||||||
|
raise
|
||||||
|
|
||||||
|
g = greenlet(_entry)
|
||||||
|
if dist is not None and hasattr(dist, "_bind_rank"):
|
||||||
|
dist._bind_rank(g, rank)
|
||||||
|
gs.append(g)
|
||||||
|
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
alive = [g for g in gs if not g.dead]
|
||||||
|
if not alive:
|
||||||
|
break
|
||||||
|
for g in alive:
|
||||||
|
if not g.dead:
|
||||||
|
g.switch()
|
||||||
|
_drain_pending(ctx)
|
||||||
|
except Exception as outer:
|
||||||
|
# D0.4-(4) sibling cleanup. Abort live greenlets, clear state.
|
||||||
|
for other in gs:
|
||||||
|
if not other.dead:
|
||||||
|
try:
|
||||||
|
other.throw(SystemExit)
|
||||||
|
except BaseException:
|
||||||
|
# SystemExit inherits BaseException; greenlet.throw
|
||||||
|
# re-raises in caller if target doesn't catch it.
|
||||||
|
# Silent — we're already in cleanup.
|
||||||
|
pass
|
||||||
|
backend = getattr(dist, "_backend", None)
|
||||||
|
if backend is not None:
|
||||||
|
if hasattr(backend, "_barrier") and hasattr(backend._barrier, "reset"):
|
||||||
|
try:
|
||||||
|
backend._barrier.reset()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
pending_collective = getattr(
|
||||||
|
backend, "_pending_collective_handles", None,
|
||||||
|
)
|
||||||
|
if pending_collective is not None:
|
||||||
|
pending_collective.clear()
|
||||||
|
ctx._pending_worker_waits.clear()
|
||||||
|
raise SpawnException(errors) from outer
|
||||||
|
# join=True: we already waited for all workers above.
|
||||||
@@ -5,6 +5,8 @@ import weakref
|
|||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Literal
|
from typing import Literal
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
from kernbench.policy.address.allocator import PEMemAllocator
|
from kernbench.policy.address.allocator import PEMemAllocator
|
||||||
from kernbench.policy.placement.dp import DPPolicy, ShardSpec
|
from kernbench.policy.placement.dp import DPPolicy, ShardSpec
|
||||||
from kernbench.runtime_api.kernel import TensorArg, TensorArgShard
|
from kernbench.runtime_api.kernel import TensorArg, TensorArgShard
|
||||||
@@ -50,13 +52,78 @@ def dtype_itemsize(dtype: str) -> int:
|
|||||||
return _DTYPE_ITEMSIZE[dtype]
|
return _DTYPE_ITEMSIZE[dtype]
|
||||||
|
|
||||||
|
|
||||||
|
_NUMPY_DTYPE = {
|
||||||
|
"f16": np.float16, "fp16": np.float16, "float16": np.float16,
|
||||||
|
"f32": np.float32, "fp32": np.float32, "float32": np.float32,
|
||||||
|
"bf16": np.float16,
|
||||||
|
"i8": np.int8, "int8": np.int8,
|
||||||
|
"i16": np.int16, "int16": np.int16,
|
||||||
|
"i32": np.int32, "int32": np.int32,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _numpy_dtype(dtype: str) -> np.dtype:
|
||||||
|
return np.dtype(_NUMPY_DTYPE.get(dtype, np.float16))
|
||||||
|
|
||||||
|
|
||||||
|
# ADR-0027 T5.g: closed-set registry of host-read barrier entry-points.
|
||||||
|
# Any new Tensor API with host-observable read semantics must be added here
|
||||||
|
# AND implement the barrier call. Code review + this registry keep the set
|
||||||
|
# consistent (Python introspection-based auto-detection is a non-goal).
|
||||||
|
# Note on ``copy_``: the source read is barriered via ``source.numpy()``.
|
||||||
|
# A target-side write barrier was specified in an earlier revision of
|
||||||
|
# ADR-0027 D0.5 but is intentionally not applied (global-pending target
|
||||||
|
# barrier can prematurely drain cross-rank collectives → deadlock).
|
||||||
|
_HOST_READ_BARRIERS: frozenset[str] = frozenset({
|
||||||
|
"numpy",
|
||||||
|
"data",
|
||||||
|
"__getitem__",
|
||||||
|
"__repr__",
|
||||||
|
"copy_", # source-side via source.numpy(); target-side not barriered
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
def _host_read_barrier(tensor: "Tensor") -> None:
|
||||||
|
"""ADR-0027 D0.5: drain pending worker-wait queue before a host-observable
|
||||||
|
read/write.
|
||||||
|
|
||||||
|
Scope: the barrier yields to main when ``ctx._pending_worker_waits`` is
|
||||||
|
non-empty AND the caller is a worker greenlet. Collective pending
|
||||||
|
(``backend._pending_collective_handles``) is **deliberately excluded**
|
||||||
|
from this check — collective handles represent cross-rank protocol that
|
||||||
|
must be drained only at scheduler synchronisation points (all workers
|
||||||
|
yielded). A collective's own yield (inside ``all_reduce``) already
|
||||||
|
ensures that once the collective call returns to the worker, post-drain
|
||||||
|
values are visible, so subsequent host reads see materialised data
|
||||||
|
without needing to trigger drain themselves. Including collective
|
||||||
|
pending here would cause an unrelated rank's barrier to prematurely
|
||||||
|
request drain of a cross-rank operation → deadlock.
|
||||||
|
|
||||||
|
No-op when called from main context or when the worker-wait queue is
|
||||||
|
empty (fast-path avoids needless context switches).
|
||||||
|
"""
|
||||||
|
ctx = None
|
||||||
|
if tensor._ctx_ref is not None:
|
||||||
|
ctx = tensor._ctx_ref()
|
||||||
|
if ctx is None:
|
||||||
|
return
|
||||||
|
worker_pending = getattr(ctx, "_pending_worker_waits", None)
|
||||||
|
if not worker_pending:
|
||||||
|
return # fast-path
|
||||||
|
from greenlet import getcurrent
|
||||||
|
g = getcurrent()
|
||||||
|
if g.parent is None or g.parent.dead:
|
||||||
|
return # main context: caller drains directly when needed
|
||||||
|
g.parent.switch()
|
||||||
|
|
||||||
|
|
||||||
def deploy_tensor(
|
def deploy_tensor(
|
||||||
*,
|
*,
|
||||||
name: str,
|
name: str,
|
||||||
shape: tuple[int, ...],
|
shape: tuple[int, ...],
|
||||||
dtype: str,
|
dtype: str,
|
||||||
placement: list[ShardSpec],
|
placement: list[ShardSpec],
|
||||||
allocators: dict[int, PEMemAllocator],
|
allocators: dict[tuple[int, int, int], PEMemAllocator],
|
||||||
mem_kind: Literal["hbm", "tcm"] = "hbm",
|
mem_kind: Literal["hbm", "tcm"] = "hbm",
|
||||||
va_allocator=None,
|
va_allocator=None,
|
||||||
) -> TensorHandle:
|
) -> TensorHandle:
|
||||||
@@ -70,15 +137,15 @@ def deploy_tensor(
|
|||||||
|
|
||||||
shards: list[TensorShard] = []
|
shards: list[TensorShard] = []
|
||||||
for spec in placement:
|
for spec in placement:
|
||||||
alloc = allocators[spec.pe_index]
|
alloc = allocators[(spec.sip, spec.cube, spec.pe)]
|
||||||
if mem_kind == "hbm":
|
if mem_kind == "hbm":
|
||||||
pa = alloc.alloc_hbm(spec.nbytes)
|
pa = alloc.alloc_hbm(spec.nbytes)
|
||||||
else:
|
else:
|
||||||
pa = alloc.alloc_tcm(spec.nbytes)
|
pa = alloc.alloc_tcm(spec.nbytes)
|
||||||
shards.append(TensorShard(
|
shards.append(TensorShard(
|
||||||
sip=alloc._sip_id,
|
sip=spec.sip,
|
||||||
cube=alloc._cube_id,
|
cube=spec.cube,
|
||||||
pe=alloc._pe_id,
|
pe=spec.pe,
|
||||||
pa=pa.encode(),
|
pa=pa.encode(),
|
||||||
nbytes=spec.nbytes,
|
nbytes=spec.nbytes,
|
||||||
offset_bytes=spec.offset_bytes,
|
offset_bytes=spec.offset_bytes,
|
||||||
@@ -129,6 +196,12 @@ class Tensor:
|
|||||||
self._dp_metadata: DPMetadata | None = None
|
self._dp_metadata: DPMetadata | None = None
|
||||||
self._handle: TensorHandle | None = None
|
self._handle: TensorHandle | None = None
|
||||||
self._ctx_ref: weakref.ref | None = None # set by RuntimeContext
|
self._ctx_ref: weakref.ref | None = None # set by RuntimeContext
|
||||||
|
self._memory_store = None # set by RuntimeContext when enable_data=True
|
||||||
|
# Host-side staging buffer for torch.from_numpy() results. A tensor
|
||||||
|
# with a non-None _host_buffer is NOT deployed to any PE — it lives
|
||||||
|
# only on the host. Use `target.copy_(host_tensor)` to scatter the
|
||||||
|
# data into a deployed, sharded target tensor.
|
||||||
|
self._host_buffer: np.ndarray | None = None
|
||||||
|
|
||||||
def __del__(self) -> None:
|
def __del__(self) -> None:
|
||||||
if self._ctx_ref is None or self._handle is None:
|
if self._ctx_ref is None or self._handle is None:
|
||||||
@@ -137,6 +210,222 @@ class Tensor:
|
|||||||
if ctx is not None:
|
if ctx is not None:
|
||||||
ctx._free_tensor(self)
|
ctx._free_tensor(self)
|
||||||
|
|
||||||
|
# ── Indexing (shard-aligned slices) ────────────────────────────
|
||||||
|
|
||||||
|
def _resolve_shard_index(self, key) -> tuple[int, int | None]:
|
||||||
|
"""Map a numpy-style index key to (flat_start_elem, flat_stop_elem).
|
||||||
|
|
||||||
|
Only shard-aligned slices on the last dimension are supported.
|
||||||
|
Returns (start, stop) in element units from the flat layout, or
|
||||||
|
raises IndexError / NotImplementedError for unsupported keys.
|
||||||
|
"""
|
||||||
|
if self._handle is None:
|
||||||
|
raise RuntimeError(f"Tensor '{self.name}' is not deployed")
|
||||||
|
ndim = len(self.shape)
|
||||||
|
if not isinstance(key, tuple):
|
||||||
|
key = (key,)
|
||||||
|
if len(key) != ndim:
|
||||||
|
raise IndexError(
|
||||||
|
f"expected {ndim} indices, got {len(key)}"
|
||||||
|
)
|
||||||
|
# All leading dims must be int (selecting a single row/plane).
|
||||||
|
for i, k in enumerate(key[:-1]):
|
||||||
|
if not isinstance(k, int):
|
||||||
|
raise NotImplementedError(
|
||||||
|
"only integer indices are supported for leading dims"
|
||||||
|
)
|
||||||
|
last = key[-1]
|
||||||
|
total_elems = math.prod(self.shape)
|
||||||
|
if isinstance(last, int):
|
||||||
|
# Single element
|
||||||
|
return (last, last + 1)
|
||||||
|
if isinstance(last, slice):
|
||||||
|
start, stop, step = last.indices(self.shape[-1])
|
||||||
|
if step != 1:
|
||||||
|
raise NotImplementedError("step != 1 not supported")
|
||||||
|
return (start, stop)
|
||||||
|
raise NotImplementedError(f"unsupported index type: {type(last)}")
|
||||||
|
|
||||||
|
def _shard_for_range(self, start_elem: int, stop_elem: int) -> TensorShard:
|
||||||
|
"""Return the single shard that fully covers [start_elem, stop_elem).
|
||||||
|
|
||||||
|
Raises NotImplementedError if the range spans multiple shards.
|
||||||
|
"""
|
||||||
|
isize = self.itemsize
|
||||||
|
start_byte = start_elem * isize
|
||||||
|
stop_byte = stop_elem * isize
|
||||||
|
for shard in self._handle.shards:
|
||||||
|
s_start = shard.offset_bytes
|
||||||
|
s_end = shard.offset_bytes + shard.nbytes
|
||||||
|
if start_byte >= s_start and stop_byte <= s_end:
|
||||||
|
return shard
|
||||||
|
raise NotImplementedError(
|
||||||
|
f"slice [{start_elem}:{stop_elem}] spans multiple shards "
|
||||||
|
f"(only shard-aligned slices are supported)"
|
||||||
|
)
|
||||||
|
|
||||||
|
def __getitem__(self, key):
|
||||||
|
"""Read a shard-aligned slice. Returns a numpy array.
|
||||||
|
|
||||||
|
Mirrors ``torch.Tensor.__getitem__`` for the shard-aligned case.
|
||||||
|
ADR-0027 D0.5: host-read barrier.
|
||||||
|
"""
|
||||||
|
_host_read_barrier(self)
|
||||||
|
start, stop = self._resolve_shard_index(key)
|
||||||
|
shard = self._shard_for_range(start, stop)
|
||||||
|
if self._memory_store is None:
|
||||||
|
return np.zeros(stop - start, dtype=_numpy_dtype(self.dtype))
|
||||||
|
isize = self.itemsize
|
||||||
|
local_start = (start * isize - shard.offset_bytes) // isize
|
||||||
|
local_count = stop - start
|
||||||
|
try:
|
||||||
|
arr = self._memory_store.read(
|
||||||
|
"hbm", self._shard_store_addr(shard),
|
||||||
|
)
|
||||||
|
flat = np.asarray(arr, dtype=_numpy_dtype(self.dtype)).reshape(-1)
|
||||||
|
return flat[local_start : local_start + local_count]
|
||||||
|
except KeyError:
|
||||||
|
return np.zeros(local_count, dtype=_numpy_dtype(self.dtype))
|
||||||
|
|
||||||
|
def __setitem__(self, key, value):
|
||||||
|
"""Write a shard-aligned slice.
|
||||||
|
|
||||||
|
Mirrors ``torch.Tensor.__setitem__``. Scalar broadcast and
|
||||||
|
numpy array assignment are both supported.
|
||||||
|
"""
|
||||||
|
if self._handle is None or self._memory_store is None:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Tensor '{self.name}' must be deployed before assignment"
|
||||||
|
)
|
||||||
|
start, stop = self._resolve_shard_index(key)
|
||||||
|
shard = self._shard_for_range(start, stop)
|
||||||
|
np_dtype = _numpy_dtype(self.dtype)
|
||||||
|
isize = self.itemsize
|
||||||
|
local_start = (start * isize - shard.offset_bytes) // isize
|
||||||
|
local_count = stop - start
|
||||||
|
shard_elems = shard.nbytes // isize
|
||||||
|
addr = self._shard_store_addr(shard)
|
||||||
|
|
||||||
|
# Read current shard data (or zeros if uninitialized)
|
||||||
|
try:
|
||||||
|
arr = self._memory_store.read("hbm", addr)
|
||||||
|
arr = np.array(arr, dtype=np_dtype).reshape(-1).copy()
|
||||||
|
except KeyError:
|
||||||
|
arr = np.zeros(shard_elems, dtype=np_dtype)
|
||||||
|
|
||||||
|
# Write the slice
|
||||||
|
if isinstance(value, (int, float)):
|
||||||
|
arr[local_start : local_start + local_count] = np_dtype.type(value)
|
||||||
|
else:
|
||||||
|
v = np.asarray(value, dtype=np_dtype).reshape(-1)
|
||||||
|
arr[local_start : local_start + local_count] = v[:local_count]
|
||||||
|
|
||||||
|
self._memory_store.write("hbm", addr, arr)
|
||||||
|
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
parts = [f"tensor(name={self.name}, shape={self.shape}, dtype={self.dtype}"]
|
||||||
|
if self._memory_store is not None and self._handle is not None:
|
||||||
|
# ADR-0027 D0.5: barrier on data-containing repr path.
|
||||||
|
_host_read_barrier(self)
|
||||||
|
arr = self.data
|
||||||
|
parts.append(f", mean={float(arr.mean()):.4g}, norm={float(np.linalg.norm(arr)):.4g}")
|
||||||
|
else:
|
||||||
|
parts.append(", data=N/A (placeholder)")
|
||||||
|
parts.append(")")
|
||||||
|
return "".join(parts)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def data(self) -> np.ndarray:
|
||||||
|
"""Tensor data as numpy array.
|
||||||
|
|
||||||
|
Gathers all shards into a single full-shape array. Returns actual
|
||||||
|
values when enable_data=True, zeros placeholder otherwise (like an
|
||||||
|
uninitialized tensor). Alias of ``numpy()``.
|
||||||
|
"""
|
||||||
|
return self.numpy()
|
||||||
|
|
||||||
|
def _shard_store_addr(self, shard: TensorShard) -> int:
|
||||||
|
"""MemoryStore key for a shard.
|
||||||
|
|
||||||
|
Kernels read tensors via VA (translated to PA by PE_DMA's MMU when
|
||||||
|
a mapping exists, otherwise the addr is treated as a PA-equivalent
|
||||||
|
key). Tensor I/O therefore writes/reads at ``va_base + offset_bytes``
|
||||||
|
when ``va_base`` is set, falling back to ``shard.pa`` for the
|
||||||
|
VA-less mode used by some legacy paths.
|
||||||
|
"""
|
||||||
|
if self._handle and self._handle.va_base:
|
||||||
|
return self._handle.va_base + shard.offset_bytes
|
||||||
|
return shard.pa
|
||||||
|
|
||||||
|
def numpy(self) -> np.ndarray:
|
||||||
|
"""Return a single numpy array gathered from all shards.
|
||||||
|
|
||||||
|
Mirrors ``torch.Tensor.numpy()``. In kernbench, sharded tensors are
|
||||||
|
gathered into a single full-shape ndarray according to each shard's
|
||||||
|
``offset_bytes`` / ``nbytes`` range.
|
||||||
|
|
||||||
|
ADR-0027 D0.5: acts as a host-read barrier — drains pending waits +
|
||||||
|
collective handles before reading, ensuring post-drain values.
|
||||||
|
"""
|
||||||
|
_host_read_barrier(self)
|
||||||
|
np_dtype = _numpy_dtype(self.dtype)
|
||||||
|
# Host-side tensor (created via torch.from_numpy) has no shards.
|
||||||
|
if self._host_buffer is not None:
|
||||||
|
return self._host_buffer.copy()
|
||||||
|
if self._handle is None or self._memory_store is None:
|
||||||
|
return np.zeros(self.shape, dtype=np_dtype)
|
||||||
|
flat = np.zeros(math.prod(self.shape), dtype=np_dtype)
|
||||||
|
for shard in self._handle.shards:
|
||||||
|
start = shard.offset_bytes // self.itemsize
|
||||||
|
count = shard.nbytes // self.itemsize
|
||||||
|
try:
|
||||||
|
piece = self._memory_store.read(
|
||||||
|
"hbm", self._shard_store_addr(shard),
|
||||||
|
)
|
||||||
|
except KeyError:
|
||||||
|
continue
|
||||||
|
flat[start : start + count] = (
|
||||||
|
np.asarray(piece, dtype=np_dtype).reshape(-1)[:count]
|
||||||
|
)
|
||||||
|
return flat.reshape(self.shape)
|
||||||
|
|
||||||
|
def copy_(self, source: "Tensor") -> "Tensor":
|
||||||
|
"""In-place copy from another tensor into self.
|
||||||
|
|
||||||
|
Mirrors ``torch.Tensor.copy_()``. If ``source`` is a host tensor
|
||||||
|
(from ``torch.from_numpy``), its ndarray is split across self's
|
||||||
|
shards using each shard's byte range. If ``source`` is a deployed
|
||||||
|
(sharded) tensor, its contents are gathered first and then
|
||||||
|
re-scattered into self's shard layout.
|
||||||
|
|
||||||
|
Shapes must match. Returns self.
|
||||||
|
|
||||||
|
ADR-0027 D0.5: source-side read barrier is triggered inside
|
||||||
|
``source.numpy()``. Target-side write barrier is not applied here
|
||||||
|
because it would require cross-rank coordination when other ranks
|
||||||
|
have pending collectives (see _host_read_barrier docstring on
|
||||||
|
collective pending being cross-rank).
|
||||||
|
"""
|
||||||
|
if self._handle is None or self._memory_store is None:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Tensor '{self.name}' must be deployed before copy_()"
|
||||||
|
)
|
||||||
|
if source.shape != self.shape:
|
||||||
|
raise ValueError(
|
||||||
|
f"copy_ shape mismatch: self={self.shape} source={source.shape}"
|
||||||
|
)
|
||||||
|
np_dtype = _numpy_dtype(self.dtype)
|
||||||
|
arr = source.numpy().astype(np_dtype, copy=False)
|
||||||
|
flat = np.ascontiguousarray(arr).reshape(-1)
|
||||||
|
for shard in self._handle.shards:
|
||||||
|
start = shard.offset_bytes // self.itemsize
|
||||||
|
count = shard.nbytes // self.itemsize
|
||||||
|
piece = flat[start : start + count].copy()
|
||||||
|
self._memory_store.write(
|
||||||
|
"hbm", self._shard_store_addr(shard), piece,
|
||||||
|
)
|
||||||
|
return self
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def itemsize(self) -> int:
|
def itemsize(self) -> int:
|
||||||
return dtype_itemsize(self.dtype)
|
return dtype_itemsize(self.dtype)
|
||||||
@@ -170,7 +459,8 @@ class Tensor:
|
|||||||
) -> Tensor:
|
) -> Tensor:
|
||||||
"""Set DP placement metadata (like torch.Tensor.to())."""
|
"""Set DP placement metadata (like torch.Tensor.to())."""
|
||||||
if placement is None:
|
if placement is None:
|
||||||
placement = [ShardSpec(pe_index=0, offset_bytes=0, nbytes=self.nbytes)]
|
placement = [ShardSpec(sip=0, cube=0, pe=0,
|
||||||
|
offset_bytes=0, nbytes=self.nbytes)]
|
||||||
self._dp_metadata = DPMetadata(
|
self._dp_metadata = DPMetadata(
|
||||||
placement=placement, dp_policy=dp_policy,
|
placement=placement, dp_policy=dp_policy,
|
||||||
sip=sip, cube=cube, target_pe=target_pe,
|
sip=sip, cube=cube, target_pe=target_pe,
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ class BenchResult:
|
|||||||
correlation_id: str
|
correlation_id: str
|
||||||
trace: Trace | None = None
|
trace: Trace | None = None
|
||||||
traces: list[dict] | None = None
|
traces: list[dict] | None = None
|
||||||
|
engine: object | None = None # GraphEngine ref for Phase 2 data access
|
||||||
|
|
||||||
def summary_text(self) -> str:
|
def summary_text(self) -> str:
|
||||||
if self.completion.ok:
|
if self.completion.ok:
|
||||||
|
|||||||
@@ -0,0 +1,249 @@
|
|||||||
|
"""DataExecutor: Phase 2 op_log-based data execution (ADR-0020 D6).
|
||||||
|
|
||||||
|
Executes GEMM/Math operations from the op_log using numpy.
|
||||||
|
Memory ops are skipped (already handled in Phase 1 via MemoryStore).
|
||||||
|
Same-timestamp independent ops can be batched for efficiency.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from kernbench.sim_engine.memory_store import MemoryStore, _resolve_dtype
|
||||||
|
from kernbench.sim_engine.op_log import OpRecord
|
||||||
|
|
||||||
|
|
||||||
|
class DataExecutor:
|
||||||
|
"""Phase 2 executor: replay op_log with actual numpy computation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
op_log: list of OpRecords from Phase 1.
|
||||||
|
store: MemoryStore snapshot from Phase 1 (contains tensor data).
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, op_log: list[OpRecord], store: MemoryStore) -> None:
|
||||||
|
self._op_log = op_log
|
||||||
|
self.store = store
|
||||||
|
|
||||||
|
# Ordering priority within the same t_start: memory copies must run
|
||||||
|
# before math/gemm so that slot data is populated before a consumer
|
||||||
|
# PE's math op reads it. With 0-ns PE_MATH overhead and tight SimPy
|
||||||
|
# scheduling, ipcq_copy and math ops from different PEs can collide
|
||||||
|
# at the exact same t_start.
|
||||||
|
_KIND_ORDER = {"memory": 0, "gemm": 1, "math": 2, "unknown": 3}
|
||||||
|
|
||||||
|
def run(self) -> None:
|
||||||
|
"""Execute all ops in op_log order.
|
||||||
|
|
||||||
|
Primary sort: t_start (ascending).
|
||||||
|
Secondary sort: op_kind priority — memory (ipcq_copy/dma_write)
|
||||||
|
before gemm before math. This ensures IPCQ slot data arrives
|
||||||
|
before a consumer PE's math op tries to read it, even when both
|
||||||
|
share the same SimPy timestamp.
|
||||||
|
"""
|
||||||
|
ops = sorted(
|
||||||
|
self._op_log,
|
||||||
|
key=lambda r: (r.t_start, self._KIND_ORDER.get(r.op_kind, 3)),
|
||||||
|
)
|
||||||
|
for op in ops:
|
||||||
|
if op.op_kind != "memory" or op.op_name != "dma_read":
|
||||||
|
self._execute_op(op)
|
||||||
|
|
||||||
|
def _execute_op(self, op: OpRecord) -> None:
|
||||||
|
if op.op_kind == "memory":
|
||||||
|
self._execute_memory(op)
|
||||||
|
elif op.op_kind == "gemm":
|
||||||
|
self._execute_gemm(op)
|
||||||
|
elif op.op_kind == "math":
|
||||||
|
self._execute_math(op)
|
||||||
|
|
||||||
|
def _execute_memory(self, op: OpRecord) -> None:
|
||||||
|
"""Replay memory copy ops in Phase 2 (ADR-0020 + ADR-0023).
|
||||||
|
|
||||||
|
- dma_read: no-op (handle already references HBM source).
|
||||||
|
- dma_write: copy (src_space, src_addr) → (dst_space, dst_addr).
|
||||||
|
Required because Phase 2 may have just produced new data at the
|
||||||
|
source addr (e.g. PE_MATH scratch output).
|
||||||
|
- ipcq_copy: copy across PEs — sender's source → receiver's slot.
|
||||||
|
Required because the source may be a Phase 2 math output, and
|
||||||
|
a downstream math op on the receiver reads from the slot.
|
||||||
|
|
||||||
|
Legacy entries without src/dst metadata are silently skipped.
|
||||||
|
"""
|
||||||
|
p = op.params
|
||||||
|
if op.op_name == "dma_write" or op.op_name == "ipcq_copy":
|
||||||
|
src_space = p.get("src_space")
|
||||||
|
src_addr = p.get("src_addr")
|
||||||
|
dst_space = p.get("dst_space")
|
||||||
|
dst_addr = p.get("dst_addr")
|
||||||
|
if (src_space is None or src_addr is None
|
||||||
|
or dst_space is None or dst_addr is None):
|
||||||
|
return
|
||||||
|
# Prefer the Phase-1-time snapshot (captured at record_end /
|
||||||
|
# outbound) so we don't read from a source that has since been
|
||||||
|
# mutated by another op. Fall back to MemoryStore for sources
|
||||||
|
# that had no Phase 1 data (e.g. math scratch outputs that
|
||||||
|
# only get populated by Phase 2's math replay).
|
||||||
|
data = p.get("snapshot")
|
||||||
|
if data is None:
|
||||||
|
try:
|
||||||
|
data = self.store.read(
|
||||||
|
src_space, src_addr,
|
||||||
|
shape=p.get("shape"), dtype=p.get("dtype"),
|
||||||
|
)
|
||||||
|
except KeyError:
|
||||||
|
return
|
||||||
|
self.store.write(dst_space, dst_addr, data)
|
||||||
|
|
||||||
|
def _execute_gemm(self, op: OpRecord) -> None:
|
||||||
|
"""Execute GEMM: out = a @ b."""
|
||||||
|
p = op.params
|
||||||
|
if "src_a_addr" not in p:
|
||||||
|
return # composite record without full params
|
||||||
|
default_space = p.get("addr_space", "tcm")
|
||||||
|
# ADR-0027: per-operand + output spaces (fall back to single space
|
||||||
|
# for legacy records without explicit space keys).
|
||||||
|
src_a_space = p.get("src_a_space", default_space)
|
||||||
|
src_b_space = p.get("src_b_space", default_space)
|
||||||
|
dst_space = p.get("dst_space", default_space)
|
||||||
|
dtype_in = p.get("dtype_in", "f16")
|
||||||
|
dtype_out = p.get("dtype_out", dtype_in)
|
||||||
|
|
||||||
|
a = self.store.read(src_a_space, p["src_a_addr"],
|
||||||
|
shape=p.get("shape_a"), dtype=dtype_in)
|
||||||
|
b = self.store.read(src_b_space, p["src_b_addr"],
|
||||||
|
shape=p.get("shape_b"), dtype=dtype_in)
|
||||||
|
|
||||||
|
# Compute in higher precision if specified
|
||||||
|
dtype_acc = p.get("dtype_acc", "f32")
|
||||||
|
a_f = a.astype(_resolve_dtype(dtype_acc))
|
||||||
|
b_f = b.astype(_resolve_dtype(dtype_acc))
|
||||||
|
result = np.matmul(a_f, b_f).astype(_resolve_dtype(dtype_out))
|
||||||
|
|
||||||
|
self.store.write(dst_space, p["dst_addr"], result)
|
||||||
|
|
||||||
|
def _execute_math(self, op: OpRecord) -> None:
|
||||||
|
"""Execute math op: unary, binary, or reduction."""
|
||||||
|
p = op.params
|
||||||
|
math_op = p.get("op", op.op_name)
|
||||||
|
dtype = p.get("dtype", "f32")
|
||||||
|
input_addrs = p.get("input_addrs", [])
|
||||||
|
input_shapes = p.get("input_shapes", [])
|
||||||
|
# Per-input space/dtype (ADR-0023 CCL accumulation): math ops can
|
||||||
|
# mix inputs from different MemoryStore spaces (e.g. acc in "hbm",
|
||||||
|
# recv slot in "tcm"). Fall back to legacy single-space mode when
|
||||||
|
# the per-input lists are absent.
|
||||||
|
input_spaces = p.get("input_spaces") or [p.get("addr_space", "tcm")] * len(input_addrs)
|
||||||
|
input_dtypes = p.get("input_dtypes") or [dtype] * len(input_addrs)
|
||||||
|
# Per-input data snapshots (ADR-0020 D6): captured at op_log
|
||||||
|
# record time. Phase 1 has correct values for slot/HBM addrs at
|
||||||
|
# that moment, which lets Phase 2 sidestep the slot-wraparound
|
||||||
|
# races where a later round overwrites a slot before this op
|
||||||
|
# runs in t_start order.
|
||||||
|
snapshots = p.get("input_snapshots") or [None] * len(input_addrs)
|
||||||
|
dst_space = p.get("dst_space", p.get("addr_space", "tcm"))
|
||||||
|
|
||||||
|
inputs = []
|
||||||
|
for addr, shape, space, idtype, snap in zip(
|
||||||
|
input_addrs, input_shapes, input_spaces, input_dtypes, snapshots
|
||||||
|
):
|
||||||
|
if snap is not None:
|
||||||
|
inputs.append(snap)
|
||||||
|
else:
|
||||||
|
inputs.append(self.store.read(space, addr, shape=shape, dtype=idtype))
|
||||||
|
|
||||||
|
result = _compute_math(math_op, inputs, p.get("axis"))
|
||||||
|
if result is not None:
|
||||||
|
self.store.write(dst_space, p["dst_addr"], result)
|
||||||
|
|
||||||
|
def verify(self, expected: dict[tuple[str, int], np.ndarray],
|
||||||
|
rtol: float = 1e-3, atol: float = 1e-3) -> dict[str, bool]:
|
||||||
|
"""Compare MemoryStore contents against expected tensors.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
expected: {(space, addr): expected_ndarray}
|
||||||
|
rtol, atol: tolerance for floating-point comparison.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
{key_str: passed} dict.
|
||||||
|
"""
|
||||||
|
results = {}
|
||||||
|
for (space, addr), exp in expected.items():
|
||||||
|
key = f"{space}:0x{addr:x}"
|
||||||
|
try:
|
||||||
|
actual = self.store.read(space, addr)
|
||||||
|
if np.issubdtype(actual.dtype, np.integer):
|
||||||
|
results[key] = bool(np.array_equal(actual, exp))
|
||||||
|
else:
|
||||||
|
results[key] = bool(np.allclose(actual, exp, rtol=rtol, atol=atol))
|
||||||
|
except KeyError:
|
||||||
|
results[key] = False
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def _compute_math(op: str, inputs: list[np.ndarray], axis: int | None) -> np.ndarray | None:
|
||||||
|
"""Execute a math operation on numpy arrays."""
|
||||||
|
if not inputs:
|
||||||
|
return None
|
||||||
|
|
||||||
|
x = inputs[0]
|
||||||
|
|
||||||
|
# Unary
|
||||||
|
if op == "exp":
|
||||||
|
return np.exp(x)
|
||||||
|
if op == "log":
|
||||||
|
return np.log(x)
|
||||||
|
if op == "sqrt":
|
||||||
|
return np.sqrt(x)
|
||||||
|
if op == "abs":
|
||||||
|
return np.abs(x)
|
||||||
|
if op == "sigmoid":
|
||||||
|
return 1.0 / (1.0 + np.exp(-x))
|
||||||
|
if op == "cos":
|
||||||
|
return np.cos(x)
|
||||||
|
if op == "sin":
|
||||||
|
return np.sin(x)
|
||||||
|
|
||||||
|
# Reduction
|
||||||
|
if op == "sum":
|
||||||
|
return np.sum(x, axis=axis, keepdims=True)
|
||||||
|
if op == "max":
|
||||||
|
return np.max(x, axis=axis, keepdims=True)
|
||||||
|
if op == "min":
|
||||||
|
return np.min(x, axis=axis, keepdims=True)
|
||||||
|
|
||||||
|
# Softmax (numerically stable)
|
||||||
|
if op == "softmax":
|
||||||
|
ax = axis if axis is not None else -1
|
||||||
|
x_max = np.max(x, axis=ax, keepdims=True)
|
||||||
|
e = np.exp(x - x_max)
|
||||||
|
s = np.sum(e, axis=ax, keepdims=True)
|
||||||
|
return e / s
|
||||||
|
|
||||||
|
# Binary
|
||||||
|
if len(inputs) >= 2:
|
||||||
|
y = inputs[1]
|
||||||
|
if op == "add":
|
||||||
|
return x + y
|
||||||
|
if op == "sub":
|
||||||
|
return x - y
|
||||||
|
if op == "mul":
|
||||||
|
return x * y
|
||||||
|
if op == "div":
|
||||||
|
return x / y
|
||||||
|
if op == "maximum":
|
||||||
|
return np.maximum(x, y)
|
||||||
|
if op == "minimum":
|
||||||
|
return np.minimum(x, y)
|
||||||
|
|
||||||
|
# Ternary
|
||||||
|
if len(inputs) >= 3:
|
||||||
|
if op == "where":
|
||||||
|
return np.where(inputs[0], inputs[1], inputs[2])
|
||||||
|
if op == "fma":
|
||||||
|
return inputs[0] * inputs[1] + inputs[2]
|
||||||
|
if op == "clamp":
|
||||||
|
return np.minimum(np.maximum(inputs[0], inputs[1]), inputs[2])
|
||||||
|
|
||||||
|
return None
|
||||||
@@ -31,6 +31,7 @@ class GraphEngine:
|
|||||||
graph: TopologyGraph,
|
graph: TopologyGraph,
|
||||||
*,
|
*,
|
||||||
component_overrides: dict[str, type[ComponentBase]] | None = None,
|
component_overrides: dict[str, type[ComponentBase]] | None = None,
|
||||||
|
enable_data: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
self._env = simpy.Environment()
|
self._env = simpy.Environment()
|
||||||
self._resolver = AddressResolver(graph)
|
self._resolver = AddressResolver(graph)
|
||||||
@@ -44,6 +45,19 @@ class GraphEngine:
|
|||||||
self._events: dict[str, simpy.Event] = {}
|
self._events: dict[str, simpy.Event] = {}
|
||||||
self._counter = 0
|
self._counter = 0
|
||||||
overrides = component_overrides or {}
|
overrides = component_overrides or {}
|
||||||
|
# ADR-0020: optional data execution support
|
||||||
|
self._op_logger = None
|
||||||
|
self._memory_store = None
|
||||||
|
if enable_data:
|
||||||
|
from kernbench.sim_engine.memory_store import MemoryStore
|
||||||
|
from kernbench.sim_engine.op_log import OpLogger
|
||||||
|
self._memory_store = MemoryStore()
|
||||||
|
self._op_logger = OpLogger(memory_store=self._memory_store)
|
||||||
|
# Cursor for incremental Phase 2 replay (ADR-0020 D6).
|
||||||
|
# SimPy env.now is monotonic so newly logged records always sort
|
||||||
|
# to the tail; the cursor remains valid across waits.
|
||||||
|
self._data_cursor = 0
|
||||||
|
|
||||||
ctx = ComponentContext(
|
ctx = ComponentContext(
|
||||||
router=self._router,
|
router=self._router,
|
||||||
resolver=self._resolver,
|
resolver=self._resolver,
|
||||||
@@ -51,6 +65,8 @@ class GraphEngine:
|
|||||||
ns_per_mm=self._ns_per_mm,
|
ns_per_mm=self._ns_per_mm,
|
||||||
edge_map=self._edge_map,
|
edge_map=self._edge_map,
|
||||||
spec=graph.spec,
|
spec=graph.spec,
|
||||||
|
memory_store=self._memory_store,
|
||||||
|
op_logger=self._op_logger,
|
||||||
)
|
)
|
||||||
self._components: dict[str, ComponentBase] = {
|
self._components: dict[str, ComponentBase] = {
|
||||||
node_id: ComponentRegistry.create(node, overrides, ctx)
|
node_id: ComponentRegistry.create(node, overrides, ctx)
|
||||||
@@ -108,10 +124,25 @@ class GraphEngine:
|
|||||||
if mmu_comp is not None and hasattr(mmu_comp, "mmu"):
|
if mmu_comp is not None and hasattr(mmu_comp, "mmu"):
|
||||||
self._components[node_id]._mmu = mmu_comp.mmu
|
self._components[node_id]._mmu = mmu_comp.mmu
|
||||||
|
|
||||||
|
# Inject op_logger into all components (ADR-0020 D2)
|
||||||
|
if self._op_logger:
|
||||||
|
for comp in self._components.values():
|
||||||
|
comp._op_logger = self._op_logger
|
||||||
|
|
||||||
# Start components after all ports are wired (ADR-0015 D3)
|
# Start components after all ports are wired (ADR-0015 D3)
|
||||||
for comp in self._components.values():
|
for comp in self._components.values():
|
||||||
comp.start(self._env)
|
comp.start(self._env)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def op_log(self):
|
||||||
|
"""Op log records from Phase 1 (ADR-0020)."""
|
||||||
|
return self._op_logger.records if self._op_logger else []
|
||||||
|
|
||||||
|
@property
|
||||||
|
def memory_store(self):
|
||||||
|
"""MemoryStore from Phase 1 (ADR-0020)."""
|
||||||
|
return self._memory_store
|
||||||
|
|
||||||
def submit(self, request: Any) -> RequestHandle:
|
def submit(self, request: Any) -> RequestHandle:
|
||||||
self._counter += 1
|
self._counter += 1
|
||||||
handle = RequestHandle(f"h{self._counter}")
|
handle = RequestHandle(f"h{self._counter}")
|
||||||
@@ -120,11 +151,60 @@ class GraphEngine:
|
|||||||
self._env.process(self._process(str(handle), request, event))
|
self._env.process(self._process(str(handle), request, event))
|
||||||
return handle
|
return handle
|
||||||
|
|
||||||
|
def _flush_data_phase(self) -> None:
|
||||||
|
"""Replay newly recorded op_log entries through DataExecutor.
|
||||||
|
|
||||||
|
ADR-0020 D6 Phase 2: when data tracking is enabled, run DataExecutor
|
||||||
|
on records added since the last flush so that callers reading
|
||||||
|
MemoryStore between launches observe correct (compute-replayed)
|
||||||
|
tensor data.
|
||||||
|
|
||||||
|
Cursor-based incremental replay is necessary because Phase 2 is
|
||||||
|
NOT idempotent across full re-runs: a math op writes a TCM scratch
|
||||||
|
addr, a later dma_write copies that scratch into HBM[X], and an
|
||||||
|
even-later math op may then read HBM[X]. Re-running everything
|
||||||
|
from scratch would let the second pass's first math op read the
|
||||||
|
already-overwritten HBM[X] instead of the original input.
|
||||||
|
"""
|
||||||
|
if self._op_logger is None or self._memory_store is None:
|
||||||
|
return
|
||||||
|
records = self._op_logger.records # sorted by t_start (stable)
|
||||||
|
if self._data_cursor >= len(records):
|
||||||
|
return
|
||||||
|
new_records = records[self._data_cursor:]
|
||||||
|
from kernbench.sim_engine.data_executor import DataExecutor
|
||||||
|
DataExecutor(new_records, self._memory_store).run()
|
||||||
|
self._data_cursor = len(records)
|
||||||
|
|
||||||
def wait(self, handle: RequestHandle) -> None:
|
def wait(self, handle: RequestHandle) -> None:
|
||||||
key = str(handle)
|
key = str(handle)
|
||||||
event = self._events[key]
|
event = self._events[key]
|
||||||
if not event.triggered:
|
if not event.triggered:
|
||||||
self._env.run(until=event)
|
try:
|
||||||
|
self._env.run(until=event)
|
||||||
|
except (simpy.core.EmptySchedule, RuntimeError) as exc:
|
||||||
|
# SimPy raises EmptySchedule directly OR (in newer simpy)
|
||||||
|
# wraps it as a RuntimeError("No scheduled events left ...").
|
||||||
|
# Either case while our event is still pending → IPCQ deadlock.
|
||||||
|
msg = str(exc)
|
||||||
|
is_deadlock = (
|
||||||
|
isinstance(exc, simpy.core.EmptySchedule)
|
||||||
|
or "No scheduled events left" in msg
|
||||||
|
)
|
||||||
|
if not is_deadlock:
|
||||||
|
raise
|
||||||
|
from kernbench.ccl.diagnostics import IpcqDeadlock, pointer_dump
|
||||||
|
dump = pointer_dump(self)
|
||||||
|
if dump.strip():
|
||||||
|
raise IpcqDeadlock(
|
||||||
|
"IPCQ deadlock: simulation schedule empty while "
|
||||||
|
f"request {handle!r} is still pending.\n"
|
||||||
|
f"Pointer state:\n{dump}"
|
||||||
|
) from None
|
||||||
|
raise
|
||||||
|
# ADR-0020: replay newly logged ops so the caller observes
|
||||||
|
# post-Phase-2 tensor state from MemoryStore.
|
||||||
|
self._flush_data_phase()
|
||||||
|
|
||||||
def get_completion(self, handle: RequestHandle) -> tuple[Completion, Trace | None]:
|
def get_completion(self, handle: RequestHandle) -> tuple[Completion, Trace | None]:
|
||||||
return self._results[str(handle)]
|
return self._results[str(handle)]
|
||||||
|
|||||||
@@ -0,0 +1,84 @@
|
|||||||
|
"""MemoryStore: tensor-granular storage for Phase 1 and Phase 2 (ADR-0020 D7).
|
||||||
|
|
||||||
|
Logically byte-addressable, implemented as addr → numpy ndarray mapping.
|
||||||
|
Read/write are reference-based (no copy) for Phase 1 performance.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
# numpy dtype string → numpy dtype mapping
|
||||||
|
_DTYPE_MAP = {
|
||||||
|
"f16": np.float16,
|
||||||
|
"f32": np.float32,
|
||||||
|
"f64": np.float64,
|
||||||
|
"bf16": np.float16, # numpy has no bfloat16; use float16 as proxy
|
||||||
|
"i8": np.int8,
|
||||||
|
"i16": np.int16,
|
||||||
|
"i32": np.int32,
|
||||||
|
"i64": np.int64,
|
||||||
|
"u8": np.uint8,
|
||||||
|
"u16": np.uint16,
|
||||||
|
"u32": np.uint32,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_dtype(dtype: str) -> np.dtype:
|
||||||
|
if dtype in _DTYPE_MAP:
|
||||||
|
return np.dtype(_DTYPE_MAP[dtype])
|
||||||
|
return np.dtype(dtype)
|
||||||
|
|
||||||
|
|
||||||
|
class MemoryStore:
|
||||||
|
"""Tensor-granular memory storage (ADR-0020 D7).
|
||||||
|
|
||||||
|
Stores numpy ndarrays by (space, addr) key.
|
||||||
|
Write = reference store (no copy), read = reference return (no copy).
|
||||||
|
Overwrite at same addr replaces the entire tensor.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
# {space: {addr: ndarray}}
|
||||||
|
self._storage: dict[str, dict[int, np.ndarray]] = {}
|
||||||
|
|
||||||
|
def write(self, space: str, addr: int, data: np.ndarray) -> None:
|
||||||
|
"""Store tensor at (space, addr). Reference-only, no copy."""
|
||||||
|
if space not in self._storage:
|
||||||
|
self._storage[space] = {}
|
||||||
|
self._storage[space][addr] = data
|
||||||
|
|
||||||
|
def read(self, space: str, addr: int, shape: tuple[int, ...] | None = None,
|
||||||
|
dtype: str | None = None) -> np.ndarray:
|
||||||
|
"""Read tensor from (space, addr). Returns reference, no copy.
|
||||||
|
|
||||||
|
If shape/dtype match stored tensor, returns as-is.
|
||||||
|
If dtype differs, performs reinterpret cast (view).
|
||||||
|
If shape differs but nbytes match, reshapes.
|
||||||
|
"""
|
||||||
|
store = self._storage.get(space)
|
||||||
|
if store is None or addr not in store:
|
||||||
|
raise KeyError(f"No data at ({space}, 0x{addr:x})")
|
||||||
|
arr = store[addr]
|
||||||
|
if dtype is not None:
|
||||||
|
np_dtype = _resolve_dtype(dtype)
|
||||||
|
if arr.dtype != np_dtype:
|
||||||
|
arr = arr.view(np_dtype)
|
||||||
|
if shape is not None and arr.shape != shape:
|
||||||
|
if arr.nbytes != np.prod(shape) * arr.dtype.itemsize:
|
||||||
|
raise ValueError(
|
||||||
|
f"Shape mismatch: stored {arr.shape} ({arr.nbytes}B) "
|
||||||
|
f"vs requested {shape} ({np.prod(shape) * arr.dtype.itemsize}B)"
|
||||||
|
)
|
||||||
|
arr = arr.reshape(shape)
|
||||||
|
return arr
|
||||||
|
|
||||||
|
def has(self, space: str, addr: int) -> bool:
|
||||||
|
return addr in self._storage.get(space, {})
|
||||||
|
|
||||||
|
def snapshot(self) -> MemoryStore:
|
||||||
|
"""Create a shallow copy for Phase 2 initialization."""
|
||||||
|
new = MemoryStore()
|
||||||
|
for space, addrs in self._storage.items():
|
||||||
|
new._storage[space] = dict(addrs) # shallow copy of addr→ndarray map
|
||||||
|
return new
|
||||||
@@ -0,0 +1,222 @@
|
|||||||
|
"""Op log infrastructure for 2-pass data execution (ADR-0020 D2, D5).
|
||||||
|
|
||||||
|
OpRecord: single data operation with timing, params, and dependencies.
|
||||||
|
OpLogger: collects OpRecords from ComponentBase hooks during Phase 1.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class OpRecord:
|
||||||
|
"""Single data operation record (ADR-0020 D5)."""
|
||||||
|
|
||||||
|
t_start: float
|
||||||
|
t_end: float
|
||||||
|
component_id: str
|
||||||
|
op_kind: str # "memory" | "gemm" | "math"
|
||||||
|
op_name: str # e.g. "dma_read", "gemm_f16", "exp"
|
||||||
|
params: dict[str, Any]
|
||||||
|
dependency_ids: list[int] = field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
|
class OpLogger:
|
||||||
|
"""Collects OpRecords during Phase 1 simulation (ADR-0020 D2).
|
||||||
|
|
||||||
|
Thread-safe is not required — SimPy is single-threaded.
|
||||||
|
Records are maintained in t_start stable ordering (insertion order).
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, memory_store: Any | None = None) -> None:
|
||||||
|
self._records: list[OpRecord] = []
|
||||||
|
self._pending: dict[int, dict[str, Any]] = {} # msg id → partial record
|
||||||
|
# Optional MemoryStore reference. When set, math op records capture
|
||||||
|
# input data snapshots at record_end time so Phase 2 replay does
|
||||||
|
# not depend on slot/scratch addrs surviving until math runs.
|
||||||
|
self._memory_store = memory_store
|
||||||
|
|
||||||
|
@property
|
||||||
|
def records(self) -> list[OpRecord]:
|
||||||
|
"""Records sorted by t_start (stable ordering per ADR-0020 D5)."""
|
||||||
|
self._records.sort(key=lambda r: r.t_start)
|
||||||
|
return self._records
|
||||||
|
|
||||||
|
def record_start(self, t: float, component_id: str, msg: Any) -> None:
|
||||||
|
"""Called by ComponentBase._on_process_start."""
|
||||||
|
self._pending[id(msg)] = {
|
||||||
|
"t_start": t,
|
||||||
|
"component_id": component_id,
|
||||||
|
"msg": msg,
|
||||||
|
}
|
||||||
|
|
||||||
|
def record_end(self, t: float, component_id: str, msg: Any) -> None:
|
||||||
|
"""Called by ComponentBase._on_process_end."""
|
||||||
|
pending = self._pending.pop(id(msg), None)
|
||||||
|
if pending is None:
|
||||||
|
return
|
||||||
|
op_kind, op_name, params = _extract_op_info(msg)
|
||||||
|
# Snapshot data at record time so Phase 2 replay sidesteps
|
||||||
|
# downstream mutations of source addrs (e.g. a tl.store that
|
||||||
|
# overwrites HBM after a load handle was sent, or a slot that
|
||||||
|
# gets reused on the next ring round).
|
||||||
|
if self._memory_store is not None:
|
||||||
|
if op_kind == "math":
|
||||||
|
snaps: list[Any] = []
|
||||||
|
for addr, shape, space, idtype in zip(
|
||||||
|
params.get("input_addrs", []),
|
||||||
|
params.get("input_shapes", []),
|
||||||
|
params.get("input_spaces", []),
|
||||||
|
params.get("input_dtypes", []),
|
||||||
|
):
|
||||||
|
try:
|
||||||
|
arr = self._memory_store.read(
|
||||||
|
space, addr, shape=shape, dtype=idtype,
|
||||||
|
)
|
||||||
|
snaps.append(arr.copy() if hasattr(arr, "copy") else arr)
|
||||||
|
except Exception:
|
||||||
|
snaps.append(None)
|
||||||
|
params["input_snapshots"] = snaps
|
||||||
|
elif op_name == "dma_write":
|
||||||
|
# ADR-0027 fix: only snapshot HBM sources. TCM (PE scratch)
|
||||||
|
# sources are repopulated by Phase 2 math/gemm replay —
|
||||||
|
# capturing a Phase-1-time snapshot here would pick up stale
|
||||||
|
# data from a PRIOR kernel's Phase 2 output that aliased the
|
||||||
|
# same scratch address, causing the later kernel's replay
|
||||||
|
# to write that stale value instead of the fresh math
|
||||||
|
# result. See ADR-0027 postmortem (TP gemm → all_reduce).
|
||||||
|
if params.get("src_space") == "hbm":
|
||||||
|
try:
|
||||||
|
arr = self._memory_store.read(
|
||||||
|
params["src_space"], params["src_addr"],
|
||||||
|
shape=params.get("shape"), dtype=params.get("dtype"),
|
||||||
|
)
|
||||||
|
params["snapshot"] = (
|
||||||
|
arr.copy() if hasattr(arr, "copy") else arr
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
params["snapshot"] = None
|
||||||
|
self._records.append(OpRecord(
|
||||||
|
t_start=pending["t_start"],
|
||||||
|
t_end=t,
|
||||||
|
component_id=pending["component_id"],
|
||||||
|
op_kind=op_kind,
|
||||||
|
op_name=op_name,
|
||||||
|
params=params,
|
||||||
|
))
|
||||||
|
|
||||||
|
def record_copy(
|
||||||
|
self, t_start: float, t_end: float, component_id: str,
|
||||||
|
src_space: str, src_addr: int,
|
||||||
|
dst_space: str, dst_addr: int,
|
||||||
|
shape: tuple[int, ...], dtype: str, nbytes: int,
|
||||||
|
snapshot: Any = None,
|
||||||
|
) -> None:
|
||||||
|
"""Record a memory copy op for Phase 2 replay (ADR-0023 + ADR-0020).
|
||||||
|
|
||||||
|
``snapshot``: if provided (e.g. token.data from in-flight DMA),
|
||||||
|
used directly. Otherwise falls back to a fresh read from
|
||||||
|
MemoryStore[src_addr]. The snapshot is what Phase 2 writes into
|
||||||
|
dst_addr, avoiding stale-source races from cross-PE mutations.
|
||||||
|
"""
|
||||||
|
snap = snapshot
|
||||||
|
if snap is None and self._memory_store is not None:
|
||||||
|
try:
|
||||||
|
arr = self._memory_store.read(
|
||||||
|
src_space, src_addr, shape=shape, dtype=dtype,
|
||||||
|
)
|
||||||
|
snap = arr.copy() if hasattr(arr, "copy") else arr
|
||||||
|
except Exception:
|
||||||
|
snap = None
|
||||||
|
self._records.append(OpRecord(
|
||||||
|
t_start=t_start, t_end=t_end,
|
||||||
|
component_id=component_id,
|
||||||
|
op_kind="memory", op_name="ipcq_copy",
|
||||||
|
params={
|
||||||
|
"src_space": src_space, "src_addr": src_addr,
|
||||||
|
"dst_space": dst_space, "dst_addr": dst_addr,
|
||||||
|
"shape": shape, "dtype": dtype, "nbytes": nbytes,
|
||||||
|
"snapshot": snap,
|
||||||
|
},
|
||||||
|
))
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_op_info(msg: Any) -> tuple[str, str, dict[str, Any]]:
|
||||||
|
"""Extract op_kind, op_name, params from a data_op message."""
|
||||||
|
from kernbench.common.pe_commands import (
|
||||||
|
DmaReadCmd, DmaWriteCmd, GemmCmd, MathCmd, CompositeCmd,
|
||||||
|
)
|
||||||
|
if isinstance(msg, DmaReadCmd):
|
||||||
|
return "memory", "dma_read", {
|
||||||
|
"src_addr": msg.src_addr,
|
||||||
|
"nbytes": msg.nbytes,
|
||||||
|
"handle_id": msg.handle.id,
|
||||||
|
}
|
||||||
|
if isinstance(msg, DmaWriteCmd):
|
||||||
|
return "memory", "dma_write", {
|
||||||
|
"src_space": getattr(msg.handle, "space", "tcm"),
|
||||||
|
"src_addr": msg.handle.addr,
|
||||||
|
"shape": msg.handle.shape,
|
||||||
|
"dtype": msg.handle.dtype,
|
||||||
|
"dst_space": "hbm",
|
||||||
|
"dst_addr": msg.dst_addr,
|
||||||
|
"nbytes": msg.nbytes,
|
||||||
|
"handle_id": msg.handle.id,
|
||||||
|
}
|
||||||
|
if isinstance(msg, GemmCmd):
|
||||||
|
return "gemm", f"gemm_{msg.a.dtype}", {
|
||||||
|
"src_a_addr": msg.a.addr,
|
||||||
|
"src_b_addr": msg.b.addr,
|
||||||
|
"dst_addr": msg.out.addr,
|
||||||
|
"shape_a": msg.a.shape,
|
||||||
|
"shape_b": msg.b.shape,
|
||||||
|
"shape_out": msg.out.shape,
|
||||||
|
"dtype_in": msg.a.dtype,
|
||||||
|
"dtype_out": msg.out.dtype,
|
||||||
|
"m": msg.m, "k": msg.k, "n": msg.n,
|
||||||
|
# ADR-0027: preserve per-operand + output MemoryStore spaces so
|
||||||
|
# Phase 2 replay can resolve HBM-resident operands (e.g. tl.load
|
||||||
|
# results keep space="hbm"). Absent → DataExecutor falls back
|
||||||
|
# to the legacy single-space mode via ``addr_space``.
|
||||||
|
"src_a_space": getattr(msg.a, "space", "tcm"),
|
||||||
|
"src_b_space": getattr(msg.b, "space", "tcm"),
|
||||||
|
"dst_space": getattr(msg.out, "space", "tcm"),
|
||||||
|
}
|
||||||
|
if isinstance(msg, MathCmd):
|
||||||
|
return "math", msg.op, {
|
||||||
|
"input_addrs": [h.addr for h in msg.inputs],
|
||||||
|
"input_shapes": [h.shape for h in msg.inputs],
|
||||||
|
"input_spaces": [getattr(h, "space", "tcm") for h in msg.inputs],
|
||||||
|
"input_dtypes": [h.dtype for h in msg.inputs],
|
||||||
|
"dst_addr": msg.out.addr,
|
||||||
|
"dst_space": getattr(msg.out, "space", "tcm"),
|
||||||
|
"shape_out": msg.out.shape,
|
||||||
|
"dtype": msg.out.dtype,
|
||||||
|
"axis": msg.axis,
|
||||||
|
}
|
||||||
|
if isinstance(msg, CompositeCmd):
|
||||||
|
params: dict[str, Any] = {
|
||||||
|
"op": msg.op,
|
||||||
|
"out_addr": msg.out_addr,
|
||||||
|
"out_nbytes": msg.out_nbytes,
|
||||||
|
}
|
||||||
|
# ADR-0027: preserve operand info so Phase 2 DataExecutor can replay
|
||||||
|
# the composite's numerical effect (treat it like a GemmCmd).
|
||||||
|
if msg.op == "gemm" and msg.a is not None and msg.b is not None:
|
||||||
|
params.update({
|
||||||
|
"src_a_addr": msg.a.addr,
|
||||||
|
"src_b_addr": msg.b.addr,
|
||||||
|
"shape_a": msg.a.shape,
|
||||||
|
"shape_b": msg.b.shape,
|
||||||
|
"dtype_in": msg.a.dtype,
|
||||||
|
"dtype_out": msg.a.dtype,
|
||||||
|
"src_a_space": getattr(msg.a, "space", "hbm"),
|
||||||
|
"src_b_space": getattr(msg.b, "space", "hbm"),
|
||||||
|
"dst_space": "hbm",
|
||||||
|
# dst_addr alias so DataExecutor._execute_gemm picks it up.
|
||||||
|
"dst_addr": msg.out_addr,
|
||||||
|
})
|
||||||
|
return "gemm" if msg.op == "gemm" else "math", f"composite_{msg.op}", params
|
||||||
|
# Fallback for unknown data_op messages
|
||||||
|
return "unknown", type(msg).__name__, {}
|
||||||
@@ -20,10 +20,12 @@ _PE_COMP_OFFSETS = {
|
|||||||
"pe_cpu": (-0.3, 0.0),
|
"pe_cpu": (-0.3, 0.0),
|
||||||
"pe_scheduler": (-0.15, 0.0),
|
"pe_scheduler": (-0.15, 0.0),
|
||||||
"pe_dma": (0.0, -0.15),
|
"pe_dma": (0.0, -0.15),
|
||||||
|
"pe_fetch_store": (0.15, 0.0),
|
||||||
"pe_gemm": (0.0, 0.0),
|
"pe_gemm": (0.0, 0.0),
|
||||||
"pe_math": (0.0, 0.15),
|
"pe_math": (0.0, 0.15),
|
||||||
"pe_mmu": (0.15, -0.15),
|
"pe_mmu": (0.15, -0.15),
|
||||||
"pe_tcm": (0.3, 0.0),
|
"pe_tcm": (0.3, 0.0),
|
||||||
|
"pe_ipcq": (-0.15, 0.15),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -276,7 +278,7 @@ def _instantiate_io_chiplets(
|
|||||||
for phy in inst["ucie"]["phys"]:
|
for phy in inst["ucie"]["phys"]:
|
||||||
phy_id = f"{prefix}.ucie-{phy}"
|
phy_id = f"{prefix}.ucie-{phy}"
|
||||||
nodes[phy_id] = Node(
|
nodes[phy_id] = Node(
|
||||||
id=phy_id, kind="io_ucie", impl="ucie_v1",
|
id=phy_id, kind="io_ucie", impl="builtin.ucie",
|
||||||
attrs={"overhead_ns": io_ucie_ns},
|
attrs={"overhead_ns": io_ucie_ns},
|
||||||
pos_mm=(cx, noc_y), label=f"IO UCIe-{phy}",
|
pos_mm=(cx, noc_y), label=f"IO UCIe-{phy}",
|
||||||
)
|
)
|
||||||
@@ -284,7 +286,7 @@ def _instantiate_io_chiplets(
|
|||||||
for ci in range(io_n_conn):
|
for ci in range(io_n_conn):
|
||||||
conn_id = f"{phy_id}.conn{ci}"
|
conn_id = f"{phy_id}.conn{ci}"
|
||||||
nodes[conn_id] = Node(
|
nodes[conn_id] = Node(
|
||||||
id=conn_id, kind="io_ucie_conn", impl="ucie_v1",
|
id=conn_id, kind="io_ucie_conn", impl="builtin.ucie",
|
||||||
attrs={"overhead_ns": 0.0},
|
attrs={"overhead_ns": 0.0},
|
||||||
pos_mm=(cx, noc_y), label=f"IO UCIe-{phy} C{ci}",
|
pos_mm=(cx, noc_y), label=f"IO UCIe-{phy} C{ci}",
|
||||||
)
|
)
|
||||||
@@ -378,14 +380,14 @@ def _instantiate_cube(
|
|||||||
pid = f"{cp}.ucie-{port}"
|
pid = f"{cp}.ucie-{port}"
|
||||||
lx, ly = local_pos[f"ucie-{port}"]
|
lx, ly = local_pos[f"ucie-{port}"]
|
||||||
nodes[pid] = Node(
|
nodes[pid] = Node(
|
||||||
id=pid, kind="ucie_port", impl="ucie_v1",
|
id=pid, kind="ucie_port", impl="builtin.ucie",
|
||||||
attrs={"overhead_ns": ucie_ns}, pos_mm=(ox + lx, oy + ly),
|
attrs={"overhead_ns": ucie_ns}, pos_mm=(ox + lx, oy + ly),
|
||||||
label=f"UCIe-{port}",
|
label=f"UCIe-{port}",
|
||||||
)
|
)
|
||||||
for ci in range(ucie_n_conn):
|
for ci in range(ucie_n_conn):
|
||||||
conn_id = f"{cp}.ucie-{port}.conn{ci}"
|
conn_id = f"{cp}.ucie-{port}.conn{ci}"
|
||||||
nodes[conn_id] = Node(
|
nodes[conn_id] = Node(
|
||||||
id=conn_id, kind="ucie_conn", impl="ucie_v1",
|
id=conn_id, kind="ucie_conn", impl="builtin.ucie",
|
||||||
attrs={"overhead_ns": 0.0},
|
attrs={"overhead_ns": 0.0},
|
||||||
pos_mm=(ox + lx, oy + ly),
|
pos_mm=(ox + lx, oy + ly),
|
||||||
label=f"UCIe-{port} C{ci}",
|
label=f"UCIe-{port} C{ci}",
|
||||||
@@ -637,12 +639,13 @@ def _instantiate_cube(
|
|||||||
|
|
||||||
|
|
||||||
def _add_pe_internal_edges(edges: list[Edge], pp: str, pe_links: dict) -> None:
|
def _add_pe_internal_edges(edges: list[Edge], pp: str, pe_links: dict) -> None:
|
||||||
"""Add PE-internal edges for a single PE instance."""
|
"""Add PE-internal edges for a single PE instance (ADR-0021)."""
|
||||||
edges.append(Edge(
|
edges.append(Edge(
|
||||||
src=f"{pp}.pe_cpu", dst=f"{pp}.pe_scheduler",
|
src=f"{pp}.pe_cpu", dst=f"{pp}.pe_scheduler",
|
||||||
distance_mm=pe_links["pe_cpu_to_scheduler_mm"],
|
distance_mm=pe_links["pe_cpu_to_scheduler_mm"],
|
||||||
kind="pe_internal",
|
kind="pe_internal",
|
||||||
))
|
))
|
||||||
|
# Scheduler → engines (initial dispatch)
|
||||||
for eng, key in [("pe_dma", "scheduler_to_dma_mm"),
|
for eng, key in [("pe_dma", "scheduler_to_dma_mm"),
|
||||||
("pe_gemm", "scheduler_to_gemm_mm"),
|
("pe_gemm", "scheduler_to_gemm_mm"),
|
||||||
("pe_math", "scheduler_to_math_mm")]:
|
("pe_math", "scheduler_to_math_mm")]:
|
||||||
@@ -651,6 +654,15 @@ def _add_pe_internal_edges(edges: list[Edge], pp: str, pe_links: dict) -> None:
|
|||||||
distance_mm=pe_links[key],
|
distance_mm=pe_links[key],
|
||||||
kind="pe_internal",
|
kind="pe_internal",
|
||||||
))
|
))
|
||||||
|
# Scheduler → fetch_store (initial dispatch)
|
||||||
|
if "scheduler_to_fetch_store_mm" in pe_links:
|
||||||
|
edges.append(Edge(
|
||||||
|
src=f"{pp}.pe_scheduler", dst=f"{pp}.pe_fetch_store",
|
||||||
|
distance_mm=pe_links["scheduler_to_fetch_store_mm"],
|
||||||
|
kind="pe_internal",
|
||||||
|
))
|
||||||
|
|
||||||
|
# Engine → TCM (legacy BW edges)
|
||||||
for eng, mm_key, bw_key in [("pe_dma", "dma_to_tcm_mm", "dma_to_tcm_bw_gbs"),
|
for eng, mm_key, bw_key in [("pe_dma", "dma_to_tcm_mm", "dma_to_tcm_bw_gbs"),
|
||||||
("pe_gemm", "gemm_to_tcm_mm", "gemm_to_tcm_bw_gbs"),
|
("pe_gemm", "gemm_to_tcm_mm", "gemm_to_tcm_bw_gbs"),
|
||||||
("pe_math", "math_to_tcm_mm", "math_to_tcm_bw_gbs")]:
|
("pe_math", "math_to_tcm_mm", "math_to_tcm_bw_gbs")]:
|
||||||
@@ -661,6 +673,46 @@ def _add_pe_internal_edges(edges: list[Edge], pp: str, pe_links: dict) -> None:
|
|||||||
kind="pe_internal",
|
kind="pe_internal",
|
||||||
))
|
))
|
||||||
|
|
||||||
|
# Fetch/Store → TCM (ADR-0021 D5)
|
||||||
|
if "fetch_store_to_tcm_mm" in pe_links:
|
||||||
|
edges.append(Edge(
|
||||||
|
src=f"{pp}.pe_fetch_store", dst=f"{pp}.pe_tcm",
|
||||||
|
distance_mm=pe_links["fetch_store_to_tcm_mm"],
|
||||||
|
bw_gbs=pe_links.get("fetch_store_to_tcm_bw_gbs", 512.0),
|
||||||
|
kind="pe_internal",
|
||||||
|
))
|
||||||
|
|
||||||
|
# Chaining edges (ADR-0021 D4 — token self-routing)
|
||||||
|
chaining = [
|
||||||
|
("pe_dma", "pe_fetch_store", "dma_to_fetch_store_mm"),
|
||||||
|
("pe_fetch_store", "pe_gemm", "fetch_store_to_gemm_mm"),
|
||||||
|
("pe_fetch_store", "pe_math", "fetch_store_to_math_mm"),
|
||||||
|
("pe_gemm", "pe_fetch_store", "gemm_to_fetch_store_mm"),
|
||||||
|
("pe_math", "pe_fetch_store", "math_to_fetch_store_mm"),
|
||||||
|
("pe_fetch_store", "pe_dma", "fetch_store_to_dma_mm"),
|
||||||
|
]
|
||||||
|
for src_eng, dst_eng, mm_key in chaining:
|
||||||
|
if mm_key in pe_links:
|
||||||
|
edges.append(Edge(
|
||||||
|
src=f"{pp}.{src_eng}", dst=f"{pp}.{dst_eng}",
|
||||||
|
distance_mm=pe_links[mm_key],
|
||||||
|
kind="pe_internal",
|
||||||
|
))
|
||||||
|
|
||||||
|
# PE_IPCQ edges (ADR-0023 D1, D9 D10)
|
||||||
|
ipcq_edges = [
|
||||||
|
("pe_cpu", "pe_ipcq", "cpu_to_ipcq_mm"), # IpcqRequest
|
||||||
|
("pe_ipcq", "pe_dma", "ipcq_to_dma_mm"), # IpcqDmaToken outbound
|
||||||
|
("pe_dma", "pe_ipcq", "dma_to_ipcq_mm"), # IpcqMetaArrival inbound
|
||||||
|
]
|
||||||
|
for src_c, dst_c, mm_key in ipcq_edges:
|
||||||
|
if mm_key in pe_links:
|
||||||
|
edges.append(Edge(
|
||||||
|
src=f"{pp}.{src_c}", dst=f"{pp}.{dst_c}",
|
||||||
|
distance_mm=pe_links[mm_key],
|
||||||
|
kind="pe_internal",
|
||||||
|
))
|
||||||
|
|
||||||
|
|
||||||
# ── Inter-cube / IO / system edges ──────────────────────────────────
|
# ── Inter-cube / IO / system edges ──────────────────────────────────
|
||||||
|
|
||||||
@@ -728,7 +780,13 @@ def _add_io_to_cube_edges(
|
|||||||
def _add_system_to_io_edges(
|
def _add_system_to_io_edges(
|
||||||
edges: list[Edge], sp: str, sip_spec: dict, system: dict,
|
edges: list[Edge], sp: str, sip_spec: dict, system: dict,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Add fabric switch → IO chiplet PCIe edges."""
|
"""Add bidirectional fabric switch ↔ IO chiplet PCIe edges.
|
||||||
|
|
||||||
|
Both directions are needed:
|
||||||
|
switch → pcie_ep for host→device traffic (memory writes, kernel launch)
|
||||||
|
pcie_ep → switch for device-side outbound traffic (cross-SIP IPCQ
|
||||||
|
send between PE_DMAs through the system switch).
|
||||||
|
"""
|
||||||
sw_id = "fabric.switch0"
|
sw_id = "fabric.switch0"
|
||||||
sys_link = system["links"]["io_ep_to_switch"]
|
sys_link = system["links"]["io_ep_to_switch"]
|
||||||
for inst in sip_spec["iochiplet"]["instances"]:
|
for inst in sip_spec["iochiplet"]["instances"]:
|
||||||
@@ -739,6 +797,12 @@ def _add_system_to_io_edges(
|
|||||||
bw_gbs=sys_link["bw_gbs_per_ep"],
|
bw_gbs=sys_link["bw_gbs_per_ep"],
|
||||||
kind="pcie",
|
kind="pcie",
|
||||||
))
|
))
|
||||||
|
edges.append(Edge(
|
||||||
|
src=pcie_ep_id, dst=sw_id,
|
||||||
|
distance_mm=sys_link["distance_mm"],
|
||||||
|
bw_gbs=sys_link["bw_gbs_per_ep"],
|
||||||
|
kind="pcie",
|
||||||
|
))
|
||||||
|
|
||||||
|
|
||||||
# ── View builders ────────────────────────────────────────────────────
|
# ── View builders ────────────────────────────────────────────────────
|
||||||
@@ -900,13 +964,13 @@ def _build_cube_view(spec: dict) -> ViewGraph:
|
|||||||
pid = f"ucie-{port}"
|
pid = f"ucie-{port}"
|
||||||
lx, ly = local_pos[pid]
|
lx, ly = local_pos[pid]
|
||||||
nodes[pid] = Node(
|
nodes[pid] = Node(
|
||||||
id=pid, kind="ucie_port", impl="ucie_v1",
|
id=pid, kind="ucie_port", impl="builtin.ucie",
|
||||||
attrs={}, pos_mm=(lx, ly), label=f"UCIe-{port}",
|
attrs={}, pos_mm=(lx, ly), label=f"UCIe-{port}",
|
||||||
)
|
)
|
||||||
for ci in range(ucie_n_conn):
|
for ci in range(ucie_n_conn):
|
||||||
conn_id = f"ucie-{port}.conn{ci}"
|
conn_id = f"ucie-{port}.conn{ci}"
|
||||||
nodes[conn_id] = Node(
|
nodes[conn_id] = Node(
|
||||||
id=conn_id, kind="ucie_conn", impl="ucie_v1",
|
id=conn_id, kind="ucie_conn", impl="builtin.ucie",
|
||||||
attrs={"overhead_ns": 0.0}, pos_mm=(lx, ly),
|
attrs={"overhead_ns": 0.0}, pos_mm=(lx, ly),
|
||||||
label=f"UCIe-{port} C{ci}",
|
label=f"UCIe-{port} C{ci}",
|
||||||
)
|
)
|
||||||
@@ -1071,17 +1135,19 @@ def _build_pe_view(spec: dict) -> ViewGraph:
|
|||||||
"pe_cpu": (1.5, 4.0),
|
"pe_cpu": (1.5, 4.0),
|
||||||
"pe_scheduler": (4.0, 4.0),
|
"pe_scheduler": (4.0, 4.0),
|
||||||
"pe_dma": (7.0, 1.5),
|
"pe_dma": (7.0, 1.5),
|
||||||
|
"pe_fetch_store": (8.5, 4.0),
|
||||||
"pe_gemm": (7.0, 4.0),
|
"pe_gemm": (7.0, 4.0),
|
||||||
"pe_math": (7.0, 6.5),
|
"pe_math": (7.0, 6.5),
|
||||||
"pe_mmu": (4.0, 1.5),
|
"pe_mmu": (4.0, 1.5),
|
||||||
"pe_tcm": (10.0, 4.0),
|
"pe_tcm": (10.0, 4.0),
|
||||||
|
"pe_ipcq": (4.0, 6.5),
|
||||||
}
|
}
|
||||||
|
|
||||||
nodes: dict[str, Node] = {}
|
nodes: dict[str, Node] = {}
|
||||||
view_edges: list[Edge] = []
|
view_edges: list[Edge] = []
|
||||||
|
|
||||||
for comp_name, comp_spec in pe_tmpl["components"].items():
|
for comp_name, comp_spec in pe_tmpl["components"].items():
|
||||||
px, py = positions[comp_name]
|
px, py = positions.get(comp_name, (1.0, 1.0))
|
||||||
nodes[comp_name] = Node(
|
nodes[comp_name] = Node(
|
||||||
id=comp_name, kind=comp_spec["kind"], impl=comp_spec["impl"],
|
id=comp_name, kind=comp_spec["kind"], impl=comp_spec["impl"],
|
||||||
attrs=comp_spec["attrs"], pos_mm=(px, py),
|
attrs=comp_spec["attrs"], pos_mm=(px, py),
|
||||||
@@ -1101,6 +1167,12 @@ def _build_pe_view(spec: dict) -> ViewGraph:
|
|||||||
distance_mm=pe_links[key],
|
distance_mm=pe_links[key],
|
||||||
kind="pe_internal",
|
kind="pe_internal",
|
||||||
))
|
))
|
||||||
|
if "scheduler_to_fetch_store_mm" in pe_links:
|
||||||
|
view_edges.append(Edge(
|
||||||
|
src="pe_scheduler", dst="pe_fetch_store",
|
||||||
|
distance_mm=pe_links["scheduler_to_fetch_store_mm"],
|
||||||
|
kind="pe_internal",
|
||||||
|
))
|
||||||
for eng, mm_key, bw_key in [("pe_dma", "dma_to_tcm_mm", "dma_to_tcm_bw_gbs"),
|
for eng, mm_key, bw_key in [("pe_dma", "dma_to_tcm_mm", "dma_to_tcm_bw_gbs"),
|
||||||
("pe_gemm", "gemm_to_tcm_mm", "gemm_to_tcm_bw_gbs"),
|
("pe_gemm", "gemm_to_tcm_mm", "gemm_to_tcm_bw_gbs"),
|
||||||
("pe_math", "math_to_tcm_mm", "math_to_tcm_bw_gbs")]:
|
("pe_math", "math_to_tcm_mm", "math_to_tcm_bw_gbs")]:
|
||||||
@@ -1110,6 +1182,13 @@ def _build_pe_view(spec: dict) -> ViewGraph:
|
|||||||
bw_gbs=pe_links[bw_key],
|
bw_gbs=pe_links[bw_key],
|
||||||
kind="pe_internal",
|
kind="pe_internal",
|
||||||
))
|
))
|
||||||
|
if "fetch_store_to_tcm_mm" in pe_links:
|
||||||
|
view_edges.append(Edge(
|
||||||
|
src="pe_fetch_store", dst="pe_tcm",
|
||||||
|
distance_mm=pe_links["fetch_store_to_tcm_mm"],
|
||||||
|
bw_gbs=pe_links.get("fetch_store_to_tcm_bw_gbs", 512.0),
|
||||||
|
kind="pe_internal",
|
||||||
|
))
|
||||||
|
|
||||||
return ViewGraph(
|
return ViewGraph(
|
||||||
name="pe", nodes=nodes, edges=view_edges,
|
name="pe", nodes=nodes, edges=view_edges,
|
||||||
|
|||||||
@@ -0,0 +1,21 @@
|
|||||||
|
"""kernbench.tp — Megatron-style Tensor Parallelism (ADR-0027).
|
||||||
|
|
||||||
|
Public API re-exports.
|
||||||
|
"""
|
||||||
|
from kernbench.tp.layers import (
|
||||||
|
ColumnParallelLinear,
|
||||||
|
RowParallelLinear,
|
||||||
|
)
|
||||||
|
from kernbench.tp.parallel_state import (
|
||||||
|
get_tensor_model_parallel_rank,
|
||||||
|
get_tensor_model_parallel_world_size,
|
||||||
|
initialize_model_parallel,
|
||||||
|
)
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"ColumnParallelLinear",
|
||||||
|
"RowParallelLinear",
|
||||||
|
"get_tensor_model_parallel_rank",
|
||||||
|
"get_tensor_model_parallel_world_size",
|
||||||
|
"initialize_model_parallel",
|
||||||
|
]
|
||||||
@@ -0,0 +1,23 @@
|
|||||||
|
"""Kernel used by ``kernbench.tp`` layers (ADR-0027 D4/D5).
|
||||||
|
|
||||||
|
Intentionally self-contained inside the ``tp`` package — the ``tp`` package
|
||||||
|
must not import from ``benches/``. Future work: move to a shared
|
||||||
|
``kernbench.kernels`` module so benches and TP can share.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
|
||||||
|
def _gemm_kernel(a_ptr, b_ptr, out_ptr, M, K, N, tl, DTYPE: str = "f16") -> None:
|
||||||
|
"""Single-PE GEMM: out = a @ b via load → dot → store.
|
||||||
|
|
||||||
|
Uses the ``tl.load + tl.dot + tl.store`` path. Unlike ``tl.composite``
|
||||||
|
(which is absorbed by the PE scheduler into TileTokens that don't reach
|
||||||
|
the op_log), this path emits explicit ``DmaReadCmd`` / ``GemmCmd`` /
|
||||||
|
``DmaWriteCmd`` records, which DataExecutor replays numerically in
|
||||||
|
Phase 2.
|
||||||
|
"""
|
||||||
|
M, K, N = int(M), int(K), int(N)
|
||||||
|
a = tl.load(int(a_ptr), shape=(M, K), dtype=DTYPE)
|
||||||
|
b = tl.load(int(b_ptr), shape=(K, N), dtype=DTYPE)
|
||||||
|
out = tl.dot(a, b)
|
||||||
|
tl.store(int(out_ptr), out)
|
||||||
@@ -0,0 +1,150 @@
|
|||||||
|
"""Megatron-style parallel layers (ADR-0027 D4/D5).
|
||||||
|
|
||||||
|
- ``ColumnParallelLinear``: weight's out_features axis split across TP ranks.
|
||||||
|
forward(x) is local gemm; no collective.
|
||||||
|
- ``RowParallelLinear``: weight's in_features axis split across TP ranks.
|
||||||
|
forward(x) ends with ``dist.all_reduce`` to sum partial products.
|
||||||
|
|
||||||
|
Both layers use the intra-device ``DPPolicy`` (ADR-0026). TP shard
|
||||||
|
ownership is determined by ``torch.ahbm.set_device(rank)`` (ADR-0024 D10).
|
||||||
|
|
||||||
|
Yield-safety contract (ADR-0027 D4/D5): every forward path contains at
|
||||||
|
least one ``ctx.wait`` (via ``torch.launch``) or one collective; this
|
||||||
|
keeps the scheduler loop making progress.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from kernbench.policy.placement.dp import DPPolicy
|
||||||
|
from kernbench.tp.kernels import _gemm_kernel
|
||||||
|
from kernbench.tp.parallel_state import (
|
||||||
|
get_tensor_model_parallel_world_size,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class ColumnParallelLinear:
|
||||||
|
"""Weight's K (out_features) axis distributed across TP ranks.
|
||||||
|
|
||||||
|
forward(x):
|
||||||
|
x: (M, N) — full-replicated across ranks
|
||||||
|
W_k: (N, K / world_size) — this rank's slice (on its SIP)
|
||||||
|
y_k = x @ W_k → (M, K / world_size)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
in_features: int,
|
||||||
|
out_features: int,
|
||||||
|
bias: bool = False,
|
||||||
|
dtype: str = "f16",
|
||||||
|
torch: Any = None,
|
||||||
|
) -> None:
|
||||||
|
if torch is None:
|
||||||
|
raise TypeError("ColumnParallelLinear requires torch=<RuntimeContext>")
|
||||||
|
ws = get_tensor_model_parallel_world_size()
|
||||||
|
if out_features % ws != 0:
|
||||||
|
raise ValueError(
|
||||||
|
f"out_features ({out_features}) must be divisible by TP world "
|
||||||
|
f"size ({ws})"
|
||||||
|
)
|
||||||
|
self.in_features = in_features
|
||||||
|
self.out_features = out_features
|
||||||
|
self.k_local = out_features // ws
|
||||||
|
self.dtype = dtype
|
||||||
|
self._torch = torch
|
||||||
|
# Per-rank weight slice. ``set_device(rank)`` (ADR-0024 D10) places
|
||||||
|
# it on SIP ``rank``. Intra-SIP layout comes from DPPolicy (ADR-0026).
|
||||||
|
self.weight = torch.zeros(
|
||||||
|
(in_features, self.k_local),
|
||||||
|
dtype=dtype,
|
||||||
|
dp=DPPolicy(cube="replicate", pe="replicate",
|
||||||
|
num_cubes=1, num_pes=1),
|
||||||
|
name="col_parallel_w",
|
||||||
|
)
|
||||||
|
# Bias omitted in initial scope (ADR-0027 D9).
|
||||||
|
self.bias = None
|
||||||
|
if bias:
|
||||||
|
raise NotImplementedError(
|
||||||
|
"bias=True is deferred (ADR-0027 D9 initial scope)"
|
||||||
|
)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
M = int(x.shape[0])
|
||||||
|
out = self._torch.empty(
|
||||||
|
(M, self.k_local),
|
||||||
|
dtype=x.dtype,
|
||||||
|
dp=DPPolicy(cube="replicate", pe="replicate",
|
||||||
|
num_cubes=1, num_pes=1),
|
||||||
|
name="col_parallel_out",
|
||||||
|
)
|
||||||
|
self._torch.launch(
|
||||||
|
"col_parallel_gemm",
|
||||||
|
_gemm_kernel,
|
||||||
|
x, self.weight, out,
|
||||||
|
M, self.in_features, self.k_local,
|
||||||
|
)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
class RowParallelLinear:
|
||||||
|
"""Weight's N (in_features) axis distributed across TP ranks.
|
||||||
|
|
||||||
|
forward(x):
|
||||||
|
x: (M, N / world_size) — rank-local slice (ColumnParallel output)
|
||||||
|
W_k: (N / world_size, K) — this rank's slice
|
||||||
|
y_k = x @ W_k → (M, K) — partial sum
|
||||||
|
y = all_reduce(y_k, op="sum") → (M, K) on every rank
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
in_features: int,
|
||||||
|
out_features: int,
|
||||||
|
bias: bool = False,
|
||||||
|
dtype: str = "f16",
|
||||||
|
torch: Any = None,
|
||||||
|
) -> None:
|
||||||
|
if torch is None:
|
||||||
|
raise TypeError("RowParallelLinear requires torch=<RuntimeContext>")
|
||||||
|
ws = get_tensor_model_parallel_world_size()
|
||||||
|
if in_features % ws != 0:
|
||||||
|
raise ValueError(
|
||||||
|
f"in_features ({in_features}) must be divisible by TP world "
|
||||||
|
f"size ({ws})"
|
||||||
|
)
|
||||||
|
self.in_features = in_features
|
||||||
|
self.out_features = out_features
|
||||||
|
self.n_local = in_features // ws
|
||||||
|
self.dtype = dtype
|
||||||
|
self._torch = torch
|
||||||
|
self.weight = torch.zeros(
|
||||||
|
(self.n_local, out_features),
|
||||||
|
dtype=dtype,
|
||||||
|
dp=DPPolicy(cube="replicate", pe="replicate",
|
||||||
|
num_cubes=1, num_pes=1),
|
||||||
|
name="row_parallel_w",
|
||||||
|
)
|
||||||
|
self.bias = None
|
||||||
|
if bias:
|
||||||
|
raise NotImplementedError(
|
||||||
|
"bias=True is deferred (ADR-0027 D9 initial scope)"
|
||||||
|
)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
M = int(x.shape[0])
|
||||||
|
y_partial = self._torch.empty(
|
||||||
|
(M, self.out_features),
|
||||||
|
dtype=x.dtype,
|
||||||
|
dp=DPPolicy(cube="replicate", pe="replicate",
|
||||||
|
num_cubes=1, num_pes=1),
|
||||||
|
name="row_parallel_partial",
|
||||||
|
)
|
||||||
|
self._torch.launch(
|
||||||
|
"row_parallel_gemm",
|
||||||
|
_gemm_kernel,
|
||||||
|
x, self.weight, y_partial,
|
||||||
|
M, self.n_local, self.out_features,
|
||||||
|
)
|
||||||
|
self._torch.distributed.all_reduce(y_partial, op="sum")
|
||||||
|
return y_partial
|
||||||
@@ -0,0 +1,5 @@
|
|||||||
|
"""Forward/backward mappings stub (ADR-0027 — future backward work).
|
||||||
|
|
||||||
|
Inference-only initial scope. Backward hooks land when training simulation
|
||||||
|
arrives.
|
||||||
|
"""
|
||||||
@@ -0,0 +1,83 @@
|
|||||||
|
"""TP group state (ADR-0027 D3).
|
||||||
|
|
||||||
|
Single global TP group. Initial scope: TP size == world_size (pure TP;
|
||||||
|
mixed DP+TP is future work).
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
_TP_WORLD_SIZE: int | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def initialize_model_parallel(tensor_model_parallel_size: int) -> None:
|
||||||
|
"""Initialize the TP process group.
|
||||||
|
|
||||||
|
Must be called after ``torch.distributed.init_process_group``.
|
||||||
|
Only ``tensor_model_parallel_size == world_size`` is supported in the
|
||||||
|
initial scope.
|
||||||
|
"""
|
||||||
|
global _TP_WORLD_SIZE
|
||||||
|
# Import here to avoid cycle when tp is imported before a ctx exists.
|
||||||
|
_ws = _current_world_size()
|
||||||
|
if tensor_model_parallel_size != _ws:
|
||||||
|
raise NotImplementedError(
|
||||||
|
f"Only TP == world_size supported; got TP={tensor_model_parallel_size}, "
|
||||||
|
f"world_size={_ws}"
|
||||||
|
)
|
||||||
|
_TP_WORLD_SIZE = tensor_model_parallel_size
|
||||||
|
|
||||||
|
|
||||||
|
def get_tensor_model_parallel_world_size() -> int:
|
||||||
|
"""Return the TP group's world size.
|
||||||
|
|
||||||
|
Raises if not initialised — callers must call
|
||||||
|
:func:`initialize_model_parallel` first.
|
||||||
|
"""
|
||||||
|
if _TP_WORLD_SIZE is None:
|
||||||
|
raise RuntimeError(
|
||||||
|
"TP group not initialised; call initialize_model_parallel() first"
|
||||||
|
)
|
||||||
|
return _TP_WORLD_SIZE
|
||||||
|
|
||||||
|
|
||||||
|
def get_tensor_model_parallel_rank() -> int:
|
||||||
|
"""Return this worker's rank within the TP group.
|
||||||
|
|
||||||
|
Delegates to the greenlet-local rank registered by the spawn launcher
|
||||||
|
(ADR-0024 D9 via ``torch.distributed.get_rank``).
|
||||||
|
"""
|
||||||
|
# Resolve via the global torch.distributed facade on the active ctx.
|
||||||
|
return _current_rank()
|
||||||
|
|
||||||
|
|
||||||
|
def _reset_for_tests() -> None:
|
||||||
|
"""Clear _TP_WORLD_SIZE so ordering-sensitive tests can re-init."""
|
||||||
|
global _TP_WORLD_SIZE
|
||||||
|
_TP_WORLD_SIZE = None
|
||||||
|
|
||||||
|
|
||||||
|
# ── helpers (resolve current ctx) ────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def _current_ctx():
|
||||||
|
"""Best-effort resolution of the currently-active RuntimeContext.
|
||||||
|
|
||||||
|
In KernBench, the ``ctx`` is passed as the ``torch`` positional in
|
||||||
|
bench/worker code. Since parallel_state is a module-global helper,
|
||||||
|
we look it up via a weak registry maintained by RuntimeContext.
|
||||||
|
"""
|
||||||
|
from kernbench.runtime_api.context import _get_active_context
|
||||||
|
ctx = _get_active_context()
|
||||||
|
if ctx is None:
|
||||||
|
raise RuntimeError(
|
||||||
|
"No active RuntimeContext; kernbench.tp requires one "
|
||||||
|
"(call init_process_group / spawn under a live ctx)"
|
||||||
|
)
|
||||||
|
return ctx
|
||||||
|
|
||||||
|
|
||||||
|
def _current_world_size() -> int:
|
||||||
|
return _current_ctx().distributed.get_world_size()
|
||||||
|
|
||||||
|
|
||||||
|
def _current_rank() -> int:
|
||||||
|
return _current_ctx().distributed.get_rank()
|
||||||
@@ -0,0 +1,34 @@
|
|||||||
|
"""TP primitive ops (ADR-0027 D6).
|
||||||
|
|
||||||
|
``copy_to_tp_region`` / ``reduce_from_tp_region`` are forward-only in the
|
||||||
|
initial scope (backward pass is future work). ``scatter`` / ``gather`` are
|
||||||
|
not implemented — they require an all-gather kernel that is not yet
|
||||||
|
available in KernBench (see ADR-0027 D9).
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
|
||||||
|
def copy_to_tp_region(x: Any) -> Any:
|
||||||
|
"""Forward: identity. Backward: all-reduce. (Training is future.)"""
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
def reduce_from_tp_region(x: Any, torch: Any) -> Any:
|
||||||
|
"""Forward: all-reduce. Backward: identity."""
|
||||||
|
torch.distributed.all_reduce(x, op="sum")
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
def scatter_to_tp_region(x: Any) -> Any:
|
||||||
|
raise NotImplementedError(
|
||||||
|
"scatter_to_tp_region deferred — caller should create the sharded "
|
||||||
|
"tensor directly (ADR-0027 D9)"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def gather_from_tp_region(x: Any) -> Any:
|
||||||
|
raise NotImplementedError(
|
||||||
|
"gather_from_tp_region deferred — requires all-gather kernel (ADR-0027 D9)"
|
||||||
|
)
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user