Compare commits
10 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 1d8b9401e5 | |||
| cfc2d74ec4 | |||
| 105f1dc09e | |||
| e7f376ebaa | |||
| 357cab525b | |||
| 787409ced1 | |||
| 79124daab1 | |||
| 4ba0a83e71 | |||
| 32536daf2e | |||
| e1084800ab |
@@ -29,3 +29,4 @@ build/
|
|||||||
|
|
||||||
# Logs
|
# Logs
|
||||||
*.log
|
*.log
|
||||||
|
.claude/
|
||||||
|
|||||||
+80
-106
@@ -1,129 +1,103 @@
|
|||||||
"""CCL all-reduce bench — single unified entry point.
|
"""CCL all-reduce bench (ADR-0024 + ADR-0027).
|
||||||
|
|
||||||
Driven entirely by ``ccl.yaml`` + ``topology.yaml``:
|
Pure TP launcher model: rank = SIP. Each rank owns a ``(N_CUBES, n_elem)``
|
||||||
|
tensor sharded row-wise across the cube mesh (pe0 per cube). After
|
||||||
|
``dist.all_reduce(op="sum")`` every cube on every rank must hold
|
||||||
|
``N_CUBES * sum(1..world_size)``. Rank 0 prints the pass/fail line.
|
||||||
|
|
||||||
- ``defaults.algorithm`` in ``ccl.yaml`` picks which kernel to run
|
Driven by ``ccl.yaml`` (``defaults.algorithm``, ``n_elem``) + ``topology.yaml``
|
||||||
(``ring_allreduce_{tcm,hbm,sram}`` / ``mesh_allreduce_4`` /
|
(SIP count → world_size, cube_mesh → N_CUBES).
|
||||||
``tree_allreduce_7``).
|
|
||||||
- ``world_size`` is derived from the algorithm entry's override or from
|
|
||||||
the topology spec (``sips × cubes_per_sip × pes_per_cube``).
|
|
||||||
- The host code uses only real PyTorch ``torch.distributed`` names:
|
|
||||||
``init_process_group``, ``get_world_size``, ``get_rank``, ``all_reduce``.
|
|
||||||
|
|
||||||
The bench is split into ``worker(rank, world_size, torch)`` — the
|
|
||||||
per-rank business logic, designed to look like a real PyTorch DDP
|
|
||||||
training worker so future model benches can reuse the same skeleton —
|
|
||||||
and ``run(torch)`` — the kernbench-specific launcher that initializes
|
|
||||||
the process group and invokes the worker.
|
|
||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from kernbench.ccl.install import load_ccl_config, resolve_algorithm_config
|
from kernbench.ccl.install import load_ccl_config, resolve_algorithm_config
|
||||||
from kernbench.policy.placement.dp import DPPolicy
|
from kernbench.policy.placement.dp import DPPolicy
|
||||||
|
|
||||||
# Default per-rank tile size if ccl.yaml doesn't override it. Real
|
DEFAULT_N_ELEM = 8
|
||||||
# pytorch benches hardcode batch/feature dims similarly.
|
|
||||||
DEFAULT_N_ELEM = 32
|
|
||||||
|
|
||||||
|
|
||||||
def _derive_dp(spec: dict, world_size: int) -> DPPolicy:
|
@dataclass(frozen=True)
|
||||||
"""Pick a DPPolicy that fans the tensor across exactly ``world_size`` PEs.
|
class _BenchCfg:
|
||||||
|
algorithm: str
|
||||||
|
n_elem: int
|
||||||
|
n_cubes: int
|
||||||
|
world_size: int
|
||||||
|
|
||||||
Mirrors what a real PyTorch DDP user does manually with
|
|
||||||
``tensor.to(f"cuda:{rank}")``: the host code chooses the placement so
|
def _resolve_cfg(torch) -> _BenchCfg:
|
||||||
that the collective sees the right number of participating ranks.
|
"""Read ccl.yaml + topology once at host side."""
|
||||||
"""
|
merged = resolve_algorithm_config(load_ccl_config())
|
||||||
sips = int(spec["system"]["sips"]["count"])
|
ws = torch.distributed.get_world_size()
|
||||||
cm = spec["sip"]["cube_mesh"]
|
spec = torch.spec or {}
|
||||||
pl = spec["cube"]["pe_layout"]
|
n_sips = int(spec.get("system", {}).get("sips", {}).get("count", 1))
|
||||||
pes_per_cube = int(pl["pe_per_corner"]) * len(pl["corners"])
|
if ws != n_sips:
|
||||||
cubes_per_sip = int(cm["w"]) * int(cm["h"])
|
raise RuntimeError(
|
||||||
total = sips * cubes_per_sip * pes_per_cube
|
f"ccl_allreduce bench requires world_size == topology SIP count "
|
||||||
if world_size == total:
|
f"(world_size={ws}, n_sips={n_sips})."
|
||||||
return DPPolicy(sip="column_wise", cube="column_wise", pe="column_wise")
|
|
||||||
if world_size <= pes_per_cube:
|
|
||||||
return DPPolicy(
|
|
||||||
sip="replicate", cube="replicate", pe="column_wise",
|
|
||||||
num_sips=1, num_cubes=1, num_pes=world_size,
|
|
||||||
)
|
)
|
||||||
if world_size <= cubes_per_sip * pes_per_cube:
|
cm = spec.get("sip", {}).get("cube_mesh", {})
|
||||||
return DPPolicy(
|
n_cubes = int(cm.get("w", 4)) * int(cm.get("h", 4))
|
||||||
sip="replicate", cube="column_wise", pe="column_wise",
|
return _BenchCfg(
|
||||||
num_sips=1, num_cubes=world_size // pes_per_cube,
|
algorithm=merged["algorithm"],
|
||||||
)
|
n_elem=int(merged.get("n_elem", DEFAULT_N_ELEM)),
|
||||||
return DPPolicy(sip="column_wise", cube="column_wise", pe="column_wise")
|
n_cubes=n_cubes,
|
||||||
|
world_size=ws,
|
||||||
|
|
||||||
def worker(rank: int, world_size: int, torch) -> None:
|
|
||||||
"""Per-rank business logic. Mirrors a real PyTorch DDP worker.
|
|
||||||
|
|
||||||
In real PyTorch DDP, this function runs in N separate processes,
|
|
||||||
each with its own ``rank``. In kernbench (single-process multi-device)
|
|
||||||
it is invoked once with ``rank=0`` on the single host driver; the
|
|
||||||
actual per-PE parallelism is handled by ``torch.launch`` fanning out
|
|
||||||
the kernel across all participating PEs via the tensor's DPPolicy.
|
|
||||||
The ``rank`` parameter is therefore always 0 today, and is kept as
|
|
||||||
an explicit argument for parity with real DDP workers (``if rank ==
|
|
||||||
0`` logging guards, future multi-host extensions).
|
|
||||||
"""
|
|
||||||
cfg = resolve_algorithm_config(load_ccl_config())
|
|
||||||
algo_name = cfg["algorithm"]
|
|
||||||
n_elem = int(cfg.get("n_elem", DEFAULT_N_ELEM))
|
|
||||||
|
|
||||||
# Pick a DP that produces exactly ``world_size`` shards on this topology.
|
|
||||||
dp = _derive_dp(torch.spec, world_size)
|
|
||||||
tensor = torch.zeros(
|
|
||||||
(1, world_size * n_elem), dtype="f16", dp=dp, name="ccl_in",
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Initialize: CCL rank r's slice gets value (r + 1). Real PyTorch idiom:
|
|
||||||
# target.copy_(torch.from_numpy(source))
|
|
||||||
init = np.zeros((1, world_size * n_elem), dtype=np.float16)
|
|
||||||
for r in range(world_size):
|
|
||||||
init[0, r * n_elem : (r + 1) * n_elem] = float(r + 1)
|
|
||||||
tensor.copy_(torch.from_numpy(init))
|
|
||||||
|
|
||||||
# The main act: one all_reduce call — the backend installs IPCQ at
|
def _rank_dp(n_cubes: int) -> DPPolicy:
|
||||||
# init_process_group time and here only dispatches the kernel.
|
return DPPolicy(cube="row_wise", pe="replicate", num_cubes=n_cubes, num_pes=1)
|
||||||
|
|
||||||
|
|
||||||
|
def _allocate_rank_tensor(torch, rank: int, cfg: _BenchCfg):
|
||||||
|
"""Allocate this rank's ``(n_cubes, n_elem)`` tensor on its SIP."""
|
||||||
|
return torch.zeros(
|
||||||
|
(cfg.n_cubes, cfg.n_elem), dtype="f16",
|
||||||
|
dp=_rank_dp(cfg.n_cubes), name=f"ccl_in_r{rank}",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _init_with_rank_value(torch, tensor, rank: int, cfg: _BenchCfg) -> None:
|
||||||
|
"""Fill all cubes with the scalar ``rank + 1``."""
|
||||||
|
arr = np.full((cfg.n_cubes, cfg.n_elem), float(rank + 1), dtype=np.float16)
|
||||||
|
tensor.copy_(torch.from_numpy(arr))
|
||||||
|
|
||||||
|
|
||||||
|
def _report(result: np.ndarray, cfg: _BenchCfg) -> None:
|
||||||
|
"""Single-line pass/fail printer (rank 0 only)."""
|
||||||
|
expected = float(cfg.n_cubes * sum(range(1, cfg.world_size + 1)))
|
||||||
|
ok = True
|
||||||
|
for cube_id in range(cfg.n_cubes):
|
||||||
|
if not np.allclose(result[cube_id], expected, rtol=1e-1, atol=1e-1):
|
||||||
|
ok = False
|
||||||
|
break
|
||||||
|
if ok:
|
||||||
|
total = cfg.world_size * cfg.n_cubes
|
||||||
|
print(f" {cfg.algorithm} (ws={cfg.world_size}): {total} OK")
|
||||||
|
return
|
||||||
|
got = float(result.reshape(-1).mean())
|
||||||
|
print(
|
||||||
|
f" [FAIL] {cfg.algorithm} (ws={cfg.world_size}): "
|
||||||
|
f"got mean={got:.3f}, expected={expected:.3f}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _worker(rank: int, cfg: _BenchCfg, torch) -> None:
|
||||||
|
torch.ahbm.set_device(rank)
|
||||||
|
tensor = _allocate_rank_tensor(torch, rank, cfg)
|
||||||
|
_init_with_rank_value(torch, tensor, rank, cfg)
|
||||||
torch.distributed.all_reduce(tensor, op="sum")
|
torch.distributed.all_reduce(tensor, op="sum")
|
||||||
|
|
||||||
# Verify: each shard should hold sum(1..world_size) after all-reduce.
|
|
||||||
result = tensor.numpy()
|
|
||||||
expected = float(sum(range(1, world_size + 1)))
|
|
||||||
all_ok = bool(np.allclose(result, expected, rtol=1e-1, atol=1e-1))
|
|
||||||
|
|
||||||
# Print only on rank 0 — real PyTorch DDP idiom for single-source logs.
|
|
||||||
if rank == 0:
|
if rank == 0:
|
||||||
if all_ok:
|
_report(tensor.numpy(), cfg)
|
||||||
print(f" {algo_name} (ws={world_size}): {world_size} OK")
|
|
||||||
else:
|
|
||||||
flat = result.reshape(-1)
|
|
||||||
n_fail = 0
|
|
||||||
for r in range(world_size):
|
|
||||||
slice_r = flat[r * n_elem : (r + 1) * n_elem]
|
|
||||||
if not np.allclose(slice_r, expected, rtol=1e-1, atol=1e-1):
|
|
||||||
n_fail += 1
|
|
||||||
if n_fail <= 5:
|
|
||||||
print(
|
|
||||||
f" [FAIL] rank {r} "
|
|
||||||
f"(ws={world_size}, algo={algo_name}): "
|
|
||||||
f"got mean={float(slice_r.mean()):.3f}, "
|
|
||||||
f"expected={expected:.3f}"
|
|
||||||
)
|
|
||||||
print(
|
|
||||||
f" {algo_name} (ws={world_size}): "
|
|
||||||
f"{world_size - n_fail} OK / {n_fail} FAIL"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def run(torch) -> None:
|
def run(torch) -> None:
|
||||||
"""CLI entry point: initialize the process group, invoke worker."""
|
torch.distributed.init_process_group(backend="ahbm")
|
||||||
dist = torch.distributed
|
cfg = _resolve_cfg(torch)
|
||||||
dist.init_process_group(backend="ahbm")
|
torch.multiprocessing.spawn(
|
||||||
worker(
|
_worker, args=(cfg, torch), nprocs=cfg.world_size,
|
||||||
rank=dist.get_rank(),
|
|
||||||
world_size=dist.get_world_size(),
|
|
||||||
torch=torch,
|
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
Full host-to-PE pipeline:
|
Full host-to-PE pipeline:
|
||||||
Host → PCIE_EP → IO_CPU → M_CPU → PE_CPU → SchedulerV2 → PE_DMA → HBM
|
Host → PCIE_EP → IO_CPU → M_CPU → PE_CPU → SchedulerV2 → PE_DMA → HBM
|
||||||
|
|
||||||
Single PE: num_sips=1, num_cubes=1, num_pes=1 via DPPolicy override.
|
Single PE: num_cubes=1, num_pes=1 via DPPolicy override.
|
||||||
Both operands use tl.ref (HBM-resident); scheduler_v2 tiles and streams
|
Both operands use tl.ref (HBM-resident); scheduler_v2 tiles and streams
|
||||||
per-tile DMA internally.
|
per-tile DMA internally.
|
||||||
|
|
||||||
@@ -30,7 +30,7 @@ def _gemm_kernel(a_ptr, b_ptr, out_ptr, M, K, N, tl, DTYPE="f16"):
|
|||||||
def run(torch):
|
def run(torch):
|
||||||
"""Run the single-PE GEMM benchmark."""
|
"""Run the single-PE GEMM benchmark."""
|
||||||
dp = DPPolicy(cube="replicate", pe="replicate",
|
dp = DPPolicy(cube="replicate", pe="replicate",
|
||||||
num_sips=1, num_cubes=1, num_pes=1)
|
num_cubes=1, num_pes=1)
|
||||||
|
|
||||||
a = torch.empty((M, K), dtype=DTYPE, dp=dp, name="a")
|
a = torch.empty((M, K), dtype=DTYPE, dp=dp, name="a")
|
||||||
b = torch.empty((K, N), dtype=DTYPE, dp=dp, name="b")
|
b = torch.empty((K, N), dtype=DTYPE, dp=dp, name="b")
|
||||||
|
|||||||
+8
-4
@@ -72,12 +72,16 @@ def run(torch):
|
|||||||
K = GPT3_D_MODEL
|
K = GPT3_D_MODEL
|
||||||
N = COLS_PER_PE
|
N = COLS_PER_PE
|
||||||
|
|
||||||
# X: replicated across all PEs
|
# ADR-0026: DPPolicy is intra-device only. For multi-SIP execution the
|
||||||
|
# ADR-0024 launcher calls this bench once per SIP (each worker via
|
||||||
|
# torch.ahbm.set_device(rank)); here the policy describes only the
|
||||||
|
# cube × PE layout within a single SIP.
|
||||||
|
# X: replicated across all PEs within the SIP
|
||||||
dp_replicate = DPPolicy(cube="replicate", pe="replicate",
|
dp_replicate = DPPolicy(cube="replicate", pe="replicate",
|
||||||
num_sips=N_SIPS, num_cubes=N_CUBES, num_pes=N_PE_PER_CUBE)
|
num_cubes=N_CUBES, num_pes=N_PE_PER_CUBE)
|
||||||
# W_Q/K/V, out_Q/K/V: column-wise sharded across all PEs
|
# W_Q/K/V, out_Q/K/V: column-wise sharded across all PEs within the SIP
|
||||||
dp_sharded = DPPolicy(cube="column_wise", pe="column_wise",
|
dp_sharded = DPPolicy(cube="column_wise", pe="column_wise",
|
||||||
num_sips=N_SIPS, num_cubes=N_CUBES, num_pes=N_PE_PER_CUBE)
|
num_cubes=N_CUBES, num_pes=N_PE_PER_CUBE)
|
||||||
|
|
||||||
x = torch.empty((M, K), dtype=DTYPE, dp=dp_replicate, name="x")
|
x = torch.empty((M, K), dtype=DTYPE, dp=dp_replicate, name="x")
|
||||||
wq = torch.empty((K, GPT3_D_MODEL), dtype=DTYPE, dp=dp_sharded, name="wq")
|
wq = torch.empty((K, GPT3_D_MODEL), dtype=DTYPE, dp=dp_sharded, name="wq")
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
"""VA offset verification benchmark.
|
"""VA offset verification benchmark.
|
||||||
|
|
||||||
Verifies that Triton-style base_ptr + pid * stride addressing works correctly
|
Verifies that Triton-style base_ptr + pid * stride addressing works correctly
|
||||||
with full TP sharding (sip/cube/pe all column_wise). Each PE loads its own
|
with intra-SIP TP sharding (cube/pe column_wise). Each PE loads its own
|
||||||
block from a sharded tensor and stores it back.
|
block from a sharded tensor and stores it back.
|
||||||
|
|
||||||
The kernel uses standard Triton patterns:
|
The kernel uses standard Triton patterns:
|
||||||
@@ -28,7 +28,7 @@ def _copy_kernel(src_ptr, dst_ptr, M, K, tl, DTYPE="f16"):
|
|||||||
|
|
||||||
def run(torch):
|
def run(torch):
|
||||||
"""Run the VA offset verification benchmark with full TP sharding."""
|
"""Run the VA offset verification benchmark with full TP sharding."""
|
||||||
dp = DPPolicy(sip="column_wise", cube="column_wise", pe="column_wise")
|
dp = DPPolicy(cube="column_wise", pe="column_wise")
|
||||||
src = torch.zeros((M, K), dtype=DTYPE, dp=dp, name="src")
|
src = torch.zeros((M, K), dtype=DTYPE, dp=dp, name="src")
|
||||||
dst = torch.empty((M, K), dtype=DTYPE, dp=dp, name="dst")
|
dst = torch.empty((M, K), dtype=DTYPE, dp=dp, name="dst")
|
||||||
|
|
||||||
|
|||||||
@@ -6,12 +6,7 @@
|
|||||||
|
|
||||||
defaults:
|
defaults:
|
||||||
# Algorithm to run for this benchmark execution.
|
# Algorithm to run for this benchmark execution.
|
||||||
algorithm: ring_allreduce_tcm
|
algorithm: intercube_allreduce
|
||||||
|
|
||||||
# NOTE: world_size is not set here by default. AhbmCCLBackend derives it
|
|
||||||
# from the chosen algorithm's entry (if it sets ``world_size``) or from
|
|
||||||
# topology.yaml (``sips × cubes_per_sip × pes_per_cube``). This mirrors
|
|
||||||
# real PyTorch DDP where ranks/world_size come from env vars, not code.
|
|
||||||
|
|
||||||
# IPCQ ring buffer location.
|
# IPCQ ring buffer location.
|
||||||
# tcm — PE-local TCM (fast, small, conflicts with compute TCM access)
|
# tcm — PE-local TCM (fast, small, conflicts with compute TCM access)
|
||||||
@@ -30,59 +25,21 @@ defaults:
|
|||||||
# Slot size in bytes (must hold one tile worth of data).
|
# Slot size in bytes (must hold one tile worth of data).
|
||||||
slot_size: 4096
|
slot_size: 4096
|
||||||
|
|
||||||
# PE_DMA virtual channel chunk size (D8). First implementation does not
|
# PE_DMA virtual channel chunk size (D8).
|
||||||
# use chunk-level interleave; this is reserved for future precision.
|
|
||||||
vc_chunk_size: 256
|
vc_chunk_size: 256
|
||||||
|
|
||||||
# Credit return fast path message size (D9). Used by bottleneck-BW
|
# Credit return fast path message size (D9).
|
||||||
# latency calculation. 16-64 bytes typical.
|
|
||||||
ipcq_credit_size_bytes: 16
|
ipcq_credit_size_bytes: 16
|
||||||
|
|
||||||
algorithms:
|
algorithms:
|
||||||
# ── ring all-reduce, buffer in PE_TCM ──
|
# ── intercube all-reduce (pe0-only, cube mesh + inter-SIP) ──
|
||||||
# Defaults to topology-derived world_size (full system, 256 ranks).
|
# Reduces across the 4×4 cube mesh within each SIP, then inter-SIP
|
||||||
# Use a smaller tile size at high rank counts so f16 sums stay within
|
# exchange on root cube, then broadcast back. SIP topology is read
|
||||||
# the verification tolerance and op_log replay scales.
|
# from topology.yaml → system.sips.topology. Kernel auto-selects
|
||||||
ring_allreduce_tcm:
|
# ring / torus / mesh inter-SIP exchange pattern.
|
||||||
module: kernbench.ccl.algorithms.ring_allreduce
|
intercube_allreduce:
|
||||||
topology: ring_1d
|
module: kernbench.ccl.algorithms.intercube_allreduce
|
||||||
buffer_kind: tcm
|
|
||||||
n_elem: 8
|
|
||||||
|
|
||||||
# ── ring all-reduce, buffer in PE-local HBM ──
|
|
||||||
ring_allreduce_hbm:
|
|
||||||
module: kernbench.ccl.algorithms.ring_allreduce
|
|
||||||
topology: ring_1d
|
|
||||||
buffer_kind: hbm
|
|
||||||
n_elem: 8
|
|
||||||
|
|
||||||
# ── ring all-reduce, buffer in cube SRAM ──
|
|
||||||
ring_allreduce_sram:
|
|
||||||
module: kernbench.ccl.algorithms.ring_allreduce
|
|
||||||
topology: ring_1d
|
|
||||||
buffer_kind: sram
|
|
||||||
n_elem: 8
|
|
||||||
|
|
||||||
# ── 2D mesh all-reduce: perfect square only (2×2 = 4 PEs) ──
|
|
||||||
mesh_allreduce_4:
|
|
||||||
module: kernbench.ccl.algorithms.mesh_allreduce
|
|
||||||
topology: mesh_2d
|
|
||||||
buffer_kind: tcm
|
|
||||||
world_size: 4
|
|
||||||
n_elem: 16
|
|
||||||
|
|
||||||
# ── tree all-reduce (binary, 7 PEs) ──
|
|
||||||
tree_allreduce_7:
|
|
||||||
module: kernbench.ccl.algorithms.tree_allreduce
|
|
||||||
topology: tree_binary
|
|
||||||
buffer_kind: tcm
|
|
||||||
world_size: 7
|
|
||||||
n_elem: 16
|
|
||||||
|
|
||||||
# ── hierarchical all-reduce (3-level: intra-cube → inter-cube → inter-SIP) ──
|
|
||||||
# Uses bidirectional ring reduce + chain broadcast. ~25 rounds vs 255 flat.
|
|
||||||
hierarchical_allreduce:
|
|
||||||
module: kernbench.ccl.algorithms.hierarchical_allreduce
|
|
||||||
topology: none
|
topology: none
|
||||||
buffer_kind: tcm
|
buffer_kind: tcm
|
||||||
n_elem: 16
|
n_elem: 8
|
||||||
|
root_cube: 15
|
||||||
|
|||||||
@@ -0,0 +1,990 @@
|
|||||||
|
# ADR-0024: SIP-level TP Launcher — rank = SIP (host-driven dispatch)
|
||||||
|
|
||||||
|
## Status
|
||||||
|
|
||||||
|
Proposed (Revision 8 — Hierarchical content split out to ADR-0029)
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
### 목표
|
||||||
|
|
||||||
|
`torch.distributed` collective 호출의 참여 단위(rank)를 **SIP**(device)
|
||||||
|
경계에 맞춘다. 실제 PyTorch DDP/TP 스크립트와 **호스트 레벨에서 구분 없이**
|
||||||
|
읽히는 bench 코드를 목표로 한다.
|
||||||
|
|
||||||
|
real PyTorch와 비교:
|
||||||
|
|
||||||
|
| 차원 | real PyTorch | KernBench (이 ADR 이후) |
|
||||||
|
|---|---|---|
|
||||||
|
| 프로세스 모델 | N개 프로세스, 각 1 GPU | 1 프로세스, N greenlet, 각 1 SIP |
|
||||||
|
| `get_rank()` | `RANK` env var | greenlet-local 레지스트리 |
|
||||||
|
| `get_world_size()` | `WORLD_SIZE` env var | topology의 SIP 수 |
|
||||||
|
| `torch.cuda.set_device(r)` (real) / `torch.ahbm.set_device(r)` (KernBench) | rank → GPU | rank → SIP |
|
||||||
|
| `mp.spawn` | OS 프로세스 fork | greenlet fan-out |
|
||||||
|
|
||||||
|
### 설계 원칙 — 공개 API의 추상화, 내부는 기존 path 활용
|
||||||
|
|
||||||
|
**공개 API (bench worker) 수준의 추상화**:
|
||||||
|
```
|
||||||
|
rank = SIP
|
||||||
|
DPPolicy = intra-device (cube × PE) 분산만
|
||||||
|
dist.all_reduce, torch.ahbm.set_device, mp.spawn 등 PyTorch-style 표면
|
||||||
|
```
|
||||||
|
|
||||||
|
**Framework 내부 구현**:
|
||||||
|
```
|
||||||
|
build_install_plans (host): topology + mapper + algorithm → SipInstallPlan
|
||||||
|
↓
|
||||||
|
backend (host): plan의 per-PE spec을 engine.submit으로 IpcqInitMsg 디스패치
|
||||||
|
↓
|
||||||
|
engine: 기존 PE-scoped routing (MmuMapMsg 등과 동일 경로)
|
||||||
|
↓
|
||||||
|
PE_IPCQ: 자체 message loop에서 IpcqInitMsg 처리 (기존 capability)
|
||||||
|
```
|
||||||
|
|
||||||
|
**핵심**: 새 message 타입이나 IO_CPU 확장 없음. 기존 engine routing과 기존
|
||||||
|
`IpcqInitMsg` 타입을 그대로 사용. 기존의 "sideband direct call" 우회만
|
||||||
|
제거하여 convention 일원화.
|
||||||
|
|
||||||
|
### 현재 상태
|
||||||
|
|
||||||
|
- `DistributedContext` facade 존재
|
||||||
|
- `init_process_group("ahbm")` → `AhbmCCLBackend`가 `ctx.install_ipcq` 호출
|
||||||
|
→ `ccl/install.py`가 **sideband direct call** (`pe_ipcq._install_neighbors`)로
|
||||||
|
PE_IPCQ에 neighbor table 설치
|
||||||
|
- `get_rank()` 항상 `0` (single-driver)
|
||||||
|
- `get_world_size()` fallback: 총 PE 수 (rank = PE)
|
||||||
|
- `benches/ccl_allreduce.py`: `worker(rank=0, world_size=total_PEs)` 1회 호출
|
||||||
|
|
||||||
|
### 풀어야 할 문제
|
||||||
|
|
||||||
|
1. **공개 API에서 rank = SIP** — bench worker가 PE 개념을 알지 않도록.
|
||||||
|
2. **Multi-worker 실행** — N개 rank가 독립 worker 코드 실행. 1 프로세스 제약
|
||||||
|
하에서 greenlet + barrier 동기화.
|
||||||
|
3. **Cross-rank collective submit 동기화** — 첫 rank가 혼자 wait하면 peer 부재로
|
||||||
|
SimPy deadlock. 모든 rank submit 후 drain 보장.
|
||||||
|
4. **기존 sideband install 제거** — IpcqInitMsg를 engine.submit으로 일원화.
|
||||||
|
MmuMapMsg 등 다른 control-plane 메시지와 동일 패턴.
|
||||||
|
5. **Algorithm / mapper / validator 분리** — 알고리즘 모듈은 kernel 코드만
|
||||||
|
담고, topology / mapping / validation은 registry + 선언.
|
||||||
|
|
||||||
|
### Non-problem (이 ADR 밖)
|
||||||
|
|
||||||
|
- IPCQ direction addressing fix → **ADR-0025**
|
||||||
|
- `DPPolicy.sip`/`num_sips` 제거 → **ADR-0026**
|
||||||
|
- Megatron-style TP → **ADR-0027**
|
||||||
|
- DTensor → **ADR-0028 (future)**
|
||||||
|
- **IO_CPU를 SIP-level control-plane 단일 endpoint로 승격**: 이 ADR에서는
|
||||||
|
invariant으로 채택하지 않음. 현재 KernBench에 해당 원칙이 없고, 단독으로
|
||||||
|
도입하기엔 정당화가 약함. 미래에 control-plane latency 모델링 정밀도 요구가
|
||||||
|
생기면 별도 ADR.
|
||||||
|
|
||||||
|
### TODO (이 ADR 구현 이후)
|
||||||
|
|
||||||
|
- Tensor Parallelism (ADR-0027)
|
||||||
|
- Hierarchical all-reduce 알고리즘 설계 (ADR-0029) — 본 ADR의 mapper /
|
||||||
|
validator registry 인프라를 활용하는 첫 사례
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Decision
|
||||||
|
|
||||||
|
### D1. rank = SIP (world_size 해석)
|
||||||
|
|
||||||
|
```python
|
||||||
|
def _resolve_world_size(self) -> int:
|
||||||
|
if "world_size" in self._merged:
|
||||||
|
return int(self._merged["world_size"])
|
||||||
|
defaults = self._cfg_all.get("defaults", {})
|
||||||
|
if "world_size" in defaults:
|
||||||
|
return int(defaults["world_size"])
|
||||||
|
spec = self.ctx.spec or {}
|
||||||
|
return int(spec.get("system", {}).get("sips", {}).get("count", 1))
|
||||||
|
```
|
||||||
|
|
||||||
|
우선순위: 알고리즘 override > defaults override > SIP count. `ccl.yaml`
|
||||||
|
override는 legacy "rank = PE" 테스트 경로로 유지.
|
||||||
|
|
||||||
|
### D2. Install 경로 — engine.submit 일원화
|
||||||
|
|
||||||
|
`ccl/install.py`의 sideband direct call을 제거하고, `IpcqInitMsg`를
|
||||||
|
`engine.submit`으로 보낸다. MmuMapMsg / MemoryWriteMsg 등이 이미 동일 패턴.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Backend (AhbmCCLBackend.__init__ 또는 init_process_group 시점)
|
||||||
|
from kernbench.ccl.install_plan import build_install_plans
|
||||||
|
|
||||||
|
plans = build_install_plans(
|
||||||
|
world_size=self._world_size,
|
||||||
|
algorithm=self._merged["algorithm"],
|
||||||
|
algorithm_config=self._merged,
|
||||||
|
spec=self.ctx.spec,
|
||||||
|
)
|
||||||
|
self._plans = plans
|
||||||
|
|
||||||
|
# Each PE_IPCQ가 자기 neighbor table을 받도록 engine 경유 submit
|
||||||
|
handles = []
|
||||||
|
for plan in plans:
|
||||||
|
for pe_install in plan.pe_installs:
|
||||||
|
h = self.ctx.submit(IpcqInitMsg(
|
||||||
|
correlation_id=self.ctx.correlation_id,
|
||||||
|
request_id=f"ipcq_init_s{plan.sip}c{pe_install.cube}p{pe_install.pe}",
|
||||||
|
target_sips=(plan.sip,),
|
||||||
|
target_cubes=(pe_install.cube,),
|
||||||
|
target_pe=pe_install.pe,
|
||||||
|
entries=pe_install.neighbors,
|
||||||
|
buffer_kind=plan.buffer_kind,
|
||||||
|
n_slots=plan.n_slots,
|
||||||
|
slot_size=plan.slot_size,
|
||||||
|
# ... (기존 IpcqInitMsg 필드)
|
||||||
|
))
|
||||||
|
handles.append(h)
|
||||||
|
|
||||||
|
# Eager install — init_process_group이 반환하기 전에 완료 보장
|
||||||
|
for h in handles:
|
||||||
|
self.ctx.wait(h)
|
||||||
|
```
|
||||||
|
|
||||||
|
**PE_IPCQ 컴포넌트**는 이미 `IpcqInitMsg`를 main loop에서 처리 (`pe_ipcq.py`
|
||||||
|
라인 145-147). 변경 불필요. 유일한 차이는 "message가 sideband Python call이
|
||||||
|
아니라 engine queue를 거쳐 도착한다"는 점.
|
||||||
|
|
||||||
|
**Correctness invariant (equivalence)**: `init_process_group()`은 모든
|
||||||
|
install handle을 `wait()`한 후 반환하므로 launch-before-install 문제는
|
||||||
|
구조적으로 없다. 남는 correctness 질문은 단 하나:
|
||||||
|
|
||||||
|
> Engine-routed `IpcqInitMsg` 처리가 기존 sideband
|
||||||
|
> `pe_ipcq._install_neighbors(msg)` 호출과 **동일한 최종 PE_IPCQ 상태**를
|
||||||
|
> 생성하는가.
|
||||||
|
|
||||||
|
검증 포인트 (T3 참고):
|
||||||
|
|
||||||
|
1. **State equivalence**: `_install_neighbors()` 내부 상태 전이가 engine
|
||||||
|
dispatch path에서도 동일하게 일어나 최종 PE_IPCQ state
|
||||||
|
(`_queue_pairs`, `_installed`, `_credit_inbox` 등)가 일치.
|
||||||
|
|
||||||
|
2. **Sideband-only side effect 부재**: Sideband path에서만 있던 부수 효과가
|
||||||
|
없음 (예: engine.submit이 설정하는 request_id / correlation tracking 등이
|
||||||
|
install semantics를 왜곡하지 않음).
|
||||||
|
|
||||||
|
3. **Ordering independence**: 서로 다른 PE들의 install message가 engine
|
||||||
|
큐에서 임의 순서로 처리되어도 최종 상태가 동일. 즉 install은 **PE별
|
||||||
|
독립 연산**이어야 하고, cross-PE 순서 의존성이 있으면 안 됨.
|
||||||
|
|
||||||
|
4. **Idempotency**: 동일 PE에 대해 `IpcqInitMsg`가 두 번 도착하면? 현재
|
||||||
|
설계 전제는 "per-PE 단 한 번 install". 중복 install 시 동작은 정의되지
|
||||||
|
않음. 보수적 정책:
|
||||||
|
- 최초 install 시 `_installed = True`로 전이
|
||||||
|
- 이후 중복 install msg는 **에러** (raise) 또는 **silent idempotent**
|
||||||
|
(no-op) 둘 중 하나로 명시
|
||||||
|
- Recommend: **raise** (명시적 에러 → 버그 조기 검출). T3에 duplicate
|
||||||
|
install 케이스 추가.
|
||||||
|
|
||||||
|
5. **Partial install visibility**: 일부 PE만 install 완료된 중간 상태가
|
||||||
|
외부에 observable한가? 현재 구조에서는 `init_process_group()`의 eager
|
||||||
|
wait-all이 barrier 역할을 하므로 partial state는 bench 코드에 노출되지
|
||||||
|
않음. 단, debugging / introspection API는 중간 상태를 볼 수 있음 (문제
|
||||||
|
아님, 문서화만).
|
||||||
|
|
||||||
|
**Timing 영향**: Engine-routed install은 `init_process_group()`이 SimPy 시간을
|
||||||
|
소비하게 만든다. 기존 sideband install은 사실상 zero-cost. ADR 계약:
|
||||||
|
|
||||||
|
> Benchmarks must not rely on zero-cost initialization.
|
||||||
|
> `init_process_group()` consumes simulated time proportional to the number
|
||||||
|
> of participating PEs × per-PE install latency. First collective call
|
||||||
|
> starts at a well-defined but non-zero sim time.
|
||||||
|
|
||||||
|
### D3. Launch 경로 — non-CCL 커널과 동일 primitive
|
||||||
|
|
||||||
|
**CCL 커널은 non-CCL 커널과 동일한 `KernelLaunchMsg` submission path를 쓴다.**
|
||||||
|
Engine 내부의 IO_CPU/M_CPU transit 같은 것은 **기존 구현 세부이지 CCL-specific
|
||||||
|
장치가 아님**. Backend는 plan의 `participating_pes` 목록을 돌면서 `KernelLaunchMsg`를
|
||||||
|
submit할 뿐이다. 새 메시지 타입 없음, 새 라우팅 경로 없음.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# AhbmCCLBackend.all_reduce
|
||||||
|
def all_reduce(self, tensor, op="sum"):
|
||||||
|
if op != "sum":
|
||||||
|
raise NotImplementedError(...)
|
||||||
|
if tensor._handle is None or not tensor._handle.shards:
|
||||||
|
raise RuntimeError(...)
|
||||||
|
|
||||||
|
# Validator — global handle 기준 (D8)
|
||||||
|
validator_name = self._merged.get("validator")
|
||||||
|
if validator_name:
|
||||||
|
resolve_validator(validator_name)(tensor._handle, self._world_size, self.ctx.spec)
|
||||||
|
|
||||||
|
rank = self.ctx.distributed.get_rank()
|
||||||
|
plan = self._plans[rank]
|
||||||
|
tensor_view = _tensor_slice_for_sip(tensor._handle, plan.sip)
|
||||||
|
|
||||||
|
# Plan에서 kernel args 계산 (host-side)
|
||||||
|
import importlib
|
||||||
|
mod = importlib.import_module(plan.kernel_module)
|
||||||
|
n_elem = tensor_view.shards[0].nbytes // tensor.itemsize
|
||||||
|
kargs = mod.kernel_args(n_elem=n_elem, world_size=plan.world_size,
|
||||||
|
**plan.kernel_config)
|
||||||
|
|
||||||
|
def _submit():
|
||||||
|
out = []
|
||||||
|
for (cube, pe) in plan.participating_pes:
|
||||||
|
h = self.ctx.submit(KernelLaunchMsg(
|
||||||
|
correlation_id=self.ctx.correlation_id,
|
||||||
|
request_id=f"allreduce_r{rank}_c{cube}p{pe}",
|
||||||
|
kernel_ref=KernelRef(name=plan.algorithm_name, kind="builtin"),
|
||||||
|
args=(_tensor_arg_for_pe(tensor_view, cube, pe), *kargs),
|
||||||
|
target_sips=(plan.sip,),
|
||||||
|
target_cubes=(cube,),
|
||||||
|
target_pe=pe,
|
||||||
|
))
|
||||||
|
out.append(h)
|
||||||
|
return out
|
||||||
|
|
||||||
|
self._barrier.submit_and_drain(self.ctx, rank, _submit)
|
||||||
|
```
|
||||||
|
|
||||||
|
### D4. Algorithm ABI — 얇게 + 명시적 arg 계약
|
||||||
|
|
||||||
|
각 알고리즘 모듈은 **kernel + kernel_args만 필수**.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# src/kernbench/ccl/algorithms/ring_allreduce.py
|
||||||
|
def kernel(t_ptr, n_elem, world_size, tl):
|
||||||
|
"""PE-side kernel code.
|
||||||
|
|
||||||
|
Signature convention: first positional arg is the tensor pointer
|
||||||
|
(per-PE slice), subsequent positional args are whatever
|
||||||
|
kernel_args() returns. `tl` is injected by the TLContext runtime.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def kernel_args(*, n_elem: int, world_size: int, **kw) -> tuple:
|
||||||
|
"""Return the tuple of non-tensor positional args.
|
||||||
|
|
||||||
|
Signature contract:
|
||||||
|
- Called keyword-only with n_elem and world_size plus kernel_config.
|
||||||
|
- Returns a tuple (possibly empty) of scalar / metadata args.
|
||||||
|
- The backend constructs the final KernelLaunchMsg.args as:
|
||||||
|
(per_pe_tensor_arg, *kernel_args(...))
|
||||||
|
where per_pe_tensor_arg is a TensorArg containing only the shards
|
||||||
|
local to the receiving PE (derived from tensor_view).
|
||||||
|
"""
|
||||||
|
return (n_elem, world_size)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Arg assembly in backend (reference)**:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# AhbmCCLBackend.all_reduce (D3에서 발췌)
|
||||||
|
kargs = mod.kernel_args(n_elem=n_elem, world_size=plan.world_size,
|
||||||
|
**plan.kernel_config)
|
||||||
|
for (cube, pe) in plan.participating_pes:
|
||||||
|
pe_tensor_arg = _tensor_arg_for_pe(tensor_view, cube, pe)
|
||||||
|
self.ctx.submit(KernelLaunchMsg(
|
||||||
|
args=(pe_tensor_arg, *kargs), # tensor first, then kernel_args return
|
||||||
|
target_sips=(plan.sip,),
|
||||||
|
target_cubes=(cube,),
|
||||||
|
target_pe=pe,
|
||||||
|
...
|
||||||
|
))
|
||||||
|
```
|
||||||
|
|
||||||
|
**ccl.yaml**에서 선언적 metadata:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
algorithms:
|
||||||
|
ring_allreduce_tcm:
|
||||||
|
module: kernbench.ccl.algorithms.ring_allreduce
|
||||||
|
topology: ring_1d # kernbench/ccl/topologies.py
|
||||||
|
mapper: leader_only # kernbench/ccl/mappers.py (신규)
|
||||||
|
validator: single_shard_per_rank # kernbench/ccl/validators.py (신규)
|
||||||
|
buffer_kind: tcm
|
||||||
|
n_elem: 8
|
||||||
|
```
|
||||||
|
|
||||||
|
- `topology` (필수)
|
||||||
|
- `mapper` (선택, default `"leader_only"`)
|
||||||
|
- `validator` (선택)
|
||||||
|
|
||||||
|
알고리즘 모듈 자체에는 mapper/validator/participating_pes/neighbor
|
||||||
|
생성기가 **들어가지 않음**.
|
||||||
|
|
||||||
|
### D5. Mapper + validator — registry key **또는** import path
|
||||||
|
|
||||||
|
Host-side framework가 built-in registry 제공. 커스텀 확장은 dot-import path.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# src/kernbench/ccl/mappers.py (new)
|
||||||
|
Mapper = Callable[[dict, int], list[tuple[int, int]]]
|
||||||
|
|
||||||
|
def leader_only(spec, rank):
|
||||||
|
"""Single leader PE per SIP. Ring/tree/mesh용."""
|
||||||
|
return [(0, 0)]
|
||||||
|
|
||||||
|
def all_pes(spec, rank):
|
||||||
|
"""Every PE in the SIP. 알고리즘이 intra-SIP 전체 PE를 참여시킬 때 사용
|
||||||
|
(e.g. intra-SIP reduction, intra-SIP broadcast, hierarchical collective
|
||||||
|
의 낮은 레벨 등)."""
|
||||||
|
cm = spec["sip"]["cube_mesh"]
|
||||||
|
pl = spec["cube"]["pe_layout"]
|
||||||
|
n_cubes = cm["w"] * cm["h"]
|
||||||
|
n_pes = pl["pe_per_corner"] * len(pl["corners"])
|
||||||
|
return [(c, p) for c in range(n_cubes) for p in range(n_pes)]
|
||||||
|
|
||||||
|
MAPPER_REGISTRY = {"leader_only": leader_only, "all_pes": all_pes}
|
||||||
|
|
||||||
|
def resolve_mapper(key_or_path: str) -> Mapper:
|
||||||
|
if key_or_path in MAPPER_REGISTRY:
|
||||||
|
return MAPPER_REGISTRY[key_or_path]
|
||||||
|
if "." in key_or_path:
|
||||||
|
import importlib
|
||||||
|
mod_path, fn_name = key_or_path.rsplit(".", 1)
|
||||||
|
return getattr(importlib.import_module(mod_path), fn_name)
|
||||||
|
raise ValueError(f"unknown mapper: {key_or_path!r}")
|
||||||
|
```
|
||||||
|
|
||||||
|
Validator도 동일 패턴 (`src/kernbench/ccl/validators.py`). 입력은 **global
|
||||||
|
TensorHandle** (D8 참고).
|
||||||
|
|
||||||
|
### D6. Host-side install plan builder
|
||||||
|
|
||||||
|
```python
|
||||||
|
# src/kernbench/ccl/install_plan.py (new; 기존 install.py의 재구성)
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Any, Mapping
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class NeighborTableEntry:
|
||||||
|
direction: str
|
||||||
|
peer_direction: str # ADR-0025
|
||||||
|
peer_sip: int
|
||||||
|
peer_cube: int
|
||||||
|
peer_pe: int
|
||||||
|
rx_base_pa: int
|
||||||
|
# ... 기타 IPCQ 설정 ...
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class PeInstallSpec:
|
||||||
|
cube: int
|
||||||
|
pe: int
|
||||||
|
neighbors: tuple[NeighborTableEntry, ...]
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class SipInstallPlan:
|
||||||
|
algorithm_name: str # human-readable ("ring_allreduce_tcm")
|
||||||
|
sip: int
|
||||||
|
rank: int
|
||||||
|
world_size: int
|
||||||
|
pe_installs: tuple[PeInstallSpec, ...] # per-PE neighbor tables
|
||||||
|
buffer_kind: str
|
||||||
|
n_slots: int
|
||||||
|
slot_size: int
|
||||||
|
kernel_module: str
|
||||||
|
participating_pes: tuple[tuple[int, int], ...]
|
||||||
|
kernel_config: Mapping[str, Any]
|
||||||
|
|
||||||
|
|
||||||
|
def build_install_plans(
|
||||||
|
world_size: int,
|
||||||
|
algorithm: str,
|
||||||
|
algorithm_config: dict,
|
||||||
|
spec: dict,
|
||||||
|
) -> list[SipInstallPlan]:
|
||||||
|
"""Compose topology + mapper + algorithm into per-SIP plan list."""
|
||||||
|
topo_fn = _resolve_topology(algorithm_config["topology"])
|
||||||
|
mapper = resolve_mapper(algorithm_config.get("mapper", "leader_only"))
|
||||||
|
|
||||||
|
# kernel_config: launch 시 kernel_args에 전달할 algorithm-specific params
|
||||||
|
kernel_config = {
|
||||||
|
k: v for k, v in algorithm_config.items()
|
||||||
|
if k in {"n_elem", "reduce_op", "chunk_size"} or k.startswith("kernel_")
|
||||||
|
}
|
||||||
|
|
||||||
|
plans = []
|
||||||
|
for rank in range(world_size):
|
||||||
|
sip = rank # identity mapping (non-identity는 open question)
|
||||||
|
pes = mapper(spec, rank)
|
||||||
|
pe_installs = _build_pe_installs(
|
||||||
|
rank=rank, world_size=world_size, sip=sip,
|
||||||
|
pes=pes, topo_fn=topo_fn, algorithm_config=algorithm_config, spec=spec,
|
||||||
|
)
|
||||||
|
plans.append(SipInstallPlan(
|
||||||
|
algorithm_name=algorithm,
|
||||||
|
sip=sip, rank=rank, world_size=world_size,
|
||||||
|
pe_installs=pe_installs,
|
||||||
|
buffer_kind=algorithm_config["buffer_kind"],
|
||||||
|
n_slots=algorithm_config["n_slots"],
|
||||||
|
slot_size=algorithm_config["slot_size"],
|
||||||
|
kernel_module=algorithm_config["module"],
|
||||||
|
participating_pes=tuple(pes),
|
||||||
|
kernel_config=kernel_config,
|
||||||
|
))
|
||||||
|
return plans
|
||||||
|
```
|
||||||
|
|
||||||
|
`_build_pe_installs`는 기존 `ccl/install.py`의 neighbor 계산 로직을 재활용
|
||||||
|
(ADR-0025의 `reverse_direction` 개선 반영).
|
||||||
|
|
||||||
|
**Multi-PE 매퍼와 neighbor 생성 책임**: mapper가 SIP 내 여러 PE를 반환하는
|
||||||
|
경우 (`all_pes` 등), PE-level neighbor 그래프는 `_build_pe_installs` 내부에
|
||||||
|
형성된다. 즉 topology 모듈은 rank-level 관계만 제공하고, PE-level 연결은
|
||||||
|
builder에서 풀어낸다. 복잡한 multi-level 패턴을 쓰는 알고리즘은 이 책임
|
||||||
|
분산이 관리 부담이 될 수 있음 — 관련 논의는 ADR-0029 참고.
|
||||||
|
|
||||||
|
### D7. Epoch-based collective barrier
|
||||||
|
|
||||||
|
Cross-rank submit 동기화. 각 collective 호출은 독립 epoch. 같은 rank의
|
||||||
|
중복 join은 즉시 에러.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# src/kernbench/runtime_api/distributed.py
|
||||||
|
@dataclass
|
||||||
|
class _EpochState:
|
||||||
|
participants: set[int] = field(default_factory=set)
|
||||||
|
pending: list = field(default_factory=list)
|
||||||
|
drained: bool = False
|
||||||
|
returned: int = 0
|
||||||
|
|
||||||
|
|
||||||
|
class _CollectiveBarrier:
|
||||||
|
"""Epoch-based barrier.
|
||||||
|
|
||||||
|
Contract:
|
||||||
|
- Each call joins the earliest non-drained epoch.
|
||||||
|
- Each rank may join a given epoch at most once. Duplicate join raises.
|
||||||
|
- Last arriver (participants == world_size) performs drain and advances
|
||||||
|
_next_epoch. Earlier arrivers yield and re-check drained on resume.
|
||||||
|
- Epoch state is GC'd when returned == world_size (success path).
|
||||||
|
- On failure paths, residual state is acceptable; reset() clears it.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, world_size: int):
|
||||||
|
self._world_size = world_size
|
||||||
|
self._next_epoch = 0
|
||||||
|
self._state: dict[int, _EpochState] = {}
|
||||||
|
|
||||||
|
def submit_and_drain(self, ctx, rank: int, submit_fn) -> None:
|
||||||
|
epoch = self._next_epoch
|
||||||
|
state = self._state.setdefault(epoch, _EpochState())
|
||||||
|
|
||||||
|
if rank in state.participants:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"rank {rank} attempted duplicate join to epoch {epoch}"
|
||||||
|
)
|
||||||
|
state.participants.add(rank)
|
||||||
|
|
||||||
|
handles = submit_fn()
|
||||||
|
state.pending.extend(handles)
|
||||||
|
|
||||||
|
is_last = len(state.participants) >= self._world_size
|
||||||
|
|
||||||
|
if is_last:
|
||||||
|
for h in state.pending:
|
||||||
|
ctx.wait(h)
|
||||||
|
state.drained = True
|
||||||
|
self._next_epoch = epoch + 1
|
||||||
|
else:
|
||||||
|
from greenlet import getcurrent
|
||||||
|
g = getcurrent()
|
||||||
|
if g.parent is None:
|
||||||
|
raise RuntimeError("barrier requires a bound worker greenlet")
|
||||||
|
while not state.drained:
|
||||||
|
g.parent.switch()
|
||||||
|
|
||||||
|
state.returned += 1
|
||||||
|
if state.returned >= self._world_size:
|
||||||
|
self._state.pop(epoch, None)
|
||||||
|
|
||||||
|
def reset(self) -> None:
|
||||||
|
"""Explicit cleanup on spawn exception unwinding."""
|
||||||
|
self._state.clear()
|
||||||
|
self._next_epoch = 0
|
||||||
|
```
|
||||||
|
|
||||||
|
### D8. Per-rank tensor view + validator contract
|
||||||
|
|
||||||
|
**Validator** (host-side, pre-slice, global handle 기준):
|
||||||
|
|
||||||
|
```python
|
||||||
|
# src/kernbench/ccl/validators.py
|
||||||
|
Validator = Callable[[TensorHandle, int, dict], None]
|
||||||
|
|
||||||
|
def single_shard_per_rank(handle, world_size, spec):
|
||||||
|
"""Ring 계열: 정확히 world_size개 shard, SIP당 1개."""
|
||||||
|
if len(handle.shards) != world_size:
|
||||||
|
raise ValueError(...)
|
||||||
|
per_sip = {}
|
||||||
|
for s in handle.shards:
|
||||||
|
per_sip[s.sip] = per_sip.get(s.sip, 0) + 1
|
||||||
|
if any(c != 1 for c in per_sip.values()):
|
||||||
|
raise ValueError(...)
|
||||||
|
|
||||||
|
def multi_pe_sip_local(handle, world_size, spec):
|
||||||
|
"""Multi-PE per SIP layout: 각 SIP에 intra-SIP PE 수만큼 shard 존재.
|
||||||
|
Intra-SIP 전체 PE를 참여시키는 알고리즘이 사용."""
|
||||||
|
cm = spec["sip"]["cube_mesh"]
|
||||||
|
pl = spec["cube"]["pe_layout"]
|
||||||
|
per_sip = cm["w"] * cm["h"] * pl["pe_per_corner"] * len(pl["corners"])
|
||||||
|
if len(handle.shards) != world_size * per_sip:
|
||||||
|
raise ValueError(...)
|
||||||
|
|
||||||
|
VALIDATOR_REGISTRY = {...}
|
||||||
|
def resolve_validator(key_or_path): ...
|
||||||
|
```
|
||||||
|
|
||||||
|
Validator는 world 전체의 shard layout 불변량을 본다. Per-rank view는
|
||||||
|
backend가 validator 호출 **후** `_tensor_slice_for_sip`로 생성.
|
||||||
|
|
||||||
|
**Per-rank tensor view** — SIP-local slice:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def _tensor_slice_for_sip(handle, sip) -> TensorArg:
|
||||||
|
sip_shards = [s for s in handle.shards if s.sip == sip]
|
||||||
|
if not sip_shards:
|
||||||
|
raise RuntimeError(f"tensor has no shards on SIP {sip}")
|
||||||
|
# Deterministic ordering contract: (cube, pe, offset_bytes) ascending.
|
||||||
|
# Multi-PE mappers (hierarchical 등) rely on this ordering to align
|
||||||
|
# per-PE tensor arg construction with participating_pes enumeration.
|
||||||
|
sip_shards.sort(key=lambda s: (s.cube, s.pe, s.offset_bytes))
|
||||||
|
min_offset = min(s.offset_bytes for s in sip_shards)
|
||||||
|
local_va_base = handle.va_base + min_offset if handle.va_base else 0
|
||||||
|
return TensorArg(
|
||||||
|
shards=tuple(TensorArgShard(...) for s in sip_shards),
|
||||||
|
va_base=local_va_base,
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Ordering invariant**: slice의 shard는 `(cube, pe, offset_bytes)` 오름차순.
|
||||||
|
Backend가 `participating_pes`를 iterate하며 `_tensor_arg_for_pe(view, cube, pe)`를
|
||||||
|
구성할 때, 결정론적 ordering을 전제할 수 있다. 특히 `all_pes` mapper +
|
||||||
|
hierarchical 알고리즘이 per-PE slice 조합을 순서 의존적으로 해석하는 경우에
|
||||||
|
중요.
|
||||||
|
|
||||||
|
### D9. Greenlet-local rank registry (+ debug warning)
|
||||||
|
|
||||||
|
```python
|
||||||
|
class DistributedContext:
|
||||||
|
def __init__(self):
|
||||||
|
self._backend = None
|
||||||
|
self._rank_by_greenlet: dict = {}
|
||||||
|
|
||||||
|
def _bind_rank(self, g, rank: int) -> None:
|
||||||
|
self._rank_by_greenlet[g] = int(rank)
|
||||||
|
|
||||||
|
def get_rank(self) -> int:
|
||||||
|
self._ensure_initialized()
|
||||||
|
from greenlet import getcurrent
|
||||||
|
g = getcurrent()
|
||||||
|
if g not in self._rank_by_greenlet:
|
||||||
|
if os.environ.get("KERNBENCH_DEBUG"):
|
||||||
|
warnings.warn(
|
||||||
|
"get_rank() called outside a bound greenlet — returning 0. "
|
||||||
|
"Likely a bug unless running single-driver."
|
||||||
|
)
|
||||||
|
return 0
|
||||||
|
return int(self._rank_by_greenlet[g])
|
||||||
|
```
|
||||||
|
|
||||||
|
### D10. `torch.ahbm.set_device(rank)` — SIP 바인딩
|
||||||
|
|
||||||
|
KernBench 백엔드 이름은 `ahbm` (ADR-0023 D10). Real PyTorch는
|
||||||
|
`torch.cuda.set_device(r)`이지만 우리는 CUDA가 아니므로 honestly-named
|
||||||
|
namespace를 사용한다.
|
||||||
|
|
||||||
|
```python
|
||||||
|
class _AhbmNamespace:
|
||||||
|
"""torch.ahbm — per-greenlet SIP device binding.
|
||||||
|
|
||||||
|
Real-PyTorch parity idiom: ``torch.cuda.set_device(rank)``. Since
|
||||||
|
KernBench's backend is 'ahbm' (not CUDA), we expose the equivalent
|
||||||
|
API under ``torch.ahbm`` to avoid pretending to be a CUDA runtime.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self._device_by_greenlet: dict = {}
|
||||||
|
|
||||||
|
def set_device(self, device: int) -> None:
|
||||||
|
from greenlet import getcurrent
|
||||||
|
self._device_by_greenlet[getcurrent()] = int(device)
|
||||||
|
|
||||||
|
def current_device(self) -> int | None:
|
||||||
|
from greenlet import getcurrent
|
||||||
|
return self._device_by_greenlet.get(getcurrent())
|
||||||
|
|
||||||
|
# Attached to RuntimeContext as `self.ahbm = _AhbmNamespace()`.
|
||||||
|
# Bench code: `torch.ahbm.set_device(rank)` mirrors `torch.cuda.set_device`.
|
||||||
|
```
|
||||||
|
|
||||||
|
**PyTorch 2.x style 병행 지원**: 최신 PyTorch는 device-agnostic한
|
||||||
|
`torch.accelerator` 네임스페이스를 지향 (`torch.accelerator.set_device_index(r)`,
|
||||||
|
`torch.accelerator.current_device_index()`). Device vendor에 종속되지 않는
|
||||||
|
코드를 쓰려는 사용자를 위해 KernBench도 이 표면을 병행 지원한다.
|
||||||
|
|
||||||
|
```python
|
||||||
|
class _AcceleratorNamespace:
|
||||||
|
"""torch.accelerator — device-agnostic API (PyTorch 2.x style).
|
||||||
|
|
||||||
|
Aliases torch.ahbm for bench code that prefers device-neutral idiom:
|
||||||
|
torch.accelerator.set_device_index(rank)
|
||||||
|
torch.accelerator.current_device_index()
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, ahbm: _AhbmNamespace):
|
||||||
|
self._ahbm = ahbm
|
||||||
|
|
||||||
|
def set_device_index(self, device: int) -> None:
|
||||||
|
self._ahbm.set_device(device)
|
||||||
|
|
||||||
|
def current_device_index(self) -> int | None:
|
||||||
|
return self._ahbm.current_device()
|
||||||
|
|
||||||
|
# RuntimeContext
|
||||||
|
self.ahbm = _AhbmNamespace()
|
||||||
|
self.accelerator = _AcceleratorNamespace(self.ahbm) # alias
|
||||||
|
```
|
||||||
|
|
||||||
|
Bench 작성자는 다음 중 하나를 선택 — 둘 다 내부적으로 같은 레지스트리를 보유:
|
||||||
|
|
||||||
|
```python
|
||||||
|
torch.ahbm.set_device(rank) # KernBench-native, explicit backend
|
||||||
|
torch.accelerator.set_device_index(rank) # PyTorch 2.x device-agnostic
|
||||||
|
```
|
||||||
|
|
||||||
|
### D11. Tensor placement = structural (sip, cube, pe) 좌표
|
||||||
|
|
||||||
|
`resolve_dp_policy`가 `target_sip`을 직접 받아 구조적 좌표로 placement 생성.
|
||||||
|
세부는 ADR-0026.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# RuntimeContext._create_tensor
|
||||||
|
current_sip = self.ahbm.current_device() # (D10 naming)
|
||||||
|
if current_sip is None:
|
||||||
|
current_sip = 0 # single-driver fallback (D9와 일관)
|
||||||
|
placement = resolve_dp_policy(
|
||||||
|
dp, shape=shape_2d, itemsize=itemsize,
|
||||||
|
num_pe=eff_num_pe, num_cubes=eff_num_cubes,
|
||||||
|
target_sip=current_sip,
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
Post-hoc `pe_index` shifting 제거 — ShardSpec이 `(sip, cube, pe)` 구조적
|
||||||
|
좌표 보유.
|
||||||
|
|
||||||
|
### D12. `torch.multiprocessing.spawn`-compat surface
|
||||||
|
|
||||||
|
Bench 작성자 표면은 real PyTorch `mp.spawn`과 동일:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# src/kernbench/runtime_api/multiprocessing.py (new)
|
||||||
|
def spawn(fn, args=(), nprocs=1, join=True, daemon=False, start_method="spawn"):
|
||||||
|
"""Drop-in for torch.multiprocessing.spawn.
|
||||||
|
Internal: greenlet fan-out + epoch-barrier sync + exception propagation.
|
||||||
|
"""
|
||||||
|
...
|
||||||
|
|
||||||
|
# torch namespace에 부착
|
||||||
|
torch.multiprocessing = SimpleNamespace(spawn=spawn)
|
||||||
|
```
|
||||||
|
|
||||||
|
Bench:
|
||||||
|
|
||||||
|
```python
|
||||||
|
import torch.multiprocessing as mp
|
||||||
|
mp.spawn(worker, nprocs=world_size, args=(world_size, torch))
|
||||||
|
```
|
||||||
|
|
||||||
|
### D13. Scheduler + exception handling
|
||||||
|
|
||||||
|
```python
|
||||||
|
def spawn(fn, args, nprocs, ...):
|
||||||
|
dist = torch.distributed
|
||||||
|
gs: list[greenlet] = []
|
||||||
|
errors: dict[int, Exception] = {}
|
||||||
|
|
||||||
|
for rank in range(nprocs):
|
||||||
|
def _entry(r=rank):
|
||||||
|
try:
|
||||||
|
fn(r, *args)
|
||||||
|
except Exception as e:
|
||||||
|
errors[r] = e
|
||||||
|
raise
|
||||||
|
g = greenlet(_entry)
|
||||||
|
dist._bind_rank(g, rank)
|
||||||
|
gs.append(g)
|
||||||
|
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
alive = [g for g in gs if not g.dead]
|
||||||
|
if not alive:
|
||||||
|
break
|
||||||
|
for g in alive:
|
||||||
|
if not g.dead:
|
||||||
|
g.switch()
|
||||||
|
except Exception as outer:
|
||||||
|
for other in gs:
|
||||||
|
if not other.dead:
|
||||||
|
try:
|
||||||
|
other.throw(SystemExit)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
# Epoch barrier state 명시적 cleanup
|
||||||
|
backend = getattr(dist, "_backend", None)
|
||||||
|
if backend is not None and hasattr(backend, "_barrier"):
|
||||||
|
backend._barrier.reset()
|
||||||
|
raise SpawnException(errors) from outer
|
||||||
|
```
|
||||||
|
|
||||||
|
**Scheduler contract**:
|
||||||
|
- Deterministic round-robin over insertion order (rank 0, 1, ..., N-1).
|
||||||
|
- 동기화 지점은 epoch barrier (D7)만. Scheduler 순서에 의존하는 correctness 없음.
|
||||||
|
- 예외 발생 시 다른 greenlet 강제 종료 + `SpawnException` 전파.
|
||||||
|
|
||||||
|
**Starvation guideline**:
|
||||||
|
- 일반적으로 collective barrier가 workers를 동기화. 큰 편차 없음.
|
||||||
|
- 극단적 non-collective 루프 대비 cooperative yield 제공:
|
||||||
|
`torch.distributed.cooperative_yield()`.
|
||||||
|
|
||||||
|
### D14. Backward compatibility
|
||||||
|
|
||||||
|
1. **Single-driver 호출**: `get_rank()` 0 반환 (D9).
|
||||||
|
2. **`ccl.yaml` world_size override**: D1 fallback 우회 — legacy "rank = PE"
|
||||||
|
테스트 경로로 사용 가능.
|
||||||
|
3. **`DPPolicy.sip="column_wise"` 명시**: ADR-0026 scope.
|
||||||
|
4. **`install_ipcq()` compatibility wrapper**:
|
||||||
|
|
||||||
|
기존 `ccl/install.py`의 `install_ipcq()` API는 곧바로 제거하지 않는다.
|
||||||
|
Thin compatibility wrapper로 남겨 기존 직접 호출자가 점진적으로 migration할
|
||||||
|
수 있게 한다.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# src/kernbench/ccl/install.py (after this ADR)
|
||||||
|
def install_ipcq(engine, spec, merged, *, algo_module=None, rank_to_pe=None):
|
||||||
|
"""DEPRECATED: legacy host-side PE installer.
|
||||||
|
|
||||||
|
Internally delegates to build_install_plans + engine-routed IpcqInitMsg.
|
||||||
|
Use dist.init_process_group() instead.
|
||||||
|
"""
|
||||||
|
from kernbench.ccl.install_plan import build_install_plans
|
||||||
|
import warnings
|
||||||
|
warnings.warn(
|
||||||
|
"install_ipcq() is deprecated; use dist.init_process_group()",
|
||||||
|
DeprecationWarning, stacklevel=2,
|
||||||
|
)
|
||||||
|
plans = build_install_plans(
|
||||||
|
world_size=merged.get("world_size", 1),
|
||||||
|
algorithm=merged["algorithm"],
|
||||||
|
algorithm_config=merged,
|
||||||
|
spec=spec,
|
||||||
|
)
|
||||||
|
handles = []
|
||||||
|
for plan in plans:
|
||||||
|
for pe_install in plan.pe_installs:
|
||||||
|
h = engine.submit(IpcqInitMsg(
|
||||||
|
target_sips=(plan.sip,),
|
||||||
|
target_cubes=(pe_install.cube,),
|
||||||
|
target_pe=pe_install.pe,
|
||||||
|
entries=pe_install.neighbors,
|
||||||
|
buffer_kind=plan.buffer_kind,
|
||||||
|
n_slots=plan.n_slots,
|
||||||
|
slot_size=plan.slot_size,
|
||||||
|
))
|
||||||
|
handles.append(h)
|
||||||
|
for h in handles:
|
||||||
|
engine.wait(h)
|
||||||
|
return {"world_size": merged.get("world_size", 1), "plans": plans}
|
||||||
|
```
|
||||||
|
|
||||||
|
Migration 스케줄:
|
||||||
|
- Phase 1: wrapper로 유지 + DeprecationWarning
|
||||||
|
- Phase 2: 직접 호출자 grep-audit → 각각 `dist.init_process_group()` 또는
|
||||||
|
`build_install_plans()` 직접 사용으로 이관
|
||||||
|
- Phase 3: wrapper 제거 (별도 cleanup ADR 또는 PR)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Dependencies
|
||||||
|
|
||||||
|
- **ADR-0023** (IPCQ): `IpcqInitMsg` 메시지 타입과 PE_IPCQ 핸들링을 그대로
|
||||||
|
활용. Engine-routed submit으로 전환하는 것이 유일한 변경.
|
||||||
|
- **ADR-0025** (IPCQ direction fix): `_build_pe_installs`의 neighbor 계산이
|
||||||
|
2-rank ring 등에서 정확히 동작하려면 필요.
|
||||||
|
- **ADR-0003 / 0016** (IO_CPU): IO_CPU는 기존 transit 역할 그대로. 본 ADR에서
|
||||||
|
IO_CPU 역할 변경 없음.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Non-goals
|
||||||
|
|
||||||
|
- **IPCQ protocol 수정**: ADR-0023 유지.
|
||||||
|
- **DPPolicy 필드 정리**: ADR-0026.
|
||||||
|
- **Megatron-style TP**: ADR-0027.
|
||||||
|
- **Multi-node (프로세스 간)**: 단일 프로세스.
|
||||||
|
- **IO_CPU SIP control-plane 단일 endpoint 원칙 채택**: 본 ADR 범위 밖. 현재
|
||||||
|
KernBench에 이 원칙이 없고, 도입은 별도 ADR.
|
||||||
|
- **Hierarchical all-reduce 알고리즘 설계**: ADR-0029. 본 ADR은 그 알고리즘이
|
||||||
|
쓸 framework 인프라 (`all_pes` mapper, `multi_pe_sip_local` validator,
|
||||||
|
registry 확장점)만 제공.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Open questions
|
||||||
|
|
||||||
|
### 🔴 Critical — 구현 blocker 가능성 (integration 전 반드시 검증)
|
||||||
|
|
||||||
|
- **`IpcqInitMsg`의 engine routing — primary implementation risk**: 현재
|
||||||
|
sideband만 쓰여서 engine routing path가 실사용 검증되지 않은 상태. **본
|
||||||
|
ADR 전체가 "engine routing이 동작한다"는 가정 위에 서 있다**. 이것이
|
||||||
|
실제로 안 되면 D2, D14, T3 등이 전부 영향 받음. 반드시 **ADR 구현 착수
|
||||||
|
전 스파이크 검증**:
|
||||||
|
- `engine.submit(IpcqInitMsg(target_sips=..., target_cubes=..., target_pe=...))`
|
||||||
|
가 PE_IPCQ로 정확히 배달되는지 (기존 `MmuMapMsg` / `MemoryWriteMsg` 라우팅
|
||||||
|
패턴과 비교)
|
||||||
|
- 미지원 시 minor hook: engine의 message-type → component-kind 매핑 테이블에
|
||||||
|
`IpcqInitMsg → "pe_ipcq"` 등록 (localized change, topology builder /
|
||||||
|
message schema 영향 없음)
|
||||||
|
- 결과에 따라 D2 채택 여부가 달라질 수 있음 — 만약 routing 불가 시 sideband
|
||||||
|
path 유지로 fallback 후 본 ADR 범위 재조정
|
||||||
|
|
||||||
|
- **Engine-routed install vs sideband equivalence** (D2 검증점 1-5): T3의
|
||||||
|
equivalence test가 실제 동작하는지 스파이크. 특히 ordering independence와
|
||||||
|
idempotency는 기존 테스트에 없는 속성이라 신규 검증 필요.
|
||||||
|
|
||||||
|
- **`install_ipcq()` 직접 호출자 audit** (구현 전 필수): deprecated wrapper
|
||||||
|
전략은 적절하지만 실제 migration 리스크는 호출자 목록에 따라 다름. 착수 전
|
||||||
|
grep audit:
|
||||||
|
- Pattern: `install_ipcq(` (cwd 전체)
|
||||||
|
- Scope: `src/`, `tests/`, `benches/`, `scripts/`, `src/kernbench/cli/`
|
||||||
|
- 각 호출자의 예상 migration path (→ `dist.init_process_group` vs
|
||||||
|
`build_install_plans` 직접)를 정리한 후 wrapper 도입
|
||||||
|
|
||||||
|
### 🟡 Nice-to-have — scope 경계 관련
|
||||||
|
|
||||||
|
- **Install timing 허용치**: SimPy 시간 상 install이 몇 ns~us 소모. 기존
|
||||||
|
sideband는 0ns. 기존 테스트가 t=0 시작을 전제로 하는지 확인 (audit 결과에
|
||||||
|
따라 테스트 교정 필요).
|
||||||
|
|
||||||
|
- **`IpcqInitMsg` 배치 가능성**: MmuMapMsg처럼 `target_pe="all"` 브로드캐스트
|
||||||
|
는 IPCQ에서는 부적합 (PE마다 neighbor가 다름). 현재는 per-PE 개별 submit.
|
||||||
|
Per-PE payload를 담는 batched IpcqInitMsg 타입은 future optimization.
|
||||||
|
|
||||||
|
- **`_rank_to_sip` 매핑**: 현재 identity. Non-trivial mapping 요구 시 별도.
|
||||||
|
|
||||||
|
- **Cooperative yield API 위치**: `torch.distributed.cooperative_yield()`로
|
||||||
|
노출 예정. 실제 필요성은 Phase 2 이후 벤치 추가 시 판단.
|
||||||
|
|
||||||
|
(PE-level topology 일원화 관련 중장기 방향은 **ADR-0029** 참고 — 복잡한
|
||||||
|
multi-level 알고리즘이 driving force가 되는 framework 진화 방향.)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Test strategy
|
||||||
|
|
||||||
|
### T1. Launcher infrastructure
|
||||||
|
|
||||||
|
`tests/test_ccl_ddp_launcher.py`:
|
||||||
|
- `test_world_size_equals_sip_count` — D1
|
||||||
|
- `test_ahbm_set_device_binds_tensor_to_single_sip` — D10/D11
|
||||||
|
- `test_get_rank_is_greenlet_local` — D9
|
||||||
|
- `test_run_spawns_one_worker_per_rank` — D12/D13
|
||||||
|
- `test_get_rank_debug_warning` — D9 warning path
|
||||||
|
|
||||||
|
### T2. Install plan builder
|
||||||
|
|
||||||
|
`tests/test_ccl_install_plan.py` (new):
|
||||||
|
- `build_install_plans` — ring_1d × leader_only 조합 (단일 PE per rank)
|
||||||
|
- `build_install_plans` — ring_1d × all_pes 조합 (multi-PE per rank; mapper
|
||||||
|
framework 동작 확인, 알고리즘-무관)
|
||||||
|
- Mapper / validator registry resolution (built-in key vs import path vs
|
||||||
|
unknown)
|
||||||
|
- Import path fallback (`"pkg.mod.fn"` 형식) 동작 검증
|
||||||
|
|
||||||
|
### T3. Engine-routed IpcqInitMsg (equivalence — 핵심 검증)
|
||||||
|
|
||||||
|
`tests/test_ipcq_init_routing.py` (new):
|
||||||
|
- **Routing**: `engine.submit(IpcqInitMsg)` → 지정 PE_IPCQ가 실제 설치 수행
|
||||||
|
- **Equivalence**: 동일한 IpcqInitMsg를 (a) sideband `_install_neighbors`
|
||||||
|
직접 호출, (b) engine.submit 두 경로로 보낸 뒤 PE_IPCQ 최종 state
|
||||||
|
(`_queue_pairs`, `_installed` 등) 동일성 비교
|
||||||
|
- **Ordering independence**: 서로 다른 PE의 install msg를 engine 큐에 임의
|
||||||
|
순서로 넣어도 최종 state가 동일
|
||||||
|
- **Idempotency (duplicate install)**: 동일 PE에 두 번 install msg → 두
|
||||||
|
번째는 에러 raise (policy: explicit error; D2 검증점 4 참고)
|
||||||
|
- **Multi-PE 병렬 install**: per-PE submit이 interference 없이 완료
|
||||||
|
- **Install 후 send 성공**: 설치 직후 `IpcqSendCmd` 실행해서 neighbor table
|
||||||
|
state가 실제로 유효한지 확인
|
||||||
|
|
||||||
|
### T4. Barrier correctness
|
||||||
|
|
||||||
|
`tests/test_collective_barrier.py` (new):
|
||||||
|
- Single collective 정상
|
||||||
|
- 다중 collective 연속 호출 (epoch 격리)
|
||||||
|
- 동일 rank의 duplicate join → RuntimeError
|
||||||
|
- Rank 1이 all_reduce 전 종료 → SpawnException + barrier.reset()
|
||||||
|
- Conditional branch 시 모든 rank 도달하면 정상
|
||||||
|
|
||||||
|
### T5. E2E
|
||||||
|
|
||||||
|
`tests/test_ccl_allreduce_matrix.py`:
|
||||||
|
- `ring_tcm` / `ring_hbm` / `ring_sram` @ ws=SIP_count
|
||||||
|
|
||||||
|
### T6. 회귀
|
||||||
|
|
||||||
|
기존 `test_ccl_framework`, `test_ccl_install`, `test_ccl_topologies`,
|
||||||
|
`test_ccl_mock_runtime`, `test_pe_ipcq`, `test_ipcq_e2e`, 기타 non-CCL
|
||||||
|
모두 통과.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Consequences
|
||||||
|
|
||||||
|
### Positive
|
||||||
|
|
||||||
|
- **새 message 타입 0개**: 기존 `IpcqInitMsg` + `KernelLaunchMsg`만으로 구현.
|
||||||
|
- **IO_CPU / engine 변경 없음**: 기존 routing 그대로.
|
||||||
|
- **Sideband install convention 제거**: MmuMapMsg 등과 동일 패턴으로 일원화.
|
||||||
|
- **Plan state stale 문제 소멸**: Plan은 host 단일 소유.
|
||||||
|
- **Bench = real PyTorch DDP** (공개 API 관점).
|
||||||
|
- **Algorithm ABI 경량**: `kernel` + `kernel_args`만 필수.
|
||||||
|
- **Epoch-based barrier**: interleaved collective 안전.
|
||||||
|
- **Control/data plane 분리**: data plane(PE_IPCQ)은 ADR-0023 유지, control
|
||||||
|
plane은 host-driven.
|
||||||
|
- 장기 확장성: Megatron TP, DTensor 기반.
|
||||||
|
|
||||||
|
### Negative
|
||||||
|
|
||||||
|
- 신규 모듈: `install_plan.py`, `mappers.py`, `validators.py`,
|
||||||
|
`multiprocessing.py`.
|
||||||
|
- Engine이 `IpcqInitMsg`를 엔진-path로 라우팅할 수 있는지 구현 시 확인 필요
|
||||||
|
(minor hook 가능성).
|
||||||
|
- Install이 SimPy 시간을 소모 (positive로도 볼 수 있으나, 기존 sideband 시점
|
||||||
|
0ns 전제인 테스트가 있으면 교정 필요).
|
||||||
|
|
||||||
|
### Neutral
|
||||||
|
|
||||||
|
- IPCQ PE-level protocol (ADR-0023) 불변.
|
||||||
|
- `DPPolicy` 필드 변경은 ADR-0026.
|
||||||
|
- IO_CPU 역할 불변 (기존 transit 그대로).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Affected files
|
||||||
|
|
||||||
|
| File | Change |
|
||||||
|
|------|--------|
|
||||||
|
| `src/kernbench/runtime_api/distributed.py` | D1/D2/D7/D9: world_size fallback, rank_to_sip, plan 소유, engine-routed install/launch, epoch barrier |
|
||||||
|
| `src/kernbench/runtime_api/context.py` | D10/D11: `_AhbmNamespace`, `ctx.ahbm`, `_create_tensor`가 `target_sip` 전달 |
|
||||||
|
| `src/kernbench/runtime_api/multiprocessing.py` (new) | D12/D13: `spawn` + scheduler + exception |
|
||||||
|
| `src/kernbench/ccl/install_plan.py` (new) | D6: `build_install_plans`, `SipInstallPlan`, `PeInstallSpec`, `NeighborTableEntry` |
|
||||||
|
| `src/kernbench/ccl/mappers.py` (new) | D5: `leader_only`, `all_pes`, registry + resolver |
|
||||||
|
| `src/kernbench/ccl/validators.py` (new) | D5: validator registry + resolver |
|
||||||
|
| `src/kernbench/ccl/install.py` | Thin deprecated compat wrapper (D14) |
|
||||||
|
| `src/kernbench/ccl/algorithms/ring_allreduce.py` | D4: `kernel` + `kernel_args` 유지 (큰 변화 없음) |
|
||||||
|
| `src/kernbench/ccl/algorithms/mesh_allreduce.py` | D4 동일 |
|
||||||
|
| `src/kernbench/ccl/algorithms/tree_allreduce.py` | D4 동일 |
|
||||||
|
| `ccl.yaml` | 각 알고리즘에 `mapper` / `validator` 선언 추가 |
|
||||||
|
| `src/kernbench/sim_engine/engine.py` | (If needed) `IpcqInitMsg` → PE_IPCQ 라우팅 확인 hook |
|
||||||
|
| `benches/ccl_allreduce.py` | 새 launcher 기반 rewrite |
|
||||||
|
| `tests/test_ccl_ddp_launcher.py` (new) | T1 |
|
||||||
|
| `tests/test_ccl_install_plan.py` (new) | T2 |
|
||||||
|
| `tests/test_ipcq_init_routing.py` (new) | T3 |
|
||||||
|
| `tests/test_collective_barrier.py` (new) | T4 |
|
||||||
|
| `tests/test_ccl_allreduce_matrix.py` | T5: ws=SIP_count 단순화 |
|
||||||
@@ -0,0 +1,365 @@
|
|||||||
|
# ADR-0025: IPCQ Direction Addressing — address-based matching
|
||||||
|
|
||||||
|
## Status
|
||||||
|
|
||||||
|
Proposed (Revision 2 — Address-based matching; peer_direction field dropped)
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
### 목표
|
||||||
|
|
||||||
|
ADR-0023의 IPCQ protocol에서 **"어느 direction pair를 통한 전송인가"의 식별**을
|
||||||
|
topology / dict-order에 의존하지 않고 **주소 기반**으로 일관되게 한다.
|
||||||
|
2-rank bidirectional ring (또는 여러 direction이 동일 peer를 가리키는
|
||||||
|
topology 일반)에서 정확히 동작하도록 한다.
|
||||||
|
|
||||||
|
### 현재 상태 (ADR-0023 D9 구현)
|
||||||
|
|
||||||
|
`src/kernbench/components/builtin/pe_ipcq.py` — `_handle_meta_arrival`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def _handle_meta_arrival(self, msg: IpcqMetaArrival) -> None:
|
||||||
|
token = msg.token
|
||||||
|
sender_key = (token.src_sip, token.src_cube, token.src_pe)
|
||||||
|
for d, qp in self._queue_pairs.items():
|
||||||
|
p = qp["peer"]
|
||||||
|
if (p.sip, p.cube, p.pe) == sender_key:
|
||||||
|
qp["peer_head_cache"] = max(qp["peer_head_cache"], token.sender_seq + 1)
|
||||||
|
# ... wake recv waiters ...
|
||||||
|
return
|
||||||
|
```
|
||||||
|
|
||||||
|
`_credit_worker`도 동일한 "sender-coord-first-match" 패턴.
|
||||||
|
|
||||||
|
`src/kernbench/ccl/install.py` — `reverse_direction`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def reverse_direction(my_rank: int, peer_rank: int) -> str | None:
|
||||||
|
for d, target in neighbor_table[peer_rank].items():
|
||||||
|
if target == my_rank:
|
||||||
|
return d
|
||||||
|
return None
|
||||||
|
```
|
||||||
|
|
||||||
|
### 드러난 버그 — 2-rank bidirectional ring
|
||||||
|
|
||||||
|
`ring_1d(rank, world_size=2)` → `{"E": 1, "W": 1}` (rank 0). 양쪽 방향이 같은 peer.
|
||||||
|
|
||||||
|
**버그 1 (install)**:
|
||||||
|
- `reverse_direction(0, 1)` → dict order로 "E" 반환 (틀림, "W"가 맞음 — opposite
|
||||||
|
direction convention)
|
||||||
|
- rank 0의 E entry가 `peer.rx_base_pa = rx_base(sip1, cube0, pe0, d="E")`로 설정
|
||||||
|
- tl.send(E) → data가 sip1의 E-rx buffer로 landing (should be W-rx)
|
||||||
|
|
||||||
|
**버그 2 (runtime)**:
|
||||||
|
- 설령 install이 올바른 주소로 설정해도, receiver의 `_handle_meta_arrival`이
|
||||||
|
sender 좌표만으로 direction 매칭 → 첫 direction (E) 승
|
||||||
|
- peer_head_cache[E] 증가, peer_head_cache[W]는 불변
|
||||||
|
- Kernel의 tl.recv(W)는 peer_head_cache[W] 대기 → 영원히 블록 → IpcqDeadlock
|
||||||
|
|
||||||
|
### 근본 원인
|
||||||
|
|
||||||
|
두 축에서 동일 문제:
|
||||||
|
1. **Install-time pairing**: "내 direction과 peer의 어느 direction이 짝인가"
|
||||||
|
결정이 dict-iteration-order에 의존 → 여러 direction이 같은 peer를 가리킬 때
|
||||||
|
fragile
|
||||||
|
2. **Runtime identification**: "어느 qp를 업데이트해야 하는가" 결정이 sender
|
||||||
|
좌표만으로 이루어짐 → direction 중복 시 ambiguous
|
||||||
|
|
||||||
|
### 해결 방향 — address-based matching
|
||||||
|
|
||||||
|
각 PE의 rx buffer는 **direction별로 고유한 주소 range**에 위치 (rx_base_pa +
|
||||||
|
direction_idx × bytes_per_direction). 따라서:
|
||||||
|
|
||||||
|
- **Runtime**: sender coord 대신 **dst_addr 범위**로 매칭 → unambiguous
|
||||||
|
- **Install**: opposite-direction 우선 선택 heuristic (ring / mesh의 자연스러운
|
||||||
|
대칭성)
|
||||||
|
- `peer_direction` 같은 이중 메타데이터 불필요 — **주소가 single source of
|
||||||
|
truth**
|
||||||
|
|
||||||
|
이 설계는 **PhysAddr 전환 (ADR-0030)과 독립적**으로 작동. 현재 synthetic
|
||||||
|
주소든 PhysAddr든 direction별 range 유일성만 지켜지면 동일하게 적용 가능.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Decision
|
||||||
|
|
||||||
|
### D1. Install — `reverse_direction` opposite-preference
|
||||||
|
|
||||||
|
`src/kernbench/ccl/install.py`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
_OPPOSITE_DIR = {"E": "W", "W": "E", "N": "S", "S": "N"}
|
||||||
|
|
||||||
|
def reverse_direction(my_rank: int, peer_rank: int, my_dir: str) -> str | None:
|
||||||
|
"""Find peer's direction that reciprocates my_dir→peer_rank.
|
||||||
|
|
||||||
|
Prefer the OPPOSITE direction (E↔W, N↔S) when the peer has it
|
||||||
|
pointing back to us. This matters in 2-rank bidirectional rings
|
||||||
|
where both E and W on one side point to the same peer — without
|
||||||
|
the preference, the first-match-wins iteration would route data
|
||||||
|
into the wrong rx slot. Falls back to any direction pointing back
|
||||||
|
for topologies without an opposite convention (tree_binary's
|
||||||
|
parent/child).
|
||||||
|
"""
|
||||||
|
nt = neighbor_table[peer_rank]
|
||||||
|
opp = _OPPOSITE_DIR.get(my_dir)
|
||||||
|
if opp is not None and nt.get(opp) == my_rank:
|
||||||
|
return opp
|
||||||
|
for d, target in nt.items():
|
||||||
|
if target == my_rank:
|
||||||
|
return d
|
||||||
|
return None
|
||||||
|
```
|
||||||
|
|
||||||
|
호출부:
|
||||||
|
|
||||||
|
```python
|
||||||
|
for d, peer_rank in nbrs.items():
|
||||||
|
peer_dir = reverse_direction(r, peer_rank, d) # my_dir 전달
|
||||||
|
if peer_dir is None:
|
||||||
|
continue
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
||||||
|
### D2. Runtime — `_handle_meta_arrival` dst_addr 매칭
|
||||||
|
|
||||||
|
`src/kernbench/components/builtin/pe_ipcq.py`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def _handle_meta_arrival(self, msg: IpcqMetaArrival) -> None:
|
||||||
|
"""Match incoming token to the receiver-side direction by dst_addr range.
|
||||||
|
|
||||||
|
Each direction has a unique rx buffer address range
|
||||||
|
(my_rx_base_pa + n_slots * slot_size). The token's dst_addr (set by
|
||||||
|
the sender's IPCQ when computing peer's slot address) falls within
|
||||||
|
exactly one such range. This address-based matching is unambiguous
|
||||||
|
even when multiple directions have the same peer (2-rank ring).
|
||||||
|
"""
|
||||||
|
token = msg.token
|
||||||
|
dst_addr = token.dst_addr
|
||||||
|
for d, qp in self._queue_pairs.items():
|
||||||
|
base = qp["my_rx_base_pa"]
|
||||||
|
size = qp["n_slots"] * qp["slot_size"]
|
||||||
|
if base <= dst_addr < base + size:
|
||||||
|
qp["peer_head_cache"] = max(qp["peer_head_cache"],
|
||||||
|
token.sender_seq + 1)
|
||||||
|
self._arrived_tokens.setdefault(d, []).append(token)
|
||||||
|
waiters = self._recv_waiters.get(d, [])
|
||||||
|
self._recv_waiters[d] = []
|
||||||
|
for ev in waiters:
|
||||||
|
if not ev.triggered:
|
||||||
|
ev.succeed()
|
||||||
|
any_waiters = self._any_recv_waiters
|
||||||
|
self._any_recv_waiters = []
|
||||||
|
for ev in any_waiters:
|
||||||
|
if not ev.triggered:
|
||||||
|
ev.succeed()
|
||||||
|
return
|
||||||
|
# Unknown dst_addr — diagnostic log (should not happen under correct install)
|
||||||
|
```
|
||||||
|
|
||||||
|
Sender 좌표 검사는 **제거**. `dst_addr`가 이미 direction을 결정.
|
||||||
|
|
||||||
|
### D3. Credit — `dst_rx_base_pa` 필드 추가
|
||||||
|
|
||||||
|
`src/kernbench/common/ipcq_types.py`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class IpcqCreditMetadata:
|
||||||
|
consumer_seq: int
|
||||||
|
dst_rx_base_pa: int # NEW: 원 sender의 peer.rx_base_pa와 매칭용
|
||||||
|
# 기존 필드 (diagnostic / log 용도로 유지)
|
||||||
|
src_sip: int
|
||||||
|
src_cube: int
|
||||||
|
src_pe: int
|
||||||
|
src_direction: str
|
||||||
|
```
|
||||||
|
|
||||||
|
Credit 생성 시 (`_delayed_credit_send`): 자기 direction의 `my_rx_base_pa`를
|
||||||
|
`dst_rx_base_pa`로 실어 보냄 (이게 상대방이 sender 당시 썼던 `peer.rx_base_pa`).
|
||||||
|
|
||||||
|
수신 측 (`_credit_worker`):
|
||||||
|
|
||||||
|
```python
|
||||||
|
def _credit_worker(self, env):
|
||||||
|
while True:
|
||||||
|
credit = yield self._credit_inbox.get()
|
||||||
|
for d, qp in self._queue_pairs.items():
|
||||||
|
# peer의 rx_base_pa와 credit의 dst_rx_base_pa가 일치하는 qp 찾기
|
||||||
|
if qp["peer"].rx_base_pa == credit.dst_rx_base_pa:
|
||||||
|
qp["peer_tail_cache"] = max(qp["peer_tail_cache"],
|
||||||
|
credit.consumer_seq)
|
||||||
|
waiters = self._send_waiters.get(d, [])
|
||||||
|
self._send_waiters[d] = []
|
||||||
|
for ev in waiters:
|
||||||
|
if not ev.triggered:
|
||||||
|
ev.succeed()
|
||||||
|
break
|
||||||
|
```
|
||||||
|
|
||||||
|
Sender 좌표 검사 제거. `dst_rx_base_pa` 매칭으로 unambiguous.
|
||||||
|
|
||||||
|
### D4. `IpcqInitEntry`에 `peer_direction` 필드를 **추가하지 않음**
|
||||||
|
|
||||||
|
ADR-0025 rev 1에서 제안했던 `IpcqInitEntry.peer_direction`은 **불필요**.
|
||||||
|
이유:
|
||||||
|
- Meta arrival은 dst_addr로 매칭 (D2)
|
||||||
|
- Credit은 dst_rx_base_pa로 매칭 (D3)
|
||||||
|
- qp에 peer_direction 저장 필요 없음
|
||||||
|
- Install은 rx_base_pa 계산 시 내부적으로만 peer_dir 사용 (`reverse_direction`)
|
||||||
|
|
||||||
|
IpcqInitEntry schema 변경 없음. Rev 1 대비 **단순화**.
|
||||||
|
|
||||||
|
### D5. `IpcqDmaToken.src_direction` 유지 (diagnostic only)
|
||||||
|
|
||||||
|
기존 `src_direction` 필드는 제거하지 않는다. 다음 용도로 유지:
|
||||||
|
- Logging / trace: `KERNBENCH_CCL_TRACE=1` 출력의 `(rank, t, dir, nbytes)`
|
||||||
|
- Diagnostics: pointer_dump 등에서 direction 표시
|
||||||
|
- 미래 확장 여지
|
||||||
|
|
||||||
|
Runtime matching은 `dst_addr`만 사용.
|
||||||
|
|
||||||
|
### D6. Invariants (ADR-0023 I3 강화)
|
||||||
|
|
||||||
|
**I3 (엄격)**: 각 방향 pair `(my_direction, peer_direction)`에 대해 my
|
||||||
|
rx_base와 peer rx_base는 **별개의 direction slot**을 가리켜야 함. Install은
|
||||||
|
이를 보장해야 한다 (reverse_direction opposite-preference).
|
||||||
|
|
||||||
|
**I3.1 (신규)**: 모든 qp에 대해 `qp["my_rx_base_pa"]`와 `qp["peer"].rx_base_pa`는
|
||||||
|
서로 disjoint한 주소 range를 점유한다 (다른 direction의 buffer는 절대 겹치지
|
||||||
|
않음). 이것이 D2/D3의 주소-기반 매칭의 전제.
|
||||||
|
|
||||||
|
Install time에 검증 가능:
|
||||||
|
```python
|
||||||
|
# ccl/install_plan.py: build_install_plans 끝에 assertion
|
||||||
|
all_rx_ranges = set()
|
||||||
|
for plan in plans:
|
||||||
|
for pe_install in plan.pe_installs:
|
||||||
|
for entry in pe_install.neighbors:
|
||||||
|
r = (entry.my_rx_base_pa,
|
||||||
|
entry.my_rx_base_pa + plan.n_slots * plan.slot_size)
|
||||||
|
overlap = any(_ranges_overlap(r, e) for e in all_rx_ranges)
|
||||||
|
assert not overlap
|
||||||
|
all_rx_ranges.add(r)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Dependencies
|
||||||
|
|
||||||
|
- **ADR-0023** (IPCQ protocol): 본 ADR은 ADR-0023의 runtime 매칭 로직 수정
|
||||||
|
(D2, D3) + install heuristic 개선 (D1). IPCQ 프로토콜의 semantic layer
|
||||||
|
변경은 없음.
|
||||||
|
- **ADR-0024** (launcher): 2-rank bidirectional ring이 실제 쓰이는 경우가
|
||||||
|
ADR-0024의 ws=SIP_count 모델. 본 ADR이 그 케이스를 작동시킴.
|
||||||
|
- **ADR-0030** (PhysAddr transition, stub): **독립적** — ADR-0025의
|
||||||
|
주소-기반 매칭은 현재 synthetic 주소든 PhysAddr이든 동일하게 작동.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Non-goals
|
||||||
|
|
||||||
|
- **IPCQ 주소 체계를 PhysAddr로 전환**: ADR-0030 scope. 본 ADR은 주소가 어떻게
|
||||||
|
인코딩되는가와 무관.
|
||||||
|
- **Multi-hop routing**: ADR-0023 D5의 single-hop DMA write 전제 유지.
|
||||||
|
- **Unidir ring 특수화**: `ring_1d_unidir`는 direction 하나만 있으므로 본 버그
|
||||||
|
무관.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Open questions
|
||||||
|
|
||||||
|
- **주소 매칭 성능**: `_handle_meta_arrival`과 `_credit_worker`가 qp를 선형
|
||||||
|
순회 (max 4 direction). 성능 영향 무시 가능 수준. 문제 시 dict lookup으로
|
||||||
|
전환 가능 (`_qp_by_rx_base`).
|
||||||
|
- **`IpcqDmaToken.src_direction` 필요성 재평가**: diagnostic 용도로만 남긴
|
||||||
|
필드를 계속 유지할지, 또는 logging 외부로 분리할지. 현재는 유지.
|
||||||
|
- **Install-time invariant 검증 cost**: D6의 I3.1 검증은 O(N_PE × N_direction)^2.
|
||||||
|
대형 topology에서 느려질 수 있음 → interval tree 등 자료구조로 개선 가능.
|
||||||
|
단순 구현 먼저.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Test strategy
|
||||||
|
|
||||||
|
### T1. Unit — `reverse_direction` opposite-preference
|
||||||
|
|
||||||
|
`tests/test_ccl_install.py` (확장):
|
||||||
|
- Ring ws=2: `reverse_direction(0, 1, "E")` → "W", `reverse_direction(0, 1, "W")` → "E"
|
||||||
|
- Ring ws=4: `reverse_direction(0, 1, "E")` → "W" (자연스러운 opposite)
|
||||||
|
- Mesh 2×2: `reverse_direction(r, peer, "N")` → "S", "E" ↔ "W"
|
||||||
|
- Tree binary: opposite 없는 direction (parent) → fallback 경로
|
||||||
|
- Non-symmetric topology: opposite가 peer에 없고 다른 direction만 있는 경우
|
||||||
|
|
||||||
|
### T2. Runtime — `_handle_meta_arrival` dst_addr 매칭
|
||||||
|
|
||||||
|
`tests/test_pe_ipcq.py` (확장):
|
||||||
|
- 2-rank pair install 후, E direction dst_addr로 meta arrival → E의 `peer_head_cache`
|
||||||
|
증가 (W는 불변)
|
||||||
|
- W direction dst_addr로 meta arrival → W의 `peer_head_cache` 증가
|
||||||
|
- 잘못된 dst_addr (어느 rx range에도 속하지 않음) → 에러 또는 silent drop
|
||||||
|
(결정 후 명시)
|
||||||
|
|
||||||
|
### T3. Credit — `dst_rx_base_pa` 매칭
|
||||||
|
|
||||||
|
`tests/test_pe_ipcq.py` (확장):
|
||||||
|
- E direction send 후 peer가 consume → credit에 자기 W의 `my_rx_base_pa`
|
||||||
|
담아 송신 → sender의 E direction `peer_tail_cache` 증가
|
||||||
|
- W direction도 동일
|
||||||
|
|
||||||
|
### T4. E2E — 2-rank bidirectional ring
|
||||||
|
|
||||||
|
`tests/test_ipcq_e2e.py`:
|
||||||
|
- 2-rank ring_1d로 tl.send(E) + tl.recv(W) pattern이 양방향으로 작동
|
||||||
|
- ADR-0024의 `test_ccl_allreduce_matrix.py`에서 ring at ws=2가 통과
|
||||||
|
|
||||||
|
### T5. Install invariant — rx_base range disjointness
|
||||||
|
|
||||||
|
`tests/test_ccl_install_plan.py` (확장):
|
||||||
|
- I3.1 검증: `build_install_plans` 결과에서 모든 qp의 rx_base range가 disjoint
|
||||||
|
|
||||||
|
### T6. 회귀
|
||||||
|
|
||||||
|
- 기존 ws≥3 ring / mesh / tree 테스트 그대로 통과
|
||||||
|
- `test_pe_ipcq`, `test_ipcq_e2e` 기존 케이스 회귀
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Consequences
|
||||||
|
|
||||||
|
### Positive
|
||||||
|
|
||||||
|
- **단순함**: `peer_direction` 이중 메타데이터 제거. 주소가 single source of truth.
|
||||||
|
- **Unambiguous matching**: 모든 topology (direction 중복 포함)에서 동작.
|
||||||
|
- **Schema 변경 최소**: `IpcqInitEntry` 불변, `IpcqCreditMetadata`에 1 필드 추가.
|
||||||
|
- **PhysAddr 전환 (ADR-0030) 독립**: 주소-기반 매칭은 주소 인코딩 방식과 무관.
|
||||||
|
- **Diagnostic 유지**: `IpcqDmaToken.src_direction`은 로깅 용도로 존치.
|
||||||
|
|
||||||
|
### Negative
|
||||||
|
|
||||||
|
- Runtime 매칭이 주소 비교로 바뀌어서 디버깅 시 "왜 peer_head_cache[E]가 아닌
|
||||||
|
W가 업데이트됐나" 같은 질문에 address range를 추적해야 함 (기존엔 direction
|
||||||
|
이름으로 충분). 해결: pointer_dump에 "direction ↔ rx_base_pa" 매핑 포함.
|
||||||
|
|
||||||
|
### Neutral
|
||||||
|
|
||||||
|
- IPCQ protocol의 semantic layer (sender가 dst_addr 계산, receiver가 수신)는
|
||||||
|
불변.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Affected files
|
||||||
|
|
||||||
|
| File | Change |
|
||||||
|
|------|--------|
|
||||||
|
| `src/kernbench/ccl/install.py` | D1: `reverse_direction`에 `my_dir` 인자 추가, opposite-preference |
|
||||||
|
| `src/kernbench/components/builtin/pe_ipcq.py` | D2: `_handle_meta_arrival` dst_addr 매칭 / D3: `_credit_worker` dst_rx_base_pa 매칭 / `_delayed_credit_send`가 `dst_rx_base_pa` 필드 채움 |
|
||||||
|
| `src/kernbench/common/ipcq_types.py` | D3: `IpcqCreditMetadata`에 `dst_rx_base_pa` 필드 추가 |
|
||||||
|
| `src/kernbench/ccl/install_plan.py` (ADR-0024 신규) | D6: I3.1 invariant 검증 (optional) |
|
||||||
|
| `docs/adr/ADR-0023-ipcq-pe-collective.md` | Reference note: runtime 매칭 방식이 ADR-0025에서 바뀜 |
|
||||||
|
| `tests/test_ccl_install.py` | T1 |
|
||||||
|
| `tests/test_pe_ipcq.py` | T2, T3 |
|
||||||
|
| `tests/test_ipcq_e2e.py` | T4 |
|
||||||
|
| `tests/test_ccl_install_plan.py` | T5 |
|
||||||
@@ -0,0 +1,476 @@
|
|||||||
|
# ADR-0026: DPPolicy = Intra-Device Only — sip/num_sips 필드 제거
|
||||||
|
|
||||||
|
## Status
|
||||||
|
|
||||||
|
Accepted (Revision 5 — Phase 2 landed 2026-04-14, 523 passed + 1 strict xfail)
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
### 목표
|
||||||
|
|
||||||
|
`DPPolicy`를 **한 device(SIP) 내부의 cube × PE 분산**만 표현하는 순수한
|
||||||
|
intra-device 추상화로 명확화한다. SIP 간 분산(TP)은 별도 레이어로 분리
|
||||||
|
(ADR-0024의 `torch.ahbm.set_device(rank)` 또는 ADR-0027의 Megatron parallel
|
||||||
|
layers가 담당).
|
||||||
|
|
||||||
|
### 현재 상태
|
||||||
|
|
||||||
|
`src/kernbench/policy/placement/dp.py`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class DPPolicy:
|
||||||
|
sip: Literal["replicate", "column_wise", "row_wise"] = "replicate"
|
||||||
|
cube: Literal["replicate", "column_wise", "row_wise"] = "replicate"
|
||||||
|
pe: Literal["replicate", "column_wise", "row_wise"] = "replicate"
|
||||||
|
num_pes: int | None = None
|
||||||
|
num_cubes: int | None = None
|
||||||
|
num_sips: int | None = None # ← 제거 대상
|
||||||
|
```
|
||||||
|
|
||||||
|
`sip` / `num_sips` 필드는 텐서를 SIP 경계 **너머**로 분산하는 경로를 제공함.
|
||||||
|
이는:
|
||||||
|
|
||||||
|
- **ADR-0024의 launcher 모델과 충돌**: ADR-0024는 "rank = SIP = 1 worker per SIP"
|
||||||
|
모델. 각 worker가 자기 SIP에 텐서를 생성. 텐서가 여러 SIP에 걸치는 경우는
|
||||||
|
Megatron-style TP가 개별 primitive로 처리해야 함.
|
||||||
|
- **사용자 의도와 불일치**: "DPPolicy는 한 디바이스 내에서 PE들로 분산하는 방법"
|
||||||
|
(사용자 진술).
|
||||||
|
- **개념 혼동**: `DPPolicy.sip="column_wise"`는 실제로 **TP**. 이름이 DP인데
|
||||||
|
하는 일은 TP → 신규 사용자에게 혼란.
|
||||||
|
|
||||||
|
### 영향받는 call site (rollback 시점 grep 결과)
|
||||||
|
|
||||||
|
**생성 사이트** (`DPPolicy(sip=...` 또는 `num_sips=...`):
|
||||||
|
- `tests/test_runtime_api_tensor.py`
|
||||||
|
- `benches/ccl_allreduce.py` (ADR-0024 scope 내에서 이미 개편됨)
|
||||||
|
- `tests/test_va_offset.py`
|
||||||
|
- `benches/va_offset_verify.py`
|
||||||
|
- `tests/test_sip_parallel.py`
|
||||||
|
|
||||||
|
**참조 사이트** (`dp.sip`, `policy.sip`, `num_sips` 등):
|
||||||
|
- `src/kernbench/runtime_api/context.py` (`_create_tensor`, `launch`)
|
||||||
|
- `src/kernbench/components/builtin/pe_cpu.py`
|
||||||
|
- `src/kernbench/components/legacy/builtin/pe_cpu.py`
|
||||||
|
- `src/kernbench/policy/placement/dp.py` (구현 자체)
|
||||||
|
- `tests/test_tensor.py`, `test_ipcq_types.py`
|
||||||
|
|
||||||
|
**핵심 테스트**: `test_sip_parallel.py`는 이름 그대로 "SIP 병렬성을 DPPolicy로
|
||||||
|
표현하는" 테스트. 이 ADR 이후 **새 launcher 모델로 재작성** 필요.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Decision
|
||||||
|
|
||||||
|
### D1. `DPPolicy`에서 `sip` + `num_sips` 필드 제거
|
||||||
|
|
||||||
|
```python
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class DPPolicy:
|
||||||
|
"""Intra-device (cube × PE) data-parallel policy.
|
||||||
|
|
||||||
|
SIP-level placement is controlled by ``torch.ahbm.set_device(rank)``
|
||||||
|
(ADR-0024 D10) and, for model-level TP, by Megatron-style parallel
|
||||||
|
layers (ADR-0027). DPPolicy does not cross SIP boundaries.
|
||||||
|
"""
|
||||||
|
cube: Literal["replicate", "column_wise", "row_wise"] = "replicate"
|
||||||
|
pe: Literal["replicate", "column_wise", "row_wise"] = "replicate"
|
||||||
|
num_pes: int | None = None
|
||||||
|
num_cubes: int | None = None
|
||||||
|
```
|
||||||
|
|
||||||
|
제거되는 필드: `sip`, `num_sips`.
|
||||||
|
|
||||||
|
### D2. `ShardSpec` — structural (sip, cube, pe) 좌표, `pe_index` 완전 제거
|
||||||
|
|
||||||
|
현재 `ShardSpec.pe_index`는 **global flat index** (`sip × cubes × pes + cube ×
|
||||||
|
pes + pe`). 이는 ADR-0024 D11이 "abstraction leakage"로 지적한 형태.
|
||||||
|
|
||||||
|
본 ADR에서 ShardSpec을 **structural 좌표로 재정의**하고, `pe_index`는
|
||||||
|
property로도 **남기지 않는다**:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# src/kernbench/policy/placement/dp.py (after)
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class ShardSpec:
|
||||||
|
"""Structural shard placement — intra-SIP (cube × PE) coord.
|
||||||
|
|
||||||
|
Global-flat `pe_index` was removed in ADR-0026. Callers must use
|
||||||
|
structural coords (sip, cube, pe) directly. If a flat integer key is
|
||||||
|
needed (e.g. dict lookup), compute it explicitly at the call site.
|
||||||
|
"""
|
||||||
|
sip: int # structural — which SIP this shard lives on
|
||||||
|
cube: int # local within SIP
|
||||||
|
pe: int # local within cube
|
||||||
|
offset_bytes: int
|
||||||
|
nbytes: int
|
||||||
|
```
|
||||||
|
|
||||||
|
**핵심 원칙**:
|
||||||
|
- ShardSpec의 정체성은 `(sip, cube, pe)` 3튜플.
|
||||||
|
- **`pe_index` property도 없음** — silent semantics drift 차단.
|
||||||
|
- Global flat을 기대한 기존 호출자는 `.pe_index` 접근 시 **즉시
|
||||||
|
`AttributeError`** → 반드시 구조적 좌표로 migration.
|
||||||
|
- Flat integer key가 필요한 국소 문맥 (예: 내부 dict lookup)은 호출자가
|
||||||
|
명시적으로 `spec.sip * N_CUBES * N_PE + spec.cube * N_PE + spec.pe`를 계산.
|
||||||
|
|
||||||
|
**Property 제거 정당화**: KernBench는 사내 프로젝트로 call site가 한정되어
|
||||||
|
있음. Silent drift 위험 (의미만 바뀌고 타입은 같은 int) 대비 explicit breakage
|
||||||
|
(AttributeError)가 훨씬 안전.
|
||||||
|
|
||||||
|
### D3. `resolve_dp_policy`가 `target_sip`을 받아 structural 좌표 생성
|
||||||
|
|
||||||
|
ADR-0024 D11의 계약 구현. Post-hoc shifting 없음.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# src/kernbench/policy/placement/dp.py (after)
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class _LocalPeShard:
|
||||||
|
"""Internal — PE resolver의 반환. Cube 내 local PE 식별자 + payload."""
|
||||||
|
local_pe: int # cube-local PE index (0..num_pe-1)
|
||||||
|
offset_bytes: int
|
||||||
|
nbytes: int
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_dp_policy(
|
||||||
|
policy: DPPolicy,
|
||||||
|
*,
|
||||||
|
shape: tuple[int, int],
|
||||||
|
itemsize: int,
|
||||||
|
num_pe: int,
|
||||||
|
num_cubes: int = 1,
|
||||||
|
target_sip: int, # NEW — 어느 SIP에 배치할지 명시
|
||||||
|
) -> list[ShardSpec]:
|
||||||
|
"""2-level resolution (cube × PE) on a specified SIP.
|
||||||
|
|
||||||
|
Returns ShardSpecs with structural coords (sip=target_sip, cube, pe).
|
||||||
|
No SIP-level split — DPPolicy is intra-device only.
|
||||||
|
"""
|
||||||
|
resolver = _PE_RESOLVERS[policy.pe]
|
||||||
|
all_shards: list[ShardSpec] = []
|
||||||
|
|
||||||
|
# Level 1: cube within SIP
|
||||||
|
cube_splits = _split_shape(policy.cube, shape, num_cubes, itemsize)
|
||||||
|
|
||||||
|
for cube_id, (cube_shape, cube_offset) in enumerate(cube_splits):
|
||||||
|
# Level 2: PE within cube — resolver returns _LocalPeShard (local_pe)
|
||||||
|
local_shards = resolver(shape=cube_shape, itemsize=itemsize,
|
||||||
|
num_pe=num_pe)
|
||||||
|
|
||||||
|
for ls in local_shards:
|
||||||
|
all_shards.append(ShardSpec(
|
||||||
|
sip=target_sip, # from caller (current_device)
|
||||||
|
cube=cube_id, # local within SIP
|
||||||
|
pe=ls.local_pe, # local within cube (explicit name)
|
||||||
|
offset_bytes=cube_offset + ls.offset_bytes,
|
||||||
|
nbytes=ls.nbytes,
|
||||||
|
))
|
||||||
|
|
||||||
|
return all_shards
|
||||||
|
```
|
||||||
|
|
||||||
|
**내부 resolver** (`column_wise`, `row_wise`, `replicate`)는 `_LocalPeShard`
|
||||||
|
리스트 반환 — `local_pe` 필드명으로 **"cube-local PE identifier"임이 명시적**.
|
||||||
|
과거 `ShardSpec.pe_index`와 이름이 혼동되던 문제 해소.
|
||||||
|
|
||||||
|
**이름 규약 정리** (전체 ADR):
|
||||||
|
- `ShardSpec.pe`: 최종 외부 API — cube-local PE (structural coord)
|
||||||
|
- `_LocalPeShard.local_pe`: 내부 resolver 단계의 동일 의미
|
||||||
|
- `pe_index`: **제거**. 외부/내부 어디에도 남기지 않는다 (silent drift 차단의
|
||||||
|
부가 효과: 이름 재등장 없음).
|
||||||
|
|
||||||
|
### D4. `_create_tensor` — 구조적 좌표로 직접 placement
|
||||||
|
|
||||||
|
ADR-0024 D11 연속선. Post-hoc shifting 제거, 구조적 좌표를 `resolve_dp_policy`
|
||||||
|
호출 시점에 직접 지정.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# context.py _create_tensor (after)
|
||||||
|
current_sip = self.ahbm.current_device()
|
||||||
|
if current_sip is None:
|
||||||
|
# Single-driver fallback (ADR-0024 D9와 일관).
|
||||||
|
# Launcher 기반 코드가 set_device()를 빼먹으면 조용히 SIP 0에 박히는
|
||||||
|
# 문제가 있음 → debug mode에서 경고.
|
||||||
|
if os.environ.get("KERNBENCH_DEBUG"):
|
||||||
|
import warnings
|
||||||
|
warnings.warn(
|
||||||
|
"torch.ahbm.current_device() is None; defaulting to SIP 0. "
|
||||||
|
"If this is a multi-rank launcher context, you likely forgot "
|
||||||
|
"torch.ahbm.set_device(rank) inside the worker.",
|
||||||
|
stacklevel=2,
|
||||||
|
)
|
||||||
|
current_sip = 0
|
||||||
|
|
||||||
|
placement = resolve_dp_policy(
|
||||||
|
dp,
|
||||||
|
shape=shape_2d,
|
||||||
|
itemsize=itemsize,
|
||||||
|
num_pe=eff_num_pe,
|
||||||
|
num_cubes=eff_num_cubes,
|
||||||
|
target_sip=current_sip, # ← 구조적 좌표 일차 지정
|
||||||
|
)
|
||||||
|
|
||||||
|
# placement의 각 ShardSpec은 이미 (sip=current_sip, cube=local, pe=local) 포함.
|
||||||
|
# 과거의 post-hoc shifting 블록은 완전히 제거.
|
||||||
|
```
|
||||||
|
|
||||||
|
**모든** 텐서가 current device SIP에 배치됨. Multi-SIP 텐서를 만들고 싶으면
|
||||||
|
ADR-0027의 TP primitive 사용.
|
||||||
|
|
||||||
|
**Single-driver fallback의 trade-off**: set_device 없는 호출에서 SIP 0으로
|
||||||
|
default는 기존 single-driver 테스트 호환을 위해 유지. `KERNBENCH_DEBUG=1`
|
||||||
|
환경에서는 launcher 컨텍스트의 실수로 set_device 누락 시 조용히 잘못된 SIP에
|
||||||
|
배치되는 것을 감지할 수 있도록 warning.
|
||||||
|
|
||||||
|
### D5. Downstream — allocator lookup은 구조적 tuple key로
|
||||||
|
|
||||||
|
기존 `deploy_tensor` (`src/kernbench/runtime_api/tensor.py`):
|
||||||
|
|
||||||
|
```python
|
||||||
|
for spec in placement:
|
||||||
|
alloc = allocators[spec.pe_index] # ← AttributeError (property 제거됨)
|
||||||
|
```
|
||||||
|
|
||||||
|
`pe_index`가 없어졌으므로 구조적 좌표로 **강제** migration:
|
||||||
|
|
||||||
|
```python
|
||||||
|
for spec in placement:
|
||||||
|
alloc = allocators[(spec.sip, spec.cube, spec.pe)]
|
||||||
|
```
|
||||||
|
|
||||||
|
`_ensure_allocators`의 dict population도 tuple key로:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# context.py _ensure_allocators (after)
|
||||||
|
for sip_id in sip_range:
|
||||||
|
for cube_id in range(cubes_per_sip):
|
||||||
|
for pe_id in range(pes_per_cube):
|
||||||
|
self._allocators[(sip_id, cube_id, pe_id)] = PEMemAllocator(
|
||||||
|
rack_id=0, sip_id=sip_id, cube_id=cube_id, pe_id=pe_id, cfg=cfg,
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
`_free_tensor`도 동일: 기존 `flat_idx = sip * ... + cube * ... + pe` 계산
|
||||||
|
블록 제거, `(shard.sip, shard.cube, shard.pe)` 직접 사용.
|
||||||
|
|
||||||
|
**Tuple vs dataclass `PEIdentity`**: Tuple이 단순하고 hashable로 바로 써서
|
||||||
|
권고. `PEIdentity` 값객체는 명시적 타입 장점은 있지만 boilerplate가 크고 현재
|
||||||
|
allocator dict의 유일한 key라 오버엔지니어링. Tuple 유지.
|
||||||
|
|
||||||
|
### D6. Migration — 기존 call site
|
||||||
|
|
||||||
|
**(A) `DPPolicy(sip=..., num_sips=..., ...)` 사용하던 코드**:
|
||||||
|
|
||||||
|
- `DPPolicy(sip="column_wise", cube=..., pe=...)` 패턴 → **해당 bench를 ADR-0024
|
||||||
|
launcher로 재작성**. worker가 `set_device(rank)`로 SIP 선택, DPPolicy는
|
||||||
|
cube/PE만.
|
||||||
|
- `DPPolicy(sip="replicate", num_sips=1, ...)` 패턴 → `DPPolicy(cube=..., pe=...)`로
|
||||||
|
축소 (필드가 사라지니 자연스럽게).
|
||||||
|
|
||||||
|
**(B) `dp.sip`, `dp.num_sips` 읽던 코드**:
|
||||||
|
|
||||||
|
- 제거. `launch()`의 `_compute_local_shape`에서 `dp.sip` 분기 삭제.
|
||||||
|
- `pe_cpu.py`가 `dp.sip`을 참조하던 곳도 정리.
|
||||||
|
|
||||||
|
**(C) `ShardSpec.pe_index`를 사용하던 코드 — 전부 수정 필요**:
|
||||||
|
|
||||||
|
- `.pe_index` 접근은 이제 `AttributeError` 발생 → 모든 call site 수정 필수.
|
||||||
|
- Allocator lookup: `allocators[spec.pe_index]` →
|
||||||
|
`allocators[(spec.sip, spec.cube, spec.pe)]`
|
||||||
|
- Flat integer가 꼭 필요한 국소 문맥: `spec.sip * N_CUBES * N_PE + spec.cube *
|
||||||
|
N_PE + spec.pe` 명시적 계산. **국소 변수로만 사용하고 공개 API에 노출하지
|
||||||
|
않는다**.
|
||||||
|
|
||||||
|
**구현 착수 전 grep audit 체크리스트**:
|
||||||
|
|
||||||
|
1. **Property 참조**:
|
||||||
|
- `\.pe_index\b` — 필드/property 접근 모두 (regex)
|
||||||
|
- `pe_index=` — 생성 시점의 키워드 인자
|
||||||
|
- `pe_index:` — dataclass 필드 선언
|
||||||
|
2. **Allocator / dict indexing**:
|
||||||
|
- `allocators\[` — dict lookup 패턴. `allocators[spec.pe_index]` 같은
|
||||||
|
것이 걸리는지
|
||||||
|
- `_allocators\[` — 같은 패턴 (prefix _)
|
||||||
|
3. **Flat index 수동 계산 블록**:
|
||||||
|
- `flat_idx =`
|
||||||
|
- `pe_index =` (좌변)
|
||||||
|
- `* pes_per_cube +` (전형적 flat 계산 패턴)
|
||||||
|
- `* self._num_cubes \* self._pes_per_cube` (global flat 계산)
|
||||||
|
4. **Serialization / logging**:
|
||||||
|
- `asdict(.*shard` — dataclass 직렬화 시 `pe_index` 자동 포함 여부
|
||||||
|
- `repr(.*ShardSpec` — 로그 포맷에서 의존하는지
|
||||||
|
- JSON/YAML 저장 포맷에서 `pe_index` 키 사용 여부
|
||||||
|
5. **Tests asserting integer PE identity**:
|
||||||
|
- `assert .*pe_index` — 정수 동일성 주장
|
||||||
|
- `spec.pe_index ==` — 비교 (SIP-local 의미로 변하면 테스트가 깨질 수 있음)
|
||||||
|
|
||||||
|
각 match마다 "이 호출자가 global flat / SIP-local / 내부 lookup 중 무엇을
|
||||||
|
기대했나"를 판단한 뒤 구조적 좌표로 교체.
|
||||||
|
|
||||||
|
**(D) `test_sip_parallel.py`**:
|
||||||
|
|
||||||
|
- 이름 유지, 내용은 ADR-0024의 multi-greenlet launcher 기반 재작성.
|
||||||
|
- "SIP 병렬성 = rank 별 worker × 각자 DPPolicy" 로 검증.
|
||||||
|
|
||||||
|
**(E) `test_va_offset.py`, `benches/va_offset_verify.py`**:
|
||||||
|
|
||||||
|
- `num_sips=1`만 쓰는 경우가 대부분. 단순히 필드 제거.
|
||||||
|
- SIP offset 테스트가 핵심이면 `set_device(rank)` + 구조적 좌표 관찰로 이식.
|
||||||
|
|
||||||
|
### D7. 하위 호환 — 불가 (cleanup ADR)
|
||||||
|
|
||||||
|
이 ADR은 **breaking change**.
|
||||||
|
|
||||||
|
1. `DPPolicy(sip=...)` 또는 `DPPolicy(num_sips=...)` 호출 → `TypeError`
|
||||||
|
2. `ShardSpec.pe_index` 접근 → `AttributeError`
|
||||||
|
|
||||||
|
모두 **즉시 명시적 breakage**. Deprecation warning / fallback 경로 없음.
|
||||||
|
KernBench는 사내 프로젝트로 call site가 한정되어 있어 한 번에 migration.
|
||||||
|
|
||||||
|
**Silent drift 차단**이 property 완전 제거의 주된 이점: global flat을 기대한
|
||||||
|
코드가 SIP-local 결과를 받아 조용히 잘못된 인덱싱을 할 가능성 제거.
|
||||||
|
|
||||||
|
### D8. 문서 업데이트
|
||||||
|
|
||||||
|
- `ADR-0008` (tensor deploy) — DPPolicy 의미 갱신 note, ShardSpec 구조적 좌표
|
||||||
|
전환 명시
|
||||||
|
- DPPolicy docstring에 "intra-device only" 명시 (D1 코드 스니펫의 docstring)
|
||||||
|
- ShardSpec docstring에 **structural coordinates `(sip, cube, pe)`를 직접
|
||||||
|
사용하며, `pe_index`는 더 이상 제공되지 않음**을 명시 (D2)
|
||||||
|
- `docs/ccl-author-guide` 등 튜토리얼에서 `sip=...` 예시 제거
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Dependencies
|
||||||
|
|
||||||
|
- **ADR-0024** (launcher): `set_device(rank)` 및 current-device scoping이
|
||||||
|
SIP 배치 메커니즘 제공. 본 ADR은 그 위에 서서 DPPolicy를 순수 intra-device로
|
||||||
|
좁힘.
|
||||||
|
- **ADR-0027** (Megatron TP): 다중 SIP에 걸친 텐서가 필요한 경우의 대안 경로.
|
||||||
|
이 ADR 적용 후 multi-SIP use case는 ADR-0027로 이관.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Non-goals
|
||||||
|
|
||||||
|
- **`DPPolicy.cube` / `pe` 재설계**: 기존 replicate/column_wise/row_wise 의미
|
||||||
|
유지.
|
||||||
|
- **Tiling 정책 통합**: `tiled_column_major` / `tiled_row_major`는 그대로.
|
||||||
|
- **Multi-device 텐서 추상화 신규**: DTensor-like는 ADR-0028.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Open questions
|
||||||
|
|
||||||
|
- **`_create_tensor`의 current_sip 기본값**: set_device 없는 호출에서 rank=0
|
||||||
|
(SIP 0)로 fallback할지, 아니면 error 낼지. 권고는 fallback (기존 single-driver
|
||||||
|
테스트와의 호환).
|
||||||
|
- **`test_sip_parallel.py` 재작성 범위**: 기존 단위 테스트의 의도를 유지하며
|
||||||
|
launcher 기반으로 옮기려면 추가 fixture 필요. 별도 작업으로 scope.
|
||||||
|
- **`DPPolicy`의 `num_sips=None` 의미**: 필드가 없어지면 `num_sips` 개념 자체가
|
||||||
|
사라짐. Multi-SIP을 표현하고 싶으면 ADR-0027의 TP primitive를 쓰라는 것이
|
||||||
|
명시적 답.
|
||||||
|
|
||||||
|
**Resolved (이전 rev에서 open이었던 것들)**:
|
||||||
|
- ~~`ShardSpec.pe_index` property 존치 여부~~ → **완전 제거** (D2)
|
||||||
|
- ~~`_ensure_allocators` dict key 형식~~ → **tuple `(sip, cube, pe)`** (D5)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Test strategy
|
||||||
|
|
||||||
|
### T1. 단위 테스트 갱신
|
||||||
|
|
||||||
|
- `tests/test_tensor.py`, `tests/test_ipcq_types.py`, `tests/test_runtime_api_tensor.py`
|
||||||
|
— DPPolicy 생성자 인자 정리, ShardSpec 구조적 좌표 검증
|
||||||
|
- `tests/test_va_offset.py` — `num_sips=1` 제거 후 동작 유지
|
||||||
|
|
||||||
|
### T2. `resolve_dp_policy` 구조적 좌표 반환
|
||||||
|
|
||||||
|
`tests/test_dp_policy.py` (new 또는 확장):
|
||||||
|
- `resolve_dp_policy(dp, ..., target_sip=1)` 결과의 모든 ShardSpec이 `sip=1`
|
||||||
|
- 각 spec의 `(cube, pe)`가 local (0..num_cubes-1, 0..num_pe-1)
|
||||||
|
- 같은 topology에서 `target_sip=0`과 `target_sip=1` 결과가 sip 필드만 다름
|
||||||
|
|
||||||
|
### T3. `test_sip_parallel.py` 재작성
|
||||||
|
|
||||||
|
SIP 병렬성 검증을 launcher 기반으로:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def test_sip_parallel_via_launcher(topology):
|
||||||
|
...
|
||||||
|
def worker(rank, ws, torch):
|
||||||
|
torch.ahbm.set_device(rank)
|
||||||
|
t = torch.zeros((1, 128), dtype="f16",
|
||||||
|
dp=DPPolicy(cube="column_wise", pe="column_wise"))
|
||||||
|
# verify shard.sip == rank (structural coord)
|
||||||
|
|
||||||
|
spawn(worker, nprocs=n_sips, ...)
|
||||||
|
```
|
||||||
|
|
||||||
|
### T4. Allocator key migration
|
||||||
|
|
||||||
|
`tests/test_allocator_structural_key.py` (new 또는 기존 확장):
|
||||||
|
- `PEMemAllocator` dict이 `(sip, cube, pe)` tuple key로 작동
|
||||||
|
- `deploy_tensor`가 구조적 좌표로 allocator lookup
|
||||||
|
- `_free_tensor`도 동일
|
||||||
|
|
||||||
|
### T5. E2E 회귀
|
||||||
|
|
||||||
|
ADR-0024의 `test_ccl_allreduce_matrix.py` 그대로 통과.
|
||||||
|
|
||||||
|
### T6. 오류 검증
|
||||||
|
|
||||||
|
- `DPPolicy(sip="column_wise")` 호출 → `TypeError`. 테스트로 명시.
|
||||||
|
- `DPPolicy(num_sips=2)` 호출 → `TypeError`.
|
||||||
|
- `spec.pe_index` 접근 → `AttributeError` (property 완전 제거 검증).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Consequences
|
||||||
|
|
||||||
|
### Positive
|
||||||
|
|
||||||
|
- **개념 분리 명확**: DPPolicy = intra-device, TP = inter-device.
|
||||||
|
- **API 단순화**: DPPolicy 생성자 필드 ~33% 축소.
|
||||||
|
- **Structural 좌표 일관성**: ShardSpec이 `(sip, cube, pe)` 튜플로 표현 →
|
||||||
|
abstraction leakage 해소 (ADR-0024 D11 계약 충족).
|
||||||
|
- **`pe_index` 의미 명확**: SIP-local이 단일 해석. Global flat이 필요하면 명시.
|
||||||
|
- **Launcher 모델 일관성**: ADR-0024의 "1 worker per SIP" 모델이 유일한 SIP
|
||||||
|
경계 제어 메커니즘.
|
||||||
|
|
||||||
|
### Negative
|
||||||
|
|
||||||
|
- **Breaking change (explicit)**: `DPPolicy(sip=...)` → `TypeError`,
|
||||||
|
`spec.pe_index` → `AttributeError`. 모든 호출자 한 번에 수정 필요.
|
||||||
|
- **ShardSpec schema 변경**: `pe_index` 단일 필드 → `sip`/`cube`/`pe` 세 필드.
|
||||||
|
Downstream (`deploy_tensor`, `_free_tensor`, `_ensure_allocators`,
|
||||||
|
`allocators` dict key 등) 연쇄 수정.
|
||||||
|
- **Silent drift 없음**: property 완전 제거로 runtime에서 즉시 실패 →
|
||||||
|
migration leakage 원천 차단. (Negative가 아니라 explicit tradeoff)
|
||||||
|
- `test_sip_parallel.py` 재작성 비용.
|
||||||
|
|
||||||
|
### Neutral
|
||||||
|
|
||||||
|
- 기존 `cube` / `pe` 필드 의미 불변.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Affected files
|
||||||
|
|
||||||
|
| File | Change |
|
||||||
|
|------|--------|
|
||||||
|
| `src/kernbench/policy/placement/dp.py` | D1: `sip`/`num_sips` 제거 / D2: `ShardSpec`에 `sip`/`cube`/`pe` structural fields 추가, **`pe_index` property 제거** / D3: `resolve_dp_policy`에 `target_sip`, SIP-level 루프 제거 / 내부 resolver가 반환하는 shard 타입 이름도 `local_pe`로 명확화 (이름 충돌 방지) |
|
||||||
|
| `src/kernbench/runtime_api/context.py` | D4: `_create_tensor` `target_sip` 전달 / D5: `_ensure_allocators` dict key → `(sip, cube, pe)` tuple / `launch`의 `dp.sip` 분기 제거 |
|
||||||
|
| `src/kernbench/runtime_api/tensor.py` | D5: `deploy_tensor`가 구조적 좌표로 allocator lookup |
|
||||||
|
| `src/kernbench/components/builtin/pe_cpu.py` | D6: `dp.sip` 참조 제거 |
|
||||||
|
| `src/kernbench/components/legacy/builtin/pe_cpu.py` | D6: 동일 |
|
||||||
|
| `benches/ccl_allreduce.py` | ADR-0024 scope에서 이미 처리 |
|
||||||
|
| `benches/va_offset_verify.py` | D6: `num_sips=1` 제거 |
|
||||||
|
| `tests/test_runtime_api_tensor.py` | D6 |
|
||||||
|
| `tests/test_va_offset.py` | D6 |
|
||||||
|
| `tests/test_tensor.py`, `test_ipcq_types.py` | D6 |
|
||||||
|
| `tests/test_sip_parallel.py` | T3: launcher 기반 재작성 |
|
||||||
|
| `tests/test_dp_policy.py` (new 또는 확장) | T2 |
|
||||||
|
| `tests/test_allocator_structural_key.py` (new) | T4 |
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,171 @@
|
|||||||
|
# ADR-0028: DTensor Support — 선언적 분산 텐서 (Stub / Future)
|
||||||
|
|
||||||
|
## Status
|
||||||
|
|
||||||
|
Stub (Future Work)
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
### 목표
|
||||||
|
|
||||||
|
**선언적 분산 텐서 추상화**(PyTorch 2.x `DTensor` 스타일)를 KernBench에
|
||||||
|
도입하기 위한 **디자인 공간 preliminary exploration**. 본 ADR은 **구현 계획이
|
||||||
|
아닌 future 작업의 파일 플레이스홀더 + 초기 질문 목록**이다.
|
||||||
|
|
||||||
|
### Megatron-style TP와의 차이 (Why DTensor)
|
||||||
|
|
||||||
|
| 관점 | Megatron (ADR-0027) | DTensor (이 ADR) |
|
||||||
|
|---|---|---|
|
||||||
|
| 표현 | 명시적 parallel layer | 텐서 + placement spec |
|
||||||
|
| 호출 형태 | `ColumnParallelLinear(...)` | `distribute_tensor(x, mesh, [Shard(1)])` |
|
||||||
|
| Collective 삽입 | 레이어 내부 명시 | 연산 dispatch가 자동 |
|
||||||
|
| Learning curve | 낮음 (명시적) | 중~높음 (선언적 의미 이해) |
|
||||||
|
| 유연성 | 레이어 단위로 고정 | 레이어 경계 무관, 어디서나 |
|
||||||
|
| KernBench에 선행 필요한 것 | launcher (ADR-0024) + TP (0027) | 그 + operator dispatch overhaul |
|
||||||
|
|
||||||
|
DTensor는 operator-level에서 "텐서의 placement를 보고 자동으로 collective
|
||||||
|
삽입". KernBench가 이를 지원하려면 **operator dispatch layer에 placement-aware
|
||||||
|
rewriting**이 들어가야 한다. 이는 비-trivial.
|
||||||
|
|
||||||
|
### 현재 상태
|
||||||
|
|
||||||
|
- KernBench는 operator dispatch 레이어가 없음 (`torch.matmul`은 없음; kernel
|
||||||
|
launch로 대체).
|
||||||
|
- DPPolicy는 정적 placement metadata를 보유 (ADR-0026 후: intra-device only).
|
||||||
|
- ADR-0024 launcher가 rank / device 개념 제공.
|
||||||
|
- Megatron-style TP (ADR-0027)가 명시적 대안으로 기능할 것.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Preliminary decision space
|
||||||
|
|
||||||
|
### DQ1. PyTorch DTensor API 수용 범위
|
||||||
|
|
||||||
|
- `DeviceMesh`: rank들의 논리적 grid.
|
||||||
|
- `Placements`: `Shard(dim)`, `Replicate()`, `Partial(reduce_op)`.
|
||||||
|
- `distribute_tensor(tensor, device_mesh, placements)`: local tensor → DTensor.
|
||||||
|
- Redistribute: `dt.redistribute(new_placements)`로 collective 자동 삽입.
|
||||||
|
- Operator forward: `dt @ dt`, `dt + dt` 등 → 적절한 collective 자동 dispatch.
|
||||||
|
|
||||||
|
KernBench가 어느 수준까지 지원할지 결정 필요. 최소: `distribute_tensor` +
|
||||||
|
`redistribute`. 최대: 모든 operator overloading.
|
||||||
|
|
||||||
|
### DQ2. Operator dispatch 레이어
|
||||||
|
|
||||||
|
KernBench에서 `dt @ dt`를 정의하려면 Tensor의 `__matmul__`이 placement를
|
||||||
|
보고 적절한 action 수행:
|
||||||
|
|
||||||
|
- 둘 다 replicated → local matmul
|
||||||
|
- A column-sharded, B row-sharded → local matmul + all-reduce (RowParallel)
|
||||||
|
- A replicated, B column-sharded → local matmul (ColumnParallel)
|
||||||
|
- etc.
|
||||||
|
|
||||||
|
이는 Megatron-style의 **자동화된 버전**. Kernel은 기존 matmul kernel 사용.
|
||||||
|
|
||||||
|
### DQ3. DeviceMesh와 기존 topology
|
||||||
|
|
||||||
|
KernBench topology는 이미 SIP/cube/PE 계층. DTensor의 DeviceMesh는 추상
|
||||||
|
`(tp_size, dp_size, ...)` grid. 매핑:
|
||||||
|
|
||||||
|
- 1D mesh of size = SIP count → rank = SIP
|
||||||
|
- 2D mesh (tp × dp) → SIP을 그룹 분할 (pure TP 대신 mixed parallelism)
|
||||||
|
|
||||||
|
초기엔 1D mesh만, DP × TP 2D는 future.
|
||||||
|
|
||||||
|
### DQ4. Placement의 intra-device (DP) 통합
|
||||||
|
|
||||||
|
KernBench 특이점: 한 rank 내부에서 DPPolicy로 cube/PE에 분산. DTensor는
|
||||||
|
device 내부를 보지 않음. 통합:
|
||||||
|
|
||||||
|
- DTensor placement = rank (SIP) 간 분산
|
||||||
|
- 각 rank의 local tensor는 여전히 DPPolicy로 cube/PE 배치
|
||||||
|
- → DTensor wrapper가 local tensor의 DPPolicy도 보관
|
||||||
|
|
||||||
|
### DQ5. Collective 자동 삽입 지점
|
||||||
|
|
||||||
|
`redistribute` 또는 operator forward 시. ADR-0024의 submit+yield+wait 패턴을
|
||||||
|
자동으로 호출하는 형태. `_launch_submit` 내부화.
|
||||||
|
|
||||||
|
### DQ6. Autograd
|
||||||
|
|
||||||
|
DTensor는 autograd와 상호작용 (backward에서 reverse collective). KernBench가
|
||||||
|
backward 지원하기 전까지는 **forward-only DTensor**.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Open questions (to resolve before real design)
|
||||||
|
|
||||||
|
1. **우선순위**: Megatron-style(ADR-0027)이 먼저 안착한 후 DTensor를 위에
|
||||||
|
얹는가, 아니면 공통 lower-layer를 먼저 설계하는가?
|
||||||
|
2. **호환성 목표**: PyTorch DTensor API와 몇 %까지 일치시키는가? 독자 API vs
|
||||||
|
거의 동일?
|
||||||
|
3. **Operator dispatch**: KernBench `Tensor` 클래스에 `__matmul__` 등 연산자
|
||||||
|
overloading을 도입하는가? (현재는 kernel launch만)
|
||||||
|
4. **Redistribute 정책**: `Shard(0) → Replicate()` 변환 시 어떤 collective
|
||||||
|
사용? `all_gather`가 없으면 구현 전까지 제약.
|
||||||
|
5. **Mesh × DPPolicy interaction**: 하나의 DTensor가 2개 layer 분산을 갖는
|
||||||
|
경우의 metadata 표현.
|
||||||
|
6. **Partial placement의 reduce 시점**: 자동 vs 명시 `redistribute` 호출.
|
||||||
|
7. **Bench authoring impact**: 기존 Megatron-style bench가 DTensor 기반으로
|
||||||
|
얼마나 쉽게 포팅되는가?
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Non-goals (for future real ADR)
|
||||||
|
|
||||||
|
- 이번 stub에서 API 확정. Future ADR에서 구체화.
|
||||||
|
- Implementation timeline. 이번 round에서는 **설계 공간 매핑만**.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Dependencies (potential)
|
||||||
|
|
||||||
|
- **ADR-0024** (launcher): rank / device 기반
|
||||||
|
- **ADR-0026** (DPPolicy cleanup): DTensor placement와의 분리 명확화
|
||||||
|
- **ADR-0027** (Megatron TP): 실용 TP 패턴 경험을 DTensor 설계로 환류
|
||||||
|
- **Future ADR** (operator dispatch layer): KernBench Tensor에 operator
|
||||||
|
overloading 도입
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Expected consequences (hypothetical)
|
||||||
|
|
||||||
|
### Positive
|
||||||
|
|
||||||
|
- PyTorch training code 이식이 **매우 쉬워짐** (DTensor 코드 그대로).
|
||||||
|
- TP + DP + 더 복잡한 parallelism을 **하나의 추상화**로 표현.
|
||||||
|
- Collective 삽입이 자동 → bench 작성자 부담 감소.
|
||||||
|
|
||||||
|
### Negative
|
||||||
|
|
||||||
|
- Operator dispatch layer 신규 구축 → 상당한 엔지니어링.
|
||||||
|
- Implicit behavior 증가 → 디버깅 / 성능 분석 복잡.
|
||||||
|
- KernBench의 "명시적 kernel launch" 철학과 tension.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Action
|
||||||
|
|
||||||
|
- **Phase 1 (현재)**: 본 stub 유지. Megatron-style (ADR-0027) 먼저 구현 +
|
||||||
|
사용 경험 축적.
|
||||||
|
- **Phase 2 (future)**: 사용 경험을 바탕으로 본 ADR을 real design으로 승격.
|
||||||
|
위 Open questions에 대한 답을 제시.
|
||||||
|
- **Phase 3 (future)**: Implementation.
|
||||||
|
|
||||||
|
현재 구현 작업은 **없음**. 디자인 공간 매핑만.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Affected files
|
||||||
|
|
||||||
|
본 ADR은 **stub**이므로 production 변경 없음. Future real ADR에서 갱신될
|
||||||
|
파일 후보:
|
||||||
|
|
||||||
|
| File | 예상 변경 (future) |
|
||||||
|
|------|---|
|
||||||
|
| `src/kernbench/dtensor/__init__.py` | 신규 패키지 |
|
||||||
|
| `src/kernbench/dtensor/device_mesh.py` | DeviceMesh |
|
||||||
|
| `src/kernbench/dtensor/placements.py` | Shard/Replicate/Partial |
|
||||||
|
| `src/kernbench/dtensor/api.py` | distribute_tensor, redistribute |
|
||||||
|
| `src/kernbench/dtensor/ops/*.py` | Operator dispatch (matmul 등) |
|
||||||
|
| `src/kernbench/runtime_api/tensor.py` | Tensor에 `__matmul__` 등 추가 |
|
||||||
@@ -0,0 +1,419 @@
|
|||||||
|
# ADR-0029: Hierarchical All-Reduce — 3-level intra/inter-SIP 알고리즘
|
||||||
|
|
||||||
|
## Status
|
||||||
|
|
||||||
|
Proposed
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
### 목표
|
||||||
|
|
||||||
|
"Rank = SIP" 모델 (ADR-0024) 위에서 각 SIP 내부의 모든 PE를 참여시키는
|
||||||
|
**3-level 계층 all-reduce** 알고리즘을 정의한다. 각 레벨이 서로 다른 물리
|
||||||
|
연결(intra-cube ring, inter-cube NoC, inter-SIP UCIe)을 활용해 대역폭을
|
||||||
|
극대화한다.
|
||||||
|
|
||||||
|
### 왜 hierarchical인가
|
||||||
|
|
||||||
|
단순 ring/mesh/tree all-reduce는 SIP당 1 PE만 참여 (ADR-0024의 `leader_only`
|
||||||
|
mapper). 이는 inter-SIP 단계는 잘 모델링하지만:
|
||||||
|
|
||||||
|
- **Intra-SIP PE가 노는 시간이 발생**. Leader PE가 inter-SIP 통신 중이면
|
||||||
|
나머지 7 PE / 16 cube는 유휴.
|
||||||
|
- **Intra-cube/inter-cube 연결 대역폭 미활용**. Cube NoC는 매우 빠르지만
|
||||||
|
단일 leader 사용 시 이 자원이 노출되지 않음.
|
||||||
|
- **실제 NCCL 등은 hierarchical**: NVLink(intra-node) + InfiniBand(inter-node)
|
||||||
|
의 bandwidth 차이를 활용. KernBench 토폴로지도 동일 구조
|
||||||
|
(intra-cube / inter-cube / inter-SIP의 bandwidth·latency 차이).
|
||||||
|
|
||||||
|
### 현재 상태
|
||||||
|
|
||||||
|
- `src/kernbench/ccl/algorithms/hierarchical_allreduce.py` 이미 존재
|
||||||
|
(git log `10b33b4` — "Tensor indexing + hierarchical 3-level all-reduce
|
||||||
|
kernel"). PE-level로 world_size = total PE를 가정하는 옛 모델 기반 구현.
|
||||||
|
- ADR-0024에 의해 launcher는 rank = SIP로 바뀜.
|
||||||
|
- Hierarchical 커널은 **재해석 필요**: 이제 각 worker(1 per SIP)가 자기 SIP의
|
||||||
|
모든 PE를 참여시키고, kernel은 intra-cube → inter-cube → inter-SIP 순으로
|
||||||
|
3-level reduce + broadcast.
|
||||||
|
|
||||||
|
### 풀어야 할 문제
|
||||||
|
|
||||||
|
1. **ADR-0024 framework 위에 hierarchical 알고리즘 맞추기**
|
||||||
|
- Mapper: `all_pes` (ADR-0024 D5 제공)
|
||||||
|
- Validator: `multi_pe_sip_local` (ADR-0024 D8 제공)
|
||||||
|
- Kernel: 기존 `hierarchical_allreduce.py` 수정 — rank 계산 방식을 SIP 내
|
||||||
|
local (cube, pe)로 바꿈
|
||||||
|
2. **PE-level neighbor graph 생성**
|
||||||
|
- Intra-cube: `(sip, cube, pe) ↔ (sip, cube, pe±1 mod N_PE)` (ring 내부)
|
||||||
|
- Inter-cube: `(sip, cube, 0) ↔ (sip, cube±1 mod N_CUBE, 0)` (cube leader만)
|
||||||
|
- Inter-SIP: `(sip, 0, 0) ↔ (sip±1 mod N_SIP, 0, 0)` (SIP leader만)
|
||||||
|
3. **Tensor layout**: 각 PE가 1 tile을 소유하고 시작 (`multi_pe_sip_local`
|
||||||
|
validator가 이 layout 강제). DPPolicy(cube="column_wise",
|
||||||
|
pe="column_wise")로 달성 가능.
|
||||||
|
4. **PE-level topology 표현 부족** (ADR-0024 D6의 "책임 분산" 이슈 구체화)
|
||||||
|
- Ring/mesh/tree 같은 단순 패턴은 rank-level topology_fn + mapper 조합으로
|
||||||
|
충분.
|
||||||
|
- Hierarchical은 레벨마다 다른 peer 매핑이라 `_build_pe_installs`에서
|
||||||
|
multi-level 해석을 해야 함.
|
||||||
|
- 장기적으로는 topology 모듈이 PE-level을 직접 표현하는 편이 명시적.
|
||||||
|
|
||||||
|
### Non-problem (이 ADR 밖)
|
||||||
|
|
||||||
|
- Launcher / barrier / rank-to-SIP / mapper-validator registry → ADR-0024
|
||||||
|
- IPCQ direction addressing → ADR-0025
|
||||||
|
- DPPolicy 필드 정리 → ADR-0026
|
||||||
|
- Megatron TP → ADR-0027
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Decision
|
||||||
|
|
||||||
|
### D1. 알고리즘 구조 — 3-level reduce + 역순 broadcast
|
||||||
|
|
||||||
|
```
|
||||||
|
Level 1 (intra-cube, E/W ring):
|
||||||
|
각 cube의 N_PE개 PE가 bidirectional ring reduce → cube 내 PE 0에 부분합 집중
|
||||||
|
Level 2 (inter-cube within SIP, N/S ring, PE 0만 참여):
|
||||||
|
N_CUBE개 cube-leader가 bidirectional ring reduce → SIP 내 (cube 0, PE 0)에
|
||||||
|
SIP 전체 부분합 집중
|
||||||
|
Level 3 (inter-SIP, N_SIP peers, (cube 0, PE 0)만 참여):
|
||||||
|
Ring 또는 pair exchange로 전역 합산 완료
|
||||||
|
Broadcast:
|
||||||
|
역순 — Level 3 결과를 (cube 0, PE 0)에서 SIP 내 모든 cube-leader로, 다시
|
||||||
|
각 cube 내 모든 PE로 전파
|
||||||
|
```
|
||||||
|
|
||||||
|
세부는 기존 `hierarchical_allreduce.py`의 커널 구현과 일치. ADR-0024 이후
|
||||||
|
변경점은 **rank 계산 방식**과 **n_elem 해석**뿐:
|
||||||
|
|
||||||
|
- 기존 (rank=PE 모델): `rank = cube_id * pes_per_cube + local_pe`, `pe_addr =
|
||||||
|
t_ptr + rank * nbytes`
|
||||||
|
- 신규 (rank=SIP 모델): 커널은 SIP-local 좌표 `(cube_id, local_pe)`로만 동작.
|
||||||
|
텐서의 per-PE slice는 backend가 per-PE `TensorArg`로 전달 (ADR-0024 D3).
|
||||||
|
커널 내부 rank 계산 자체가 불필요해짐 — `tl.program_id(0/1)`로 충분.
|
||||||
|
|
||||||
|
### D2. Framework integration — ADR-0024 infrastructure 재활용
|
||||||
|
|
||||||
|
`ccl.yaml`:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
algorithms:
|
||||||
|
hierarchical_allreduce:
|
||||||
|
module: kernbench.ccl.algorithms.hierarchical_allreduce
|
||||||
|
topology: hierarchical_3level # NEW — D3 참고
|
||||||
|
mapper: all_pes # ADR-0024 D5 built-in
|
||||||
|
validator: multi_pe_sip_local # ADR-0024 D8 built-in
|
||||||
|
buffer_kind: tcm
|
||||||
|
n_elem: 128
|
||||||
|
```
|
||||||
|
|
||||||
|
Framework 관점에서 hierarchical은 **특별한 알고리즘이 아니라, 특정
|
||||||
|
topology / mapper / validator 조합**. 본 ADR은 그 조합과 topology 패턴을
|
||||||
|
정의.
|
||||||
|
|
||||||
|
### D3. `hierarchical_3level` topology (신규)
|
||||||
|
|
||||||
|
`kernbench/ccl/topologies.py`에 신규 추가:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def hierarchical_3level(rank: int, world_size: int, spec: dict) -> dict:
|
||||||
|
"""3-level hierarchical neighbor pattern.
|
||||||
|
|
||||||
|
Returns a nested structure describing intra-cube + inter-cube + inter-SIP
|
||||||
|
neighbors. Unlike ring_1d / mesh_2d which are rank → {dir: peer_rank},
|
||||||
|
hierarchical is PE-level and requires spec for cube_mesh / pe_layout.
|
||||||
|
"""
|
||||||
|
```
|
||||||
|
|
||||||
|
반환 스키마 (초안):
|
||||||
|
|
||||||
|
```python
|
||||||
|
{
|
||||||
|
"intra_cube": {
|
||||||
|
# 각 cube 내 ring neighbors: (cube, pe) → {"E": (cube, pe_e), "W": (cube, pe_w)}
|
||||||
|
...
|
||||||
|
},
|
||||||
|
"inter_cube": {
|
||||||
|
# cube-leader 간 ring: (cube, 0) → {"N": (cube_n, 0), "S": (cube_s, 0)}
|
||||||
|
...
|
||||||
|
},
|
||||||
|
"inter_sip": {
|
||||||
|
# SIP-leader 간: rank → {"parent": peer_rank} (또는 ring 방식)
|
||||||
|
...
|
||||||
|
},
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
이 구조는 `_build_pe_installs`가 해석하여 각 PE의 neighbor table 엔트리
|
||||||
|
(4-direction)에 대응시킨다.
|
||||||
|
|
||||||
|
**Rank-level `topologies.py` 현 API와의 관계**: 기존 단순 패턴은
|
||||||
|
`(rank → {dir: peer_rank})` 단일 레벨. Hierarchical은 multi-level이므로
|
||||||
|
기존 API와 schema가 다름. `_resolve_topology`는 **알고리즘이 어떤 schema를
|
||||||
|
쓰는지 선언**하고, builder가 그에 맞춰 해석하도록 확장 필요 (open question).
|
||||||
|
|
||||||
|
### D4. PE-level neighbor graph — `_build_pe_installs` 확장
|
||||||
|
|
||||||
|
기존 (ring/mesh/tree): topology_fn이 반환한 `(rank → {dir: peer_rank})`를
|
||||||
|
각 참여 PE에 그대로 매핑 (leader_only일 경우 peer PE도 leader).
|
||||||
|
|
||||||
|
신규 (hierarchical): `hierarchical_3level`의 3단 구조를 per-PE neighbor
|
||||||
|
table로 펼침:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def _build_pe_installs_hierarchical(rank, world_size, sip, pes, topo, spec):
|
||||||
|
"""Hierarchical 전용 PE neighbor table 빌더."""
|
||||||
|
result = []
|
||||||
|
for (cube, pe) in pes:
|
||||||
|
entries = []
|
||||||
|
# Level 1: intra-cube ring (E/W)
|
||||||
|
for d, peer in topo["intra_cube"][(cube, pe)].items():
|
||||||
|
entries.append(NeighborTableEntry(direction=d, ...))
|
||||||
|
# Level 2: inter-cube ring (N/S) — cube leader (pe == 0)만
|
||||||
|
if pe == 0:
|
||||||
|
for d, peer in topo["inter_cube"][(cube, 0)].items():
|
||||||
|
entries.append(NeighborTableEntry(direction=d, ...))
|
||||||
|
# Level 3: inter-SIP — SIP leader (cube == 0 and pe == 0)만
|
||||||
|
if cube == 0 and pe == 0:
|
||||||
|
for d, peer_rank in topo["inter_sip"][rank].items():
|
||||||
|
# peer_rank → peer SIP의 (0, 0)
|
||||||
|
entries.append(NeighborTableEntry(
|
||||||
|
direction=d, peer_sip=peer_rank, peer_cube=0, peer_pe=0, ...))
|
||||||
|
result.append(PeInstallSpec(cube=cube, pe=pe, neighbors=tuple(entries)))
|
||||||
|
return tuple(result)
|
||||||
|
```
|
||||||
|
|
||||||
|
`build_install_plans`에서 algorithm_config의 `topology`에 따라 적절한 builder
|
||||||
|
선택 (기존 simple builder vs hierarchical builder).
|
||||||
|
|
||||||
|
### D5. Kernel 재해석 — SIP-local 좌표로
|
||||||
|
|
||||||
|
`src/kernbench/ccl/algorithms/hierarchical_allreduce.py`를 ADR-0024 D3에
|
||||||
|
맞춰 수정:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def kernel_args(*, n_elem: int, world_size: int, pes_per_cube: int,
|
||||||
|
cubes_per_sip: int, num_sips: int, **kw) -> tuple:
|
||||||
|
"""world_size (= num_sips), pes_per_cube, cubes_per_sip를 스칼라로."""
|
||||||
|
return (n_elem, pes_per_cube, cubes_per_sip, num_sips)
|
||||||
|
|
||||||
|
def kernel(t_ptr, n_elem, pes_per_cube, cubes_per_sip, num_sips, tl):
|
||||||
|
"""SIP-local 좌표 기반.
|
||||||
|
|
||||||
|
이전 (rank=PE 모델):
|
||||||
|
rank = cube_id * pes_per_cube + local_pe
|
||||||
|
pe_addr = t_ptr + rank * nbytes
|
||||||
|
현재 (rank=SIP 모델):
|
||||||
|
per-PE tensor slice는 backend가 TensorArg로 전달 → t_ptr은 이미 local.
|
||||||
|
intra-cube ring은 tl.program_id(0) 사용.
|
||||||
|
inter-cube ring은 pe_id == 0 조건으로 제한.
|
||||||
|
inter-SIP reduce는 cube_id == 0 and pe_id == 0 조건으로 제한.
|
||||||
|
"""
|
||||||
|
local_pe = tl.program_id(axis=0)
|
||||||
|
cube_id = tl.program_id(axis=1)
|
||||||
|
|
||||||
|
# Level 1: intra-cube ring
|
||||||
|
for _ in range(intra_rounds(pes_per_cube)):
|
||||||
|
tl.send(dir="E", src=acc)
|
||||||
|
recv = tl.recv(dir="W", shape=(n_elem,), dtype="f16")
|
||||||
|
acc = acc + recv
|
||||||
|
|
||||||
|
# Level 2: inter-cube (cube leader only)
|
||||||
|
if local_pe == 0:
|
||||||
|
for _ in range(inter_cube_rounds(cubes_per_sip)):
|
||||||
|
tl.send(dir="N", src=acc)
|
||||||
|
recv = tl.recv(dir="S", shape=(n_elem,), dtype="f16")
|
||||||
|
acc = acc + recv
|
||||||
|
|
||||||
|
# Level 3: inter-SIP (SIP leader only)
|
||||||
|
if local_pe == 0 and cube_id == 0:
|
||||||
|
for _ in range(inter_sip_rounds(num_sips)):
|
||||||
|
tl.send(dir="parent", src=acc)
|
||||||
|
recv = tl.recv(dir="parent", shape=(n_elem,), dtype="f16")
|
||||||
|
acc = acc + recv
|
||||||
|
|
||||||
|
# Broadcast (reverse chain)
|
||||||
|
# ...
|
||||||
|
tl.store(t_ptr, acc)
|
||||||
|
```
|
||||||
|
|
||||||
|
`kernel_args`는 ADR-0024 D4의 keyword-only signature 계약을 따른다.
|
||||||
|
|
||||||
|
### D6. Validator — `multi_pe_sip_local`
|
||||||
|
|
||||||
|
ADR-0024 D8의 built-in 그대로 활용. `ccl.yaml`에서 `validator:
|
||||||
|
multi_pe_sip_local` 지정 시 backend가 각 SIP에 `cubes × pes_per_cube`개
|
||||||
|
shard가 있는지 검증.
|
||||||
|
|
||||||
|
### D7. Bench — 기본 all-reduce bench 확장
|
||||||
|
|
||||||
|
`benches/ccl_allreduce.py`의 worker는 `ccl.yaml`이 `hierarchical_allreduce`를
|
||||||
|
선택하면 자동으로:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Worker 예
|
||||||
|
dp = DPPolicy(cube="column_wise", pe="column_wise")
|
||||||
|
tensor = torch.zeros((1, intra_sip_pes * n_elem), dp=dp, name="in")
|
||||||
|
# tensor는 각 SIP의 모든 PE에 1 tile씩 분산 (multi_pe_sip_local validator 통과)
|
||||||
|
dist.all_reduce(tensor, op="sum")
|
||||||
|
```
|
||||||
|
|
||||||
|
Worker 코드 자체는 알고리즘 종류를 모름 (`ccl.yaml` 선택에 의존). 단,
|
||||||
|
**DPPolicy가 hierarchical 요구와 일치해야** 함 — `cube/pe="column_wise"`
|
||||||
|
같은 SIP-내 분산을 하는 DPPolicy여야 `multi_pe_sip_local` 검증 통과. 이
|
||||||
|
DPPolicy 선택은 bench 설정 또는 sample bench에서 결정.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Dependencies
|
||||||
|
|
||||||
|
- **ADR-0024**: Launcher, `all_pes` mapper, `multi_pe_sip_local` validator,
|
||||||
|
registry + import path. 본 ADR 구현의 전제.
|
||||||
|
- **ADR-0025**: IPCQ direction addressing — cube/pe/SIP 간 다중 direction을
|
||||||
|
동시 사용하므로 정확한 direction 매칭 필수.
|
||||||
|
- **ADR-0023**: IPCQ protocol (neighbor table, send/recv, credit return).
|
||||||
|
- **기존 `hierarchical_allreduce.py`**: 본 ADR은 그 커널의 재해석 + 주변
|
||||||
|
framework integration.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Non-goals
|
||||||
|
|
||||||
|
- **ADR-0024 framework 변경**: 재활용만.
|
||||||
|
- **Alternative reduce topology (tree-in-tree 등)**: 3-level ring이 첫 구현.
|
||||||
|
- **Dynamic level count**: 현재 SIP/cube/PE 3단 고정. 2단 (SIP + PE, cube
|
||||||
|
skip) 또는 4단 이상은 future.
|
||||||
|
- **Bandwidth-optimal schedule tuning**: reduce round 수 / chunk size 조정
|
||||||
|
같은 tuning은 별도.
|
||||||
|
- **Pipelined hierarchical**: 여러 chunk를 파이프라인으로 겹쳐서 돌리는
|
||||||
|
NCCL-style 최적화는 future.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Open questions
|
||||||
|
|
||||||
|
### 🟠 중간 영향 — 구현 시 결정 필요
|
||||||
|
|
||||||
|
- **`topologies.py` 스키마 확장**: 기존 `ring_1d` 등은 단일 레벨 `(rank →
|
||||||
|
{dir: peer})`. `hierarchical_3level`은 multi-level. `_resolve_topology`가
|
||||||
|
둘을 모두 반환할 수 있도록 schema를 일반화할지, 아니면 hierarchical 전용
|
||||||
|
return type을 두고 builder가 분기할지.
|
||||||
|
- Option A: 모든 topology를 neighbor-list 형태로 단일화
|
||||||
|
(`[{direction, peer_sip, peer_cube, peer_pe}, ...]`)
|
||||||
|
- Option B: topology 모듈이 `kind` 필드 제공, builder가 분기
|
||||||
|
- 권장: Option A (single source of truth, ADR-0024 Open Q의
|
||||||
|
"PE-level topology 일원화" 방향과 일치)
|
||||||
|
|
||||||
|
- **`hierarchical_3level` vs algorithm별 topology 모듈**: 향후 mesh-based
|
||||||
|
hierarchical 등 variant이 생기면? `hierarchical_3level` 같은 이름이 이미
|
||||||
|
topology-specific. 변형은 새 key 추가 (`hierarchical_mesh_3level` 등) 또는
|
||||||
|
알고리즘 모듈에서 topology 생성 override.
|
||||||
|
|
||||||
|
### 🟡 Nice-to-have
|
||||||
|
|
||||||
|
- **Reduce round 수 최적화**: Bidirectional ring은 `ceil((N-1)/2)` round.
|
||||||
|
Non-power-of-2 group size에서 idle PE 발생 가능.
|
||||||
|
- **Non-uniform topology 대응**: cube_mesh가 w != h일 때 inter-cube ring
|
||||||
|
balance.
|
||||||
|
- **Single SIP 케이스**: world_size = 1 (SIP 1개)일 때 Level 3 skip. Degenerate
|
||||||
|
case 검증.
|
||||||
|
|
||||||
|
### 🟢 Framework evolution 시사점 (ADR-0024로부터 이관)
|
||||||
|
|
||||||
|
- **PE-level topology 일원화 (중장기)**: 현 설계는
|
||||||
|
- topology (rank graph 또는 level-separated)
|
||||||
|
- mapper (per-SIP PE set)
|
||||||
|
- `_build_pe_installs` (actual edges)
|
||||||
|
|
||||||
|
의 3단 분산. Hierarchical이 이 분산을 가장 스트레스 받는 케이스. 중장기로는
|
||||||
|
`topologies.py`가 PE-level neighbor list를 직접 반환하고 mapper는 단순히
|
||||||
|
"어느 PE가 참여하느냐"만 결정, `_build_pe_installs`는 flat
|
||||||
|
mapping으로 단순화되는 방향이 자연스러움. **본 ADR에서 Option A를 채택**하면
|
||||||
|
이 방향으로 이미 정합.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Test strategy
|
||||||
|
|
||||||
|
### T1. Topology generator
|
||||||
|
|
||||||
|
`tests/test_hierarchical_topology.py` (new):
|
||||||
|
- `hierarchical_3level(rank, world_size, spec)` → 각 level의 neighbor set이
|
||||||
|
예상 구조인지 (intra-cube는 ring, inter-cube는 cube-leader만 참여, inter-SIP은
|
||||||
|
SIP-leader만 참여)
|
||||||
|
- 2 SIP × 4 cubes × 4 PEs 같은 작은 토폴로지로 수작업 검증 가능
|
||||||
|
- Symmetry: rank r의 E neighbor가 peer에서 W로 역포인팅
|
||||||
|
|
||||||
|
### T2. Install plan — hierarchical × all_pes
|
||||||
|
|
||||||
|
`tests/test_ccl_install_plan.py` (확장):
|
||||||
|
- `build_install_plans(algorithm="hierarchical_allreduce", mapper="all_pes",
|
||||||
|
validator="multi_pe_sip_local")` 호출 시
|
||||||
|
- 각 SIP의 모든 PE가 `participating_pes`에 포함
|
||||||
|
- PE 0 (cube leader)만 inter-cube neighbor를 가짐
|
||||||
|
- (cube 0, pe 0) (SIP leader)만 inter-SIP neighbor를 가짐
|
||||||
|
- Non-leader PE는 intra-cube neighbor만
|
||||||
|
|
||||||
|
### T3. Kernel unit — mock runtime
|
||||||
|
|
||||||
|
`tests/test_hierarchical_mock_runtime.py` (new):
|
||||||
|
- `run_kernel_in_mock` (kernbench.ccl.testing)을 확장해 multi-level 지원
|
||||||
|
- 2 SIP × 2 cubes × 4 PEs (총 16 PE) 토폴로지에서 초기 tile을 rank+1로 채우고
|
||||||
|
hierarchical all-reduce 실행
|
||||||
|
- 모든 PE의 최종 결과가 `sum(1..16)`인지
|
||||||
|
|
||||||
|
### T4. E2E — 실제 SimPy backend
|
||||||
|
|
||||||
|
`tests/test_ccl_allreduce_matrix.py` (확장):
|
||||||
|
- `hierarchical @ ws=SIP_count`: multi_pe_sip_local layout + 3-level 알고리즘
|
||||||
|
전체 stack 통과 검증
|
||||||
|
|
||||||
|
### T5. Validator enforcement
|
||||||
|
|
||||||
|
- `multi_pe_sip_local` validator가 wrong layout (예: leader_only 스타일 1
|
||||||
|
shard per rank) 입력에 raise
|
||||||
|
|
||||||
|
### T6. 회귀
|
||||||
|
|
||||||
|
기존 ring/mesh/tree 알고리즘 모두 그대로 통과. 본 ADR은 그들을 건드리지 않음.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Consequences
|
||||||
|
|
||||||
|
### Positive
|
||||||
|
|
||||||
|
- **Intra-SIP PE 활용도 증가**: Inter-SIP 통신 중에도 intra-cube / inter-cube
|
||||||
|
reduce가 진행되어 전체 PE 가동률 향상.
|
||||||
|
- **Multi-level bandwidth 활용**: cube NoC, UCIe 모두 작동 → 더 정확한 HW 모델.
|
||||||
|
- **ADR-0024 framework 검증**: `all_pes` mapper + `multi_pe_sip_local`
|
||||||
|
validator의 첫 non-trivial use case. Framework 설계 타당성 확인.
|
||||||
|
- **기존 커널 재활용**: `hierarchical_allreduce.py` 큰 구조 유지, SIP-local
|
||||||
|
좌표만 재해석.
|
||||||
|
|
||||||
|
### Negative
|
||||||
|
|
||||||
|
- **`topologies.py` schema 확장 필요**: Single-level vs multi-level 표현.
|
||||||
|
해결안(Option A)은 기존 ring/mesh/tree의 마이그레이션 비용 유발.
|
||||||
|
- **Validator / mapper 조합 요구**: 사용자가 DPPolicy를
|
||||||
|
`multi_pe_sip_local`에 맞춰 선택해야 함 (bench 설정 복잡도 증가).
|
||||||
|
|
||||||
|
### Neutral
|
||||||
|
|
||||||
|
- 본 ADR 구현 전까지 `hierarchical_allreduce.py`는 deprecated 상태 유지 또는
|
||||||
|
ADR-0024 matrix test에서 제외. 현재 파일을 곧바로 삭제하지는 않음.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Affected files
|
||||||
|
|
||||||
|
| File | Change |
|
||||||
|
|------|--------|
|
||||||
|
| `src/kernbench/ccl/topologies.py` | D3: `hierarchical_3level` topology 함수 추가. (Option A 채택 시) 기존 topology 출력 format 통일 |
|
||||||
|
| `src/kernbench/ccl/install_plan.py` | D4: hierarchical builder 분기 (또는 단일 builder가 level 개수로 dispatch) |
|
||||||
|
| `src/kernbench/ccl/algorithms/hierarchical_allreduce.py` | D5: SIP-local 좌표로 kernel 재작성, `kernel_args` keyword-only signature |
|
||||||
|
| `ccl.yaml` | D2: `hierarchical_allreduce` 엔트리 추가 (`mapper: all_pes`, `validator: multi_pe_sip_local`, `topology: hierarchical_3level`) |
|
||||||
|
| `tests/test_hierarchical_topology.py` (new) | T1 |
|
||||||
|
| `tests/test_ccl_install_plan.py` | T2 확장 |
|
||||||
|
| `tests/test_hierarchical_mock_runtime.py` (new) | T3 |
|
||||||
|
| `tests/test_ccl_allreduce_matrix.py` | T4: hierarchical row 추가 |
|
||||||
@@ -0,0 +1,347 @@
|
|||||||
|
# ADR-0030: IPCQ Physical Addressing — PhysAddr integration
|
||||||
|
|
||||||
|
## Status
|
||||||
|
|
||||||
|
Proposed (Blocked on ADR-0031 — PhysAddr PE-resource extension)
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
### 목표
|
||||||
|
|
||||||
|
IPCQ ring buffer의 주소 체계를 ADR-0023의 **synthetic parallel namespace**
|
||||||
|
(`_IPCQ_BASE = 1<<60`)에서 **ADR-0001의 PhysAddr**로 이관한다. Routing /
|
||||||
|
allocator / MemoryStore의 정합성을 회복하고, buffer_kind (tcm/hbm/sram)별
|
||||||
|
physical backing을 구조적 좌표로 표현한다.
|
||||||
|
|
||||||
|
### 현재 상태 (ADR-0023 D2.5)
|
||||||
|
|
||||||
|
`src/kernbench/ccl/install.py:52-56`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
_IPCQ_BASE = 1 << 60
|
||||||
|
def _ipcq_base_for_pe(sip, cube, pe):
|
||||||
|
return _IPCQ_BASE | (sip << 40) | (cube << 32) | (pe << 24)
|
||||||
|
|
||||||
|
def rx_base(s, c, p, d):
|
||||||
|
return _ipcq_base_for_pe(s, c, p) + direction_idx[d] * bytes_per_direction
|
||||||
|
```
|
||||||
|
|
||||||
|
- **bit 60** 사용 → ADR-0001의 51-bit PhysAddr 공간 밖 (`MAX_51 = (1 << 51) - 1`)
|
||||||
|
- `PhysAddr.decode(addr)` → `PhysAddrError("addr must be a 51-bit value")`
|
||||||
|
- `IpcqEndpoint.rx_base_pa: int` — 타입이 raw int, 구조 없음
|
||||||
|
- `buffer_kind` (tcm/hbm/sram)와 synthetic 주소의 관계가 coupling 없음
|
||||||
|
- Allocator (`PEMemAllocator`) 우회 — synthetic unique id per (sip, cube, pe,
|
||||||
|
direction). 진짜 physical allocation이 아님
|
||||||
|
|
||||||
|
ADR-0023 D2.5 원문:
|
||||||
|
|
||||||
|
> This bypasses the topology's address resolver / PhysAddr encoding and
|
||||||
|
> treats IPCQ buffers as a separate, parallel address namespace. Real PA
|
||||||
|
> encoding can be plugged in later without changing the rest of the design.
|
||||||
|
|
||||||
|
"later"가 이 ADR.
|
||||||
|
|
||||||
|
### 왜 지금 다루는가
|
||||||
|
|
||||||
|
- ADR-0025 (direction addressing)은 주소-기반 매칭으로 전환. 주소가 correctness에
|
||||||
|
직접 기여 → 주소 체계가 설계 관점에서 더 중요해짐
|
||||||
|
- ADR-0001의 "Routing consumes decoded domains, not raw bit-fields" 계약 위반
|
||||||
|
지속 → 기술 부채
|
||||||
|
- Routing fabric (cube_noc / UCIe)은 PhysAddr.decode()로 destination을 정함.
|
||||||
|
IPCQ의 synthetic 주소가 fabric routing에서 실제로 어떻게 처리되는지 **검증되지
|
||||||
|
않음** (별도 경로로 배달되는 것으로 추정)
|
||||||
|
- TCM / HBM / SRAM의 실제 memory layout과 IPCQ ring buffer 위치가 **disjoint**
|
||||||
|
→ allocator가 IPCQ 영역을 모르므로 실수로 겹칠 가능성 (현재는 bit 60로 완전
|
||||||
|
분리되어 문제 없지만 설계 원칙상 건강하지 않음)
|
||||||
|
|
||||||
|
### 풀어야 할 문제
|
||||||
|
|
||||||
|
1. **IPCQ ring buffer의 PhysAddr 표현**: buffer_kind별로 어떤 PhysAddr factory를
|
||||||
|
쓸지.
|
||||||
|
2. **PhysAddr 공간 부족 가능성**: 51-bit 공간에 IPCQ 버퍼를 담을 여유가 있는지.
|
||||||
|
3. **Allocator 통합**: `PEMemAllocator`에 IPCQ buffer 영역 예약 기능 추가, 또는
|
||||||
|
기존 pool에서 정상 allocation.
|
||||||
|
4. **MemoryStore space naming 정리**: 현재는 `{"tcm", "hbm", "sram"}` 문자열로
|
||||||
|
space 구분. IPCQ buffer도 이 space에 속하면 일반 data와 주소 겹침 방지 필요.
|
||||||
|
5. **Routing fabric 통합**: PhysAddr 기반 routing이 IPCQ 토큰을 올바른 SIP의
|
||||||
|
올바른 메모리로 배달.
|
||||||
|
6. **ADR-0025와의 정합**: 주소-기반 매칭이 PhysAddr에서도 동일하게 작동.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Decision
|
||||||
|
|
||||||
|
### D1. IPCQ ring buffer = PhysAddr factory 사용
|
||||||
|
|
||||||
|
각 `buffer_kind`가 해당하는 PhysAddr factory를 호출:
|
||||||
|
|
||||||
|
| buffer_kind | PhysAddr factory | 필요한 인자 |
|
||||||
|
|---|---|---|
|
||||||
|
| `tcm` | `PhysAddr.pe_tcm_addr(rack_id, sip_id, cube_id, pe_id, tcm_offset)` | PE-local TCM |
|
||||||
|
| `hbm` | `PhysAddr.pe_hbm_addr(rack_id, sip_id, cube_id, pe_id, pe_local_hbm_offset, slice_size_bytes)` | PE-local HBM slice |
|
||||||
|
| `sram` | `PhysAddr.cube_sram_addr(rack_id, sip_id, cube_id, sram_offset)` | Cube-shared SRAM |
|
||||||
|
|
||||||
|
Install plan builder (`build_install_plans` in ADR-0024)가 각 PE의 rx_base를
|
||||||
|
계산할 때:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ADR-0030 후 install_plan.py (pseudocode)
|
||||||
|
def _compute_rx_base(sip, cube, pe, direction_idx, buffer_kind, n_slots, slot_size,
|
||||||
|
allocator_pool, rack_id=0) -> PhysAddr:
|
||||||
|
bytes_per_direction = n_slots * slot_size
|
||||||
|
offset = direction_idx * bytes_per_direction
|
||||||
|
|
||||||
|
if buffer_kind == "tcm":
|
||||||
|
# TCM base (per-PE) + direction offset
|
||||||
|
tcm_base = allocator_pool.reserve_pe_tcm_for_ipcq(sip, cube, pe,
|
||||||
|
total_bytes=N_DIR * bytes_per_direction)
|
||||||
|
return PhysAddr.pe_tcm_addr(rack_id=rack_id, sip_id=sip, cube_id=cube,
|
||||||
|
pe_id=pe, tcm_offset=tcm_base + offset)
|
||||||
|
elif buffer_kind == "hbm":
|
||||||
|
hbm_base = allocator_pool.reserve_pe_hbm_for_ipcq(sip, cube, pe,
|
||||||
|
total_bytes=...)
|
||||||
|
return PhysAddr.pe_hbm_addr(rack_id=rack_id, sip_id=sip, cube_id=cube,
|
||||||
|
pe_id=pe, pe_local_hbm_offset=hbm_base + offset,
|
||||||
|
slice_size_bytes=slice_size)
|
||||||
|
elif buffer_kind == "sram":
|
||||||
|
sram_base = allocator_pool.reserve_cube_sram_for_ipcq(sip, cube,
|
||||||
|
total_bytes=...)
|
||||||
|
return PhysAddr.cube_sram_addr(rack_id=rack_id, sip_id=sip, cube_id=cube,
|
||||||
|
sram_offset=sram_base + offset)
|
||||||
|
```
|
||||||
|
|
||||||
|
`IpcqEndpoint.rx_base_pa`의 타입을 `PhysAddr` (또는 encoded `int`)로 변경:
|
||||||
|
|
||||||
|
```python
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class IpcqEndpoint:
|
||||||
|
sip: int
|
||||||
|
cube: int
|
||||||
|
pe: int
|
||||||
|
buffer_kind: str
|
||||||
|
rx_base_pa: int # PhysAddr.encode() 결과 (51-bit)
|
||||||
|
rx_base_va: int
|
||||||
|
n_slots: int
|
||||||
|
slot_size: int
|
||||||
|
```
|
||||||
|
|
||||||
|
타입은 int 유지 (encoded form), 단 **반드시 PhysAddr.decode()로 복원 가능**한
|
||||||
|
값임을 invariant으로 둔다. 디코더 호출자는 `PhysAddr.decode(rx_base_pa)`로
|
||||||
|
구조적 좌표 획득.
|
||||||
|
|
||||||
|
### D2. Allocator 확장 — IPCQ 예약 API
|
||||||
|
|
||||||
|
`PEMemAllocator`에 IPCQ 전용 예약 기능 추가:
|
||||||
|
|
||||||
|
```python
|
||||||
|
class PEMemAllocator:
|
||||||
|
def reserve_ipcq_tcm(self, total_bytes: int) -> int:
|
||||||
|
"""Reserve TCM region for IPCQ ring buffers at this PE.
|
||||||
|
Returns tcm_offset (to be used in PhysAddr.pe_tcm_addr)."""
|
||||||
|
# TCM에서 `total_bytes` 연속 영역 예약.
|
||||||
|
# Tensor allocation과 겹치지 않도록.
|
||||||
|
|
||||||
|
def reserve_ipcq_hbm(self, total_bytes: int) -> int: ...
|
||||||
|
# cube-level allocator도 유사
|
||||||
|
```
|
||||||
|
|
||||||
|
Install plan 빌더가 각 PE allocator에서 예약. 예약 결과(offset)를 PhysAddr
|
||||||
|
factory에 전달.
|
||||||
|
|
||||||
|
**기존 `_ipcq_base_for_pe` / `_IPCQ_BASE` 제거**.
|
||||||
|
|
||||||
|
### D3. MemoryStore space 통합
|
||||||
|
|
||||||
|
현재 `MemoryStore`는 `{space_name: {addr: ndarray}}` 구조. IPCQ buffer는 일반
|
||||||
|
tensor 데이터와 같은 space (tcm/hbm/sram)를 공유하게 됨. 주소 유일성은 ADR-0001의
|
||||||
|
PhysAddr 계층 보장.
|
||||||
|
|
||||||
|
Backward compatibility: 기존 IPCQ address (synthetic)을 쓰는 code path는
|
||||||
|
**제거**하고, 모두 PhysAddr.encode() 결과만 사용. 이 자체는 API 변경이 아니라
|
||||||
|
값 변경.
|
||||||
|
|
||||||
|
### D4. Routing fabric 통합
|
||||||
|
|
||||||
|
IPCQ DMA write (`IpcqDmaToken`의 `src_addr → dst_addr`)이 PhysAddr encoding을
|
||||||
|
사용하므로 **routing fabric이 `PhysAddr.decode(dst_addr)`로 destination
|
||||||
|
SIP/cube/PE를 정확히 찾을 수 있음**. Fabric routing 로직 변경 없음 (기존에도
|
||||||
|
PhysAddr.decode를 쓰는 것으로 추정).
|
||||||
|
|
||||||
|
**검증 필요**: 현재 fabric이 bit 60 synthetic 주소를 어떻게 라우팅하는지 확인.
|
||||||
|
별도 경로가 있다면 제거, PhysAddr 경로로 통합.
|
||||||
|
|
||||||
|
### D5. ADR-0025와의 정합
|
||||||
|
|
||||||
|
ADR-0025의 주소-기반 매칭 (dst_addr로 direction 식별)은 PhysAddr.encode()
|
||||||
|
결과를 비교하는 것으로 자연스럽게 호환. 변경 없음.
|
||||||
|
|
||||||
|
다만 debug / diagnostic 향상 가능:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# pointer_dump 등에서
|
||||||
|
print(f"E: rx_base_pa={PhysAddr.decode(qp.peer.rx_base_pa)}")
|
||||||
|
# 출력 예: PhysAddr(sip=1, cube=0, pe=0, kind="pe_resource", unit_type=PE, ...)
|
||||||
|
```
|
||||||
|
|
||||||
|
이전 synthetic 주소는 decode 불가 → diagnostic 질 저하. PhysAddr 전환으로 개선.
|
||||||
|
|
||||||
|
### D6. ADR-0023 D2.5 amendment
|
||||||
|
|
||||||
|
ADR-0023의 "bypasses PhysAddr encoding" 문구를 **Accepted fallback → now
|
||||||
|
replaced by ADR-0030**으로 수정. 본 ADR이 적용되면 ADR-0023 D2.5의 "Real PA
|
||||||
|
encoding can be plugged in later" 약속이 이행된 것.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Migration strategy
|
||||||
|
|
||||||
|
단계적 전환 (한 PR로 하지 않는다):
|
||||||
|
|
||||||
|
### Phase 1: PhysAddr 공간 재검토
|
||||||
|
- 51-bit PhysAddr 공간에 IPCQ ring buffer가 실제로 들어갈 수 있는지 확인.
|
||||||
|
- 각 buffer_kind (tcm/hbm/sram)별 factory가 제공하는 `local_offset` 범위가
|
||||||
|
IPCQ 요구 (4 direction × n_slots × slot_size)를 수용 가능한지.
|
||||||
|
- 부족하면 PhysAddr layout 자체 확장 (ADR-0001 amendment 별도 필요).
|
||||||
|
|
||||||
|
### Phase 2: Allocator API 확장
|
||||||
|
- `PEMemAllocator.reserve_ipcq_*` 메소드 추가.
|
||||||
|
- 기존 tensor allocation과 영역 충돌 방지.
|
||||||
|
|
||||||
|
### Phase 3: Install plan builder 전환
|
||||||
|
- `_ipcq_base_for_pe` 제거, PhysAddr factory 호출로 대체.
|
||||||
|
- `IpcqEndpoint.rx_base_pa`가 PhysAddr.encode() 결과 (51-bit).
|
||||||
|
|
||||||
|
### Phase 4: Routing fabric 검증
|
||||||
|
- IPCQ DMA token이 fabric 정상 경로로 배달되는지 확인.
|
||||||
|
- 별도 fast-path가 있다면 제거, 통합.
|
||||||
|
|
||||||
|
### Phase 5: MemoryStore space 검증
|
||||||
|
- IPCQ buffer 주소가 기존 tensor 주소와 겹치지 않는지.
|
||||||
|
- Allocator 레벨에서 이미 예약했으므로 정상적으로 분리되어야 함.
|
||||||
|
|
||||||
|
### Phase 6: ADR-0023 D2.5 업데이트 + 기존 sideband path 제거 (완료)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Dependencies
|
||||||
|
|
||||||
|
- **ADR-0031** (PhysAddr PE-resource extension) — **Blocker**: PhysAddr가 PE
|
||||||
|
resource (특히 IPCQ ring buffer)를 충분히 표현할 수 있도록 schema 확장이
|
||||||
|
선행되어야 함. 본 ADR은 ADR-0031 완료 후에만 실행 가능.
|
||||||
|
- **ADR-0001** (PhysAddr layout): 본 ADR의 기반. 51-bit 공간 / factory API의
|
||||||
|
ADR-0031 확장본을 사용.
|
||||||
|
- **ADR-0023** (IPCQ protocol): 본 ADR은 ADR-0023 D2.5의 "later" 약속 이행.
|
||||||
|
D9 piggyback / credit return 프로토콜 자체는 불변.
|
||||||
|
- **ADR-0024** (launcher + install_plan.py): `build_install_plans`가 PhysAddr
|
||||||
|
factory를 호출하게 됨.
|
||||||
|
- **ADR-0025** (direction addressing): 주소-기반 매칭이 PhysAddr에서도 동일하게
|
||||||
|
작동. 변경 없음.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Non-goals
|
||||||
|
|
||||||
|
- **ADR-0001 PhysAddr layout 자체 변경**: 51-bit 공간과 segment 구조는 유지.
|
||||||
|
부족 시 별도 ADR.
|
||||||
|
- **IPCQ protocol semantic 변경**: ADR-0023 D9 piggyback 등 프로토콜 로직 유지.
|
||||||
|
- **Allocator 전반 재설계**: IPCQ 예약 API 추가만.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Open questions
|
||||||
|
|
||||||
|
### 🔴 Critical — Migration 전 반드시 검증
|
||||||
|
|
||||||
|
- **PhysAddr 51-bit 공간에 IPCQ 버퍼가 실제로 들어가는가**: 각 PE의 TCM
|
||||||
|
영역에서 `4 direction × n_slots (default 4) × slot_size (default 4KB)` =
|
||||||
|
64KB가 PE TCM 공간에 수용 가능. TCM size (e.g., 16MB) 대비 충분. HBM도 여유
|
||||||
|
많음. SRAM은 cube 공유라 direction × PE 곱이 있음 — 별도 검증 필요.
|
||||||
|
- **Routing fabric의 현재 IPCQ 주소 처리**: 현재 synthetic 주소가 fabric에서
|
||||||
|
어떻게 routing되는지 trace 필요. `PhysAddr.decode()`로 판독 불가한 값이
|
||||||
|
fabric에서 정상 배달된다면 어떤 경로를 쓰는지 조사.
|
||||||
|
|
||||||
|
### 🟡 Nice-to-have
|
||||||
|
|
||||||
|
- **IPCQ 전용 kind / sub_offset 인코딩**: `UnitType.PE`의 sub_offset 공간을
|
||||||
|
IPCQ와 공유. 충돌 방지를 위해 IPCQ 전용 sub-space 정의할지 여부.
|
||||||
|
- **Debug tool**: `pointer_dump`를 PhysAddr 포매팅으로 개선.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Test strategy
|
||||||
|
|
||||||
|
### T1. PhysAddr round-trip
|
||||||
|
|
||||||
|
`tests/test_ipcq_physaddr.py` (new):
|
||||||
|
- `PhysAddr.pe_tcm_addr(...)` → encode → decode → 동일 필드 복원
|
||||||
|
- TCM / HBM / SRAM 각 factory에 대해
|
||||||
|
|
||||||
|
### T2. Allocator 예약
|
||||||
|
|
||||||
|
`tests/test_ipcq_alloc.py` (new):
|
||||||
|
- `PEMemAllocator.reserve_ipcq_tcm` → 반환된 offset이 valid TCM 영역
|
||||||
|
- 중복 예약 → 에러 또는 non-overlapping offset
|
||||||
|
- Tensor allocation과 충돌 없음
|
||||||
|
|
||||||
|
### T3. Install plan PhysAddr integration
|
||||||
|
|
||||||
|
`tests/test_ccl_install_plan.py` (확장):
|
||||||
|
- `build_install_plans` 결과의 `rx_base_pa`가 PhysAddr.decode() 가능
|
||||||
|
- Decoded 좌표가 plan의 (sip, cube, pe)와 일치
|
||||||
|
- I3.1 invariant (ADR-0025 D6) — rx_base range disjointness가 PhysAddr에서도 성립
|
||||||
|
|
||||||
|
### T4. Routing — IPCQ DMA fabric traversal
|
||||||
|
|
||||||
|
`tests/test_ipcq_routing.py` (new):
|
||||||
|
- Cross-SIP IPCQ send → fabric이 `PhysAddr.decode(dst_addr)`로 destination SIP
|
||||||
|
정확히 판단 → 올바른 MemoryStore에 write
|
||||||
|
- UCIe 경로 / cube_noc 경로 모두 검증
|
||||||
|
|
||||||
|
### T5. 회귀
|
||||||
|
|
||||||
|
- 기존 IPCQ E2E 테스트 (ring, mesh, tree) 모두 통과
|
||||||
|
- ADR-0024, ADR-0025 통합 테스트 통과
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Consequences
|
||||||
|
|
||||||
|
### Positive
|
||||||
|
|
||||||
|
- **ADR-0001 정합성 회복**: routing과 addressing이 단일 체계.
|
||||||
|
- **buffer_kind 명확**: TCM/HBM/SRAM이 구조적 좌표로 구분.
|
||||||
|
- **Debug 향상**: PhysAddr.decode()로 사람이 읽을 수 있는 좌표.
|
||||||
|
- **Allocator 통합**: IPCQ 영역이 정상 예약 → tensor와의 충돌 리스크 사전 차단.
|
||||||
|
- **Fabric routing 일원화**: 별도 경로 없이 기존 PhysAddr-based routing 재활용.
|
||||||
|
|
||||||
|
### Negative
|
||||||
|
|
||||||
|
- **Migration 복잡도**: 6 Phase 단계적 전환 필요. 각 Phase마다 regression 리스크.
|
||||||
|
- **PhysAddr 공간 검증 부담**: Phase 1에서 TCM/HBM/SRAM 공간이 IPCQ 요구를
|
||||||
|
수용하는지 실측 필요.
|
||||||
|
- **Routing fabric 검증**: 현재 fabric이 synthetic 주소를 어떻게 처리하는지
|
||||||
|
조사 필요.
|
||||||
|
|
||||||
|
### Neutral
|
||||||
|
|
||||||
|
- IPCQ protocol semantic (ADR-0023 D9 등) 불변.
|
||||||
|
- ADR-0025의 direction addressing 로직 불변.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Affected files
|
||||||
|
|
||||||
|
| File | Change |
|
||||||
|
|------|--------|
|
||||||
|
| `src/kernbench/ccl/install.py` | `_IPCQ_BASE`, `_ipcq_base_for_pe` 제거 |
|
||||||
|
| `src/kernbench/ccl/install_plan.py` (ADR-0024) | D1: PhysAddr factory 호출로 rx_base 계산 |
|
||||||
|
| `src/kernbench/policy/address/allocator.py` (or similar) | D2: IPCQ 예약 API (`reserve_ipcq_tcm` 등) |
|
||||||
|
| `src/kernbench/common/ipcq_types.py` | D1: `IpcqEndpoint.rx_base_pa` 문서화 — PhysAddr.encode 결과 |
|
||||||
|
| `src/kernbench/sim_engine/memory_store.py` | D3: IPCQ buffer가 기존 space와 공유되는지 검증 |
|
||||||
|
| `src/kernbench/sim_engine/engine.py` | D4: IPCQ token routing이 PhysAddr-based fabric 경로 사용 |
|
||||||
|
| `src/kernbench/ccl/diagnostics.py` | D5: pointer_dump를 PhysAddr 포매팅으로 개선 |
|
||||||
|
| `docs/adr/ADR-0023-ipcq-pe-collective.md` | D6: D2.5 amendment note |
|
||||||
|
| `tests/test_ipcq_physaddr.py` (new) | T1 |
|
||||||
|
| `tests/test_ipcq_alloc.py` (new) | T2 |
|
||||||
|
| `tests/test_ccl_install_plan.py` | T3 확장 |
|
||||||
|
| `tests/test_ipcq_routing.py` (new) | T4 |
|
||||||
@@ -0,0 +1,257 @@
|
|||||||
|
# ADR-0031: PhysAddr PE-Resource Extension
|
||||||
|
|
||||||
|
## Status
|
||||||
|
|
||||||
|
Stub (Blocker for ADR-0030 — specific range allocations TBD)
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
### 목표
|
||||||
|
|
||||||
|
ADR-0001의 `PhysAddr` schema를 **PE 내부의 다양한 resource**를 체계적으로
|
||||||
|
표현할 수 있도록 확장한다. ADR-0030 (IPCQ PhysAddr integration) 및 향후의
|
||||||
|
PE-local resource 추가 (scratchpad, register file, status register, 등)의
|
||||||
|
기반을 제공한다.
|
||||||
|
|
||||||
|
### 현재 상태 (ADR-0001)
|
||||||
|
|
||||||
|
51-bit PhysAddr layout:
|
||||||
|
|
||||||
|
```
|
||||||
|
[50:47] rack_id (4)
|
||||||
|
[46:43] sip_id (4)
|
||||||
|
[42:38] sip_seg (5) # cube_id
|
||||||
|
[37:0] local_offset (38)
|
||||||
|
```
|
||||||
|
|
||||||
|
`local_offset` (38 bits) 내부:
|
||||||
|
|
||||||
|
- `[37]` selector: 1 = HBM window (128GB), 0 = PE resource window
|
||||||
|
- PE resource window는 `unit_type` (3 bits: PE | MCPU | SRAM) +
|
||||||
|
`pe_id` (4 bits) + `ext` (1 bit) + `sub_offset` (29 bits)
|
||||||
|
|
||||||
|
Factory API:
|
||||||
|
- `PhysAddr.hbm_addr(...)` — HBM generic
|
||||||
|
- `PhysAddr.pe_hbm_addr(...)` — PE-local HBM slice
|
||||||
|
- `PhysAddr.pe_tcm_addr(...)` — PE TCM (via `UnitType.PE` + `sub_offset`)
|
||||||
|
- `PhysAddr.cube_sram_addr(...)` — Cube-shared SRAM
|
||||||
|
|
||||||
|
### 풀어야 할 문제
|
||||||
|
|
||||||
|
1. **PE 내부 resource 구분의 명시적 체계 부재**: 현재 `local_offset` (38 bits)
|
||||||
|
이 평면 공간으로 취급되고, PE TCM / IPCQ ring / scratchpad / 향후 register
|
||||||
|
file 등이 관습적 offset 범위로만 구분됨. Schema 레벨에서 명확하지 않음.
|
||||||
|
2. **IPCQ 주소의 PhysAddr 표현 부재**: ADR-0030이 IPCQ ring buffer를 PhysAddr로
|
||||||
|
표현하려면 "이 주소가 IPCQ 영역"을 decode 가능해야 함. 현재는 불가.
|
||||||
|
3. **향후 PE resource 확장 경로**: register file, performance counter 등
|
||||||
|
추가 시 일관된 위치 할당 규칙 필요.
|
||||||
|
|
||||||
|
### 설계 방향 — local_offset을 PE 컴포넌트별 range로 분할
|
||||||
|
|
||||||
|
`local_offset` (38 bits = 256GB per PE segment)을 **PE 컴포넌트마다 고정
|
||||||
|
range**로 나누어 할당한다. 각 range는 해당 컴포넌트 전용 주소 공간이며,
|
||||||
|
`PhysAddr.decode()`가 주소가 어느 range에 속하는지 판별해 해당하는 `kind` /
|
||||||
|
`unit_type` / `sub_type` 필드를 채운다.
|
||||||
|
|
||||||
|
개념적 구조 (구체적 bit 할당은 **TBD**):
|
||||||
|
|
||||||
|
```
|
||||||
|
local_offset [37:0] (38 bits total)
|
||||||
|
├── HBM window [37] = 1 (기존 128GB)
|
||||||
|
├── PE component ranges [37] = 0
|
||||||
|
│ ├── TCM [range_1]
|
||||||
|
│ ├── IPCQ rings [range_2]
|
||||||
|
│ ├── Scratchpad [range_3]
|
||||||
|
│ ├── Register file [range_4]
|
||||||
|
│ ├── (reserved) ...
|
||||||
|
│ └── Sideband / status [range_N]
|
||||||
|
```
|
||||||
|
|
||||||
|
### 왜 range-based partition인가
|
||||||
|
|
||||||
|
- **Schema-level 명시성**: 주소 하나 보고 어느 컴포넌트의 자원인지 decode 가능.
|
||||||
|
"Routing consumes decoded domains" (ADR-0001 D5) 계약 충족.
|
||||||
|
- **Unit type enum 확장보다 유연**: 3-bit `UnitType` 공간을 고갈시키지 않고
|
||||||
|
세분화 가능. 미래 추가 컴포넌트도 빈 range 할당.
|
||||||
|
- **Allocator 통합 자연**: 각 PE-level allocator가 관리하는 하위 pool을
|
||||||
|
address range와 1:1 매칭 (e.g., `reserve_ipcq_tcm()` → IPCQ range 안에서만
|
||||||
|
할당).
|
||||||
|
- **Decode routing 단순**: `PhysAddr.decode(addr)`가 range table을 참조해
|
||||||
|
`kind` + sub-field를 채움. 기존 HBM selector bit 패턴의 일반화.
|
||||||
|
|
||||||
|
### 왜 지금 다루는가
|
||||||
|
|
||||||
|
- ADR-0030 (IPCQ PhysAddr 통합)이 이 확장에 **의존**. ADR-0030 단독 진행 시
|
||||||
|
`sub_offset` 공간을 불투명하게 재사용하게 되어 ADR-0001 계약 미충족.
|
||||||
|
- PE 내부 자원이 더 추가될 가능성 — 지금 구조를 정리해두면 일관된 확장 경로 확보.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Decision (pending specific range allocation)
|
||||||
|
|
||||||
|
### D1. Range-based local_offset partition — approach
|
||||||
|
|
||||||
|
`local_offset`을 고정 byte range로 분할하고, 각 range를 PE 컴포넌트에 할당한다.
|
||||||
|
주소의 어느 range에 속하는가로 `kind` / component type을 결정.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# src/kernbench/policy/address/phyaddr.py (conceptual, post-extension)
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class PeResourceRange:
|
||||||
|
name: str # e.g. "tcm", "ipcq", "scratchpad", "regfile"
|
||||||
|
start_offset: int # local_offset 내 시작
|
||||||
|
end_offset: int # exclusive
|
||||||
|
byte_size: int # end - start
|
||||||
|
|
||||||
|
PE_RESOURCE_MAP: tuple[PeResourceRange, ...] = (
|
||||||
|
# TBD — 구체적 range 할당은 사용자가 별도 업데이트
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
`PhysAddr.decode(addr)`의 PE resource 경로는:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def decode_pe_resource(local_offset: int) -> dict:
|
||||||
|
for r in PE_RESOURCE_MAP:
|
||||||
|
if r.start_offset <= local_offset < r.end_offset:
|
||||||
|
return {
|
||||||
|
"kind": "pe_resource",
|
||||||
|
"component": r.name, # NEW: "tcm"/"ipcq"/...
|
||||||
|
"component_offset": local_offset - r.start_offset, # within range
|
||||||
|
}
|
||||||
|
raise PhysAddrError(f"local_offset {local_offset} not in any PE range")
|
||||||
|
```
|
||||||
|
|
||||||
|
### D2. Specific range allocations — **TBD**
|
||||||
|
|
||||||
|
> 사용자가 구체적 byte 할당을 별도로 정의한 뒤 본 ADR에 업데이트.
|
||||||
|
>
|
||||||
|
> 필요 정보:
|
||||||
|
> - 각 컴포넌트 (TCM, IPCQ, scratchpad, regfile, ...)의 이름 / byte size
|
||||||
|
> - `local_offset` 내 시작 offset (align 고려)
|
||||||
|
> - 현재 하드웨어 사양 / 시뮬레이션 요구 반영
|
||||||
|
|
||||||
|
이 섹션이 채워진 뒤 ADR status: **Stub → Proposed → Accepted** 승격.
|
||||||
|
|
||||||
|
### D3. Factory API — per-component 함수
|
||||||
|
|
||||||
|
기존 `PhysAddr.pe_tcm_addr(...)` 패턴을 일반화:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# 기존 (이미 존재)
|
||||||
|
PhysAddr.pe_tcm_addr(rack_id, sip_id, cube_id, pe_id, tcm_offset)
|
||||||
|
|
||||||
|
# 신규 (ADR-0031 후 추가)
|
||||||
|
PhysAddr.pe_ipcq_addr(rack_id, sip_id, cube_id, pe_id, ipcq_offset)
|
||||||
|
PhysAddr.pe_scratchpad_addr(...)
|
||||||
|
PhysAddr.pe_regfile_addr(...)
|
||||||
|
# ...
|
||||||
|
```
|
||||||
|
|
||||||
|
각 factory는 해당 컴포넌트의 range 내에서 `component_offset`만 받아 최종
|
||||||
|
PhysAddr encoding. 호출자는 어느 range인지 몰라도 됨.
|
||||||
|
|
||||||
|
### D4. Backward compatibility
|
||||||
|
|
||||||
|
- 기존 `pe_tcm_addr()` signature / semantic 유지.
|
||||||
|
- 내부 인코딩만 신규 range table을 참조하도록 변경.
|
||||||
|
- 기존 `UnitType.PE` decoding 경로는 `PE_RESOURCE_MAP`에서 "tcm" range를
|
||||||
|
대응하도록 매핑 → 기존 코드 transparent.
|
||||||
|
- 기존 코드가 `PhysAddr.decode(addr).unit_type == UnitType.PE`를 체크하는
|
||||||
|
경우는 여전히 유효 (TCM 주소는 계속 PE unit_type).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Open questions
|
||||||
|
|
||||||
|
### 🔴 Pending user input (ADR 승격 blocker)
|
||||||
|
|
||||||
|
- **D2의 specific range allocation**: 사용자가 구체적 byte 할당 테이블을
|
||||||
|
제공해야 Stub → Proposed 승격 가능. 필요 정보:
|
||||||
|
- 컴포넌트 목록 (TCM, IPCQ, scratchpad, regfile 등)
|
||||||
|
- 각 컴포넌트의 byte size / 시작 offset
|
||||||
|
- Alignment 요구사항 (4KB / page-aligned 등)
|
||||||
|
|
||||||
|
### 🟡 설계 세부 — range allocation 결정 과정에서 함께 결정
|
||||||
|
|
||||||
|
- **총 local_offset space 배분**: HBM window (bit 37 = 1, 128GB)을 유지할지,
|
||||||
|
아니면 PE resource space를 확장하기 위해 HBM window 축소할지.
|
||||||
|
- **Range padding / reserved space**: 미래 컴포넌트 추가를 위한 "reserved"
|
||||||
|
range 몇 개를 미리 확보할지.
|
||||||
|
- **Address alignment**: 각 range의 시작 offset이 특정 alignment (page /
|
||||||
|
cache line) 만족해야 하는지.
|
||||||
|
- **Diagnostic / debug 포맷**: `PhysAddr.decode()` 출력에서 component 이름 +
|
||||||
|
component_offset을 사람이 읽기 좋게 표시 (e.g., "IPCQ ring sip=0 cube=0 pe=3
|
||||||
|
offset=0x1234").
|
||||||
|
- **기존 `UnitType` enum의 role**: Range-based 접근 후에도 `unit_type` 필드
|
||||||
|
유지할지 (decode 결과에 `component` 추가), 또는 enum 대체할지.
|
||||||
|
|
||||||
|
### 🟢 ADR-0030 연동 질문
|
||||||
|
|
||||||
|
- **IPCQ range 내 direction/slot 표현**: PhysAddr는 `component_offset` 단위
|
||||||
|
까지만 표현. "direction=E, slot=2"는 IPCQ range 내 offset 계산으로 도출
|
||||||
|
(`direction_idx * slot_region_size + slot_idx * slot_size`) — 이 공식은
|
||||||
|
ADR-0030 scope에서 구체화.
|
||||||
|
- **Allocator pool 구조**: `PEMemAllocator`가 여러 range (TCM, IPCQ,
|
||||||
|
scratchpad)를 개별 pool로 관리할지, 단일 pool에서 kind별 reserved만 관리
|
||||||
|
할지. Range-based schema면 개별 pool이 자연스러움.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Non-goals (this ADR)
|
||||||
|
|
||||||
|
- **51-bit 전체 layout 재작성**: 본 ADR은 `local_offset` (38 bits) 내부의
|
||||||
|
subdivision만 다룬다. Rack / SIP / cube segment 같은 상위 bit 구조는
|
||||||
|
불변.
|
||||||
|
- **`UnitType` enum 재설계**: range-based 접근으로 대체 가능하지만, 기존 enum
|
||||||
|
(PE / MCPU / SRAM)은 backward compat 위해 유지.
|
||||||
|
- **Dynamic range allocation**: runtime에 range 크기 바꾸는 기능 불필요. 모든
|
||||||
|
range는 컴파일 / 설정 시점에 고정.
|
||||||
|
- **Multi-process / multi-rack partitioning**: PE 내부 resource만 다룸.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Action
|
||||||
|
|
||||||
|
### Phase 1 — User 입력: specific range allocation (**Blocker**)
|
||||||
|
- 사용자가 정의한 PE 컴포넌트별 byte range를 D2에 기입:
|
||||||
|
- `PE_RESOURCE_MAP` 테이블 내용 (name, start_offset, byte_size per 컴포넌트)
|
||||||
|
- 각 컴포넌트의 hardware spec 근거 note
|
||||||
|
|
||||||
|
### Phase 2 — ADR Stub → Proposed 승격
|
||||||
|
- D2 채워지면 status 변경.
|
||||||
|
- Open questions의 "🔴 Pending user input" 블록 제거.
|
||||||
|
- ADR-0001에 amendment note 초안 작성.
|
||||||
|
|
||||||
|
### Phase 3 — 구현
|
||||||
|
- `PhysAddr` range-based decode 구현.
|
||||||
|
- 신규 factory 함수 (`pe_ipcq_addr`, `pe_scratchpad_addr` 등 컴포넌트별)
|
||||||
|
추가.
|
||||||
|
- 기존 `pe_tcm_addr` 내부 인코딩만 신규 range table 참조하도록 수정
|
||||||
|
(signature 불변).
|
||||||
|
- 기존 코드 경로 회귀 확인.
|
||||||
|
|
||||||
|
### Phase 4 — ADR-0030 unblock
|
||||||
|
- ADR-0030 "Blocked" 상태 해제.
|
||||||
|
- Install_plan builder가 `pe_ipcq_addr(...)` 등 확장된 factory 호출하도록
|
||||||
|
수정.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Dependencies
|
||||||
|
|
||||||
|
- **ADR-0001** (PhysAddr layout): 본 ADR은 ADR-0001의 확장.
|
||||||
|
- **ADR-0023** (IPCQ protocol): IPCQ ring buffer의 주소 체계를 PhysAddr로
|
||||||
|
통합할 수 있게 하는 기반.
|
||||||
|
- **ADR-0030** (IPCQ PhysAddr integration): 본 ADR에 blocked.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Affected files (future, after promotion to Proposed)
|
||||||
|
|
||||||
|
| File | Change |
|
||||||
|
|------|--------|
|
||||||
|
| `src/kernbench/policy/address/phyaddr.py` | Range table (`PE_RESOURCE_MAP`), range-based decode, 신규 component-specific factory들 (`pe_ipcq_addr` 등), 기존 `pe_tcm_addr` 내부 인코딩 갱신 |
|
||||||
|
| `src/kernbench/policy/address/allocator.py` | Range-aware pool 분리 (TCM pool / IPCQ pool / scratchpad pool 등 per-PE) |
|
||||||
|
| `docs/adr/ADR-0001-physaddr-layout.md` | Amendment note: range-based PE resource partition |
|
||||||
|
| `tests/test_phyaddr.py` | Range table 검증, 각 factory의 encode/decode round-trip, 기존 `pe_tcm_addr` 회귀 |
|
||||||
@@ -129,8 +129,8 @@ N_ELEM = 8
|
|||||||
def worker(rank: int, world_size: int, torch) -> None:
|
def worker(rank: int, world_size: int, torch) -> None:
|
||||||
"""Per-rank business logic — mirrors a real PyTorch DDP worker."""
|
"""Per-rank business logic — mirrors a real PyTorch DDP worker."""
|
||||||
dp = DPPolicy(
|
dp = DPPolicy(
|
||||||
sip="replicate", cube="replicate", pe="column_wise",
|
cube="replicate", pe="column_wise",
|
||||||
num_sips=1, num_cubes=1, num_pes=world_size,
|
num_cubes=1, num_pes=world_size,
|
||||||
)
|
)
|
||||||
tensor = torch.zeros(
|
tensor = torch.zeros(
|
||||||
(1, world_size * N_ELEM), dtype="f16", dp=dp, name="hello_in",
|
(1, world_size * N_ELEM), dtype="f16", dp=dp, name="hello_in",
|
||||||
|
|||||||
@@ -114,8 +114,8 @@ def run(torch):
|
|||||||
a = torch.zeros(
|
a = torch.zeros(
|
||||||
(1, WORLD_SIZE * N_ELEM), dtype="f16",
|
(1, WORLD_SIZE * N_ELEM), dtype="f16",
|
||||||
dp=DPPolicy(
|
dp=DPPolicy(
|
||||||
sip="replicate", cube="replicate", pe="column_wise",
|
cube="replicate", pe="column_wise",
|
||||||
num_sips=1, num_cubes=1,
|
num_cubes=1,
|
||||||
),
|
),
|
||||||
name="hello_in",
|
name="hello_in",
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -1,29 +0,0 @@
|
|||||||
"""Hello-world CCL kernel for the docs/ccl-author-guide.md walkthrough.
|
|
||||||
|
|
||||||
Each PE sends its tile to the E neighbor and receives one tile from W,
|
|
||||||
then stores the received tile back into its own HBM slice. The simplest
|
|
||||||
possible demonstration of ``tl.send`` / ``tl.recv``.
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
|
|
||||||
def kernel_args(world_size: int, n_elem: int) -> tuple:
|
|
||||||
"""Return the positional kernel arguments for the ahbm backend."""
|
|
||||||
return (n_elem,)
|
|
||||||
|
|
||||||
|
|
||||||
def kernel(t_ptr, n_elem, tl):
|
|
||||||
local_pe = tl.program_id(axis=0)
|
|
||||||
cube_id = tl.program_id(axis=1)
|
|
||||||
pes_per_cube = tl.num_programs(axis=0)
|
|
||||||
rank = cube_id * pes_per_cube + local_pe
|
|
||||||
nbytes = n_elem * 2
|
|
||||||
pe_addr = t_ptr + rank * nbytes
|
|
||||||
|
|
||||||
# Send our local HBM tile to the E neighbor.
|
|
||||||
src = tl.load(pe_addr, shape=(n_elem,), dtype="f16")
|
|
||||||
tl.send(dir="E", src=src)
|
|
||||||
|
|
||||||
# Receive a tile from W and store it into our slice (overwrite).
|
|
||||||
recv = tl.recv(dir="W", shape=(n_elem,), dtype="f16")
|
|
||||||
tl.store(pe_addr, recv)
|
|
||||||
@@ -1,192 +0,0 @@
|
|||||||
"""Hierarchical all-reduce kernel (ADR-0023).
|
|
||||||
|
|
||||||
3-level reduce + broadcast exploiting the topology hierarchy:
|
|
||||||
|
|
||||||
Level 1 — Intra-cube (8 PEs, E/W, fastest link):
|
|
||||||
Bidirectional ring reduce to PE 0.
|
|
||||||
Level 2 — Inter-cube within SIP (16 cubes, N/S, UCIe):
|
|
||||||
Bidirectional ring reduce of PE 0s to cube 0 PE 0.
|
|
||||||
Level 3 — Inter-SIP (2 SIPs, parent):
|
|
||||||
Pair exchange between SIP representatives.
|
|
||||||
Broadcast — Reverse chain through levels 2 and 1.
|
|
||||||
|
|
||||||
Bidirectional reduce: left-half sends toward node 0 via dir_dec,
|
|
||||||
right-half sends via dir_inc (wrapping). Representative receives from
|
|
||||||
both sides. Rounds per level = ceil((group_size - 1) / 2).
|
|
||||||
|
|
||||||
Direction pairing (ring):
|
|
||||||
Send via dir_dec at PE K → recv via dir_inc at PE K-1
|
|
||||||
Send via dir_inc at PE K → recv via dir_dec at PE K+1
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
|
|
||||||
def kernel_args(world_size: int, n_elem: int) -> tuple:
|
|
||||||
"""Positional kernel args for the ahbm backend."""
|
|
||||||
pes_per_cube = 8
|
|
||||||
num_sips = max(1, world_size // 128) if world_size > 128 else 1
|
|
||||||
cubes_per_sip = world_size // (pes_per_cube * num_sips)
|
|
||||||
return (n_elem, pes_per_cube, cubes_per_sip, num_sips)
|
|
||||||
|
|
||||||
|
|
||||||
def neighbors(rank: int, world_size: int, neighbor_map: dict) -> dict:
|
|
||||||
"""Build the 3-level neighbor map."""
|
|
||||||
pes_per_cube = 8
|
|
||||||
num_sips = max(1, world_size // 128) if world_size > 128 else 1
|
|
||||||
cubes_per_sip = world_size // (pes_per_cube * num_sips)
|
|
||||||
|
|
||||||
pe_id = rank % pes_per_cube
|
|
||||||
cube_global = rank // pes_per_cube
|
|
||||||
sip_id = cube_global // cubes_per_sip
|
|
||||||
local_cube_id = cube_global % cubes_per_sip
|
|
||||||
|
|
||||||
result = {}
|
|
||||||
|
|
||||||
# Level 1: intra-cube ring (E/W, all PEs)
|
|
||||||
cube_base = cube_global * pes_per_cube
|
|
||||||
result["E"] = cube_base + (pe_id + 1) % pes_per_cube
|
|
||||||
result["W"] = cube_base + (pe_id - 1) % pes_per_cube
|
|
||||||
|
|
||||||
# Level 2: inter-cube ring (N/S, PE 0 only)
|
|
||||||
if pe_id == 0 and cubes_per_sip > 1:
|
|
||||||
sip_base = sip_id * cubes_per_sip * pes_per_cube
|
|
||||||
next_cube_pe0 = sip_base + ((local_cube_id + 1) % cubes_per_sip) * pes_per_cube
|
|
||||||
prev_cube_pe0 = sip_base + ((local_cube_id - 1) % cubes_per_sip) * pes_per_cube
|
|
||||||
result["N"] = next_cube_pe0
|
|
||||||
result["S"] = prev_cube_pe0
|
|
||||||
|
|
||||||
# Level 3: inter-SIP (parent, PE 0 cube 0 only)
|
|
||||||
if pe_id == 0 and local_cube_id == 0 and num_sips > 1:
|
|
||||||
other_sip_pe0 = ((sip_id + 1) % num_sips) * cubes_per_sip * pes_per_cube
|
|
||||||
result["parent"] = other_sip_pe0
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
def _bidir_reduce(tl, acc, my_id, group_size, dir_inc, dir_dec, shape, dtype):
|
|
||||||
"""Bidirectional ring reduce to node 0.
|
|
||||||
|
|
||||||
Left half (1..half): chain reduces via dir_dec (toward lower IDs).
|
|
||||||
Each PE recvs from higher PE (via dir_inc) and sends to lower (via dir_dec).
|
|
||||||
Right half (half+1..N-1): chain reduces via dir_inc (wraps to node 0).
|
|
||||||
Each PE recvs from lower PE (via dir_dec) and sends to higher (via dir_inc).
|
|
||||||
Node 0: recvs left sum via dir_inc, right sum via dir_dec.
|
|
||||||
|
|
||||||
Direction pairing: send dir_dec at K → recv dir_inc at K-1.
|
|
||||||
send dir_inc at K → recv dir_dec at K+1.
|
|
||||||
"""
|
|
||||||
if group_size <= 1:
|
|
||||||
return acc
|
|
||||||
|
|
||||||
half = group_size // 2
|
|
||||||
|
|
||||||
if my_id == 0:
|
|
||||||
# Representative: recv left-half sum via dir_inc (from PE 1)
|
|
||||||
recv = tl.recv(dir=dir_inc, shape=shape, dtype=dtype)
|
|
||||||
acc = acc + recv
|
|
||||||
# Recv right-half sum via dir_dec (from PE N-1, wrapped)
|
|
||||||
if group_size - half - 1 >= 1:
|
|
||||||
recv = tl.recv(dir=dir_dec, shape=shape, dtype=dtype)
|
|
||||||
acc = acc + recv
|
|
||||||
|
|
||||||
elif my_id <= half:
|
|
||||||
# Left half: recv from PE my_id+1 via dir_inc, send to PE my_id-1 via dir_dec
|
|
||||||
if my_id < half: # not the far-edge
|
|
||||||
recv = tl.recv(dir=dir_inc, shape=shape, dtype=dtype)
|
|
||||||
acc = acc + recv
|
|
||||||
tl.send(dir=dir_dec, src=acc)
|
|
||||||
|
|
||||||
else:
|
|
||||||
# Right half: recv from PE my_id-1 via dir_dec, send to PE my_id+1 via dir_inc
|
|
||||||
if my_id > half + 1: # not the near-edge
|
|
||||||
recv = tl.recv(dir=dir_dec, shape=shape, dtype=dtype)
|
|
||||||
acc = acc + recv
|
|
||||||
tl.send(dir=dir_inc, src=acc)
|
|
||||||
|
|
||||||
return acc
|
|
||||||
|
|
||||||
|
|
||||||
def _chain_broadcast(tl, acc, my_id, group_size, dir_inc, shape, dtype):
|
|
||||||
"""Linear chain broadcast from node 0 via dir_inc.
|
|
||||||
|
|
||||||
Node 0 sends via dir_inc → node 1. Node 1 recvs via dir_dec (implicit
|
|
||||||
from the ring pairing), stores, sends via dir_inc → node 2. Etc.
|
|
||||||
|
|
||||||
Recv direction = the opposite: send dir_inc at K → recv dir_dec at K+1.
|
|
||||||
"""
|
|
||||||
if group_size <= 1:
|
|
||||||
return acc
|
|
||||||
|
|
||||||
# In ring pairing: send via dir_inc at K → recv via dir_dec at K+1.
|
|
||||||
# dir_dec is the "other" direction. We infer it from the ring:
|
|
||||||
# if dir_inc is "E", peer recvs via "W"; if "N", peer recvs via "S".
|
|
||||||
_recv_dir = {"E": "W", "W": "E", "N": "S", "S": "N"}.get(dir_inc, dir_inc)
|
|
||||||
|
|
||||||
if my_id == 0:
|
|
||||||
tl.send(dir=dir_inc, src=acc)
|
|
||||||
else:
|
|
||||||
acc = tl.recv(dir=_recv_dir, shape=shape, dtype=dtype)
|
|
||||||
if my_id < group_size - 1:
|
|
||||||
tl.send(dir=dir_inc, src=acc)
|
|
||||||
return acc
|
|
||||||
|
|
||||||
|
|
||||||
def kernel(t_ptr, n_elem, pes_per_cube, cubes_per_sip, num_sips, tl):
|
|
||||||
"""Hierarchical all-reduce.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
t_ptr: HBM base address (column-sharded VA).
|
|
||||||
n_elem: f16 elements per tile.
|
|
||||||
pes_per_cube: PEs per cube (typically 8).
|
|
||||||
cubes_per_sip: cubes per SIP (typically 16).
|
|
||||||
num_sips: number of SIPs (typically 2).
|
|
||||||
tl: TLContext (auto-injected).
|
|
||||||
"""
|
|
||||||
pe_id = tl.program_id(axis=0)
|
|
||||||
cube_global = tl.program_id(axis=1)
|
|
||||||
sip_id = cube_global // cubes_per_sip
|
|
||||||
local_cube_id = cube_global % cubes_per_sip
|
|
||||||
|
|
||||||
rank = cube_global * pes_per_cube + pe_id
|
|
||||||
nbytes = n_elem * 2
|
|
||||||
pe_addr = t_ptr + rank * nbytes
|
|
||||||
|
|
||||||
acc = tl.load(pe_addr, shape=(n_elem,), dtype="f16")
|
|
||||||
shape = (n_elem,)
|
|
||||||
dtype = "f16"
|
|
||||||
|
|
||||||
# ── Level 1: intra-cube bidirectional reduce to PE 0 ──
|
|
||||||
acc = _bidir_reduce(
|
|
||||||
tl, acc, my_id=pe_id, group_size=pes_per_cube,
|
|
||||||
dir_inc="E", dir_dec="W", shape=shape, dtype=dtype,
|
|
||||||
)
|
|
||||||
|
|
||||||
# ── Level 2: inter-cube bidirectional reduce to cube 0 (PE 0 only) ──
|
|
||||||
if pe_id == 0 and cubes_per_sip > 1:
|
|
||||||
acc = _bidir_reduce(
|
|
||||||
tl, acc, my_id=local_cube_id, group_size=cubes_per_sip,
|
|
||||||
dir_inc="N", dir_dec="S", shape=shape, dtype=dtype,
|
|
||||||
)
|
|
||||||
|
|
||||||
# ── Level 3: inter-SIP exchange (PE 0 cube 0 only) ──
|
|
||||||
if pe_id == 0 and local_cube_id == 0 and num_sips > 1:
|
|
||||||
tl.send(dir="parent", src=acc)
|
|
||||||
recv = tl.recv(dir="parent", shape=shape, dtype=dtype)
|
|
||||||
acc = acc + recv
|
|
||||||
|
|
||||||
# ── Broadcast back ──
|
|
||||||
|
|
||||||
# Level 2: cube 0 PE 0 → all PE 0s via chain
|
|
||||||
if pe_id == 0 and cubes_per_sip > 1:
|
|
||||||
acc = _chain_broadcast(
|
|
||||||
tl, acc, my_id=local_cube_id, group_size=cubes_per_sip,
|
|
||||||
dir_inc="N", shape=shape, dtype=dtype,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Level 1: PE 0 → all PEs in cube via chain
|
|
||||||
acc = _chain_broadcast(
|
|
||||||
tl, acc, my_id=pe_id, group_size=pes_per_cube,
|
|
||||||
dir_inc="E", shape=shape, dtype=dtype,
|
|
||||||
)
|
|
||||||
|
|
||||||
tl.store(pe_addr, acc)
|
|
||||||
@@ -0,0 +1,189 @@
|
|||||||
|
"""Intercube all-reduce kernel (pe0-only, same-lane across cubes).
|
||||||
|
|
||||||
|
Reduces across the 4×4 cube mesh within each SIP, then exchanges
|
||||||
|
between SIPs using the configured SIP topology, and broadcasts back.
|
||||||
|
|
||||||
|
Supported SIP topologies (selected via ``sip_topo_kind``):
|
||||||
|
0 — ring_1d: global_E/global_W ring, n_sips-1 rounds
|
||||||
|
1 — torus_2d: row ring (global_E/W) + col ring (global_S/N)
|
||||||
|
2 — mesh_2d: row chain reduce+broadcast + col chain reduce+broadcast
|
||||||
|
|
||||||
|
IPCQ wiring is handled by ``configure_sfr_intercube_multisip``.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
SIP_TOPO_RING = 0
|
||||||
|
SIP_TOPO_TORUS = 1
|
||||||
|
SIP_TOPO_MESH = 2
|
||||||
|
|
||||||
|
TOPO_NAME_TO_KIND = {
|
||||||
|
"ring_1d": SIP_TOPO_RING,
|
||||||
|
"torus_2d": SIP_TOPO_TORUS,
|
||||||
|
"mesh_2d": SIP_TOPO_TORUS,
|
||||||
|
"mesh_2d_no_wrap": SIP_TOPO_MESH,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def kernel_args(world_size: int, n_elem: int) -> tuple:
|
||||||
|
cube_w = 4
|
||||||
|
cube_h = 4
|
||||||
|
return (n_elem, cube_w, cube_h, world_size)
|
||||||
|
|
||||||
|
|
||||||
|
def _inter_sip_ring(acc, n_sips, n_elem, tl):
|
||||||
|
current = acc
|
||||||
|
for _ in range(n_sips - 1):
|
||||||
|
tl.send(dir="global_E", src=current)
|
||||||
|
recv = tl.recv(dir="global_W", shape=(n_elem,), dtype="f16")
|
||||||
|
acc = acc + recv
|
||||||
|
current = recv
|
||||||
|
return acc
|
||||||
|
|
||||||
|
|
||||||
|
def _inter_sip_torus_2d(acc, sip_rank, sip_topo_w, sip_topo_h, n_elem, tl):
|
||||||
|
# Row ring (global_E / global_W)
|
||||||
|
current = acc
|
||||||
|
for _ in range(sip_topo_w - 1):
|
||||||
|
tl.send(dir="global_E", src=current)
|
||||||
|
recv = tl.recv(dir="global_W", shape=(n_elem,), dtype="f16")
|
||||||
|
acc = acc + recv
|
||||||
|
current = recv
|
||||||
|
# Col ring (global_S / global_N)
|
||||||
|
current = acc
|
||||||
|
for _ in range(sip_topo_h - 1):
|
||||||
|
tl.send(dir="global_S", src=current)
|
||||||
|
recv = tl.recv(dir="global_N", shape=(n_elem,), dtype="f16")
|
||||||
|
acc = acc + recv
|
||||||
|
current = recv
|
||||||
|
return acc
|
||||||
|
|
||||||
|
|
||||||
|
def _inter_sip_mesh_2d(acc, sip_rank, sip_topo_w, sip_topo_h, n_elem, tl):
|
||||||
|
sip_row = sip_rank // sip_topo_w
|
||||||
|
sip_col = sip_rank % sip_topo_w
|
||||||
|
|
||||||
|
# Row reduce W → E
|
||||||
|
if sip_col == 0:
|
||||||
|
tl.send(dir="global_E", src=acc)
|
||||||
|
elif sip_col < sip_topo_w - 1:
|
||||||
|
recv = tl.recv(dir="global_W", shape=(n_elem,), dtype="f16")
|
||||||
|
acc = acc + recv
|
||||||
|
tl.send(dir="global_E", src=acc)
|
||||||
|
else:
|
||||||
|
recv = tl.recv(dir="global_W", shape=(n_elem,), dtype="f16")
|
||||||
|
acc = acc + recv
|
||||||
|
|
||||||
|
# Row broadcast E → W
|
||||||
|
if sip_col == sip_topo_w - 1:
|
||||||
|
tl.send(dir="global_W", src=acc)
|
||||||
|
elif sip_col > 0:
|
||||||
|
acc = tl.recv(dir="global_E", shape=(n_elem,), dtype="f16")
|
||||||
|
tl.send(dir="global_W", src=acc)
|
||||||
|
else:
|
||||||
|
acc = tl.recv(dir="global_E", shape=(n_elem,), dtype="f16")
|
||||||
|
|
||||||
|
# Col reduce N → S
|
||||||
|
if sip_row == 0:
|
||||||
|
tl.send(dir="global_S", src=acc)
|
||||||
|
elif sip_row < sip_topo_h - 1:
|
||||||
|
recv = tl.recv(dir="global_N", shape=(n_elem,), dtype="f16")
|
||||||
|
acc = acc + recv
|
||||||
|
tl.send(dir="global_S", src=acc)
|
||||||
|
else:
|
||||||
|
recv = tl.recv(dir="global_N", shape=(n_elem,), dtype="f16")
|
||||||
|
acc = acc + recv
|
||||||
|
|
||||||
|
# Col broadcast S → N
|
||||||
|
if sip_row == sip_topo_h - 1:
|
||||||
|
tl.send(dir="global_N", src=acc)
|
||||||
|
elif sip_row > 0:
|
||||||
|
acc = tl.recv(dir="global_S", shape=(n_elem,), dtype="f16")
|
||||||
|
tl.send(dir="global_N", src=acc)
|
||||||
|
else:
|
||||||
|
acc = tl.recv(dir="global_S", shape=(n_elem,), dtype="f16")
|
||||||
|
|
||||||
|
return acc
|
||||||
|
|
||||||
|
|
||||||
|
def allreduce_intercube_multidevice(
|
||||||
|
t_ptr, n_elem, cube_w, cube_h, n_sips, sip_rank,
|
||||||
|
sip_topo_kind, sip_topo_w, sip_topo_h, tl,
|
||||||
|
):
|
||||||
|
"""Intercube all-reduce (pe0-only) with configurable SIP topology.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
t_ptr: VA base of the row-wise-sharded tensor on this SIP.
|
||||||
|
n_elem: f16 elements per cube tile.
|
||||||
|
cube_w: cube mesh width (columns).
|
||||||
|
cube_h: cube mesh height (rows).
|
||||||
|
n_sips: number of SIPs.
|
||||||
|
sip_rank: this SIP's rank (0-based).
|
||||||
|
sip_topo_kind: 0=ring, 1=torus_2d, 2=mesh_2d.
|
||||||
|
sip_topo_w: SIP mesh width (for 2D topologies, 0 for ring).
|
||||||
|
sip_topo_h: SIP mesh height (for 2D topologies, 0 for ring).
|
||||||
|
tl: TLContext (auto-injected).
|
||||||
|
"""
|
||||||
|
cube_id = tl.program_id(axis=1)
|
||||||
|
row = cube_id // cube_w
|
||||||
|
col = cube_id % cube_w
|
||||||
|
nbytes = n_elem * 2
|
||||||
|
|
||||||
|
pe_addr = t_ptr + cube_id * nbytes
|
||||||
|
acc = tl.load(pe_addr, shape=(n_elem,), dtype="f16")
|
||||||
|
|
||||||
|
# ── Phase 1: row reduce W → E ──
|
||||||
|
if col == 0:
|
||||||
|
tl.send(dir="E", src=acc)
|
||||||
|
elif col < cube_w - 1:
|
||||||
|
recv = tl.recv(dir="W", shape=(n_elem,), dtype="f16")
|
||||||
|
acc = acc + recv
|
||||||
|
tl.send(dir="E", src=acc)
|
||||||
|
else:
|
||||||
|
recv = tl.recv(dir="W", shape=(n_elem,), dtype="f16")
|
||||||
|
acc = acc + recv
|
||||||
|
|
||||||
|
# ── Phase 2: col reduce N → S on rightmost column ──
|
||||||
|
if col == cube_w - 1:
|
||||||
|
if row == 0:
|
||||||
|
tl.send(dir="S", src=acc)
|
||||||
|
elif row < cube_h - 1:
|
||||||
|
recv = tl.recv(dir="N", shape=(n_elem,), dtype="f16")
|
||||||
|
acc = acc + recv
|
||||||
|
tl.send(dir="S", src=acc)
|
||||||
|
else:
|
||||||
|
recv = tl.recv(dir="N", shape=(n_elem,), dtype="f16")
|
||||||
|
acc = acc + recv
|
||||||
|
|
||||||
|
# ── Phase 3: inter-SIP exchange on root cube ──
|
||||||
|
root_cube = (cube_h - 1) * cube_w + (cube_w - 1)
|
||||||
|
if cube_id == root_cube and n_sips > 1:
|
||||||
|
if sip_topo_kind == SIP_TOPO_RING:
|
||||||
|
acc = _inter_sip_ring(acc, n_sips, n_elem, tl)
|
||||||
|
elif sip_topo_kind == SIP_TOPO_TORUS:
|
||||||
|
acc = _inter_sip_torus_2d(acc, sip_rank, sip_topo_w, sip_topo_h, n_elem, tl)
|
||||||
|
elif sip_topo_kind == SIP_TOPO_MESH:
|
||||||
|
acc = _inter_sip_mesh_2d(acc, sip_rank, sip_topo_w, sip_topo_h, n_elem, tl)
|
||||||
|
|
||||||
|
# ── Phase 4: col broadcast S → N on rightmost column ──
|
||||||
|
if col == cube_w - 1:
|
||||||
|
if row == cube_h - 1:
|
||||||
|
tl.send(dir="N", src=acc)
|
||||||
|
elif row > 0:
|
||||||
|
acc = tl.recv(dir="S", shape=(n_elem,), dtype="f16")
|
||||||
|
tl.send(dir="N", src=acc)
|
||||||
|
else:
|
||||||
|
acc = tl.recv(dir="S", shape=(n_elem,), dtype="f16")
|
||||||
|
|
||||||
|
# ── Phase 5: row broadcast E → W ──
|
||||||
|
if col == cube_w - 1:
|
||||||
|
tl.send(dir="W", src=acc)
|
||||||
|
elif col > 0:
|
||||||
|
acc = tl.recv(dir="E", shape=(n_elem,), dtype="f16")
|
||||||
|
tl.send(dir="W", src=acc)
|
||||||
|
else:
|
||||||
|
acc = tl.recv(dir="E", shape=(n_elem,), dtype="f16")
|
||||||
|
|
||||||
|
tl.store(pe_addr, acc)
|
||||||
|
|
||||||
|
|
||||||
|
kernel = allreduce_intercube_multidevice
|
||||||
@@ -1,73 +0,0 @@
|
|||||||
"""2D-mesh all-reduce kernel (ADR-0023).
|
|
||||||
|
|
||||||
Two-phase reduce on a square mesh of side ``S`` (world_size = S*S):
|
|
||||||
1. Row reduce: ring all-reduce along E/W within each row.
|
|
||||||
2. Column reduce: ring all-reduce along N/S within each column.
|
|
||||||
|
|
||||||
After both phases, every rank holds the global sum.
|
|
||||||
|
|
||||||
Uses TensorHandle math (PE_MATH) for accumulation. Op_log captures the
|
|
||||||
data flow so Phase 2 produces correct final HBM contents. Math/recv
|
|
||||||
handles are passed directly to the next send, avoiding store→reload
|
|
||||||
which doesn't propagate correctly with timing-only Phase 1 math.
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import math
|
|
||||||
|
|
||||||
|
|
||||||
def kernel_args(world_size: int, n_elem: int) -> tuple:
|
|
||||||
"""Return the positional kernel arguments for the ahbm backend.
|
|
||||||
|
|
||||||
Mesh all-reduce requires ``world_size`` to be a perfect square —
|
|
||||||
the mesh side length is ``sqrt(world_size)``.
|
|
||||||
"""
|
|
||||||
side = int(round(math.sqrt(world_size)))
|
|
||||||
if side * side != world_size:
|
|
||||||
raise ValueError(
|
|
||||||
f"mesh_allreduce requires a square world_size; got {world_size}"
|
|
||||||
)
|
|
||||||
return (n_elem, side)
|
|
||||||
|
|
||||||
|
|
||||||
def kernel(t_ptr, n_elem, side, tl):
|
|
||||||
"""All-reduce on a square mesh.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
t_ptr: HBM base address (column-sharded VA shared across ranks)
|
|
||||||
n_elem: number of f16 elements per tile
|
|
||||||
side: mesh side length (sqrt(world_size))
|
|
||||||
tl: TLContext (ADR-0022).
|
|
||||||
"""
|
|
||||||
local_pe = tl.program_id(axis=0)
|
|
||||||
cube_id = tl.program_id(axis=1)
|
|
||||||
pes_per_cube = tl.num_programs(axis=0)
|
|
||||||
rank = cube_id * pes_per_cube + local_pe
|
|
||||||
nbytes = n_elem * 2
|
|
||||||
|
|
||||||
pe_addr = t_ptr + rank * nbytes
|
|
||||||
acc = tl.load(pe_addr, shape=(n_elem,), dtype="f16")
|
|
||||||
current = acc
|
|
||||||
|
|
||||||
# ── Phase 1: row ring (E direction) ──
|
|
||||||
# Ring forwards each received tile (not the cumulative acc) so every
|
|
||||||
# tile passes through every rank exactly once.
|
|
||||||
for _ in range(side - 1):
|
|
||||||
tl.send(dir="E", src=current)
|
|
||||||
recv = tl.recv(dir="W", shape=(n_elem,), dtype="f16")
|
|
||||||
acc = acc + recv
|
|
||||||
current = recv
|
|
||||||
|
|
||||||
# Phase 2 column ring starts from the row-phase accumulator. We do NOT
|
|
||||||
# store/reload here — the math handle's scratch addr is the source for
|
|
||||||
# the first column send and Phase 2 ipcq_copy replays from there.
|
|
||||||
current = acc
|
|
||||||
|
|
||||||
# ── Phase 2: column ring (S direction) ──
|
|
||||||
for _ in range(side - 1):
|
|
||||||
tl.send(dir="S", src=current)
|
|
||||||
recv = tl.recv(dir="N", shape=(n_elem,), dtype="f16")
|
|
||||||
acc = acc + recv
|
|
||||||
current = recv
|
|
||||||
|
|
||||||
tl.store(pe_addr, acc)
|
|
||||||
@@ -1,80 +0,0 @@
|
|||||||
"""Ring all-reduce kernel for IPCQ-based PE collective (ADR-0023).
|
|
||||||
|
|
||||||
Algorithm: 1D ring of N PEs, each PE starts with one tile of data.
|
|
||||||
After ``world_size - 1`` rounds, every PE's accumulator holds the sum
|
|
||||||
of all PE tiles.
|
|
||||||
|
|
||||||
Strategy
|
|
||||||
--------
|
|
||||||
Each PE starts with its own tile in HBM. The kernel:
|
|
||||||
1. Loads the local tile into a TensorHandle (the accumulator).
|
|
||||||
2. In each of ``world_size - 1`` rounds:
|
|
||||||
- Sends the current accumulator/recv slot to the E neighbor.
|
|
||||||
- Receives a tile from the W neighbor — the recv handle points
|
|
||||||
into the per-direction TCM slot.
|
|
||||||
- Adds the received tile to the accumulator using the TensorHandle
|
|
||||||
operator overload, which dispatches to ``MathCmd`` (PE_MATH).
|
|
||||||
3. Stores the final accumulator back to HBM via tl.store. The store is
|
|
||||||
recorded in op_log with both src and dst, so Phase 2 will copy the
|
|
||||||
replayed math result from PE-local scratch into HBM.
|
|
||||||
|
|
||||||
ADR-0020 D3 split: Phase 1 simulates timing only — math results are
|
|
||||||
not yet computed, so the accumulator data flowing through Phase 1 may
|
|
||||||
be stale. Phase 2's DataExecutor replays math + IPCQ copies + dma_write
|
|
||||||
in stable t_start order, producing correct final HBM contents.
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
|
|
||||||
def kernel_args(world_size: int, n_elem: int) -> tuple:
|
|
||||||
"""Return the positional kernel arguments for the ahbm backend.
|
|
||||||
|
|
||||||
Ring all-reduce takes (n_elem, world_size) after the tensor pointer.
|
|
||||||
"""
|
|
||||||
return (n_elem, world_size)
|
|
||||||
|
|
||||||
|
|
||||||
def kernel(t_ptr, n_elem, world_size, tl):
|
|
||||||
"""Ring all-reduce.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
t_ptr: HBM base address of the column-sharded tensor — all PEs
|
|
||||||
share this base. The per-PE slice lives at
|
|
||||||
``t_ptr + global_rank * n_elem * 2``.
|
|
||||||
n_elem: number of f16 elements per tile.
|
|
||||||
world_size: total number of participating ranks (passed by host).
|
|
||||||
tl: TLContext (auto-injected, ADR-0022). The kernel derives the
|
|
||||||
global rank from ``program_id(axis=0)`` (local PE) and
|
|
||||||
``program_id(axis=1)`` (cube id):
|
|
||||||
|
|
||||||
rank = cube_id * pes_per_cube + local_pe
|
|
||||||
"""
|
|
||||||
local_pe = tl.program_id(axis=0)
|
|
||||||
cube_id = tl.program_id(axis=1)
|
|
||||||
pes_per_cube = tl.num_programs(axis=0)
|
|
||||||
rank = cube_id * pes_per_cube + local_pe
|
|
||||||
nbytes = n_elem * 2 # f16
|
|
||||||
|
|
||||||
# Each PE reads from its own slice of the shared base address
|
|
||||||
pe_addr = t_ptr + rank * nbytes
|
|
||||||
|
|
||||||
# Load the local tile — handle points at HBM[pe_addr].
|
|
||||||
acc = tl.load(pe_addr, shape=(n_elem,), dtype="f16")
|
|
||||||
# The ring forwards each received tile to the next neighbor (NOT the
|
|
||||||
# cumulative accumulator), so every rank's tile passes through every
|
|
||||||
# rank exactly once. The accumulator sums the new arrival each round.
|
|
||||||
current = acc
|
|
||||||
|
|
||||||
for _step in range(world_size - 1):
|
|
||||||
tl.send(dir="E", src=current)
|
|
||||||
recv = tl.recv(dir="W", shape=(n_elem,), dtype="f16")
|
|
||||||
# TensorHandle add → MathCmd → PE_MATH (timing in Phase 1, real
|
|
||||||
# numpy in Phase 2 via DataExecutor). The result handle lives at
|
|
||||||
# an auto-allocated PE-local scratch addr.
|
|
||||||
acc = acc + recv
|
|
||||||
current = recv # forward W's tile to E next round
|
|
||||||
|
|
||||||
# Final result back to this PE's HBM slice. Op_log captures the
|
|
||||||
# source (scratch addr) and dst (HBM slice) so Phase 2 copies the
|
|
||||||
# accumulated value into HBM for verification.
|
|
||||||
tl.store(pe_addr, acc)
|
|
||||||
@@ -1,80 +0,0 @@
|
|||||||
"""Tree all-reduce kernel for IPCQ-based PE collective (ADR-0023).
|
|
||||||
|
|
||||||
Two-phase binary tree all-reduce:
|
|
||||||
|
|
||||||
Phase 1 (reduce up):
|
|
||||||
- leaf nodes send their value to ``parent``
|
|
||||||
- internal nodes recv from each child, sum, then send to ``parent``
|
|
||||||
- root accumulates child contributions; final acc holds global sum
|
|
||||||
|
|
||||||
Phase 2 (broadcast down):
|
|
||||||
- root sends acc to ``child_left`` and ``child_right`` (if present)
|
|
||||||
- internal nodes recv from ``parent``, then forward to children
|
|
||||||
- all ranks store the final acc to HBM
|
|
||||||
|
|
||||||
Uses TensorHandle math (PE_MATH) for accumulation. Op_log captures the
|
|
||||||
data flow so Phase 2 produces correct final HBM contents. The kernel
|
|
||||||
deliberately avoids the store→reload→send pattern: math/recv handles
|
|
||||||
are passed directly to the next send so PE_DMA snapshots a deterministic
|
|
||||||
source addr that Phase 2 can replay.
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
|
|
||||||
def kernel_args(world_size: int, n_elem: int) -> tuple:
|
|
||||||
"""Return the positional kernel arguments for the ahbm backend."""
|
|
||||||
return (n_elem, world_size)
|
|
||||||
|
|
||||||
|
|
||||||
def kernel(t_ptr, n_elem, world_size, tl):
|
|
||||||
"""Tree all-reduce.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
t_ptr: HBM base address.
|
|
||||||
n_elem: number of f16 elements per tile.
|
|
||||||
world_size: total number of participating ranks (passed by host).
|
|
||||||
tl: TLContext (ADR-0022). Global rank from program_id(0/1).
|
|
||||||
"""
|
|
||||||
local_pe = tl.program_id(axis=0)
|
|
||||||
cube_id = tl.program_id(axis=1)
|
|
||||||
pes_per_cube = tl.num_programs(axis=0)
|
|
||||||
rank = cube_id * pes_per_cube + local_pe
|
|
||||||
nbytes = n_elem * 2
|
|
||||||
|
|
||||||
pe_addr = t_ptr + rank * nbytes
|
|
||||||
acc = tl.load(pe_addr, shape=(n_elem,), dtype="f16")
|
|
||||||
|
|
||||||
# Compute children/parent existence (matches tree_binary topology generator)
|
|
||||||
has_parent = rank > 0
|
|
||||||
left = 2 * rank + 1
|
|
||||||
right = 2 * rank + 2
|
|
||||||
has_left = left < world_size
|
|
||||||
has_right = right < world_size
|
|
||||||
|
|
||||||
# ── Phase 1: reduce up ──
|
|
||||||
if has_left:
|
|
||||||
recv = tl.recv(dir="child_left", shape=(n_elem,), dtype="f16")
|
|
||||||
acc = acc + recv
|
|
||||||
if has_right:
|
|
||||||
recv = tl.recv(dir="child_right", shape=(n_elem,), dtype="f16")
|
|
||||||
acc = acc + recv
|
|
||||||
|
|
||||||
if has_parent:
|
|
||||||
# Send the math/load handle directly — its addr is either the
|
|
||||||
# original HBM tile (leaf) or the PE-local scratch where the
|
|
||||||
# accumulator lives. Phase 2 ipcq_copy replays from the same addr.
|
|
||||||
tl.send(dir="parent", src=acc)
|
|
||||||
|
|
||||||
# ── Phase 2: broadcast down ──
|
|
||||||
if has_parent:
|
|
||||||
# Replace acc with the value broadcast from the parent (the global
|
|
||||||
# sum). The recv handle points at the parent-direction TCM slot.
|
|
||||||
acc = tl.recv(dir="parent", shape=(n_elem,), dtype="f16")
|
|
||||||
|
|
||||||
if has_left:
|
|
||||||
tl.send(dir="child_left", src=acc)
|
|
||||||
if has_right:
|
|
||||||
tl.send(dir="child_right", src=acc)
|
|
||||||
|
|
||||||
# Final store to HBM for the bench's verification path.
|
|
||||||
tl.store(pe_addr, acc)
|
|
||||||
@@ -219,9 +219,28 @@ def install_ipcq(
|
|||||||
"neighbor_table": neighbor_table,
|
"neighbor_table": neighbor_table,
|
||||||
}
|
}
|
||||||
|
|
||||||
def reverse_direction(my_rank: int, peer_rank: int) -> str | None:
|
_OPPOSITE_DIR = {
|
||||||
"""Find which direction in peer's neighbor table points back to my_rank."""
|
"E": "W", "W": "E", "N": "S", "S": "N",
|
||||||
for d, target in neighbor_table[peer_rank].items():
|
"global_E": "global_W", "global_W": "global_E",
|
||||||
|
"global_N": "global_S", "global_S": "global_N",
|
||||||
|
}
|
||||||
|
|
||||||
|
def reverse_direction(my_rank: int, peer_rank: int, my_dir: str) -> str | None:
|
||||||
|
"""Find peer's direction that reciprocates my_dir→peer_rank.
|
||||||
|
|
||||||
|
Prefer the OPPOSITE direction (E↔W, N↔S) when the peer has it
|
||||||
|
pointing back to us (ADR-0025 D1). This matters in 2-rank
|
||||||
|
bidirectional rings where both E and W on one side point to the
|
||||||
|
same peer — without the preference, dict-order first-match would
|
||||||
|
route data into the wrong rx slot. Falls back to any direction
|
||||||
|
pointing back for topologies without an opposite convention
|
||||||
|
(e.g. tree_binary's parent/child).
|
||||||
|
"""
|
||||||
|
nt = neighbor_table[peer_rank]
|
||||||
|
opp = _OPPOSITE_DIR.get(my_dir)
|
||||||
|
if opp is not None and nt.get(opp) == my_rank:
|
||||||
|
return opp
|
||||||
|
for d, target in nt.items():
|
||||||
if target == my_rank:
|
if target == my_rank:
|
||||||
return d
|
return d
|
||||||
return None
|
return None
|
||||||
@@ -234,7 +253,7 @@ def install_ipcq(
|
|||||||
if peer_rank is None:
|
if peer_rank is None:
|
||||||
continue
|
continue
|
||||||
peer_s, peer_c, peer_p = rank_pe[peer_rank]
|
peer_s, peer_c, peer_p = rank_pe[peer_rank]
|
||||||
peer_dir = reverse_direction(r, peer_rank)
|
peer_dir = reverse_direction(r, peer_rank, d)
|
||||||
if peer_dir is None:
|
if peer_dir is None:
|
||||||
# Peer doesn't have a reverse entry — skip (asymmetric topology)
|
# Peer doesn't have a reverse entry — skip (asymmetric topology)
|
||||||
continue
|
continue
|
||||||
|
|||||||
@@ -0,0 +1,104 @@
|
|||||||
|
"""SFR configuration for intercube + inter-SIP IPCQ wiring.
|
||||||
|
|
||||||
|
Provides ``configure_sfr_intercube_multisip`` which programs PE_IPCQ
|
||||||
|
neighbor tables for:
|
||||||
|
|
||||||
|
1. Intercube within each SIP — pe0 of every cube connects to pe0 of
|
||||||
|
its N/S/E/W mesh neighbors (no wrap-around).
|
||||||
|
2. Inter-SIP on ALL cubes — pe0 of cube_c on sip_A connects to pe0 of
|
||||||
|
cube_c on each peer SIP, using ``global_E``/``global_W`` (ring) or
|
||||||
|
``global_N``/``global_S``/``global_E``/``global_W`` (mesh/torus)
|
||||||
|
direction labels. Wiring all cubes allows the kernel to
|
||||||
|
dynamically elect the root cube at runtime.
|
||||||
|
|
||||||
|
SIP-level topology is read from ``topology.yaml`` →
|
||||||
|
``system.sips.topology`` (e.g. ``ring_1d``, ``mesh_2d``).
|
||||||
|
Intercube mesh dimensions come from ``sip.cube_mesh.w/h``.
|
||||||
|
|
||||||
|
Internally delegates to ``install_ipcq`` with a computed ``rank_to_pe``
|
||||||
|
(pe0-only) and a closure-captured ``neighbors()`` function.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import types
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from kernbench.ccl.install import install_ipcq
|
||||||
|
from kernbench.ccl.topologies import _BUILTIN as _TOPO_BUILTINS
|
||||||
|
|
||||||
|
|
||||||
|
def configure_sfr_intercube_multisip(
|
||||||
|
engine: Any,
|
||||||
|
spec: dict,
|
||||||
|
cfg: dict,
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
"""Wire IPCQ for intercube (pe0, mesh) + inter-SIP (pe0, all cubes).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
engine: GraphEngine with ``_components``.
|
||||||
|
spec: topology spec dict (from topology.yaml).
|
||||||
|
cfg: merged algorithm config (from ``resolve_algorithm_config``).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The install plan dict from ``install_ipcq``.
|
||||||
|
"""
|
||||||
|
cm = spec["sip"]["cube_mesh"]
|
||||||
|
mesh_w = int(cm["w"])
|
||||||
|
mesh_h = int(cm["h"])
|
||||||
|
n_cubes = mesh_w * mesh_h
|
||||||
|
n_sips = int(spec.get("system", {}).get("sips", {}).get("count", 1))
|
||||||
|
sip_topology = str(
|
||||||
|
spec.get("system", {}).get("sips", {}).get("topology", "ring_1d")
|
||||||
|
)
|
||||||
|
|
||||||
|
if sip_topology not in _TOPO_BUILTINS:
|
||||||
|
raise ValueError(
|
||||||
|
f"Unknown sip topology '{sip_topology}'. "
|
||||||
|
f"Available: {list(_TOPO_BUILTINS)}"
|
||||||
|
)
|
||||||
|
sip_topo_fn = _TOPO_BUILTINS[sip_topology]
|
||||||
|
|
||||||
|
world_size = n_sips * n_cubes
|
||||||
|
pe_idx_to_pe: list[tuple[int, int, int]] = [
|
||||||
|
(sip, cube, 0)
|
||||||
|
for sip in range(n_sips)
|
||||||
|
for cube in range(n_cubes)
|
||||||
|
]
|
||||||
|
|
||||||
|
def _neighbors(pe_idx: int, ws: int, _base: dict) -> dict[str, int]:
|
||||||
|
sip = pe_idx // n_cubes
|
||||||
|
cube = pe_idx % n_cubes
|
||||||
|
row = cube // mesh_w
|
||||||
|
col = cube % mesh_w
|
||||||
|
|
||||||
|
nbrs: dict[str, int] = {}
|
||||||
|
|
||||||
|
# Intercube within SIP (mesh, no wrap-around)
|
||||||
|
if col < mesh_w - 1:
|
||||||
|
nbrs["E"] = sip * n_cubes + (row * mesh_w + col + 1)
|
||||||
|
if col > 0:
|
||||||
|
nbrs["W"] = sip * n_cubes + (row * mesh_w + col - 1)
|
||||||
|
if row < mesh_h - 1:
|
||||||
|
nbrs["S"] = sip * n_cubes + ((row + 1) * mesh_w + col)
|
||||||
|
if row > 0:
|
||||||
|
nbrs["N"] = sip * n_cubes + ((row - 1) * mesh_w + col)
|
||||||
|
|
||||||
|
# Inter-SIP on ALL cubes
|
||||||
|
if n_sips > 1:
|
||||||
|
sip_nbrs = sip_topo_fn(sip, n_sips)
|
||||||
|
for d, peer_sip in sip_nbrs.items():
|
||||||
|
nbrs[f"global_{d}"] = peer_sip * n_cubes + cube
|
||||||
|
|
||||||
|
return nbrs
|
||||||
|
|
||||||
|
mock_module = types.SimpleNamespace(neighbors=_neighbors)
|
||||||
|
|
||||||
|
cfg_copy = dict(cfg)
|
||||||
|
cfg_copy["world_size"] = world_size
|
||||||
|
cfg_copy["topology"] = "none"
|
||||||
|
|
||||||
|
return install_ipcq(
|
||||||
|
engine, spec, cfg_copy,
|
||||||
|
algo_module=mock_module,
|
||||||
|
rank_to_pe=pe_idx_to_pe,
|
||||||
|
)
|
||||||
@@ -1,492 +0,0 @@
|
|||||||
"""Mock CCL runtime for fast unit tests of algorithm kernels (ADR-0023 D15).
|
|
||||||
|
|
||||||
Runs a kernel function once per rank with a minimal ``tl`` shim — no SimPy,
|
|
||||||
no PE_DMA, no fabric simulation. Just enough to verify *functional*
|
|
||||||
correctness of an IPCQ-based collective algorithm.
|
|
||||||
|
|
||||||
Cross-rank send/recv is implemented with greenlet cooperative scheduling
|
|
||||||
plus per-(rank, direction) FIFO queues. Backpressure is not modeled —
|
|
||||||
queues are unbounded.
|
|
||||||
|
|
||||||
Typical usage in a test::
|
|
||||||
|
|
||||||
from kernbench.ccl.testing import run_kernel_in_mock
|
|
||||||
from kernbench.ccl.algorithms.ring_allreduce import kernel
|
|
||||||
|
|
||||||
inputs = [np.full(16, r + 1, dtype="f16") for r in range(4)]
|
|
||||||
outputs = run_kernel_in_mock(
|
|
||||||
kernel_fn=kernel, world_size=4, topology="ring_1d",
|
|
||||||
inputs=inputs, kernel_args=(16,),
|
|
||||||
)
|
|
||||||
for r in range(4):
|
|
||||||
assert np.allclose(outputs[r], sum(inputs))
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from collections import deque
|
|
||||||
from typing import Any, Callable
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
from greenlet import greenlet
|
|
||||||
|
|
||||||
from kernbench.ccl.topologies import resolve_topology
|
|
||||||
from kernbench.common.ipcq_types import IpcqInvalidDirection
|
|
||||||
from kernbench.common.pe_commands import TensorHandle
|
|
||||||
|
|
||||||
|
|
||||||
# ── Per-rank fake state ──────────────────────────────────────────────
|
|
||||||
|
|
||||||
|
|
||||||
class _MockRankState:
|
|
||||||
"""Per-rank scratch holding HBM/recv slots and tl shim hooks."""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
rank: int,
|
|
||||||
world_size: int,
|
|
||||||
neighbors: dict[str, int],
|
|
||||||
input_arr: np.ndarray,
|
|
||||||
pes_per_cube: int = 0,
|
|
||||||
) -> None:
|
|
||||||
self.rank = rank
|
|
||||||
self.world_size = world_size
|
|
||||||
# PEs per cube for program_id(axis=0/1). If 0 or world_size,
|
|
||||||
# all ranks are in one cube (legacy single-cube behavior).
|
|
||||||
self.pes_per_cube = pes_per_cube if pes_per_cube > 0 else world_size
|
|
||||||
self.neighbors = neighbors # direction → peer rank
|
|
||||||
# HBM "memory": addr → ndarray. Per-rank, no cross-rank sharing.
|
|
||||||
self._hbm: dict[int, np.ndarray] = {}
|
|
||||||
self._tcm: dict[int, np.ndarray] = {}
|
|
||||||
# ``t_ptr`` is the address the kernel sees. Real benches use a
|
|
||||||
# column-sharded VA so each rank reads from ``t_ptr + rank*nbytes``.
|
|
||||||
# Mirror that here: each rank's slice lives at the rank-specific addr.
|
|
||||||
nbytes = int(input_arr.nbytes)
|
|
||||||
self.t_ptr = 0 # base; per-rank offset is rank * nbytes
|
|
||||||
self._slice_addr = rank * nbytes
|
|
||||||
self._hbm[self._slice_addr] = input_arr.copy()
|
|
||||||
# Inbound recv FIFOs: direction → deque[ndarray]
|
|
||||||
self.recv_q: dict[str, deque[np.ndarray]] = {d: deque() for d in neighbors}
|
|
||||||
# Output (set when kernel calls tl.store at slice address)
|
|
||||||
self.output: np.ndarray | None = None
|
|
||||||
# Greenlet for this rank — set later
|
|
||||||
self.g: greenlet | None = None
|
|
||||||
|
|
||||||
|
|
||||||
# ── Mock TLContext ───────────────────────────────────────────────────
|
|
||||||
|
|
||||||
|
|
||||||
class _MockTL:
|
|
||||||
"""Drop-in tl shim for mock runtime.
|
|
||||||
|
|
||||||
Supports the subset of TLContext API that algorithm authors use:
|
|
||||||
program_id, num_programs, load, store, send, recv, recv_async, wait,
|
|
||||||
plus arithmetic operations on TensorHandle (eager numpy execution,
|
|
||||||
no SimPy involved).
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, state: _MockRankState, scheduler: "_MockScheduler") -> None:
|
|
||||||
self._state = state
|
|
||||||
self._scheduler = scheduler
|
|
||||||
self._handle_counter = 0
|
|
||||||
|
|
||||||
def _next_id(self) -> str:
|
|
||||||
self._handle_counter += 1
|
|
||||||
return f"mt{self._handle_counter}"
|
|
||||||
|
|
||||||
@property
|
|
||||||
def rank(self) -> int:
|
|
||||||
return self._state.rank
|
|
||||||
|
|
||||||
@property
|
|
||||||
def world_size(self) -> int:
|
|
||||||
return self._state.world_size
|
|
||||||
|
|
||||||
# axis-aware
|
|
||||||
def program_id(self, axis: int = 0) -> int:
|
|
||||||
# Multi-cube: axis=0 = PE within cube, axis=1 = global cube id.
|
|
||||||
# Falls back to flat (all ranks in one cube) if pes_per_cube
|
|
||||||
# is not set (legacy single-cube tests).
|
|
||||||
ppc = self._state.pes_per_cube
|
|
||||||
if axis == 1:
|
|
||||||
return self._state.rank // ppc
|
|
||||||
return self._state.rank % ppc
|
|
||||||
|
|
||||||
def num_programs(self, axis: int = 0) -> int:
|
|
||||||
ppc = self._state.pes_per_cube
|
|
||||||
if axis == 1:
|
|
||||||
return self._state.world_size // ppc
|
|
||||||
return ppc
|
|
||||||
|
|
||||||
# ── arithmetic ops (called by TensorHandle.__add__ etc.) ──
|
|
||||||
|
|
||||||
def _binary_math(self, op: str, a: TensorHandle, b: TensorHandle) -> TensorHandle:
|
|
||||||
a_data = np.asarray(a.data) if a.data is not None else None
|
|
||||||
b_data = np.asarray(b.data) if b.data is not None else None
|
|
||||||
if a_data is None or b_data is None:
|
|
||||||
result = None
|
|
||||||
elif op == "add":
|
|
||||||
result = a_data + b_data
|
|
||||||
elif op == "sub":
|
|
||||||
result = a_data - b_data
|
|
||||||
elif op == "mul":
|
|
||||||
result = a_data * b_data
|
|
||||||
elif op == "div":
|
|
||||||
result = a_data / b_data
|
|
||||||
elif op == "maximum":
|
|
||||||
result = np.maximum(a_data, b_data)
|
|
||||||
elif op == "minimum":
|
|
||||||
result = np.minimum(a_data, b_data)
|
|
||||||
else:
|
|
||||||
raise NotImplementedError(f"mock _binary_math: op {op!r} not implemented")
|
|
||||||
return TensorHandle(
|
|
||||||
id=self._next_id(),
|
|
||||||
addr=0, shape=a.shape, dtype=a.dtype,
|
|
||||||
nbytes=int(np.prod(a.shape)) * 2 if a.shape else 0,
|
|
||||||
data=result, space="tcm",
|
|
||||||
)
|
|
||||||
|
|
||||||
def maximum(self, a: TensorHandle, b: TensorHandle) -> TensorHandle:
|
|
||||||
return self._binary_math("maximum", a, b)
|
|
||||||
|
|
||||||
def minimum(self, a: TensorHandle, b: TensorHandle) -> TensorHandle:
|
|
||||||
return self._binary_math("minimum", a, b)
|
|
||||||
|
|
||||||
def fma(
|
|
||||||
self, a: TensorHandle, b: TensorHandle, c: TensorHandle,
|
|
||||||
) -> TensorHandle:
|
|
||||||
a_data = np.asarray(a.data) if a.data is not None else None
|
|
||||||
b_data = np.asarray(b.data) if b.data is not None else None
|
|
||||||
c_data = np.asarray(c.data) if c.data is not None else None
|
|
||||||
result = (
|
|
||||||
a_data * b_data + c_data
|
|
||||||
if (a_data is not None and b_data is not None and c_data is not None)
|
|
||||||
else None
|
|
||||||
)
|
|
||||||
return TensorHandle(
|
|
||||||
id=self._next_id(),
|
|
||||||
addr=0, shape=a.shape, dtype=a.dtype,
|
|
||||||
nbytes=int(np.prod(a.shape)) * 2 if a.shape else 0,
|
|
||||||
data=result, space="tcm",
|
|
||||||
)
|
|
||||||
|
|
||||||
def clamp(
|
|
||||||
self,
|
|
||||||
x: TensorHandle,
|
|
||||||
min: TensorHandle,
|
|
||||||
max: TensorHandle,
|
|
||||||
) -> TensorHandle:
|
|
||||||
x_data = np.asarray(x.data) if x.data is not None else None
|
|
||||||
lo = np.asarray(min.data) if min.data is not None else None
|
|
||||||
hi = np.asarray(max.data) if max.data is not None else None
|
|
||||||
result = (
|
|
||||||
np.minimum(np.maximum(x_data, lo), hi)
|
|
||||||
if (x_data is not None and lo is not None and hi is not None)
|
|
||||||
else None
|
|
||||||
)
|
|
||||||
return TensorHandle(
|
|
||||||
id=self._next_id(),
|
|
||||||
addr=0, shape=x.shape, dtype=x.dtype,
|
|
||||||
nbytes=int(np.prod(x.shape)) * 2 if x.shape else 0,
|
|
||||||
data=result, space="tcm",
|
|
||||||
)
|
|
||||||
|
|
||||||
def softmax(self, x: TensorHandle, axis: int = -1) -> TensorHandle:
|
|
||||||
x_data = np.asarray(x.data) if x.data is not None else None
|
|
||||||
if x_data is None:
|
|
||||||
result = None
|
|
||||||
else:
|
|
||||||
x_max = np.max(x_data, axis=axis, keepdims=True)
|
|
||||||
e = np.exp(x_data - x_max)
|
|
||||||
s = np.sum(e, axis=axis, keepdims=True)
|
|
||||||
result = e / s
|
|
||||||
return TensorHandle(
|
|
||||||
id=self._next_id(),
|
|
||||||
addr=0, shape=x.shape, dtype=x.dtype,
|
|
||||||
nbytes=int(np.prod(x.shape)) * 2 if x.shape else 0,
|
|
||||||
data=result, space="tcm",
|
|
||||||
)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def cdiv(a: int, b: int) -> int:
|
|
||||||
return -(-int(a) // int(b))
|
|
||||||
|
|
||||||
def _unary_math(self, op: str, x: TensorHandle) -> TensorHandle:
|
|
||||||
x_data = np.asarray(x.data) if x.data is not None else None
|
|
||||||
if x_data is None:
|
|
||||||
result = None
|
|
||||||
elif op == "exp":
|
|
||||||
result = np.exp(x_data)
|
|
||||||
elif op == "log":
|
|
||||||
result = np.log(x_data)
|
|
||||||
elif op == "sqrt":
|
|
||||||
result = np.sqrt(x_data)
|
|
||||||
elif op == "abs":
|
|
||||||
result = np.abs(x_data)
|
|
||||||
elif op == "sigmoid":
|
|
||||||
result = 1.0 / (1.0 + np.exp(-x_data))
|
|
||||||
elif op == "cos":
|
|
||||||
result = np.cos(x_data)
|
|
||||||
elif op == "sin":
|
|
||||||
result = np.sin(x_data)
|
|
||||||
else:
|
|
||||||
raise NotImplementedError(f"mock _unary_math: op {op!r} not implemented")
|
|
||||||
return TensorHandle(
|
|
||||||
id=self._next_id(),
|
|
||||||
addr=0, shape=x.shape, dtype=x.dtype,
|
|
||||||
nbytes=int(np.prod(x.shape)) * 2 if x.shape else 0,
|
|
||||||
data=result, space="tcm",
|
|
||||||
)
|
|
||||||
|
|
||||||
def load(self, ptr: int, shape: tuple[int, ...], dtype: str = "f16") -> TensorHandle:
|
|
||||||
data = self._state._hbm.get(ptr)
|
|
||||||
if data is None:
|
|
||||||
data = np.zeros(shape, dtype=np.float16)
|
|
||||||
return TensorHandle(
|
|
||||||
id=f"load_{ptr}", addr=ptr, shape=shape, dtype=dtype,
|
|
||||||
nbytes=int(np.prod(shape)) * 2, data=data, space="hbm",
|
|
||||||
)
|
|
||||||
|
|
||||||
def store(self, ptr: int, handle: TensorHandle) -> None:
|
|
||||||
if handle.data is not None:
|
|
||||||
self._state._hbm[ptr] = np.asarray(handle.data)
|
|
||||||
if ptr == self._state._slice_addr:
|
|
||||||
self._state.output = self._state._hbm[ptr]
|
|
||||||
|
|
||||||
# IPCQ
|
|
||||||
def send(
|
|
||||||
self,
|
|
||||||
dir: str,
|
|
||||||
src: TensorHandle | None = None,
|
|
||||||
*,
|
|
||||||
src_addr: int | None = None,
|
|
||||||
nbytes: int | None = None,
|
|
||||||
shape: tuple[int, ...] | None = None,
|
|
||||||
dtype: str = "f16",
|
|
||||||
space: str = "tcm",
|
|
||||||
) -> None:
|
|
||||||
if dir not in self._state.neighbors:
|
|
||||||
raise IpcqInvalidDirection(
|
|
||||||
f"mock tl.send: direction {dir!r} not in neighbors {list(self._state.neighbors)}"
|
|
||||||
)
|
|
||||||
if src is not None:
|
|
||||||
if src.data is not None:
|
|
||||||
data = np.asarray(src.data)
|
|
||||||
else:
|
|
||||||
# Resolve from this rank's local memory at src.addr
|
|
||||||
space_dict = self._state._hbm if src.space == "hbm" else self._state._tcm
|
|
||||||
stored = space_dict.get(src.addr)
|
|
||||||
if stored is None:
|
|
||||||
raise RuntimeError(
|
|
||||||
f"mock tl.send: no data at {src.space}:0x{src.addr:x}"
|
|
||||||
)
|
|
||||||
data = np.asarray(stored)
|
|
||||||
else:
|
|
||||||
data = None
|
|
||||||
if data is None:
|
|
||||||
raise RuntimeError("mock tl.send: src is None")
|
|
||||||
peer_rank = self._state.neighbors[dir]
|
|
||||||
# Find the reverse direction at the peer, mirroring real IPCQ
|
|
||||||
# install pairing: N↔S, E↔W, parent↔parent, child_left↔child_left, etc.
|
|
||||||
_REVERSE = {"N": "S", "S": "N", "E": "W", "W": "E",
|
|
||||||
"parent": "parent", "child_left": "child_left",
|
|
||||||
"child_right": "child_right"}
|
|
||||||
peer_state = self._scheduler.states[peer_rank]
|
|
||||||
reverse_dir = _REVERSE.get(dir)
|
|
||||||
# Fall back to "first direction pointing at me" if the explicit
|
|
||||||
# reverse doesn't exist at the peer (e.g. custom directions).
|
|
||||||
if reverse_dir is None or reverse_dir not in peer_state.neighbors:
|
|
||||||
reverse_dir = None
|
|
||||||
for d, target in peer_state.neighbors.items():
|
|
||||||
if target == self._state.rank:
|
|
||||||
reverse_dir = d
|
|
||||||
break
|
|
||||||
if reverse_dir is None:
|
|
||||||
raise RuntimeError(
|
|
||||||
f"mock tl.send: peer rank {peer_rank} has no reverse direction"
|
|
||||||
)
|
|
||||||
peer_state.recv_q[reverse_dir].append(data.copy())
|
|
||||||
self._scheduler._send_counter += 1
|
|
||||||
# After delivering, hand control back to scheduler so the receiver
|
|
||||||
# can wake up.
|
|
||||||
self._scheduler.yield_()
|
|
||||||
|
|
||||||
def recv_async(
|
|
||||||
self,
|
|
||||||
dir: str,
|
|
||||||
shape: tuple[int, ...] = (),
|
|
||||||
dtype: str = "f16",
|
|
||||||
) -> dict:
|
|
||||||
"""Non-blocking recv. Returns a future dict to pass to tl.wait."""
|
|
||||||
if dir not in self._state.neighbors:
|
|
||||||
raise IpcqInvalidDirection(
|
|
||||||
f"mock tl.recv_async: direction {dir!r} not in neighbors"
|
|
||||||
)
|
|
||||||
return {"_kind": "recv_future", "dir": dir, "shape": shape, "dtype": dtype}
|
|
||||||
|
|
||||||
def wait(self, future: Any) -> TensorHandle:
|
|
||||||
"""Block until the recv future has data."""
|
|
||||||
if not isinstance(future, dict) or future.get("_kind") != "recv_future":
|
|
||||||
raise TypeError("tl.wait: expected recv future from tl.recv_async")
|
|
||||||
d = future["dir"]
|
|
||||||
while not self._state.recv_q[d]:
|
|
||||||
self._scheduler.yield_()
|
|
||||||
data = self._state.recv_q[d].popleft()
|
|
||||||
return self._make_handle(data, d, future["dtype"])
|
|
||||||
|
|
||||||
def recv(
|
|
||||||
self,
|
|
||||||
dir: str | None = None,
|
|
||||||
shape: tuple[int, ...] = (),
|
|
||||||
dtype: str = "f16",
|
|
||||||
) -> TensorHandle:
|
|
||||||
if dir is not None and dir not in self._state.neighbors:
|
|
||||||
raise IpcqInvalidDirection(
|
|
||||||
f"mock tl.recv: direction {dir!r} not in neighbors {list(self._state.neighbors)}"
|
|
||||||
)
|
|
||||||
# Wait for data
|
|
||||||
while True:
|
|
||||||
if dir is None:
|
|
||||||
# round-robin over directions
|
|
||||||
for d in self._state.neighbors:
|
|
||||||
if self._state.recv_q[d]:
|
|
||||||
data = self._state.recv_q[d].popleft()
|
|
||||||
return self._make_handle(data, d, dtype)
|
|
||||||
else:
|
|
||||||
if self._state.recv_q[dir]:
|
|
||||||
data = self._state.recv_q[dir].popleft()
|
|
||||||
return self._make_handle(data, dir, dtype)
|
|
||||||
# Yield to other ranks
|
|
||||||
self._scheduler.yield_()
|
|
||||||
|
|
||||||
def _make_handle(self, data: np.ndarray, direction: str, dtype: str) -> TensorHandle:
|
|
||||||
return TensorHandle(
|
|
||||||
id=f"recv_{direction}",
|
|
||||||
addr=0, shape=data.shape, dtype=dtype,
|
|
||||||
nbytes=int(data.nbytes), data=data, space="tcm",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# ── Cooperative scheduler ────────────────────────────────────────────
|
|
||||||
|
|
||||||
|
|
||||||
class _MockScheduler:
|
|
||||||
"""Round-robin cooperative scheduler over rank greenlets."""
|
|
||||||
|
|
||||||
def __init__(self, states: list[_MockRankState]) -> None:
|
|
||||||
self.states = states
|
|
||||||
self._parent: greenlet | None = None
|
|
||||||
self._cur_idx = 0
|
|
||||||
|
|
||||||
def yield_(self) -> None:
|
|
||||||
"""Called from inside a rank greenlet to give other ranks a turn."""
|
|
||||||
assert self._parent is not None
|
|
||||||
self._parent.switch()
|
|
||||||
|
|
||||||
def run(self, kernel_fn: Callable, kernel_args: tuple) -> list[np.ndarray]:
|
|
||||||
from kernbench.triton_emu.tl_context import TLContext
|
|
||||||
|
|
||||||
self._parent = greenlet.getcurrent()
|
|
||||||
n = len(self.states)
|
|
||||||
|
|
||||||
# Per-rank tl shim
|
|
||||||
tls: dict[int, _MockTL] = {}
|
|
||||||
|
|
||||||
def _spawn(rank_idx: int) -> greenlet:
|
|
||||||
state = self.states[rank_idx]
|
|
||||||
tl = _MockTL(state, self)
|
|
||||||
tls[rank_idx] = tl
|
|
||||||
|
|
||||||
def _entry():
|
|
||||||
# Activate this rank's tl for TensorHandle operator overloads
|
|
||||||
TLContext._set_active(tl) # type: ignore[attr-defined]
|
|
||||||
try:
|
|
||||||
kernel_fn(state.t_ptr, *kernel_args, tl=tl)
|
|
||||||
finally:
|
|
||||||
TLContext._set_active(None) # type: ignore[attr-defined]
|
|
||||||
|
|
||||||
return greenlet(_entry)
|
|
||||||
|
|
||||||
for state in self.states:
|
|
||||||
state.g = _spawn(state.rank)
|
|
||||||
|
|
||||||
# Drive each rank round-robin until all dead. Detect global deadlock.
|
|
||||||
# A global send counter tracks whether any greenlet delivered data
|
|
||||||
# in the current round. This is more reliable than queue-depth
|
|
||||||
# tracking because a recv+send pair in the same round nets to zero
|
|
||||||
# depth change yet still represents real progress.
|
|
||||||
self._send_counter = 0
|
|
||||||
max_idle_rounds = 10_000
|
|
||||||
idle_rounds = 0
|
|
||||||
while True:
|
|
||||||
alive = [s for s in self.states if s.g is not None and not s.g.dead]
|
|
||||||
if not alive:
|
|
||||||
break
|
|
||||||
counter_before = self._send_counter
|
|
||||||
for s in self.states:
|
|
||||||
if s.g is None or s.g.dead:
|
|
||||||
continue
|
|
||||||
TLContext._set_active(tls[s.rank]) # type: ignore[attr-defined]
|
|
||||||
s.g.switch()
|
|
||||||
TLContext._set_active(None) # type: ignore[attr-defined]
|
|
||||||
any_died = any(s.g is not None and s.g.dead for s in self.states)
|
|
||||||
if self._send_counter > counter_before or any_died:
|
|
||||||
idle_rounds = 0
|
|
||||||
else:
|
|
||||||
idle_rounds += 1
|
|
||||||
if idle_rounds >= max_idle_rounds:
|
|
||||||
raise RuntimeError(
|
|
||||||
"mock CCL runtime: deadlock detected (no progress for "
|
|
||||||
f"{max_idle_rounds} rounds)"
|
|
||||||
)
|
|
||||||
|
|
||||||
return [
|
|
||||||
s.output if s.output is not None else s._hbm.get(s._slice_addr)
|
|
||||||
for s in self.states
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
# ── Public entry ────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
|
|
||||||
def run_kernel_in_mock(
|
|
||||||
kernel_fn: Callable,
|
|
||||||
world_size: int,
|
|
||||||
topology: str,
|
|
||||||
inputs: list[np.ndarray],
|
|
||||||
kernel_args: tuple = (),
|
|
||||||
algo_module: Any | None = None,
|
|
||||||
pes_per_cube: int = 0,
|
|
||||||
) -> list[np.ndarray]:
|
|
||||||
"""Run a CCL kernel under the mock runtime with no SimPy/fabric.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
kernel_fn: ``kernel(t_ptr, *kernel_args, tl=...)``
|
|
||||||
world_size: number of ranks
|
|
||||||
topology: builtin topology name (e.g. "ring_1d")
|
|
||||||
inputs: per-rank input ndarrays. ``inputs[r]`` becomes rank r's
|
|
||||||
local tile at HBM address 0.
|
|
||||||
kernel_args: extra positional args after t_ptr
|
|
||||||
algo_module: optional module providing ``neighbors()`` override
|
|
||||||
pes_per_cube: PEs per cube for multi-cube program_id mapping.
|
|
||||||
0 → single-cube legacy (all ranks in one cube).
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Per-rank output ndarrays — whatever the kernel wrote via tl.store
|
|
||||||
(or the original input if the kernel didn't store).
|
|
||||||
"""
|
|
||||||
if len(inputs) != world_size:
|
|
||||||
raise ValueError(f"len(inputs)={len(inputs)} != world_size={world_size}")
|
|
||||||
|
|
||||||
topo_fn = resolve_topology(topology, algo_module=algo_module)
|
|
||||||
states = [
|
|
||||||
_MockRankState(
|
|
||||||
rank=r, world_size=world_size,
|
|
||||||
neighbors=topo_fn(r, world_size),
|
|
||||||
input_arr=inputs[r],
|
|
||||||
pes_per_cube=pes_per_cube,
|
|
||||||
)
|
|
||||||
for r in range(world_size)
|
|
||||||
]
|
|
||||||
|
|
||||||
sched = _MockScheduler(states)
|
|
||||||
return sched.run(kernel_fn, kernel_args)
|
|
||||||
@@ -73,6 +73,39 @@ def tree_binary(rank: int, world_size: int) -> NeighborMap:
|
|||||||
return n
|
return n
|
||||||
|
|
||||||
|
|
||||||
|
def torus_2d(rank: int, world_size: int) -> NeighborMap:
|
||||||
|
"""Square 2D torus (N/S/E/W) with wrap-around on all edges.
|
||||||
|
|
||||||
|
Alias for mesh_2d (which already wraps). Explicit name for clarity
|
||||||
|
when used as a SIP-level topology.
|
||||||
|
"""
|
||||||
|
return mesh_2d(rank, world_size)
|
||||||
|
|
||||||
|
|
||||||
|
def mesh_2d_no_wrap(rank: int, world_size: int) -> NeighborMap:
|
||||||
|
"""Square 2D mesh (N/S/E/W) WITHOUT wrap-around.
|
||||||
|
|
||||||
|
Edge nodes have fewer neighbors (no wrapping). Used for SIP-level
|
||||||
|
topologies where physical links don't wrap.
|
||||||
|
"""
|
||||||
|
side = int(round(world_size ** 0.5))
|
||||||
|
if side * side != world_size:
|
||||||
|
raise ValueError(
|
||||||
|
f"mesh_2d_no_wrap requires square world_size, got {world_size}"
|
||||||
|
)
|
||||||
|
r, c = divmod(rank, side)
|
||||||
|
n: NeighborMap = {}
|
||||||
|
if r > 0:
|
||||||
|
n["N"] = (r - 1) * side + c
|
||||||
|
if r < side - 1:
|
||||||
|
n["S"] = (r + 1) * side + c
|
||||||
|
if c > 0:
|
||||||
|
n["W"] = r * side + (c - 1)
|
||||||
|
if c < side - 1:
|
||||||
|
n["E"] = r * side + (c + 1)
|
||||||
|
return n
|
||||||
|
|
||||||
|
|
||||||
def none(rank: int, world_size: int) -> NeighborMap:
|
def none(rank: int, world_size: int) -> NeighborMap:
|
||||||
"""Empty map — algorithm's neighbors() must build from scratch."""
|
"""Empty map — algorithm's neighbors() must build from scratch."""
|
||||||
return {}
|
return {}
|
||||||
@@ -82,6 +115,8 @@ _BUILTIN: dict[str, TopologyFn] = {
|
|||||||
"ring_1d": ring_1d,
|
"ring_1d": ring_1d,
|
||||||
"ring_1d_unidir": ring_1d_unidir,
|
"ring_1d_unidir": ring_1d_unidir,
|
||||||
"mesh_2d": mesh_2d,
|
"mesh_2d": mesh_2d,
|
||||||
|
"torus_2d": torus_2d,
|
||||||
|
"mesh_2d_no_wrap": mesh_2d_no_wrap,
|
||||||
"tree_binary": tree_binary,
|
"tree_binary": tree_binary,
|
||||||
"none": none,
|
"none": none,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -196,10 +196,17 @@ class IpcqCreditMetadata:
|
|||||||
Sent by ``PeIpcqComponent._delayed_credit_send`` after a
|
Sent by ``PeIpcqComponent._delayed_credit_send`` after a
|
||||||
bottleneck-BW based latency, putting the metadata directly into
|
bottleneck-BW based latency, putting the metadata directly into
|
||||||
the peer's pre-wired credit store (no fabric routing).
|
the peer's pre-wired credit store (no fabric routing).
|
||||||
|
|
||||||
|
``dst_rx_base_pa`` is the receiver's ``my_rx_base_pa`` for the direction
|
||||||
|
whose slot was consumed. The original sender matches this against
|
||||||
|
``qp.peer.rx_base_pa`` to find the correct direction (ADR-0025 D3) —
|
||||||
|
unambiguous even when multiple directions share the same peer (e.g.
|
||||||
|
2-rank bidirectional ring).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
consumer_seq: int # my_tail at recv side (new tail value)
|
consumer_seq: int # my_tail at recv side (new tail value)
|
||||||
src_sip: int # which peer is sending the credit
|
dst_rx_base_pa: int # receiver-side my_rx_base_pa (ADR-0025 D3)
|
||||||
|
src_sip: int # which peer is sending the credit (diag)
|
||||||
src_cube: int
|
src_cube: int
|
||||||
src_pe: int
|
src_pe: int
|
||||||
src_direction: str # sender-side direction (peer maps to its own)
|
src_direction: str # sender-side direction (peer maps to its own)
|
||||||
|
|||||||
@@ -370,11 +370,21 @@ class PeIpcqComponent(ComponentBase):
|
|||||||
# ── Metadata arrival from PE_DMA (D9) ──
|
# ── Metadata arrival from PE_DMA (D9) ──
|
||||||
|
|
||||||
def _handle_meta_arrival(self, msg: IpcqMetaArrival) -> None:
|
def _handle_meta_arrival(self, msg: IpcqMetaArrival) -> None:
|
||||||
|
"""Match arrival to the correct direction by dst_addr range (ADR-0025 D2).
|
||||||
|
|
||||||
|
Each direction has a unique rx buffer address range
|
||||||
|
([my_rx_base_pa, my_rx_base_pa + n_slots * slot_size)). The token's
|
||||||
|
dst_addr (set by the sender's IPCQ when computing the peer slot
|
||||||
|
address) falls within exactly one such range. Address-based matching
|
||||||
|
is unambiguous even when multiple directions share the same peer
|
||||||
|
(2-rank bidirectional ring).
|
||||||
|
"""
|
||||||
token = msg.token
|
token = msg.token
|
||||||
sender_key = (token.src_sip, token.src_cube, token.src_pe)
|
dst_addr = token.dst_addr
|
||||||
for d, qp in self._queue_pairs.items():
|
for d, qp in self._queue_pairs.items():
|
||||||
p = qp["peer"]
|
base = qp["my_rx_base_pa"]
|
||||||
if (p.sip, p.cube, p.pe) == sender_key:
|
size = qp["n_slots"] * qp["slot_size"]
|
||||||
|
if base <= dst_addr < base + size:
|
||||||
qp["peer_head_cache"] = max(qp["peer_head_cache"], token.sender_seq + 1)
|
qp["peer_head_cache"] = max(qp["peer_head_cache"], token.sender_seq + 1)
|
||||||
# Track arrived token for strict-mode peek
|
# Track arrived token for strict-mode peek
|
||||||
self._arrived_tokens.setdefault(d, []).append(token)
|
self._arrived_tokens.setdefault(d, []).append(token)
|
||||||
@@ -391,19 +401,22 @@ class PeIpcqComponent(ComponentBase):
|
|||||||
if not ev.triggered:
|
if not ev.triggered:
|
||||||
ev.succeed()
|
ev.succeed()
|
||||||
return
|
return
|
||||||
# Unknown sender — silently drop (could log)
|
# Unknown dst_addr — silently drop (could log)
|
||||||
|
|
||||||
# ── Credit return (fast path) ──
|
# ── Credit return (fast path) ──
|
||||||
|
|
||||||
def _credit_worker(self, env: simpy.Environment) -> Generator:
|
def _credit_worker(self, env: simpy.Environment) -> Generator:
|
||||||
"""Process IpcqCreditMetadata from credit_inbox."""
|
"""Process IpcqCreditMetadata from credit_inbox.
|
||||||
|
|
||||||
|
Matches credit to the correct direction by `credit.dst_rx_base_pa ==
|
||||||
|
qp.peer.rx_base_pa` (ADR-0025 D3). This is unambiguous even when
|
||||||
|
multiple directions share the same peer (2-rank bidirectional ring).
|
||||||
|
"""
|
||||||
assert self._credit_inbox is not None
|
assert self._credit_inbox is not None
|
||||||
while True:
|
while True:
|
||||||
credit: IpcqCreditMetadata = yield self._credit_inbox.get()
|
credit: IpcqCreditMetadata = yield self._credit_inbox.get()
|
||||||
sender_key = (credit.src_sip, credit.src_cube, credit.src_pe)
|
|
||||||
for d, qp in self._queue_pairs.items():
|
for d, qp in self._queue_pairs.items():
|
||||||
p = qp["peer"]
|
if qp["peer"].rx_base_pa == credit.dst_rx_base_pa:
|
||||||
if (p.sip, p.cube, p.pe) == sender_key:
|
|
||||||
qp["peer_tail_cache"] = max(qp["peer_tail_cache"], credit.consumer_seq)
|
qp["peer_tail_cache"] = max(qp["peer_tail_cache"], credit.consumer_seq)
|
||||||
# Wake any blocked send on this direction
|
# Wake any blocked send on this direction
|
||||||
waiters = self._send_waiters.get(d, [])
|
waiters = self._send_waiters.get(d, [])
|
||||||
@@ -421,12 +434,19 @@ class PeIpcqComponent(ComponentBase):
|
|||||||
new_tail: int,
|
new_tail: int,
|
||||||
) -> Generator:
|
) -> Generator:
|
||||||
"""Wait bottleneck-BW latency, then put IpcqCreditMetadata into peer
|
"""Wait bottleneck-BW latency, then put IpcqCreditMetadata into peer
|
||||||
credit store (D9 fast path)."""
|
credit store (D9 fast path).
|
||||||
|
|
||||||
|
Carries ``dst_rx_base_pa`` = this PE's my_rx_base_pa for the
|
||||||
|
consumed direction. The peer (original sender) matches this against
|
||||||
|
qp.peer.rx_base_pa to identify the correct qp (ADR-0025 D3).
|
||||||
|
"""
|
||||||
latency_ns = self._credit_latency_ns(direction)
|
latency_ns = self._credit_latency_ns(direction)
|
||||||
if latency_ns > 0:
|
if latency_ns > 0:
|
||||||
yield env.timeout(latency_ns)
|
yield env.timeout(latency_ns)
|
||||||
|
qp = self._queue_pairs[direction]
|
||||||
meta = IpcqCreditMetadata(
|
meta = IpcqCreditMetadata(
|
||||||
consumer_seq=new_tail,
|
consumer_seq=new_tail,
|
||||||
|
dst_rx_base_pa=qp["my_rx_base_pa"],
|
||||||
src_sip=self._self_sip,
|
src_sip=self._self_sip,
|
||||||
src_cube=self._self_cube,
|
src_cube=self._self_cube,
|
||||||
src_pe=self._self_pe,
|
src_pe=self._self_pe,
|
||||||
|
|||||||
@@ -1,3 +1,14 @@
|
|||||||
|
"""Data-parallel placement policy (ADR-0026: intra-device only).
|
||||||
|
|
||||||
|
``DPPolicy`` describes how a tensor is sharded *within a single SIP* across
|
||||||
|
that SIP's cubes and PEs. Crossing the SIP boundary is not a DPPolicy
|
||||||
|
concern: ADR-0024's ``torch.ahbm.set_device(rank)`` picks the SIP, and
|
||||||
|
Megatron-style TP (ADR-0027) expresses multi-SIP tensors when needed.
|
||||||
|
|
||||||
|
``ShardSpec`` is expressed in structural ``(sip, cube, pe)`` coordinates.
|
||||||
|
The former flat ``pe_index`` field/property is fully removed — callers
|
||||||
|
needing a flat integer key compute it explicitly at the call site.
|
||||||
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
@@ -7,25 +18,58 @@ from typing import Literal
|
|||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
class DPPolicy:
|
class DPPolicy:
|
||||||
"""Three-level data-parallel policy: sip-level + cube-level + pe-level.
|
"""Intra-device (cube × PE) data-parallel policy.
|
||||||
|
|
||||||
Policies:
|
SIP-level placement is controlled by ``torch.ahbm.set_device(rank)``
|
||||||
|
(ADR-0024). For tensors that must cross SIP boundaries, use
|
||||||
|
Megatron-style parallel layers (ADR-0027). DPPolicy itself never
|
||||||
|
crosses a SIP boundary.
|
||||||
|
|
||||||
|
Policies (per axis):
|
||||||
- "replicate": full copy at each unit
|
- "replicate": full copy at each unit
|
||||||
- "column_wise": split K (column) axis across units
|
- "column_wise": split K (column) axis across units
|
||||||
- "row_wise": split M (row) axis across units
|
- "row_wise": split M (row) axis across units
|
||||||
|
|
||||||
Optional overrides (default None = use topology dimensions):
|
Optional overrides (``None`` = use topology dimensions):
|
||||||
- num_pes: override PEs per cube (e.g., 1 for single-PE test)
|
- num_pes: override PEs per cube
|
||||||
- num_cubes: override cubes per SIP (e.g., 1 for single-cube test)
|
- num_cubes: override cubes per SIP
|
||||||
- num_sips: override SIP count
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
sip: Literal["replicate", "column_wise", "row_wise"] = "replicate"
|
|
||||||
cube: Literal["replicate", "column_wise", "row_wise"] = "replicate"
|
cube: Literal["replicate", "column_wise", "row_wise"] = "replicate"
|
||||||
pe: Literal["replicate", "column_wise", "row_wise"] = "replicate"
|
pe: Literal["replicate", "column_wise", "row_wise"] = "replicate"
|
||||||
num_pes: int | None = None
|
num_pes: int | None = None
|
||||||
num_cubes: int | None = None
|
num_cubes: int | None = None
|
||||||
num_sips: int | None = None
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class ShardSpec:
|
||||||
|
"""Structural shard placement — ``(sip, cube, pe)`` coord (ADR-0026).
|
||||||
|
|
||||||
|
Global-flat ``pe_index`` was removed: callers must use structural
|
||||||
|
coords directly. If a flat integer key is needed in a local context
|
||||||
|
(e.g. internal dict lookup), compute it explicitly at the call site
|
||||||
|
and do not expose it in any public API.
|
||||||
|
"""
|
||||||
|
|
||||||
|
sip: int
|
||||||
|
cube: int
|
||||||
|
pe: int
|
||||||
|
offset_bytes: int
|
||||||
|
nbytes: int
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class _LocalPeShard:
|
||||||
|
"""Internal — PE resolver's return type (ADR-0026 D3).
|
||||||
|
|
||||||
|
Holds a cube-local PE identifier (``local_pe``) plus the shard's
|
||||||
|
byte payload. Lifted into ``ShardSpec`` with full ``(sip, cube, pe)``
|
||||||
|
coordinates inside ``resolve_dp_policy``.
|
||||||
|
"""
|
||||||
|
|
||||||
|
local_pe: int
|
||||||
|
offset_bytes: int
|
||||||
|
nbytes: int
|
||||||
|
|
||||||
|
|
||||||
def _split_shape(
|
def _split_shape(
|
||||||
@@ -52,14 +96,13 @@ def resolve_dp_policy(
|
|||||||
itemsize: int,
|
itemsize: int,
|
||||||
num_pe: int,
|
num_pe: int,
|
||||||
num_cubes: int = 1,
|
num_cubes: int = 1,
|
||||||
num_sips: int = 1,
|
target_sip: int,
|
||||||
) -> list[ShardSpec]:
|
) -> list[ShardSpec]:
|
||||||
"""Resolve a DPPolicy into a list[ShardSpec] with three-level resolution.
|
"""Resolve a DPPolicy into a list[ShardSpec] on a single SIP.
|
||||||
|
|
||||||
SIP-level → cube-level → pe-level.
|
Two-level resolution (cube × PE) within ``target_sip``. Each returned
|
||||||
num_cubes is cubes per SIP (not total).
|
``ShardSpec`` carries ``sip=target_sip`` and cube/pe local to the SIP.
|
||||||
ShardSpec.pe_index uses flat indexing:
|
No SIP-level split — DPPolicy is intra-device only (ADR-0026).
|
||||||
sip_id * num_cubes * num_pe + cube_id * num_pe + pe_id
|
|
||||||
"""
|
"""
|
||||||
_PE_RESOLVERS = {
|
_PE_RESOLVERS = {
|
||||||
"replicate": replicate,
|
"replicate": replicate,
|
||||||
@@ -70,84 +113,61 @@ def resolve_dp_policy(
|
|||||||
if resolver is None:
|
if resolver is None:
|
||||||
raise ValueError(f"Unknown pe-level policy: {policy.pe}")
|
raise ValueError(f"Unknown pe-level policy: {policy.pe}")
|
||||||
|
|
||||||
cubes_per_sip = num_cubes
|
|
||||||
all_shards: list[ShardSpec] = []
|
all_shards: list[ShardSpec] = []
|
||||||
|
|
||||||
# Level 1: SIP
|
# Level 1: cube within SIP
|
||||||
sip_splits = _split_shape(policy.sip, shape, num_sips, itemsize)
|
cube_splits = _split_shape(policy.cube, shape, num_cubes, itemsize)
|
||||||
|
|
||||||
for sip_id, (sip_shape, sip_offset) in enumerate(sip_splits):
|
for cube_id, (cube_shape, cube_offset) in enumerate(cube_splits):
|
||||||
# Level 2: Cube within SIP
|
# Level 2: PE within cube — resolver returns _LocalPeShard
|
||||||
cube_splits = _split_shape(policy.cube, sip_shape, cubes_per_sip, itemsize)
|
local_shards = resolver(shape=cube_shape, itemsize=itemsize, num_pe=num_pe)
|
||||||
|
|
||||||
for cube_id, (cube_shape, cube_offset) in enumerate(cube_splits):
|
for ls in local_shards:
|
||||||
# Level 3: PE within cube
|
all_shards.append(ShardSpec(
|
||||||
pe_shards = resolver(shape=cube_shape, itemsize=itemsize, num_pe=num_pe)
|
sip=target_sip,
|
||||||
|
cube=cube_id,
|
||||||
for ps in pe_shards:
|
pe=ls.local_pe,
|
||||||
flat_idx = (
|
offset_bytes=cube_offset + ls.offset_bytes,
|
||||||
sip_id * cubes_per_sip * num_pe
|
nbytes=ls.nbytes,
|
||||||
+ cube_id * num_pe
|
))
|
||||||
+ ps.pe_index
|
|
||||||
)
|
|
||||||
all_shards.append(ShardSpec(
|
|
||||||
pe_index=flat_idx,
|
|
||||||
offset_bytes=sip_offset + cube_offset + ps.offset_bytes,
|
|
||||||
nbytes=ps.nbytes,
|
|
||||||
))
|
|
||||||
|
|
||||||
return all_shards
|
return all_shards
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
|
||||||
class ShardSpec:
|
|
||||||
pe_index: int
|
|
||||||
offset_bytes: int
|
|
||||||
nbytes: int
|
|
||||||
|
|
||||||
|
|
||||||
def column_wise(
|
def column_wise(
|
||||||
*, shape: tuple[int, int], itemsize: int, num_pe: int,
|
*, shape: tuple[int, int], itemsize: int, num_pe: int,
|
||||||
) -> list[ShardSpec]:
|
) -> list[_LocalPeShard]:
|
||||||
"""Split K axis into num_pe equal parts. Each PE gets (M, K/P)."""
|
"""Split K axis into num_pe equal parts. Each PE gets (M, K/P)."""
|
||||||
M, K = shape
|
M, K = shape
|
||||||
chunk_k = K // num_pe
|
chunk_k = K // num_pe
|
||||||
chunk_bytes = M * chunk_k * itemsize
|
chunk_bytes = M * chunk_k * itemsize
|
||||||
shards = []
|
return [
|
||||||
for i in range(num_pe):
|
_LocalPeShard(local_pe=i, offset_bytes=i * chunk_bytes, nbytes=chunk_bytes)
|
||||||
shards.append(ShardSpec(
|
for i in range(num_pe)
|
||||||
pe_index=i,
|
]
|
||||||
offset_bytes=i * chunk_bytes,
|
|
||||||
nbytes=chunk_bytes,
|
|
||||||
))
|
|
||||||
return shards
|
|
||||||
|
|
||||||
|
|
||||||
def row_wise(
|
def row_wise(
|
||||||
*, shape: tuple[int, int], itemsize: int, num_pe: int,
|
*, shape: tuple[int, int], itemsize: int, num_pe: int,
|
||||||
) -> list[ShardSpec]:
|
) -> list[_LocalPeShard]:
|
||||||
"""Split M axis into num_pe equal parts. Each PE gets (M/P, K)."""
|
"""Split M axis into num_pe equal parts. Each PE gets (M/P, K)."""
|
||||||
M, K = shape
|
M, K = shape
|
||||||
chunk_m = M // num_pe
|
chunk_m = M // num_pe
|
||||||
chunk_bytes = chunk_m * K * itemsize
|
chunk_bytes = chunk_m * K * itemsize
|
||||||
shards = []
|
return [
|
||||||
for i in range(num_pe):
|
_LocalPeShard(local_pe=i, offset_bytes=i * chunk_bytes, nbytes=chunk_bytes)
|
||||||
shards.append(ShardSpec(
|
for i in range(num_pe)
|
||||||
pe_index=i,
|
]
|
||||||
offset_bytes=i * chunk_bytes,
|
|
||||||
nbytes=chunk_bytes,
|
|
||||||
))
|
|
||||||
return shards
|
|
||||||
|
|
||||||
|
|
||||||
def replicate(
|
def replicate(
|
||||||
*, shape: tuple[int, int], itemsize: int, num_pe: int,
|
*, shape: tuple[int, int], itemsize: int, num_pe: int,
|
||||||
) -> list[ShardSpec]:
|
) -> list[_LocalPeShard]:
|
||||||
"""Full copy per PE. Each PE gets (M, K)."""
|
"""Full copy per PE. Each PE gets (M, K)."""
|
||||||
M, K = shape
|
M, K = shape
|
||||||
full_bytes = M * K * itemsize
|
full_bytes = M * K * itemsize
|
||||||
return [
|
return [
|
||||||
ShardSpec(pe_index=i, offset_bytes=0, nbytes=full_bytes)
|
_LocalPeShard(local_pe=i, offset_bytes=0, nbytes=full_bytes)
|
||||||
for i in range(num_pe)
|
for i in range(num_pe)
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -155,20 +175,20 @@ def replicate(
|
|||||||
def tiled_column_major(
|
def tiled_column_major(
|
||||||
*, shape: tuple[int, int], itemsize: int, num_pe: int,
|
*, shape: tuple[int, int], itemsize: int, num_pe: int,
|
||||||
tile_m: int, tile_k: int,
|
tile_m: int, tile_k: int,
|
||||||
) -> list[ShardSpec]:
|
) -> list[_LocalPeShard]:
|
||||||
"""2D tiling, column-major order (K axis first), round-robin across PEs."""
|
"""2D tiling, column-major order (K axis first), round-robin across PEs."""
|
||||||
M, K = shape
|
M, K = shape
|
||||||
tiles_m = ceil(M / tile_m)
|
tiles_m = ceil(M / tile_m)
|
||||||
tiles_k = ceil(K / tile_k)
|
tiles_k = ceil(K / tile_k)
|
||||||
tile_bytes = tile_m * tile_k * itemsize
|
tile_bytes = tile_m * tile_k * itemsize
|
||||||
row_bytes = K * itemsize
|
row_bytes = K * itemsize
|
||||||
shards = []
|
shards: list[_LocalPeShard] = []
|
||||||
idx = 0
|
idx = 0
|
||||||
for mi in range(tiles_m):
|
for mi in range(tiles_m):
|
||||||
for ki in range(tiles_k):
|
for ki in range(tiles_k):
|
||||||
offset = (mi * tile_m * row_bytes) + (ki * tile_k * itemsize)
|
offset = (mi * tile_m * row_bytes) + (ki * tile_k * itemsize)
|
||||||
shards.append(ShardSpec(
|
shards.append(_LocalPeShard(
|
||||||
pe_index=idx % num_pe,
|
local_pe=idx % num_pe,
|
||||||
offset_bytes=offset,
|
offset_bytes=offset,
|
||||||
nbytes=tile_bytes,
|
nbytes=tile_bytes,
|
||||||
))
|
))
|
||||||
@@ -179,20 +199,20 @@ def tiled_column_major(
|
|||||||
def tiled_row_major(
|
def tiled_row_major(
|
||||||
*, shape: tuple[int, int], itemsize: int, num_pe: int,
|
*, shape: tuple[int, int], itemsize: int, num_pe: int,
|
||||||
tile_m: int, tile_k: int,
|
tile_m: int, tile_k: int,
|
||||||
) -> list[ShardSpec]:
|
) -> list[_LocalPeShard]:
|
||||||
"""2D tiling, row-major order (M axis first), round-robin across PEs."""
|
"""2D tiling, row-major order (M axis first), round-robin across PEs."""
|
||||||
M, K = shape
|
M, K = shape
|
||||||
tiles_m = ceil(M / tile_m)
|
tiles_m = ceil(M / tile_m)
|
||||||
tiles_k = ceil(K / tile_k)
|
tiles_k = ceil(K / tile_k)
|
||||||
tile_bytes = tile_m * tile_k * itemsize
|
tile_bytes = tile_m * tile_k * itemsize
|
||||||
row_bytes = K * itemsize
|
row_bytes = K * itemsize
|
||||||
shards = []
|
shards: list[_LocalPeShard] = []
|
||||||
idx = 0
|
idx = 0
|
||||||
for ki in range(tiles_k):
|
for ki in range(tiles_k):
|
||||||
for mi in range(tiles_m):
|
for mi in range(tiles_m):
|
||||||
offset = (mi * tile_m * row_bytes) + (ki * tile_k * itemsize)
|
offset = (mi * tile_m * row_bytes) + (ki * tile_k * itemsize)
|
||||||
shards.append(ShardSpec(
|
shards.append(_LocalPeShard(
|
||||||
pe_index=idx % num_pe,
|
local_pe=idx % num_pe,
|
||||||
offset_bytes=offset,
|
offset_bytes=offset,
|
||||||
nbytes=tile_bytes,
|
nbytes=tile_bytes,
|
||||||
))
|
))
|
||||||
|
|||||||
@@ -42,6 +42,59 @@ def _numpy_to_dtype_str(np_dtype) -> str:
|
|||||||
raise ValueError(f"unsupported numpy dtype: {np_dtype!r}")
|
raise ValueError(f"unsupported numpy dtype: {np_dtype!r}")
|
||||||
|
|
||||||
|
|
||||||
|
# ADR-0027 D3: weak registry of the currently-active RuntimeContext so
|
||||||
|
# module-level helpers (e.g. ``kernbench.tp.parallel_state``) can resolve
|
||||||
|
# the ctx without threading it through every call.
|
||||||
|
import weakref as _weakref
|
||||||
|
|
||||||
|
_ACTIVE_CTX_REF: _weakref.ref | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def _get_active_context():
|
||||||
|
"""Return the most-recently-entered RuntimeContext, or None."""
|
||||||
|
if _ACTIVE_CTX_REF is None:
|
||||||
|
return None
|
||||||
|
return _ACTIVE_CTX_REF()
|
||||||
|
|
||||||
|
|
||||||
|
class _AhbmNamespace:
|
||||||
|
"""torch.ahbm — per-greenlet SIP device binding (ADR-0024 D10).
|
||||||
|
|
||||||
|
Real-PyTorch parity idiom: ``torch.cuda.set_device(rank)``. KernBench's
|
||||||
|
backend is 'ahbm' (not CUDA), so this namespace avoids pretending to be
|
||||||
|
a CUDA runtime.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self._device_by_greenlet: dict = {}
|
||||||
|
|
||||||
|
def set_device(self, device: int) -> None:
|
||||||
|
from greenlet import getcurrent
|
||||||
|
self._device_by_greenlet[getcurrent()] = int(device)
|
||||||
|
|
||||||
|
def current_device(self) -> int | None:
|
||||||
|
from greenlet import getcurrent
|
||||||
|
return self._device_by_greenlet.get(getcurrent())
|
||||||
|
|
||||||
|
|
||||||
|
class _AcceleratorNamespace:
|
||||||
|
"""torch.accelerator — device-agnostic alias (PyTorch 2.x style).
|
||||||
|
|
||||||
|
Wraps _AhbmNamespace. Bench code can pick either:
|
||||||
|
torch.ahbm.set_device(rank) # explicit backend
|
||||||
|
torch.accelerator.set_device_index(rank) # portable
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, ahbm: "_AhbmNamespace") -> None:
|
||||||
|
self._ahbm = ahbm
|
||||||
|
|
||||||
|
def set_device_index(self, device: int) -> None:
|
||||||
|
self._ahbm.set_device(device)
|
||||||
|
|
||||||
|
def current_device_index(self) -> int | None:
|
||||||
|
return self._ahbm.current_device()
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class RuntimeContext:
|
class RuntimeContext:
|
||||||
engine: SimEngine
|
engine: SimEngine
|
||||||
@@ -51,7 +104,11 @@ class RuntimeContext:
|
|||||||
|
|
||||||
_handles: list[RequestHandle] = field(default_factory=list, init=False)
|
_handles: list[RequestHandle] = field(default_factory=list, init=False)
|
||||||
_completed: set[RequestHandle] = field(default_factory=set, init=False)
|
_completed: set[RequestHandle] = field(default_factory=set, init=False)
|
||||||
_allocators: dict[int, Any] = field(default_factory=dict, init=False)
|
# ADR-0027 D0.1: worker-deferred wait queue. When a worker greenlet
|
||||||
|
# calls ctx.wait(h), the handle is appended here and control yields to
|
||||||
|
# main. Main's scheduler drain consumes this list.
|
||||||
|
_pending_worker_waits: list[RequestHandle] = field(default_factory=list, init=False)
|
||||||
|
_allocators: dict[tuple[int, int, int], Any] = field(default_factory=dict, init=False)
|
||||||
_va_allocator: Any = field(default=None, init=False)
|
_va_allocator: Any = field(default=None, init=False)
|
||||||
_tensor_counter: int = field(default=0, init=False)
|
_tensor_counter: int = field(default=0, init=False)
|
||||||
_traces: list[dict] = field(default_factory=list, init=False)
|
_traces: list[dict] = field(default_factory=list, init=False)
|
||||||
@@ -67,6 +124,13 @@ class RuntimeContext:
|
|||||||
dc = DistributedContext()
|
dc = DistributedContext()
|
||||||
dc._ctx_ref = self # back-reference for AhbmCCLBackend to reach ctx.launch etc.
|
dc._ctx_ref = self # back-reference for AhbmCCLBackend to reach ctx.launch etc.
|
||||||
self.distributed = dc
|
self.distributed = dc
|
||||||
|
# ADR-0024 D10: torch.ahbm (KernBench-native) + torch.accelerator
|
||||||
|
# (PyTorch 2.x portable) namespaces for per-greenlet device binding.
|
||||||
|
self.ahbm = _AhbmNamespace()
|
||||||
|
self.accelerator = _AcceleratorNamespace(self.ahbm)
|
||||||
|
# ADR-0027 D1.3: torch.multiprocessing.spawn namespace.
|
||||||
|
from kernbench.runtime_api.multiprocessing import _MultiprocessingNamespace
|
||||||
|
self.multiprocessing = _MultiprocessingNamespace(self)
|
||||||
|
|
||||||
def install_ipcq(
|
def install_ipcq(
|
||||||
self,
|
self,
|
||||||
@@ -118,10 +182,16 @@ class RuntimeContext:
|
|||||||
return plan
|
return plan
|
||||||
|
|
||||||
def __enter__(self):
|
def __enter__(self):
|
||||||
|
global _ACTIVE_CTX_REF
|
||||||
|
_ACTIVE_CTX_REF = _weakref.ref(self)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def __exit__(self, *exc):
|
def __exit__(self, *exc):
|
||||||
|
global _ACTIVE_CTX_REF
|
||||||
self.cleanup()
|
self.cleanup()
|
||||||
|
# Clear active-context registry if we are it.
|
||||||
|
if _ACTIVE_CTX_REF is not None and _ACTIVE_CTX_REF() is self:
|
||||||
|
_ACTIVE_CTX_REF = None
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def submit(self, request: Any) -> RequestHandle:
|
def submit(self, request: Any) -> RequestHandle:
|
||||||
@@ -136,10 +206,24 @@ class RuntimeContext:
|
|||||||
return handle in self._completed
|
return handle in self._completed
|
||||||
|
|
||||||
def wait(self, handle: RequestHandle, *, _meta: dict | None = None) -> Completion:
|
def wait(self, handle: RequestHandle, *, _meta: dict | None = None) -> Completion:
|
||||||
|
# ADR-0027 D0.2: fast-path for already-completed handles (avoid
|
||||||
|
# redundant worker→main→worker round-trip).
|
||||||
if handle in self._completed:
|
if handle in self._completed:
|
||||||
completion, trace = self.engine.get_completion(handle)
|
completion, trace = self.engine.get_completion(handle)
|
||||||
return completion
|
return completion
|
||||||
|
|
||||||
|
# ADR-0027 D0.2: if called from a worker greenlet (parent is main,
|
||||||
|
# not dead), defer the wait to the main scheduler — enqueue and
|
||||||
|
# yield. Main drains env.run, then switches back. On resume the
|
||||||
|
# handle must be in _completed (D0.3 resume invariant).
|
||||||
|
from greenlet import getcurrent
|
||||||
|
g = getcurrent()
|
||||||
|
if g.parent is not None and not g.parent.dead:
|
||||||
|
self._pending_worker_waits.append(handle)
|
||||||
|
g.parent.switch()
|
||||||
|
# Resume: main drained. Fall through to completion/trace assembly.
|
||||||
|
|
||||||
|
# Main context (or single-driver): drive engine directly.
|
||||||
wait_fn = getattr(self.engine, "wait", None)
|
wait_fn = getattr(self.engine, "wait", None)
|
||||||
if wait_fn is not None:
|
if wait_fn is not None:
|
||||||
wait_fn(handle) # type: ignore[misc]
|
wait_fn(handle) # type: ignore[misc]
|
||||||
@@ -228,12 +312,7 @@ class RuntimeContext:
|
|||||||
# Return PA space
|
# Return PA space
|
||||||
if self._allocators:
|
if self._allocators:
|
||||||
for shard in handle.shards:
|
for shard in handle.shards:
|
||||||
flat_idx = (
|
alloc = self._allocators.get((shard.sip, shard.cube, shard.pe))
|
||||||
shard.sip * self._num_cubes * self._pes_per_cube
|
|
||||||
+ shard.cube * self._pes_per_cube
|
|
||||||
+ shard.pe
|
|
||||||
)
|
|
||||||
alloc = self._allocators.get(flat_idx)
|
|
||||||
if alloc is not None:
|
if alloc is not None:
|
||||||
from kernbench.policy.address.phyaddr import PhysAddr
|
from kernbench.policy.address.phyaddr import PhysAddr
|
||||||
alloc.free_hbm(PhysAddr.decode(shard.pa), shard.nbytes)
|
alloc.free_hbm(PhysAddr.decode(shard.pa), shard.nbytes)
|
||||||
@@ -297,17 +376,15 @@ class RuntimeContext:
|
|||||||
tcm_scheduler_reserved_bytes=4 * (1 << 20),
|
tcm_scheduler_reserved_bytes=4 * (1 << 20),
|
||||||
sram_bytes_per_cube=32 * (1 << 20),
|
sram_bytes_per_cube=32 * (1 << 20),
|
||||||
)
|
)
|
||||||
# Create allocators scoped to target SIP(s) only
|
# Create allocators scoped to target SIP(s) only.
|
||||||
# Flat index: sip_id * cubes_per_sip * pes_per_cube + cube_id * pes_per_cube + pe_id
|
# ADR-0026 D5: dict key is the structural (sip, cube, pe) tuple.
|
||||||
self._pes_per_cube = pes_per_cube
|
self._pes_per_cube = pes_per_cube
|
||||||
self._num_cubes = cubes_per_sip
|
self._num_cubes = cubes_per_sip
|
||||||
self._num_sips = sip_count
|
self._num_sips = sip_count
|
||||||
cubes_x_pes = cubes_per_sip * pes_per_cube
|
|
||||||
for sip_id in sip_range:
|
for sip_id in sip_range:
|
||||||
for cube_id in range(cubes_per_sip):
|
for cube_id in range(cubes_per_sip):
|
||||||
for pe_id in range(pes_per_cube):
|
for pe_id in range(pes_per_cube):
|
||||||
flat_idx = sip_id * cubes_x_pes + cube_id * pes_per_cube + pe_id
|
self._allocators[(sip_id, cube_id, pe_id)] = PEMemAllocator(
|
||||||
self._allocators[flat_idx] = PEMemAllocator(
|
|
||||||
rack_id=0, sip_id=sip_id, cube_id=cube_id, pe_id=pe_id, cfg=cfg,
|
rack_id=0, sip_id=sip_id, cube_id=cube_id, pe_id=pe_id, cfg=cfg,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -394,16 +471,23 @@ class RuntimeContext:
|
|||||||
# DPPolicy overrides take precedence over topology dimensions
|
# DPPolicy overrides take precedence over topology dimensions
|
||||||
eff_num_pe = dp.num_pes if dp.num_pes is not None else self._pes_per_cube
|
eff_num_pe = dp.num_pes if dp.num_pes is not None else self._pes_per_cube
|
||||||
eff_num_cubes = dp.num_cubes if dp.num_cubes is not None else self._num_cubes
|
eff_num_cubes = dp.num_cubes if dp.num_cubes is not None else self._num_cubes
|
||||||
eff_num_sips = dp.num_sips if dp.num_sips is not None else self._num_sips
|
# ADR-0026 D4: resolve structural coords directly at resolve time.
|
||||||
|
# ``torch.ahbm.set_device(rank)`` (ADR-0024 D10) selects the target
|
||||||
|
# SIP; if unset, fall back to SIP 0 for single-driver compatibility.
|
||||||
|
current_sip = (
|
||||||
|
self.ahbm.current_device() if hasattr(self, "ahbm") else None
|
||||||
|
)
|
||||||
|
if current_sip is None:
|
||||||
|
current_sip = 0
|
||||||
placement = resolve_dp_policy(
|
placement = resolve_dp_policy(
|
||||||
dp, shape=shape_2d, itemsize=itemsize,
|
dp, shape=shape_2d, itemsize=itemsize,
|
||||||
num_pe=eff_num_pe, num_cubes=eff_num_cubes,
|
num_pe=eff_num_pe, num_cubes=eff_num_cubes,
|
||||||
num_sips=eff_num_sips,
|
target_sip=int(current_sip),
|
||||||
)
|
)
|
||||||
|
|
||||||
# Infer target_pe from placement using local (within-cube) PE IDs.
|
# Infer target_pe from placement using local (within-cube) PE IDs.
|
||||||
# This ensures M_CPU only fans out to PEs that own shards, not all PEs.
|
# This ensures M_CPU only fans out to PEs that own shards, not all PEs.
|
||||||
local_pe_ids = sorted({s.pe_index % eff_num_pe for s in placement})
|
local_pe_ids = sorted({s.pe for s in placement})
|
||||||
if len(local_pe_ids) == 1:
|
if len(local_pe_ids) == 1:
|
||||||
target_pe: int | tuple[int, ...] | str = local_pe_ids[0]
|
target_pe: int | tuple[int, ...] | str = local_pe_ids[0]
|
||||||
elif len(local_pe_ids) == eff_num_pe and eff_num_pe == self._pes_per_cube:
|
elif len(local_pe_ids) == eff_num_pe and eff_num_pe == self._pes_per_cube:
|
||||||
@@ -501,6 +585,21 @@ class RuntimeContext:
|
|||||||
"sip": shard.sip, "cube": shard.cube, "pe": shard.pe,
|
"sip": shard.sip, "cube": shard.cube, "pe": shard.pe,
|
||||||
"nbytes": shard.nbytes,
|
"nbytes": shard.nbytes,
|
||||||
})
|
})
|
||||||
|
# ADR-0027: also populate MemoryStore at VA keys so kernels
|
||||||
|
# reading via VA (the common ``tl.load`` path) see the init
|
||||||
|
# data. Phase 1 MemoryWriteMsg writes via PA; kernels read via
|
||||||
|
# VA; Phase 2 DataExecutor reads via the addresses captured in
|
||||||
|
# op_log (VA for tl.load). Without this, zero-init tensors are
|
||||||
|
# invisible to kernels in Phase 2.
|
||||||
|
store = getattr(self.engine, "_memory_store", None)
|
||||||
|
if store is not None and pattern == "zero" and handle.va_base:
|
||||||
|
import numpy as np
|
||||||
|
from kernbench.runtime_api.tensor import _numpy_dtype
|
||||||
|
np_dtype = _numpy_dtype(dtype)
|
||||||
|
for shard in handle.shards:
|
||||||
|
count = shard.nbytes // itemsize
|
||||||
|
addr = handle.va_base + shard.offset_bytes
|
||||||
|
store.write("hbm", addr, np.zeros(count, dtype=np_dtype))
|
||||||
|
|
||||||
return t
|
return t
|
||||||
|
|
||||||
@@ -509,6 +608,7 @@ class RuntimeContext:
|
|||||||
kernel_name: str,
|
kernel_name: str,
|
||||||
kernel_fn: Any,
|
kernel_fn: Any,
|
||||||
*args: Any,
|
*args: Any,
|
||||||
|
_defer_wait: bool = False,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> RequestHandle:
|
) -> RequestHandle:
|
||||||
"""Register and launch a kernel (like a fused torch op).
|
"""Register and launch a kernel (like a fused torch op).
|
||||||
@@ -518,6 +618,11 @@ class RuntimeContext:
|
|||||||
|
|
||||||
Creates per-SIP KernelLaunchMsg with local va_base per tensor
|
Creates per-SIP KernelLaunchMsg with local va_base per tensor
|
||||||
(like host driver sending per-rank launch commands).
|
(like host driver sending per-rank launch commands).
|
||||||
|
|
||||||
|
When ``_defer_wait=True`` (ADR-0024 D7), returns the list of
|
||||||
|
``(handle, sip_id, meta)`` tuples instead of waiting. Caller is
|
||||||
|
responsible for waiting — used by collective ops to yield between
|
||||||
|
submit and wait so all sibling ranks can submit first.
|
||||||
"""
|
"""
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
|
||||||
@@ -593,11 +698,8 @@ class RuntimeContext:
|
|||||||
dp = t._dp_metadata.dp_policy if t._dp_metadata else None
|
dp = t._dp_metadata.dp_policy if t._dp_metadata else None
|
||||||
if dp is None:
|
if dp is None:
|
||||||
return t.shape
|
return t.shape
|
||||||
if dp.sip != "replicate":
|
# ADR-0026: DPPolicy no longer crosses SIP boundaries; cube + PE
|
||||||
if dp.sip == "column_wise":
|
# are the only axes that shrink the local shape.
|
||||||
K = K // self._num_sips
|
|
||||||
elif dp.sip == "row_wise":
|
|
||||||
M = M // self._num_sips
|
|
||||||
if dp.cube != "replicate":
|
if dp.cube != "replicate":
|
||||||
if dp.cube == "column_wise":
|
if dp.cube == "column_wise":
|
||||||
K = K // self._num_cubes
|
K = K // self._num_cubes
|
||||||
@@ -683,6 +785,18 @@ class RuntimeContext:
|
|||||||
_pending_handles.append((h, sip_id))
|
_pending_handles.append((h, sip_id))
|
||||||
last_handle = h
|
last_handle = h
|
||||||
|
|
||||||
|
if _defer_wait:
|
||||||
|
# ADR-0024 D7: return the pending-list so the caller can yield
|
||||||
|
# between submit and drain. Used by collective ops that need
|
||||||
|
# all sibling ranks to submit before any rank waits.
|
||||||
|
return [
|
||||||
|
(h, sip_id, {
|
||||||
|
"phase": "kernel", "name": kernel_name,
|
||||||
|
"sip": sip_id, "target_pe": target_pe,
|
||||||
|
})
|
||||||
|
for h, sip_id in _pending_handles
|
||||||
|
]
|
||||||
|
|
||||||
# Drain pending handles now that every SIP has a launch posted.
|
# Drain pending handles now that every SIP has a launch posted.
|
||||||
for h, sip_id in _pending_handles:
|
for h, sip_id in _pending_handles:
|
||||||
self.wait(h, _meta={
|
self.wait(h, _meta={
|
||||||
|
|||||||
@@ -23,6 +23,7 @@ Host bench code uses only real-PyTorch names:
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import importlib
|
import importlib
|
||||||
|
import math
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
|
|
||||||
@@ -40,20 +41,44 @@ class AhbmCCLBackend:
|
|||||||
self._merged = resolve_algorithm_config(self._cfg_all)
|
self._merged = resolve_algorithm_config(self._cfg_all)
|
||||||
self._algo_module = importlib.import_module(self._merged["module"])
|
self._algo_module = importlib.import_module(self._merged["module"])
|
||||||
self._world_size = self._resolve_world_size()
|
self._world_size = self._resolve_world_size()
|
||||||
|
self._pending_collective_handles: list = []
|
||||||
|
self._dist_ctx: Any = None
|
||||||
|
|
||||||
# Eager IPCQ install — ``init_process_group`` time. Mirrors NCCL
|
spec = self.ctx.spec or {}
|
||||||
# communicator creation: done once, reused across every subsequent
|
self._n_sips = int(spec.get("system", {}).get("sips", {}).get("count", 1))
|
||||||
# collective call on the same process group.
|
self._sip_topo = str(
|
||||||
self.ctx.install_ipcq(
|
spec.get("system", {}).get("sips", {}).get("topology", "ring_1d")
|
||||||
algorithm=self._merged["algorithm"],
|
|
||||||
world_size_override=self._world_size,
|
|
||||||
)
|
)
|
||||||
|
cm = spec.get("sip", {}).get("cube_mesh", {})
|
||||||
|
self._cube_w = int(cm.get("w", 4))
|
||||||
|
self._cube_h = int(cm.get("h", 4))
|
||||||
|
|
||||||
|
# Resolve SIP topology dims for the kernel
|
||||||
|
topo_map = getattr(self._algo_module, "TOPO_NAME_TO_KIND", None)
|
||||||
|
if topo_map is not None:
|
||||||
|
self._sip_topo_kind = topo_map.get(self._sip_topo, 0)
|
||||||
|
else:
|
||||||
|
self._sip_topo_kind = 0
|
||||||
|
if self._sip_topo == "ring_1d":
|
||||||
|
self._sip_topo_w, self._sip_topo_h = 0, 0
|
||||||
|
else:
|
||||||
|
side = int(round(math.sqrt(self._n_sips)))
|
||||||
|
self._sip_topo_w, self._sip_topo_h = side, side
|
||||||
|
|
||||||
|
# IPCQ install: wire all pe0s across all cubes and SIPs
|
||||||
|
engine = getattr(self.ctx, "engine", None)
|
||||||
|
if engine is not None:
|
||||||
|
from kernbench.ccl.sfr_config import configure_sfr_intercube_multisip
|
||||||
|
configure_sfr_intercube_multisip(engine, spec, self._merged)
|
||||||
|
|
||||||
def _resolve_world_size(self) -> int:
|
def _resolve_world_size(self) -> int:
|
||||||
"""Derive world_size (priority: algorithm override > defaults > topology).
|
"""Derive world_size (priority: algorithm override > defaults > topology).
|
||||||
|
|
||||||
Topology derivation:
|
ADR-0024 D1: topology fallback is SIP count. Each rank represents one
|
||||||
sips × cubes_per_sip × pes_per_cube
|
SIP (TP dimension). Intra-SIP parallelism is expressed via DPPolicy
|
||||||
|
inside each worker and is independent of world_size.
|
||||||
|
Explicit ``ccl.yaml`` override still respected — legacy "rank = flat
|
||||||
|
PE index" tests use this path.
|
||||||
"""
|
"""
|
||||||
if "world_size" in self._merged:
|
if "world_size" in self._merged:
|
||||||
return int(self._merged["world_size"])
|
return int(self._merged["world_size"])
|
||||||
@@ -61,14 +86,7 @@ class AhbmCCLBackend:
|
|||||||
if "world_size" in defaults:
|
if "world_size" in defaults:
|
||||||
return int(defaults["world_size"])
|
return int(defaults["world_size"])
|
||||||
spec = self.ctx.spec or {}
|
spec = self.ctx.spec or {}
|
||||||
sips = int(spec.get("system", {}).get("sips", {}).get("count", 1))
|
return int(spec.get("system", {}).get("sips", {}).get("count", 1))
|
||||||
cm = spec.get("sip", {}).get("cube_mesh", {})
|
|
||||||
cubes_per_sip = int(cm.get("w", 1)) * int(cm.get("h", 1))
|
|
||||||
pl = spec.get("cube", {}).get("pe_layout", {})
|
|
||||||
corners = pl.get("corners", [])
|
|
||||||
pe_per_corner = int(pl.get("pe_per_corner", 1))
|
|
||||||
pes_per_cube = pe_per_corner * max(len(corners), 1)
|
|
||||||
return sips * cubes_per_sip * pes_per_cube
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def world_size(self) -> int:
|
def world_size(self) -> int:
|
||||||
@@ -89,20 +107,48 @@ class AhbmCCLBackend:
|
|||||||
"with a DPPolicy first)"
|
"with a DPPolicy first)"
|
||||||
)
|
)
|
||||||
shards = tensor._handle.shards
|
shards = tensor._handle.shards
|
||||||
if len(shards) != self._world_size:
|
if not shards:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"all_reduce tensor has {len(shards)} shards but the "
|
f"all_reduce tensor '{tensor.name}' has no shards"
|
||||||
f"ahbm backend was installed with world_size="
|
|
||||||
f"{self._world_size}; adjust the tensor's DPPolicy or "
|
|
||||||
"restart the process group"
|
|
||||||
)
|
)
|
||||||
n_elem = shards[0].nbytes // tensor.itemsize
|
n_elem = shards[0].nbytes // tensor.itemsize
|
||||||
kernel_fn = self._algo_module.kernel
|
kernel_fn = self._algo_module.kernel
|
||||||
kernel_args = self._algo_module.kernel_args(self._world_size, n_elem)
|
kernel_args = self._algo_module.kernel_args(self._world_size, n_elem)
|
||||||
self.ctx.launch(
|
|
||||||
self._merged["algorithm"], kernel_fn, tensor, *kernel_args,
|
# Resolve sip_rank from the current greenlet's bound rank
|
||||||
|
from greenlet import getcurrent as _gc
|
||||||
|
g = _gc()
|
||||||
|
dist_ctx = getattr(self, "_dist_ctx", None)
|
||||||
|
if dist_ctx is not None:
|
||||||
|
sip_rank = int(dist_ctx._rank_by_greenlet.get(g, 0))
|
||||||
|
else:
|
||||||
|
sip_rank = 0
|
||||||
|
|
||||||
|
extra_args = (
|
||||||
|
sip_rank,
|
||||||
|
self._sip_topo_kind,
|
||||||
|
self._sip_topo_w,
|
||||||
|
self._sip_topo_h,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
pending = self.ctx.launch(
|
||||||
|
self._merged["algorithm"], kernel_fn, tensor,
|
||||||
|
*kernel_args, *extra_args,
|
||||||
|
_defer_wait=True,
|
||||||
|
)
|
||||||
|
from greenlet import getcurrent
|
||||||
|
g = getcurrent()
|
||||||
|
if g.parent is not None and not g.parent.dead:
|
||||||
|
# Multi-greenlet mode: hand pending to the backend-level queue so
|
||||||
|
# the main scheduler drains. Worker just yields.
|
||||||
|
self._pending_collective_handles.extend(pending)
|
||||||
|
g.parent.switch()
|
||||||
|
# On resume, all pending handles have been drained by main.
|
||||||
|
else:
|
||||||
|
# Single-driver (no bench scheduler): drain inline.
|
||||||
|
for h, _sip_id, meta in pending:
|
||||||
|
self.ctx.wait(h, _meta=meta)
|
||||||
|
|
||||||
def barrier(self) -> None:
|
def barrier(self) -> None:
|
||||||
# Single-driver model → no cross-process sync needed. Keeping the
|
# Single-driver model → no cross-process sync needed. Keeping the
|
||||||
# method so ``dist.barrier()`` is callable (pytorch-compat surface).
|
# method so ``dist.barrier()`` is callable (pytorch-compat surface).
|
||||||
@@ -121,6 +167,11 @@ class DistributedContext:
|
|||||||
|
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
self._backend: AhbmCCLBackend | None = None
|
self._backend: AhbmCCLBackend | None = None
|
||||||
|
# ADR-0024 D9: greenlet-local rank registry. Bench launcher calls
|
||||||
|
# _bind_rank(g, rank) when spawning workers; get_rank() resolves the
|
||||||
|
# current greenlet to its rank. Unbound greenlets fall back to 0 for
|
||||||
|
# single-driver test compat.
|
||||||
|
self._rank_by_greenlet: dict = {}
|
||||||
|
|
||||||
def init_process_group(
|
def init_process_group(
|
||||||
self,
|
self,
|
||||||
@@ -146,6 +197,7 @@ class DistributedContext:
|
|||||||
"DistributedContext not bound to a RuntimeContext"
|
"DistributedContext not bound to a RuntimeContext"
|
||||||
)
|
)
|
||||||
self._backend = AhbmCCLBackend(torch_ctx=ctx)
|
self._backend = AhbmCCLBackend(torch_ctx=ctx)
|
||||||
|
self._backend._dist_ctx = self
|
||||||
|
|
||||||
def is_initialized(self) -> bool:
|
def is_initialized(self) -> bool:
|
||||||
return self._backend is not None
|
return self._backend is not None
|
||||||
@@ -155,9 +207,20 @@ class DistributedContext:
|
|||||||
return self._backend.world_size
|
return self._backend.world_size
|
||||||
|
|
||||||
def get_rank(self) -> int:
|
def get_rank(self) -> int:
|
||||||
# Single-driver kernbench: there is only one host rank.
|
"""Return the rank bound to the current greenlet (default 0).
|
||||||
|
|
||||||
|
ADR-0024 D9: workers spawned by the bench launcher each get a rank
|
||||||
|
registered via ``_bind_rank``. Callers outside any bound greenlet
|
||||||
|
fall back to rank 0 for single-driver test compat.
|
||||||
|
"""
|
||||||
self._ensure_initialized()
|
self._ensure_initialized()
|
||||||
return 0
|
from greenlet import getcurrent
|
||||||
|
g = getcurrent()
|
||||||
|
return int(self._rank_by_greenlet.get(g, 0))
|
||||||
|
|
||||||
|
def _bind_rank(self, g: Any, rank: int) -> None:
|
||||||
|
"""Bind a greenlet to a rank so ``get_rank()`` returns it (ADR-0024 D9)."""
|
||||||
|
self._rank_by_greenlet[g] = int(rank)
|
||||||
|
|
||||||
def get_backend(self) -> str:
|
def get_backend(self) -> str:
|
||||||
self._ensure_initialized()
|
self._ensure_initialized()
|
||||||
|
|||||||
@@ -0,0 +1,152 @@
|
|||||||
|
"""``torch.multiprocessing.spawn``-compatible namespace (ADR-0027 D1).
|
||||||
|
|
||||||
|
Real-PyTorch API *signature* parity only — execution model is a cooperative
|
||||||
|
greenlet scheduler in a single Python process (D1.0). Non-goals: process
|
||||||
|
isolation, independent address space, failure isolation, OS-level scheduler
|
||||||
|
fairness, mp.Queue/Lock.
|
||||||
|
|
||||||
|
Attached to ``RuntimeContext`` as ``ctx.multiprocessing`` in
|
||||||
|
``__post_init__`` (D1.3).
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Any, Callable
|
||||||
|
|
||||||
|
|
||||||
|
class SpawnException(RuntimeError):
|
||||||
|
"""Raised from ``_MultiprocessingNamespace.spawn`` on worker failure.
|
||||||
|
|
||||||
|
``errors`` contains only root-cause ranks — the rank(s) whose body
|
||||||
|
raised. Sibling greenlets terminated via ``throw(SystemExit)`` during
|
||||||
|
cleanup are NOT recorded (SystemExit does not satisfy ``except
|
||||||
|
Exception`` in the entry wrapper).
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, errors: dict[int, Exception]):
|
||||||
|
self.errors = errors
|
||||||
|
first = next(iter(errors.items()), None)
|
||||||
|
msg = (
|
||||||
|
f"spawn failed on ranks {sorted(errors.keys())}"
|
||||||
|
+ (
|
||||||
|
f": rank {first[0]} raised {first[1]!r}"
|
||||||
|
if first is not None
|
||||||
|
else ""
|
||||||
|
)
|
||||||
|
)
|
||||||
|
super().__init__(msg)
|
||||||
|
|
||||||
|
|
||||||
|
def _drain_pending(ctx: Any) -> None:
|
||||||
|
"""Drain worker-wait + collective-pending queues in main context (D0.4/D0.5).
|
||||||
|
|
||||||
|
Loop-until-empty: runs until both queues are simultaneously empty. Safe
|
||||||
|
under the current model where main-context ``ctx.wait`` never re-enqueues
|
||||||
|
(D0.5 main-context non-reentrance invariant); also safe under future
|
||||||
|
extensions where drain can add sub-handles (SimPy causality gives finite
|
||||||
|
depth).
|
||||||
|
"""
|
||||||
|
distributed = getattr(ctx, "distributed", None)
|
||||||
|
backend = getattr(distributed, "_backend", None) if distributed else None
|
||||||
|
|
||||||
|
def _collective_nonempty() -> bool:
|
||||||
|
if backend is None:
|
||||||
|
return False
|
||||||
|
pending = getattr(backend, "_pending_collective_handles", None)
|
||||||
|
return bool(pending)
|
||||||
|
|
||||||
|
while ctx._pending_worker_waits or _collective_nonempty():
|
||||||
|
# (a) Worker-driven waits (D0.1). FIFO.
|
||||||
|
while ctx._pending_worker_waits:
|
||||||
|
h = ctx._pending_worker_waits.pop(0)
|
||||||
|
if h not in ctx._completed:
|
||||||
|
wait_fn = getattr(ctx.engine, "wait", None)
|
||||||
|
if wait_fn is not None:
|
||||||
|
wait_fn(h)
|
||||||
|
# Populate _completed so fast-path in ctx.wait short-circuits
|
||||||
|
# on the return leg.
|
||||||
|
ctx._completed.add(h)
|
||||||
|
# (b) Collective backend queue (ADR-0024 D7 + D0.4-(2)).
|
||||||
|
if backend is not None:
|
||||||
|
pending_list = getattr(backend, "_pending_collective_handles", None)
|
||||||
|
if pending_list is not None:
|
||||||
|
while pending_list:
|
||||||
|
h, _sip_id, meta = pending_list.pop(0)
|
||||||
|
# Main context: ctx.wait drives engine directly and does
|
||||||
|
# NOT re-enqueue (D0.5 invariant).
|
||||||
|
ctx.wait(h, _meta=meta)
|
||||||
|
|
||||||
|
|
||||||
|
class _MultiprocessingNamespace:
|
||||||
|
"""torch.multiprocessing-compat facade bound to a RuntimeContext."""
|
||||||
|
|
||||||
|
def __init__(self, ctx: Any) -> None:
|
||||||
|
self._ctx = ctx
|
||||||
|
|
||||||
|
def spawn(
|
||||||
|
self,
|
||||||
|
fn: Callable,
|
||||||
|
args: tuple = (),
|
||||||
|
nprocs: int = 1,
|
||||||
|
join: bool = True,
|
||||||
|
) -> None:
|
||||||
|
"""Spawn ``nprocs`` worker greenlets, each calling ``fn(rank, *args)``.
|
||||||
|
|
||||||
|
Mirrors ``torch.multiprocessing.spawn`` signature (minus ``daemon``).
|
||||||
|
Runs the D0.4 round-robin scheduler loop until all workers finish,
|
||||||
|
draining pending queues between rounds.
|
||||||
|
"""
|
||||||
|
from greenlet import greenlet
|
||||||
|
|
||||||
|
ctx = self._ctx
|
||||||
|
dist = ctx.distributed
|
||||||
|
gs: list = []
|
||||||
|
errors: dict[int, Exception] = {}
|
||||||
|
|
||||||
|
for rank in range(nprocs):
|
||||||
|
def _entry(r: int = rank) -> None:
|
||||||
|
try:
|
||||||
|
fn(r, *args)
|
||||||
|
except Exception as e:
|
||||||
|
errors[r] = e
|
||||||
|
raise
|
||||||
|
|
||||||
|
g = greenlet(_entry)
|
||||||
|
if dist is not None and hasattr(dist, "_bind_rank"):
|
||||||
|
dist._bind_rank(g, rank)
|
||||||
|
gs.append(g)
|
||||||
|
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
alive = [g for g in gs if not g.dead]
|
||||||
|
if not alive:
|
||||||
|
break
|
||||||
|
for g in alive:
|
||||||
|
if not g.dead:
|
||||||
|
g.switch()
|
||||||
|
_drain_pending(ctx)
|
||||||
|
except Exception as outer:
|
||||||
|
# D0.4-(4) sibling cleanup. Abort live greenlets, clear state.
|
||||||
|
for other in gs:
|
||||||
|
if not other.dead:
|
||||||
|
try:
|
||||||
|
other.throw(SystemExit)
|
||||||
|
except BaseException:
|
||||||
|
# SystemExit inherits BaseException; greenlet.throw
|
||||||
|
# re-raises in caller if target doesn't catch it.
|
||||||
|
# Silent — we're already in cleanup.
|
||||||
|
pass
|
||||||
|
backend = getattr(dist, "_backend", None)
|
||||||
|
if backend is not None:
|
||||||
|
if hasattr(backend, "_barrier") and hasattr(backend._barrier, "reset"):
|
||||||
|
try:
|
||||||
|
backend._barrier.reset()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
pending_collective = getattr(
|
||||||
|
backend, "_pending_collective_handles", None,
|
||||||
|
)
|
||||||
|
if pending_collective is not None:
|
||||||
|
pending_collective.clear()
|
||||||
|
ctx._pending_worker_waits.clear()
|
||||||
|
raise SpawnException(errors) from outer
|
||||||
|
# join=True: we already waited for all workers above.
|
||||||
@@ -66,13 +66,64 @@ def _numpy_dtype(dtype: str) -> np.dtype:
|
|||||||
return np.dtype(_NUMPY_DTYPE.get(dtype, np.float16))
|
return np.dtype(_NUMPY_DTYPE.get(dtype, np.float16))
|
||||||
|
|
||||||
|
|
||||||
|
# ADR-0027 T5.g: closed-set registry of host-read barrier entry-points.
|
||||||
|
# Any new Tensor API with host-observable read semantics must be added here
|
||||||
|
# AND implement the barrier call. Code review + this registry keep the set
|
||||||
|
# consistent (Python introspection-based auto-detection is a non-goal).
|
||||||
|
# Note on ``copy_``: the source read is barriered via ``source.numpy()``.
|
||||||
|
# A target-side write barrier was specified in an earlier revision of
|
||||||
|
# ADR-0027 D0.5 but is intentionally not applied (global-pending target
|
||||||
|
# barrier can prematurely drain cross-rank collectives → deadlock).
|
||||||
|
_HOST_READ_BARRIERS: frozenset[str] = frozenset({
|
||||||
|
"numpy",
|
||||||
|
"data",
|
||||||
|
"__getitem__",
|
||||||
|
"__repr__",
|
||||||
|
"copy_", # source-side via source.numpy(); target-side not barriered
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
def _host_read_barrier(tensor: "Tensor") -> None:
|
||||||
|
"""ADR-0027 D0.5: drain pending worker-wait queue before a host-observable
|
||||||
|
read/write.
|
||||||
|
|
||||||
|
Scope: the barrier yields to main when ``ctx._pending_worker_waits`` is
|
||||||
|
non-empty AND the caller is a worker greenlet. Collective pending
|
||||||
|
(``backend._pending_collective_handles``) is **deliberately excluded**
|
||||||
|
from this check — collective handles represent cross-rank protocol that
|
||||||
|
must be drained only at scheduler synchronisation points (all workers
|
||||||
|
yielded). A collective's own yield (inside ``all_reduce``) already
|
||||||
|
ensures that once the collective call returns to the worker, post-drain
|
||||||
|
values are visible, so subsequent host reads see materialised data
|
||||||
|
without needing to trigger drain themselves. Including collective
|
||||||
|
pending here would cause an unrelated rank's barrier to prematurely
|
||||||
|
request drain of a cross-rank operation → deadlock.
|
||||||
|
|
||||||
|
No-op when called from main context or when the worker-wait queue is
|
||||||
|
empty (fast-path avoids needless context switches).
|
||||||
|
"""
|
||||||
|
ctx = None
|
||||||
|
if tensor._ctx_ref is not None:
|
||||||
|
ctx = tensor._ctx_ref()
|
||||||
|
if ctx is None:
|
||||||
|
return
|
||||||
|
worker_pending = getattr(ctx, "_pending_worker_waits", None)
|
||||||
|
if not worker_pending:
|
||||||
|
return # fast-path
|
||||||
|
from greenlet import getcurrent
|
||||||
|
g = getcurrent()
|
||||||
|
if g.parent is None or g.parent.dead:
|
||||||
|
return # main context: caller drains directly when needed
|
||||||
|
g.parent.switch()
|
||||||
|
|
||||||
|
|
||||||
def deploy_tensor(
|
def deploy_tensor(
|
||||||
*,
|
*,
|
||||||
name: str,
|
name: str,
|
||||||
shape: tuple[int, ...],
|
shape: tuple[int, ...],
|
||||||
dtype: str,
|
dtype: str,
|
||||||
placement: list[ShardSpec],
|
placement: list[ShardSpec],
|
||||||
allocators: dict[int, PEMemAllocator],
|
allocators: dict[tuple[int, int, int], PEMemAllocator],
|
||||||
mem_kind: Literal["hbm", "tcm"] = "hbm",
|
mem_kind: Literal["hbm", "tcm"] = "hbm",
|
||||||
va_allocator=None,
|
va_allocator=None,
|
||||||
) -> TensorHandle:
|
) -> TensorHandle:
|
||||||
@@ -86,15 +137,15 @@ def deploy_tensor(
|
|||||||
|
|
||||||
shards: list[TensorShard] = []
|
shards: list[TensorShard] = []
|
||||||
for spec in placement:
|
for spec in placement:
|
||||||
alloc = allocators[spec.pe_index]
|
alloc = allocators[(spec.sip, spec.cube, spec.pe)]
|
||||||
if mem_kind == "hbm":
|
if mem_kind == "hbm":
|
||||||
pa = alloc.alloc_hbm(spec.nbytes)
|
pa = alloc.alloc_hbm(spec.nbytes)
|
||||||
else:
|
else:
|
||||||
pa = alloc.alloc_tcm(spec.nbytes)
|
pa = alloc.alloc_tcm(spec.nbytes)
|
||||||
shards.append(TensorShard(
|
shards.append(TensorShard(
|
||||||
sip=alloc._sip_id,
|
sip=spec.sip,
|
||||||
cube=alloc._cube_id,
|
cube=spec.cube,
|
||||||
pe=alloc._pe_id,
|
pe=spec.pe,
|
||||||
pa=pa.encode(),
|
pa=pa.encode(),
|
||||||
nbytes=spec.nbytes,
|
nbytes=spec.nbytes,
|
||||||
offset_bytes=spec.offset_bytes,
|
offset_bytes=spec.offset_bytes,
|
||||||
@@ -217,7 +268,9 @@ class Tensor:
|
|||||||
"""Read a shard-aligned slice. Returns a numpy array.
|
"""Read a shard-aligned slice. Returns a numpy array.
|
||||||
|
|
||||||
Mirrors ``torch.Tensor.__getitem__`` for the shard-aligned case.
|
Mirrors ``torch.Tensor.__getitem__`` for the shard-aligned case.
|
||||||
|
ADR-0027 D0.5: host-read barrier.
|
||||||
"""
|
"""
|
||||||
|
_host_read_barrier(self)
|
||||||
start, stop = self._resolve_shard_index(key)
|
start, stop = self._resolve_shard_index(key)
|
||||||
shard = self._shard_for_range(start, stop)
|
shard = self._shard_for_range(start, stop)
|
||||||
if self._memory_store is None:
|
if self._memory_store is None:
|
||||||
@@ -272,6 +325,8 @@ class Tensor:
|
|||||||
def __repr__(self) -> str:
|
def __repr__(self) -> str:
|
||||||
parts = [f"tensor(name={self.name}, shape={self.shape}, dtype={self.dtype}"]
|
parts = [f"tensor(name={self.name}, shape={self.shape}, dtype={self.dtype}"]
|
||||||
if self._memory_store is not None and self._handle is not None:
|
if self._memory_store is not None and self._handle is not None:
|
||||||
|
# ADR-0027 D0.5: barrier on data-containing repr path.
|
||||||
|
_host_read_barrier(self)
|
||||||
arr = self.data
|
arr = self.data
|
||||||
parts.append(f", mean={float(arr.mean()):.4g}, norm={float(np.linalg.norm(arr)):.4g}")
|
parts.append(f", mean={float(arr.mean()):.4g}, norm={float(np.linalg.norm(arr)):.4g}")
|
||||||
else:
|
else:
|
||||||
@@ -308,7 +363,11 @@ class Tensor:
|
|||||||
Mirrors ``torch.Tensor.numpy()``. In kernbench, sharded tensors are
|
Mirrors ``torch.Tensor.numpy()``. In kernbench, sharded tensors are
|
||||||
gathered into a single full-shape ndarray according to each shard's
|
gathered into a single full-shape ndarray according to each shard's
|
||||||
``offset_bytes`` / ``nbytes`` range.
|
``offset_bytes`` / ``nbytes`` range.
|
||||||
|
|
||||||
|
ADR-0027 D0.5: acts as a host-read barrier — drains pending waits +
|
||||||
|
collective handles before reading, ensuring post-drain values.
|
||||||
"""
|
"""
|
||||||
|
_host_read_barrier(self)
|
||||||
np_dtype = _numpy_dtype(self.dtype)
|
np_dtype = _numpy_dtype(self.dtype)
|
||||||
# Host-side tensor (created via torch.from_numpy) has no shards.
|
# Host-side tensor (created via torch.from_numpy) has no shards.
|
||||||
if self._host_buffer is not None:
|
if self._host_buffer is not None:
|
||||||
@@ -340,6 +399,12 @@ class Tensor:
|
|||||||
re-scattered into self's shard layout.
|
re-scattered into self's shard layout.
|
||||||
|
|
||||||
Shapes must match. Returns self.
|
Shapes must match. Returns self.
|
||||||
|
|
||||||
|
ADR-0027 D0.5: source-side read barrier is triggered inside
|
||||||
|
``source.numpy()``. Target-side write barrier is not applied here
|
||||||
|
because it would require cross-rank coordination when other ranks
|
||||||
|
have pending collectives (see _host_read_barrier docstring on
|
||||||
|
collective pending being cross-rank).
|
||||||
"""
|
"""
|
||||||
if self._handle is None or self._memory_store is None:
|
if self._handle is None or self._memory_store is None:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
@@ -394,7 +459,8 @@ class Tensor:
|
|||||||
) -> Tensor:
|
) -> Tensor:
|
||||||
"""Set DP placement metadata (like torch.Tensor.to())."""
|
"""Set DP placement metadata (like torch.Tensor.to())."""
|
||||||
if placement is None:
|
if placement is None:
|
||||||
placement = [ShardSpec(pe_index=0, offset_bytes=0, nbytes=self.nbytes)]
|
placement = [ShardSpec(sip=0, cube=0, pe=0,
|
||||||
|
offset_bytes=0, nbytes=self.nbytes)]
|
||||||
self._dp_metadata = DPMetadata(
|
self._dp_metadata = DPMetadata(
|
||||||
placement=placement, dp_policy=dp_policy,
|
placement=placement, dp_policy=dp_policy,
|
||||||
sip=sip, cube=cube, target_pe=target_pe,
|
sip=sip, cube=cube, target_pe=target_pe,
|
||||||
|
|||||||
@@ -101,12 +101,19 @@ class DataExecutor:
|
|||||||
p = op.params
|
p = op.params
|
||||||
if "src_a_addr" not in p:
|
if "src_a_addr" not in p:
|
||||||
return # composite record without full params
|
return # composite record without full params
|
||||||
space = p.get("addr_space", "tcm")
|
default_space = p.get("addr_space", "tcm")
|
||||||
|
# ADR-0027: per-operand + output spaces (fall back to single space
|
||||||
|
# for legacy records without explicit space keys).
|
||||||
|
src_a_space = p.get("src_a_space", default_space)
|
||||||
|
src_b_space = p.get("src_b_space", default_space)
|
||||||
|
dst_space = p.get("dst_space", default_space)
|
||||||
dtype_in = p.get("dtype_in", "f16")
|
dtype_in = p.get("dtype_in", "f16")
|
||||||
dtype_out = p.get("dtype_out", dtype_in)
|
dtype_out = p.get("dtype_out", dtype_in)
|
||||||
|
|
||||||
a = self.store.read(space, p["src_a_addr"], shape=p.get("shape_a"), dtype=dtype_in)
|
a = self.store.read(src_a_space, p["src_a_addr"],
|
||||||
b = self.store.read(space, p["src_b_addr"], shape=p.get("shape_b"), dtype=dtype_in)
|
shape=p.get("shape_a"), dtype=dtype_in)
|
||||||
|
b = self.store.read(src_b_space, p["src_b_addr"],
|
||||||
|
shape=p.get("shape_b"), dtype=dtype_in)
|
||||||
|
|
||||||
# Compute in higher precision if specified
|
# Compute in higher precision if specified
|
||||||
dtype_acc = p.get("dtype_acc", "f32")
|
dtype_acc = p.get("dtype_acc", "f32")
|
||||||
@@ -114,7 +121,7 @@ class DataExecutor:
|
|||||||
b_f = b.astype(_resolve_dtype(dtype_acc))
|
b_f = b.astype(_resolve_dtype(dtype_acc))
|
||||||
result = np.matmul(a_f, b_f).astype(_resolve_dtype(dtype_out))
|
result = np.matmul(a_f, b_f).astype(_resolve_dtype(dtype_out))
|
||||||
|
|
||||||
self.store.write(space, p["dst_addr"], result)
|
self.store.write(dst_space, p["dst_addr"], result)
|
||||||
|
|
||||||
def _execute_math(self, op: OpRecord) -> None:
|
def _execute_math(self, op: OpRecord) -> None:
|
||||||
"""Execute math op: unary, binary, or reduction."""
|
"""Execute math op: unary, binary, or reduction."""
|
||||||
|
|||||||
@@ -79,16 +79,24 @@ class OpLogger:
|
|||||||
snaps.append(None)
|
snaps.append(None)
|
||||||
params["input_snapshots"] = snaps
|
params["input_snapshots"] = snaps
|
||||||
elif op_name == "dma_write":
|
elif op_name == "dma_write":
|
||||||
try:
|
# ADR-0027 fix: only snapshot HBM sources. TCM (PE scratch)
|
||||||
arr = self._memory_store.read(
|
# sources are repopulated by Phase 2 math/gemm replay —
|
||||||
params["src_space"], params["src_addr"],
|
# capturing a Phase-1-time snapshot here would pick up stale
|
||||||
shape=params.get("shape"), dtype=params.get("dtype"),
|
# data from a PRIOR kernel's Phase 2 output that aliased the
|
||||||
)
|
# same scratch address, causing the later kernel's replay
|
||||||
params["snapshot"] = (
|
# to write that stale value instead of the fresh math
|
||||||
arr.copy() if hasattr(arr, "copy") else arr
|
# result. See ADR-0027 postmortem (TP gemm → all_reduce).
|
||||||
)
|
if params.get("src_space") == "hbm":
|
||||||
except Exception:
|
try:
|
||||||
params["snapshot"] = None
|
arr = self._memory_store.read(
|
||||||
|
params["src_space"], params["src_addr"],
|
||||||
|
shape=params.get("shape"), dtype=params.get("dtype"),
|
||||||
|
)
|
||||||
|
params["snapshot"] = (
|
||||||
|
arr.copy() if hasattr(arr, "copy") else arr
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
params["snapshot"] = None
|
||||||
self._records.append(OpRecord(
|
self._records.append(OpRecord(
|
||||||
t_start=pending["t_start"],
|
t_start=pending["t_start"],
|
||||||
t_end=t,
|
t_end=t,
|
||||||
@@ -167,6 +175,13 @@ def _extract_op_info(msg: Any) -> tuple[str, str, dict[str, Any]]:
|
|||||||
"dtype_in": msg.a.dtype,
|
"dtype_in": msg.a.dtype,
|
||||||
"dtype_out": msg.out.dtype,
|
"dtype_out": msg.out.dtype,
|
||||||
"m": msg.m, "k": msg.k, "n": msg.n,
|
"m": msg.m, "k": msg.k, "n": msg.n,
|
||||||
|
# ADR-0027: preserve per-operand + output MemoryStore spaces so
|
||||||
|
# Phase 2 replay can resolve HBM-resident operands (e.g. tl.load
|
||||||
|
# results keep space="hbm"). Absent → DataExecutor falls back
|
||||||
|
# to the legacy single-space mode via ``addr_space``.
|
||||||
|
"src_a_space": getattr(msg.a, "space", "tcm"),
|
||||||
|
"src_b_space": getattr(msg.b, "space", "tcm"),
|
||||||
|
"dst_space": getattr(msg.out, "space", "tcm"),
|
||||||
}
|
}
|
||||||
if isinstance(msg, MathCmd):
|
if isinstance(msg, MathCmd):
|
||||||
return "math", msg.op, {
|
return "math", msg.op, {
|
||||||
@@ -181,10 +196,27 @@ def _extract_op_info(msg: Any) -> tuple[str, str, dict[str, Any]]:
|
|||||||
"axis": msg.axis,
|
"axis": msg.axis,
|
||||||
}
|
}
|
||||||
if isinstance(msg, CompositeCmd):
|
if isinstance(msg, CompositeCmd):
|
||||||
return "gemm" if msg.op == "gemm" else "math", f"composite_{msg.op}", {
|
params: dict[str, Any] = {
|
||||||
"op": msg.op,
|
"op": msg.op,
|
||||||
"out_addr": msg.out_addr,
|
"out_addr": msg.out_addr,
|
||||||
"out_nbytes": msg.out_nbytes,
|
"out_nbytes": msg.out_nbytes,
|
||||||
}
|
}
|
||||||
|
# ADR-0027: preserve operand info so Phase 2 DataExecutor can replay
|
||||||
|
# the composite's numerical effect (treat it like a GemmCmd).
|
||||||
|
if msg.op == "gemm" and msg.a is not None and msg.b is not None:
|
||||||
|
params.update({
|
||||||
|
"src_a_addr": msg.a.addr,
|
||||||
|
"src_b_addr": msg.b.addr,
|
||||||
|
"shape_a": msg.a.shape,
|
||||||
|
"shape_b": msg.b.shape,
|
||||||
|
"dtype_in": msg.a.dtype,
|
||||||
|
"dtype_out": msg.a.dtype,
|
||||||
|
"src_a_space": getattr(msg.a, "space", "hbm"),
|
||||||
|
"src_b_space": getattr(msg.b, "space", "hbm"),
|
||||||
|
"dst_space": "hbm",
|
||||||
|
# dst_addr alias so DataExecutor._execute_gemm picks it up.
|
||||||
|
"dst_addr": msg.out_addr,
|
||||||
|
})
|
||||||
|
return "gemm" if msg.op == "gemm" else "math", f"composite_{msg.op}", params
|
||||||
# Fallback for unknown data_op messages
|
# Fallback for unknown data_op messages
|
||||||
return "unknown", type(msg).__name__, {}
|
return "unknown", type(msg).__name__, {}
|
||||||
|
|||||||
@@ -0,0 +1,21 @@
|
|||||||
|
"""kernbench.tp — Megatron-style Tensor Parallelism (ADR-0027).
|
||||||
|
|
||||||
|
Public API re-exports.
|
||||||
|
"""
|
||||||
|
from kernbench.tp.layers import (
|
||||||
|
ColumnParallelLinear,
|
||||||
|
RowParallelLinear,
|
||||||
|
)
|
||||||
|
from kernbench.tp.parallel_state import (
|
||||||
|
get_tensor_model_parallel_rank,
|
||||||
|
get_tensor_model_parallel_world_size,
|
||||||
|
initialize_model_parallel,
|
||||||
|
)
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"ColumnParallelLinear",
|
||||||
|
"RowParallelLinear",
|
||||||
|
"get_tensor_model_parallel_rank",
|
||||||
|
"get_tensor_model_parallel_world_size",
|
||||||
|
"initialize_model_parallel",
|
||||||
|
]
|
||||||
@@ -0,0 +1,23 @@
|
|||||||
|
"""Kernel used by ``kernbench.tp`` layers (ADR-0027 D4/D5).
|
||||||
|
|
||||||
|
Intentionally self-contained inside the ``tp`` package — the ``tp`` package
|
||||||
|
must not import from ``benches/``. Future work: move to a shared
|
||||||
|
``kernbench.kernels`` module so benches and TP can share.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
|
||||||
|
def _gemm_kernel(a_ptr, b_ptr, out_ptr, M, K, N, tl, DTYPE: str = "f16") -> None:
|
||||||
|
"""Single-PE GEMM: out = a @ b via load → dot → store.
|
||||||
|
|
||||||
|
Uses the ``tl.load + tl.dot + tl.store`` path. Unlike ``tl.composite``
|
||||||
|
(which is absorbed by the PE scheduler into TileTokens that don't reach
|
||||||
|
the op_log), this path emits explicit ``DmaReadCmd`` / ``GemmCmd`` /
|
||||||
|
``DmaWriteCmd`` records, which DataExecutor replays numerically in
|
||||||
|
Phase 2.
|
||||||
|
"""
|
||||||
|
M, K, N = int(M), int(K), int(N)
|
||||||
|
a = tl.load(int(a_ptr), shape=(M, K), dtype=DTYPE)
|
||||||
|
b = tl.load(int(b_ptr), shape=(K, N), dtype=DTYPE)
|
||||||
|
out = tl.dot(a, b)
|
||||||
|
tl.store(int(out_ptr), out)
|
||||||
@@ -0,0 +1,150 @@
|
|||||||
|
"""Megatron-style parallel layers (ADR-0027 D4/D5).
|
||||||
|
|
||||||
|
- ``ColumnParallelLinear``: weight's out_features axis split across TP ranks.
|
||||||
|
forward(x) is local gemm; no collective.
|
||||||
|
- ``RowParallelLinear``: weight's in_features axis split across TP ranks.
|
||||||
|
forward(x) ends with ``dist.all_reduce`` to sum partial products.
|
||||||
|
|
||||||
|
Both layers use the intra-device ``DPPolicy`` (ADR-0026). TP shard
|
||||||
|
ownership is determined by ``torch.ahbm.set_device(rank)`` (ADR-0024 D10).
|
||||||
|
|
||||||
|
Yield-safety contract (ADR-0027 D4/D5): every forward path contains at
|
||||||
|
least one ``ctx.wait`` (via ``torch.launch``) or one collective; this
|
||||||
|
keeps the scheduler loop making progress.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from kernbench.policy.placement.dp import DPPolicy
|
||||||
|
from kernbench.tp.kernels import _gemm_kernel
|
||||||
|
from kernbench.tp.parallel_state import (
|
||||||
|
get_tensor_model_parallel_world_size,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class ColumnParallelLinear:
|
||||||
|
"""Weight's K (out_features) axis distributed across TP ranks.
|
||||||
|
|
||||||
|
forward(x):
|
||||||
|
x: (M, N) — full-replicated across ranks
|
||||||
|
W_k: (N, K / world_size) — this rank's slice (on its SIP)
|
||||||
|
y_k = x @ W_k → (M, K / world_size)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
in_features: int,
|
||||||
|
out_features: int,
|
||||||
|
bias: bool = False,
|
||||||
|
dtype: str = "f16",
|
||||||
|
torch: Any = None,
|
||||||
|
) -> None:
|
||||||
|
if torch is None:
|
||||||
|
raise TypeError("ColumnParallelLinear requires torch=<RuntimeContext>")
|
||||||
|
ws = get_tensor_model_parallel_world_size()
|
||||||
|
if out_features % ws != 0:
|
||||||
|
raise ValueError(
|
||||||
|
f"out_features ({out_features}) must be divisible by TP world "
|
||||||
|
f"size ({ws})"
|
||||||
|
)
|
||||||
|
self.in_features = in_features
|
||||||
|
self.out_features = out_features
|
||||||
|
self.k_local = out_features // ws
|
||||||
|
self.dtype = dtype
|
||||||
|
self._torch = torch
|
||||||
|
# Per-rank weight slice. ``set_device(rank)`` (ADR-0024 D10) places
|
||||||
|
# it on SIP ``rank``. Intra-SIP layout comes from DPPolicy (ADR-0026).
|
||||||
|
self.weight = torch.zeros(
|
||||||
|
(in_features, self.k_local),
|
||||||
|
dtype=dtype,
|
||||||
|
dp=DPPolicy(cube="replicate", pe="replicate",
|
||||||
|
num_cubes=1, num_pes=1),
|
||||||
|
name="col_parallel_w",
|
||||||
|
)
|
||||||
|
# Bias omitted in initial scope (ADR-0027 D9).
|
||||||
|
self.bias = None
|
||||||
|
if bias:
|
||||||
|
raise NotImplementedError(
|
||||||
|
"bias=True is deferred (ADR-0027 D9 initial scope)"
|
||||||
|
)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
M = int(x.shape[0])
|
||||||
|
out = self._torch.empty(
|
||||||
|
(M, self.k_local),
|
||||||
|
dtype=x.dtype,
|
||||||
|
dp=DPPolicy(cube="replicate", pe="replicate",
|
||||||
|
num_cubes=1, num_pes=1),
|
||||||
|
name="col_parallel_out",
|
||||||
|
)
|
||||||
|
self._torch.launch(
|
||||||
|
"col_parallel_gemm",
|
||||||
|
_gemm_kernel,
|
||||||
|
x, self.weight, out,
|
||||||
|
M, self.in_features, self.k_local,
|
||||||
|
)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
class RowParallelLinear:
|
||||||
|
"""Weight's N (in_features) axis distributed across TP ranks.
|
||||||
|
|
||||||
|
forward(x):
|
||||||
|
x: (M, N / world_size) — rank-local slice (ColumnParallel output)
|
||||||
|
W_k: (N / world_size, K) — this rank's slice
|
||||||
|
y_k = x @ W_k → (M, K) — partial sum
|
||||||
|
y = all_reduce(y_k, op="sum") → (M, K) on every rank
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
in_features: int,
|
||||||
|
out_features: int,
|
||||||
|
bias: bool = False,
|
||||||
|
dtype: str = "f16",
|
||||||
|
torch: Any = None,
|
||||||
|
) -> None:
|
||||||
|
if torch is None:
|
||||||
|
raise TypeError("RowParallelLinear requires torch=<RuntimeContext>")
|
||||||
|
ws = get_tensor_model_parallel_world_size()
|
||||||
|
if in_features % ws != 0:
|
||||||
|
raise ValueError(
|
||||||
|
f"in_features ({in_features}) must be divisible by TP world "
|
||||||
|
f"size ({ws})"
|
||||||
|
)
|
||||||
|
self.in_features = in_features
|
||||||
|
self.out_features = out_features
|
||||||
|
self.n_local = in_features // ws
|
||||||
|
self.dtype = dtype
|
||||||
|
self._torch = torch
|
||||||
|
self.weight = torch.zeros(
|
||||||
|
(self.n_local, out_features),
|
||||||
|
dtype=dtype,
|
||||||
|
dp=DPPolicy(cube="replicate", pe="replicate",
|
||||||
|
num_cubes=1, num_pes=1),
|
||||||
|
name="row_parallel_w",
|
||||||
|
)
|
||||||
|
self.bias = None
|
||||||
|
if bias:
|
||||||
|
raise NotImplementedError(
|
||||||
|
"bias=True is deferred (ADR-0027 D9 initial scope)"
|
||||||
|
)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
M = int(x.shape[0])
|
||||||
|
y_partial = self._torch.empty(
|
||||||
|
(M, self.out_features),
|
||||||
|
dtype=x.dtype,
|
||||||
|
dp=DPPolicy(cube="replicate", pe="replicate",
|
||||||
|
num_cubes=1, num_pes=1),
|
||||||
|
name="row_parallel_partial",
|
||||||
|
)
|
||||||
|
self._torch.launch(
|
||||||
|
"row_parallel_gemm",
|
||||||
|
_gemm_kernel,
|
||||||
|
x, self.weight, y_partial,
|
||||||
|
M, self.n_local, self.out_features,
|
||||||
|
)
|
||||||
|
self._torch.distributed.all_reduce(y_partial, op="sum")
|
||||||
|
return y_partial
|
||||||
@@ -0,0 +1,5 @@
|
|||||||
|
"""Forward/backward mappings stub (ADR-0027 — future backward work).
|
||||||
|
|
||||||
|
Inference-only initial scope. Backward hooks land when training simulation
|
||||||
|
arrives.
|
||||||
|
"""
|
||||||
@@ -0,0 +1,83 @@
|
|||||||
|
"""TP group state (ADR-0027 D3).
|
||||||
|
|
||||||
|
Single global TP group. Initial scope: TP size == world_size (pure TP;
|
||||||
|
mixed DP+TP is future work).
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
_TP_WORLD_SIZE: int | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def initialize_model_parallel(tensor_model_parallel_size: int) -> None:
|
||||||
|
"""Initialize the TP process group.
|
||||||
|
|
||||||
|
Must be called after ``torch.distributed.init_process_group``.
|
||||||
|
Only ``tensor_model_parallel_size == world_size`` is supported in the
|
||||||
|
initial scope.
|
||||||
|
"""
|
||||||
|
global _TP_WORLD_SIZE
|
||||||
|
# Import here to avoid cycle when tp is imported before a ctx exists.
|
||||||
|
_ws = _current_world_size()
|
||||||
|
if tensor_model_parallel_size != _ws:
|
||||||
|
raise NotImplementedError(
|
||||||
|
f"Only TP == world_size supported; got TP={tensor_model_parallel_size}, "
|
||||||
|
f"world_size={_ws}"
|
||||||
|
)
|
||||||
|
_TP_WORLD_SIZE = tensor_model_parallel_size
|
||||||
|
|
||||||
|
|
||||||
|
def get_tensor_model_parallel_world_size() -> int:
|
||||||
|
"""Return the TP group's world size.
|
||||||
|
|
||||||
|
Raises if not initialised — callers must call
|
||||||
|
:func:`initialize_model_parallel` first.
|
||||||
|
"""
|
||||||
|
if _TP_WORLD_SIZE is None:
|
||||||
|
raise RuntimeError(
|
||||||
|
"TP group not initialised; call initialize_model_parallel() first"
|
||||||
|
)
|
||||||
|
return _TP_WORLD_SIZE
|
||||||
|
|
||||||
|
|
||||||
|
def get_tensor_model_parallel_rank() -> int:
|
||||||
|
"""Return this worker's rank within the TP group.
|
||||||
|
|
||||||
|
Delegates to the greenlet-local rank registered by the spawn launcher
|
||||||
|
(ADR-0024 D9 via ``torch.distributed.get_rank``).
|
||||||
|
"""
|
||||||
|
# Resolve via the global torch.distributed facade on the active ctx.
|
||||||
|
return _current_rank()
|
||||||
|
|
||||||
|
|
||||||
|
def _reset_for_tests() -> None:
|
||||||
|
"""Clear _TP_WORLD_SIZE so ordering-sensitive tests can re-init."""
|
||||||
|
global _TP_WORLD_SIZE
|
||||||
|
_TP_WORLD_SIZE = None
|
||||||
|
|
||||||
|
|
||||||
|
# ── helpers (resolve current ctx) ────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def _current_ctx():
|
||||||
|
"""Best-effort resolution of the currently-active RuntimeContext.
|
||||||
|
|
||||||
|
In KernBench, the ``ctx`` is passed as the ``torch`` positional in
|
||||||
|
bench/worker code. Since parallel_state is a module-global helper,
|
||||||
|
we look it up via a weak registry maintained by RuntimeContext.
|
||||||
|
"""
|
||||||
|
from kernbench.runtime_api.context import _get_active_context
|
||||||
|
ctx = _get_active_context()
|
||||||
|
if ctx is None:
|
||||||
|
raise RuntimeError(
|
||||||
|
"No active RuntimeContext; kernbench.tp requires one "
|
||||||
|
"(call init_process_group / spawn under a live ctx)"
|
||||||
|
)
|
||||||
|
return ctx
|
||||||
|
|
||||||
|
|
||||||
|
def _current_world_size() -> int:
|
||||||
|
return _current_ctx().distributed.get_world_size()
|
||||||
|
|
||||||
|
|
||||||
|
def _current_rank() -> int:
|
||||||
|
return _current_ctx().distributed.get_rank()
|
||||||
@@ -0,0 +1,34 @@
|
|||||||
|
"""TP primitive ops (ADR-0027 D6).
|
||||||
|
|
||||||
|
``copy_to_tp_region`` / ``reduce_from_tp_region`` are forward-only in the
|
||||||
|
initial scope (backward pass is future work). ``scatter`` / ``gather`` are
|
||||||
|
not implemented — they require an all-gather kernel that is not yet
|
||||||
|
available in KernBench (see ADR-0027 D9).
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
|
||||||
|
def copy_to_tp_region(x: Any) -> Any:
|
||||||
|
"""Forward: identity. Backward: all-reduce. (Training is future.)"""
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
def reduce_from_tp_region(x: Any, torch: Any) -> Any:
|
||||||
|
"""Forward: all-reduce. Backward: identity."""
|
||||||
|
torch.distributed.all_reduce(x, op="sum")
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
def scatter_to_tp_region(x: Any) -> Any:
|
||||||
|
raise NotImplementedError(
|
||||||
|
"scatter_to_tp_region deferred — caller should create the sharded "
|
||||||
|
"tensor directly (ADR-0027 D9)"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def gather_from_tp_region(x: Any) -> Any:
|
||||||
|
raise NotImplementedError(
|
||||||
|
"gather_from_tp_region deferred — requires all-gather kernel (ADR-0027 D9)"
|
||||||
|
)
|
||||||
@@ -0,0 +1,239 @@
|
|||||||
|
"""ADR-0026 Phase 1 tests: DPPolicy intra-device only + ShardSpec structural.
|
||||||
|
|
||||||
|
These tests encode the contract from ADR-0026:
|
||||||
|
|
||||||
|
- DPPolicy no longer accepts ``sip`` or ``num_sips`` kwargs (TypeError).
|
||||||
|
- ShardSpec carries structural ``(sip, cube, pe)`` coordinates; the old flat
|
||||||
|
``pe_index`` field/property is fully removed (AttributeError).
|
||||||
|
- ``resolve_dp_policy(..., target_sip=N)`` stamps every returned ShardSpec
|
||||||
|
with ``sip=N``; cube and pe fields are local.
|
||||||
|
- ``RuntimeContext._allocators`` is keyed by ``(sip, cube, pe)`` tuples.
|
||||||
|
|
||||||
|
Phase 1: production code is unchanged → these tests SHOULD FAIL until the
|
||||||
|
Phase 2 diff lands. Phase 2 makes all of them pass.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from kernbench.policy.address.allocator import AddressConfig, PEMemAllocator
|
||||||
|
from kernbench.policy.placement.dp import DPPolicy, ShardSpec, resolve_dp_policy
|
||||||
|
from kernbench.runtime_api.tensor import deploy_tensor
|
||||||
|
|
||||||
|
|
||||||
|
# ── D1: DPPolicy no longer accepts sip / num_sips ─────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_dppolicy_rejects_sip_kwarg():
|
||||||
|
"""DPPolicy(sip=...) must raise TypeError after field removal."""
|
||||||
|
with pytest.raises(TypeError):
|
||||||
|
DPPolicy(sip="column_wise", cube="replicate", pe="replicate")
|
||||||
|
|
||||||
|
|
||||||
|
def test_dppolicy_rejects_num_sips_kwarg():
|
||||||
|
"""DPPolicy(num_sips=...) must raise TypeError after field removal."""
|
||||||
|
with pytest.raises(TypeError):
|
||||||
|
DPPolicy(cube="replicate", pe="replicate", num_sips=2)
|
||||||
|
|
||||||
|
|
||||||
|
def test_dppolicy_accepts_only_intra_device_fields():
|
||||||
|
"""Intra-device fields still work: cube, pe, num_cubes, num_pes."""
|
||||||
|
dp = DPPolicy(cube="column_wise", pe="column_wise",
|
||||||
|
num_cubes=2, num_pes=4)
|
||||||
|
assert dp.cube == "column_wise"
|
||||||
|
assert dp.pe == "column_wise"
|
||||||
|
assert dp.num_cubes == 2
|
||||||
|
assert dp.num_pes == 4
|
||||||
|
# No sip / num_sips attributes — even reading them must fail.
|
||||||
|
assert not hasattr(dp, "sip"), "DPPolicy.sip must be removed"
|
||||||
|
assert not hasattr(dp, "num_sips"), "DPPolicy.num_sips must be removed"
|
||||||
|
|
||||||
|
|
||||||
|
# ── D2: ShardSpec structural coords, no pe_index ──────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_shardspec_has_structural_coords():
|
||||||
|
"""ShardSpec constructs from (sip, cube, pe, offset_bytes, nbytes)."""
|
||||||
|
s = ShardSpec(sip=1, cube=2, pe=3, offset_bytes=128, nbytes=64)
|
||||||
|
assert s.sip == 1
|
||||||
|
assert s.cube == 2
|
||||||
|
assert s.pe == 3
|
||||||
|
assert s.offset_bytes == 128
|
||||||
|
assert s.nbytes == 64
|
||||||
|
|
||||||
|
|
||||||
|
def test_shardspec_has_no_pe_index_attr():
|
||||||
|
"""Flat pe_index must be fully removed — no field, no property."""
|
||||||
|
s = ShardSpec(sip=0, cube=0, pe=0, offset_bytes=0, nbytes=8)
|
||||||
|
with pytest.raises(AttributeError):
|
||||||
|
_ = s.pe_index # noqa: F841
|
||||||
|
|
||||||
|
|
||||||
|
def test_shardspec_rejects_pe_index_kwarg():
|
||||||
|
"""ShardSpec(pe_index=...) must raise TypeError."""
|
||||||
|
with pytest.raises(TypeError):
|
||||||
|
ShardSpec(pe_index=0, offset_bytes=0, nbytes=8) # type: ignore[call-arg]
|
||||||
|
|
||||||
|
|
||||||
|
# ── D3: resolve_dp_policy(target_sip=...) structural semantics ────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_resolve_dp_policy_target_sip_stamps_shards():
|
||||||
|
"""All returned shards must carry sip == target_sip; cube/pe local."""
|
||||||
|
dp = DPPolicy(cube="column_wise", pe="column_wise")
|
||||||
|
shards = resolve_dp_policy(
|
||||||
|
dp, shape=(4, 32), itemsize=2,
|
||||||
|
num_pe=4, num_cubes=2, target_sip=1,
|
||||||
|
)
|
||||||
|
assert len(shards) == 2 * 4
|
||||||
|
assert all(s.sip == 1 for s in shards)
|
||||||
|
assert all(0 <= s.cube < 2 for s in shards)
|
||||||
|
assert all(0 <= s.pe < 4 for s in shards)
|
||||||
|
|
||||||
|
|
||||||
|
def test_resolve_dp_policy_target_sip_differ_only_in_sip():
|
||||||
|
"""Same policy + dims on two SIPs → shards identical except .sip."""
|
||||||
|
dp = DPPolicy(cube="replicate", pe="column_wise")
|
||||||
|
shards_0 = resolve_dp_policy(
|
||||||
|
dp, shape=(4, 32), itemsize=2,
|
||||||
|
num_pe=4, num_cubes=1, target_sip=0,
|
||||||
|
)
|
||||||
|
shards_1 = resolve_dp_policy(
|
||||||
|
dp, shape=(4, 32), itemsize=2,
|
||||||
|
num_pe=4, num_cubes=1, target_sip=1,
|
||||||
|
)
|
||||||
|
assert len(shards_0) == len(shards_1)
|
||||||
|
for a, b in zip(shards_0, shards_1):
|
||||||
|
assert a.sip == 0 and b.sip == 1
|
||||||
|
assert a.cube == b.cube
|
||||||
|
assert a.pe == b.pe
|
||||||
|
assert a.offset_bytes == b.offset_bytes
|
||||||
|
assert a.nbytes == b.nbytes
|
||||||
|
|
||||||
|
|
||||||
|
def test_resolve_dp_policy_no_num_sips_param():
|
||||||
|
"""resolve_dp_policy must not accept num_sips anymore.
|
||||||
|
|
||||||
|
Post-Phase-2 signature drops ``num_sips`` (DPPolicy no longer crosses
|
||||||
|
SIP boundaries) and adds required ``target_sip``. Calling with
|
||||||
|
``num_sips=...`` must raise TypeError (unexpected keyword argument).
|
||||||
|
"""
|
||||||
|
dp = DPPolicy(cube="replicate", pe="replicate")
|
||||||
|
with pytest.raises(TypeError, match="num_sips"):
|
||||||
|
resolve_dp_policy(
|
||||||
|
dp, shape=(4, 8), itemsize=2,
|
||||||
|
num_pe=1, num_cubes=1, num_sips=2, # type: ignore[call-arg]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ── D5: Allocator dict keyed by (sip, cube, pe) tuples ────────────────
|
||||||
|
|
||||||
|
|
||||||
|
_MB = 1 << 20
|
||||||
|
_GB = 1 << 30
|
||||||
|
|
||||||
|
_CFG = AddressConfig(
|
||||||
|
sip_count=2,
|
||||||
|
cubes_per_sip=2,
|
||||||
|
pes_per_cube=4,
|
||||||
|
hbm_bytes_per_cube=_GB,
|
||||||
|
hbm_slices_per_cube=4,
|
||||||
|
tcm_bytes_per_pe=_MB,
|
||||||
|
tcm_scheduler_reserved_bytes=0,
|
||||||
|
sram_bytes_per_cube=_MB,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _make_tuple_allocators(
|
||||||
|
num_sips: int = 1, num_cubes: int = 1, num_pe: int = 4,
|
||||||
|
) -> dict[tuple[int, int, int], PEMemAllocator]:
|
||||||
|
return {
|
||||||
|
(s, c, p): PEMemAllocator(
|
||||||
|
rack_id=0, sip_id=s, cube_id=c, pe_id=p, cfg=_CFG,
|
||||||
|
)
|
||||||
|
for s in range(num_sips)
|
||||||
|
for c in range(num_cubes)
|
||||||
|
for p in range(num_pe)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_deploy_tensor_uses_tuple_lookup():
|
||||||
|
"""deploy_tensor(allocators={(sip,cube,pe): alloc, ...}) succeeds."""
|
||||||
|
dp = DPPolicy(cube="replicate", pe="column_wise")
|
||||||
|
placement = resolve_dp_policy(
|
||||||
|
dp, shape=(4, 16), itemsize=2,
|
||||||
|
num_pe=4, num_cubes=1, target_sip=0,
|
||||||
|
)
|
||||||
|
allocators = _make_tuple_allocators(num_sips=1, num_cubes=1, num_pe=4)
|
||||||
|
handle = deploy_tensor(
|
||||||
|
name="t", shape=(4, 16), dtype="f16",
|
||||||
|
placement=placement, allocators=allocators,
|
||||||
|
)
|
||||||
|
assert len(handle.shards) == 4
|
||||||
|
# Each shard's TensorShard carries structural coords; those coords
|
||||||
|
# must match the shard's ShardSpec (sip, cube, pe).
|
||||||
|
for spec, shard in zip(placement, handle.shards):
|
||||||
|
assert shard.sip == spec.sip
|
||||||
|
assert shard.cube == spec.cube
|
||||||
|
assert shard.pe == spec.pe
|
||||||
|
|
||||||
|
|
||||||
|
def test_runtime_context_allocator_keys_are_tuples(topology):
|
||||||
|
"""After ctx tensor op, ctx._allocators keys are (sip, cube, pe) tuples.
|
||||||
|
|
||||||
|
Ensures D5 migration landed (allocator population + lookup).
|
||||||
|
"""
|
||||||
|
from kernbench.runtime_api.context import RuntimeContext
|
||||||
|
from kernbench.runtime_api.types import DeviceSelector
|
||||||
|
from kernbench.sim_engine.engine import GraphEngine
|
||||||
|
|
||||||
|
engine = GraphEngine(topology.topology_obj, enable_data=True)
|
||||||
|
ctx = RuntimeContext(
|
||||||
|
engine=engine,
|
||||||
|
target_device=DeviceSelector("sip:0"),
|
||||||
|
correlation_id="test_adr0026_tuple_keys",
|
||||||
|
spec=topology.topology_obj.spec,
|
||||||
|
)
|
||||||
|
dp = DPPolicy(cube="replicate", pe="replicate", num_cubes=1, num_pes=1)
|
||||||
|
_ = ctx.zeros((1, 16), dtype="f16", dp=dp)
|
||||||
|
|
||||||
|
assert ctx._allocators, "allocators dict should be populated"
|
||||||
|
keys = list(ctx._allocators.keys())
|
||||||
|
assert all(isinstance(k, tuple) and len(k) == 3 for k in keys), (
|
||||||
|
f"_allocators keys must be (sip, cube, pe) tuples; got {keys[:5]}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ── D4 (via regression): no SIP-crossing tensor without set_device ────
|
||||||
|
|
||||||
|
|
||||||
|
def test_create_tensor_on_target_sip_via_set_device(topology):
|
||||||
|
"""torch.ahbm.set_device(1) + DPPolicy(cube=replicate, pe=replicate)
|
||||||
|
→ all shards land on SIP 1 structurally (no post-hoc shifting needed)."""
|
||||||
|
from kernbench.runtime_api.context import RuntimeContext
|
||||||
|
from kernbench.runtime_api.types import DeviceSelector
|
||||||
|
from kernbench.sim_engine.engine import GraphEngine
|
||||||
|
|
||||||
|
# Skip the test if topology has only 1 SIP (nothing to verify).
|
||||||
|
n_sips = int(
|
||||||
|
topology.topology_obj.spec.get("system", {})
|
||||||
|
.get("sips", {}).get("count", 1)
|
||||||
|
)
|
||||||
|
if n_sips < 2:
|
||||||
|
pytest.skip("topology has <2 SIPs; set_device(1) not meaningful")
|
||||||
|
|
||||||
|
engine = GraphEngine(topology.topology_obj, enable_data=True)
|
||||||
|
ctx = RuntimeContext(
|
||||||
|
engine=engine,
|
||||||
|
target_device=DeviceSelector("sip:1"),
|
||||||
|
correlation_id="test_adr0026_set_device",
|
||||||
|
spec=topology.topology_obj.spec,
|
||||||
|
)
|
||||||
|
ctx.ahbm.set_device(1)
|
||||||
|
dp = DPPolicy(cube="replicate", pe="replicate", num_cubes=1, num_pes=1)
|
||||||
|
t = ctx.zeros((1, 16), dtype="f16", dp=dp)
|
||||||
|
|
||||||
|
assert t._handle is not None
|
||||||
|
assert all(s.sip == 1 for s in t._handle.shards), (
|
||||||
|
f"expected all shards on SIP 1; got {[s.sip for s in t._handle.shards]}"
|
||||||
|
)
|
||||||
@@ -0,0 +1,222 @@
|
|||||||
|
"""Config-driven multi-device allreduce test application.
|
||||||
|
|
||||||
|
Reads ``ccl.yaml`` + ``topology.yaml``, dynamically loads the kernel
|
||||||
|
module from ``ccl.yaml → module``, and picks the inter-SIP exchange
|
||||||
|
pattern from ``topology.yaml → system.sips.topology``.
|
||||||
|
|
||||||
|
Run directly::
|
||||||
|
|
||||||
|
python -m pytest tests/allreduce_app.py -v -s
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import importlib
|
||||||
|
import math
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from kernbench.ccl.install import load_ccl_config, resolve_algorithm_config
|
||||||
|
from kernbench.ccl.sfr_config import configure_sfr_intercube_multisip
|
||||||
|
from kernbench.policy.placement.dp import DPPolicy
|
||||||
|
|
||||||
|
|
||||||
|
def _sip_topo_dims(sip_topo: str, n_sips: int) -> tuple[int, int]:
|
||||||
|
if sip_topo == "ring_1d":
|
||||||
|
return (0, 0)
|
||||||
|
side = int(round(math.sqrt(n_sips)))
|
||||||
|
if side * side != n_sips:
|
||||||
|
raise ValueError(
|
||||||
|
f"SIP topology '{sip_topo}' requires square n_sips, got {n_sips}"
|
||||||
|
)
|
||||||
|
return (side, side)
|
||||||
|
|
||||||
|
|
||||||
|
def run_allreduce(
|
||||||
|
ctx: Any,
|
||||||
|
engine: Any,
|
||||||
|
spec: dict,
|
||||||
|
*,
|
||||||
|
algorithm: str | None = None,
|
||||||
|
ccl_yaml: str | None = None,
|
||||||
|
) -> dict:
|
||||||
|
"""Config-driven allreduce: read yaml, load kernel, run.
|
||||||
|
|
||||||
|
Everything is resolved from config — no hardcoded kernel imports.
|
||||||
|
"""
|
||||||
|
cfg_all = load_ccl_config(ccl_yaml)
|
||||||
|
cfg = resolve_algorithm_config(cfg_all, algorithm)
|
||||||
|
|
||||||
|
# Dynamic import from ccl.yaml → module
|
||||||
|
algo_module = importlib.import_module(cfg["module"])
|
||||||
|
kernel_fn = algo_module.kernel
|
||||||
|
topo_name_to_kind = algo_module.TOPO_NAME_TO_KIND
|
||||||
|
|
||||||
|
n_elem = int(cfg.get("n_elem", 8))
|
||||||
|
n_sips = int(spec.get("system", {}).get("sips", {}).get("count", 1))
|
||||||
|
sip_topo = str(
|
||||||
|
spec.get("system", {}).get("sips", {}).get("topology", "ring_1d")
|
||||||
|
)
|
||||||
|
|
||||||
|
cm = spec["sip"]["cube_mesh"]
|
||||||
|
cube_w = int(cm["w"])
|
||||||
|
cube_h = int(cm["h"])
|
||||||
|
n_cubes = cube_w * cube_h
|
||||||
|
|
||||||
|
sip_topo_kind = topo_name_to_kind.get(sip_topo, 0)
|
||||||
|
sip_topo_w, sip_topo_h = _sip_topo_dims(sip_topo, n_sips)
|
||||||
|
|
||||||
|
algo_name = cfg.get("algorithm", "allreduce")
|
||||||
|
print(f"\n{'=' * 60}")
|
||||||
|
print(f"algorithm: {algo_name}")
|
||||||
|
print(f"module: {cfg['module']}")
|
||||||
|
print(f"sip_topology: {sip_topo}")
|
||||||
|
print(f"kernel: {kernel_fn.__name__}")
|
||||||
|
print(f"n_sips: {n_sips}")
|
||||||
|
print(f"n_cubes: {n_cubes}")
|
||||||
|
print(f"n_elem: {n_elem}")
|
||||||
|
print(f"{'=' * 60}")
|
||||||
|
|
||||||
|
configure_sfr_intercube_multisip(engine, spec, cfg)
|
||||||
|
|
||||||
|
dp = DPPolicy(
|
||||||
|
cube="row_wise", pe="replicate",
|
||||||
|
num_pes=1, num_cubes=n_cubes,
|
||||||
|
)
|
||||||
|
|
||||||
|
tensors = []
|
||||||
|
for sip in range(n_sips):
|
||||||
|
ctx.ahbm.set_device(sip)
|
||||||
|
t = ctx.zeros(
|
||||||
|
(n_cubes, n_elem), dtype="f16", dp=dp,
|
||||||
|
name=f"sip{sip}",
|
||||||
|
)
|
||||||
|
t.copy_(ctx.from_numpy(
|
||||||
|
np.full((n_cubes, n_elem), float(sip + 1), dtype=np.float16)
|
||||||
|
))
|
||||||
|
tensors.append(t)
|
||||||
|
|
||||||
|
for sip in range(n_sips):
|
||||||
|
arr = tensors[sip].numpy()
|
||||||
|
print(f"[SIP {sip}] input cube0[:4] = {arr[0][:4].tolist()} "
|
||||||
|
f"cube{n_cubes - 1}[:4] = {arr[-1][:4].tolist()}")
|
||||||
|
|
||||||
|
t_start = engine._env.now
|
||||||
|
|
||||||
|
all_pending = []
|
||||||
|
for sip_rank, t in enumerate(tensors):
|
||||||
|
pending = ctx.launch(
|
||||||
|
algo_name, kernel_fn, t,
|
||||||
|
n_elem, cube_w, cube_h, n_sips, sip_rank,
|
||||||
|
sip_topo_kind, sip_topo_w, sip_topo_h,
|
||||||
|
_defer_wait=True,
|
||||||
|
)
|
||||||
|
all_pending.extend(pending)
|
||||||
|
|
||||||
|
for h, sip_id, meta in all_pending:
|
||||||
|
ctx.wait(h, _meta=meta)
|
||||||
|
|
||||||
|
t_end = engine._env.now
|
||||||
|
latency_ns = t_end - t_start
|
||||||
|
print(f"\n[{algo_name} ws={n_sips}] sim latency = "
|
||||||
|
f"{latency_ns:.1f} ns ({latency_ns / 1000:.3f} us)")
|
||||||
|
|
||||||
|
for key, (_, trace) in engine._results.items():
|
||||||
|
if not isinstance(trace, dict):
|
||||||
|
continue
|
||||||
|
total = trace.get("total_ns", 0.0)
|
||||||
|
pe_exec = trace.get("pe_exec_ns", 0.0) or 0.0
|
||||||
|
network = total - pe_exec
|
||||||
|
print(f" [{key}] total={total:.1f} ns "
|
||||||
|
f"pe_exec={pe_exec:.1f} ns network={network:.1f} ns")
|
||||||
|
|
||||||
|
expected = float(n_cubes * sum(range(1, n_sips + 1)))
|
||||||
|
|
||||||
|
print()
|
||||||
|
for sip in range(n_sips):
|
||||||
|
arr = tensors[sip].numpy()
|
||||||
|
print(f"[SIP {sip}] output cube0[:4] = {arr[0][:4].tolist()}")
|
||||||
|
print(f"[SIP {sip}] output cube{n_cubes - 1}[:4] = {arr[-1][:4].tolist()}")
|
||||||
|
|
||||||
|
ok_cubes = 0
|
||||||
|
for sip in range(n_sips):
|
||||||
|
arr = tensors[sip].numpy()
|
||||||
|
for cube_id in range(n_cubes):
|
||||||
|
assert np.allclose(
|
||||||
|
arr[cube_id], expected, rtol=1e-1, atol=1e-1,
|
||||||
|
), (
|
||||||
|
f"SIP{sip} cube {cube_id}: "
|
||||||
|
f"got {arr[cube_id][:4]}, expected {expected}"
|
||||||
|
)
|
||||||
|
ok_cubes += 1
|
||||||
|
|
||||||
|
print(f"\n {algo_name} (ws={n_sips}): {ok_cubes} OK")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"expected": expected,
|
||||||
|
"latency_ns": latency_ns,
|
||||||
|
"ok_cubes": ok_cubes,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ── pytest entry point ───────────────────────────────────────────────
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
from kernbench.runtime_api.context import RuntimeContext
|
||||||
|
from kernbench.runtime_api.types import DeviceSelector
|
||||||
|
from kernbench.sim_engine.engine import GraphEngine
|
||||||
|
from kernbench.topology.builder import resolve_topology
|
||||||
|
|
||||||
|
TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"
|
||||||
|
|
||||||
|
CONFIGS = [
|
||||||
|
pytest.param("intercube_allreduce", "ring_1d", 2, id="ring_2sip"),
|
||||||
|
pytest.param("intercube_allreduce", "torus_2d", 4, id="torus_4sip"),
|
||||||
|
pytest.param("intercube_allreduce", "mesh_2d_no_wrap", 4, id="mesh_4sip"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _write_temp_configs(tmp_path, sip_topology, n_sips, algorithm):
|
||||||
|
"""Write temp topology.yaml and ccl.yaml with the given overrides."""
|
||||||
|
with open(TOPOLOGY_PATH) as f:
|
||||||
|
topo_cfg = yaml.safe_load(f)
|
||||||
|
topo_cfg["system"]["sips"]["count"] = n_sips
|
||||||
|
topo_cfg["system"]["sips"]["topology"] = sip_topology
|
||||||
|
topo_path = tmp_path / "topology.yaml"
|
||||||
|
with open(topo_path, "w") as f:
|
||||||
|
yaml.dump(topo_cfg, f, default_flow_style=False)
|
||||||
|
|
||||||
|
ccl_path = Path(__file__).parent.parent / "ccl.yaml"
|
||||||
|
with open(ccl_path) as f:
|
||||||
|
ccl_cfg = yaml.safe_load(f)
|
||||||
|
ccl_cfg["defaults"]["algorithm"] = algorithm
|
||||||
|
tmp_ccl = tmp_path / "ccl.yaml"
|
||||||
|
with open(tmp_ccl, "w") as f:
|
||||||
|
yaml.dump(ccl_cfg, f, default_flow_style=False)
|
||||||
|
|
||||||
|
return str(topo_path), str(tmp_ccl)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("algorithm,sip_topology,n_sips", CONFIGS)
|
||||||
|
def test_allreduce(tmp_path, algorithm, sip_topology, n_sips):
|
||||||
|
topo_path, ccl_path = _write_temp_configs(
|
||||||
|
tmp_path, sip_topology, n_sips, algorithm,
|
||||||
|
)
|
||||||
|
topo = resolve_topology(topo_path)
|
||||||
|
engine = GraphEngine(topo.topology_obj, enable_data=True)
|
||||||
|
spec = topo.topology_obj.spec
|
||||||
|
|
||||||
|
with RuntimeContext(
|
||||||
|
engine=engine,
|
||||||
|
target_device=DeviceSelector("all"),
|
||||||
|
correlation_id=f"test_{algorithm}_{sip_topology}",
|
||||||
|
spec=spec,
|
||||||
|
) as ctx:
|
||||||
|
result = run_allreduce(
|
||||||
|
ctx, engine, spec,
|
||||||
|
algorithm=algorithm, ccl_yaml=ccl_path,
|
||||||
|
)
|
||||||
|
assert result["ok_cubes"] > 0
|
||||||
@@ -1,150 +0,0 @@
|
|||||||
"""End-to-end matrix tests for the unified ``ccl_allreduce`` bench.
|
|
||||||
|
|
||||||
Each parametrized case writes a tmp ``ccl.yaml`` overlay that selects a
|
|
||||||
specific (algorithm, world_size, buffer_kind, n_elem) combination, then
|
|
||||||
runs the bench via the CLI and asserts the printed line reports all
|
|
||||||
ranks OK.
|
|
||||||
|
|
||||||
This single test file replaces the per-variant bench tests
|
|
||||||
(test_ccl_allreduce_e2e, test_ccl_mesh_allreduce, test_ccl_tree_allreduce,
|
|
||||||
test_ccl_multicube, test_ccl_multisip).
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import os
|
|
||||||
import textwrap
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
import kernbench.cli.main as cli_main
|
|
||||||
|
|
||||||
|
|
||||||
CCL_YAML_TEMPLATE = textwrap.dedent("""\
|
|
||||||
defaults:
|
|
||||||
algorithm: {algorithm}
|
|
||||||
buffer_kind: {buffer_kind}
|
|
||||||
backpressure: sleep
|
|
||||||
n_slots: 4
|
|
||||||
slot_size: 4096
|
|
||||||
vc_chunk_size: 256
|
|
||||||
ipcq_credit_size_bytes: 16
|
|
||||||
|
|
||||||
algorithms:
|
|
||||||
{algorithm}:
|
|
||||||
module: {module}
|
|
||||||
topology: {topology}
|
|
||||||
buffer_kind: {buffer_kind}
|
|
||||||
{world_size_line}{n_elem_line}
|
|
||||||
""")
|
|
||||||
|
|
||||||
|
|
||||||
def _write_ccl_yaml(
|
|
||||||
tmp_path,
|
|
||||||
*,
|
|
||||||
algorithm: str,
|
|
||||||
module: str,
|
|
||||||
topology: str,
|
|
||||||
buffer_kind: str = "tcm",
|
|
||||||
world_size: int | None = None,
|
|
||||||
n_elem: int | None = None,
|
|
||||||
) -> str:
|
|
||||||
"""Write a tmp ccl.yaml in tmp_path and return its directory."""
|
|
||||||
ws_line = f" world_size: {world_size}\n" if world_size is not None else ""
|
|
||||||
nel_line = f" n_elem: {n_elem}\n" if n_elem is not None else ""
|
|
||||||
body = CCL_YAML_TEMPLATE.format(
|
|
||||||
algorithm=algorithm,
|
|
||||||
module=module,
|
|
||||||
topology=topology,
|
|
||||||
buffer_kind=buffer_kind,
|
|
||||||
world_size_line=ws_line,
|
|
||||||
n_elem_line=nel_line,
|
|
||||||
)
|
|
||||||
yaml_path = tmp_path / "ccl.yaml"
|
|
||||||
yaml_path.write_text(body)
|
|
||||||
return str(tmp_path)
|
|
||||||
|
|
||||||
|
|
||||||
CASES = [
|
|
||||||
# algorithm, module, topology, buffer_kind, world_size, n_elem, expected_ws
|
|
||||||
#
|
|
||||||
# Full-system (256-rank, cross-SIP) — run only ONCE (tcm). Buffer
|
|
||||||
# variant differences are purely IPCQ slot placement; the compute path
|
|
||||||
# is identical. Cross-SIP routing is the real thing being verified here.
|
|
||||||
pytest.param(
|
|
||||||
"ring_allreduce_tcm", "kernbench.ccl.algorithms.ring_allreduce",
|
|
||||||
"ring_1d", "tcm", None, 8, 256,
|
|
||||||
id="ring_full_system",
|
|
||||||
marks=pytest.mark.slow,
|
|
||||||
),
|
|
||||||
# Buffer variants at 8-rank (fast — same kernel, different slot space).
|
|
||||||
pytest.param(
|
|
||||||
"ring_allreduce_tcm", "kernbench.ccl.algorithms.ring_allreduce",
|
|
||||||
"ring_1d", "tcm", 8, 32, 8,
|
|
||||||
id="ring_tcm_8",
|
|
||||||
),
|
|
||||||
pytest.param(
|
|
||||||
"ring_allreduce_hbm", "kernbench.ccl.algorithms.ring_allreduce",
|
|
||||||
"ring_1d", "hbm", 8, 32, 8,
|
|
||||||
id="ring_hbm_8",
|
|
||||||
),
|
|
||||||
pytest.param(
|
|
||||||
"ring_allreduce_sram", "kernbench.ccl.algorithms.ring_allreduce",
|
|
||||||
"ring_1d", "sram", 8, 32, 8,
|
|
||||||
id="ring_sram_8",
|
|
||||||
),
|
|
||||||
# Multi-cube (16-rank, cross-cube within 1 SIP).
|
|
||||||
pytest.param(
|
|
||||||
"ring_allreduce_16", "kernbench.ccl.algorithms.ring_allreduce",
|
|
||||||
"ring_1d", "tcm", 16, 16, 16,
|
|
||||||
id="ring_multi_cube",
|
|
||||||
),
|
|
||||||
# Mesh + tree algorithms.
|
|
||||||
pytest.param(
|
|
||||||
"mesh_allreduce_4", "kernbench.ccl.algorithms.mesh_allreduce",
|
|
||||||
"mesh_2d", "tcm", 4, 16, 4,
|
|
||||||
id="mesh_2x2",
|
|
||||||
),
|
|
||||||
pytest.param(
|
|
||||||
"tree_allreduce_7", "kernbench.ccl.algorithms.tree_allreduce",
|
|
||||||
"tree_binary", "tcm", 7, 16, 7,
|
|
||||||
id="tree_binary_7",
|
|
||||||
),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"algorithm,module,topology,buffer_kind,world_size,n_elem,expected_ws",
|
|
||||||
CASES,
|
|
||||||
)
|
|
||||||
def test_ccl_allreduce_matrix(
|
|
||||||
tmp_path, capsys, monkeypatch,
|
|
||||||
algorithm, module, topology, buffer_kind, world_size, n_elem, expected_ws,
|
|
||||||
):
|
|
||||||
"""Each (algorithm × buffer × world_size) combo passes through the
|
|
||||||
unified bench and yields all ranks OK."""
|
|
||||||
project_root = os.path.abspath(
|
|
||||||
os.path.join(os.path.dirname(__file__), "..")
|
|
||||||
)
|
|
||||||
yaml_dir = _write_ccl_yaml(
|
|
||||||
tmp_path,
|
|
||||||
algorithm=algorithm,
|
|
||||||
module=module,
|
|
||||||
topology=topology,
|
|
||||||
buffer_kind=buffer_kind,
|
|
||||||
world_size=world_size,
|
|
||||||
n_elem=n_elem,
|
|
||||||
)
|
|
||||||
monkeypatch.chdir(yaml_dir)
|
|
||||||
rc = cli_main.main([
|
|
||||||
"run",
|
|
||||||
"--topology", os.path.join(project_root, "topology.yaml"),
|
|
||||||
"--bench", "ccl_allreduce",
|
|
||||||
"--verify-data",
|
|
||||||
])
|
|
||||||
assert rc == 0
|
|
||||||
out = capsys.readouterr().out
|
|
||||||
assert "FAIL" not in out, f"unexpected FAIL in output:\n{out}"
|
|
||||||
assert f"{algorithm} (ws={expected_ws}): {expected_ws} OK" in out, (
|
|
||||||
f"expected '{algorithm} (ws={expected_ws}): {expected_ws} OK' "
|
|
||||||
f"in output:\n{out}"
|
|
||||||
)
|
|
||||||
@@ -1,125 +0,0 @@
|
|||||||
"""Tests for IPCQ deadlock detection (ADR-0023 D14 F3)."""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from dataclasses import dataclass, field
|
|
||||||
from typing import Any
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
import simpy
|
|
||||||
|
|
||||||
from kernbench.ccl import diagnostics
|
|
||||||
from kernbench.common.ipcq_types import (
|
|
||||||
IpcqEndpoint,
|
|
||||||
IpcqInitEntry,
|
|
||||||
IpcqRecvCmd,
|
|
||||||
IpcqRequest,
|
|
||||||
)
|
|
||||||
from kernbench.components.builtin.pe_ipcq import PeIpcqComponent
|
|
||||||
from kernbench.runtime_api.kernel import IpcqInitMsg
|
|
||||||
from kernbench.topology.types import Node
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class _FakeTxn:
|
|
||||||
request: Any
|
|
||||||
done: simpy.Event
|
|
||||||
result_data: dict[str, Any] = field(default_factory=dict)
|
|
||||||
|
|
||||||
|
|
||||||
def _make_isolated_pe_ipcq(env):
|
|
||||||
node = Node(
|
|
||||||
id="sip0.cube0.pe0.pe_ipcq", kind="pe_ipcq",
|
|
||||||
impl="builtin.pe_ipcq", attrs={}, pos_mm=None,
|
|
||||||
)
|
|
||||||
comp = PeIpcqComponent(node, ctx=None)
|
|
||||||
comp.in_ports["host"] = simpy.Store(env)
|
|
||||||
comp.out_ports["sip0.cube0.pe0.pe_dma"] = simpy.Store(env)
|
|
||||||
comp.start(env)
|
|
||||||
|
|
||||||
peer_credit = simpy.Store(env)
|
|
||||||
ep = IpcqEndpoint(
|
|
||||||
sip=0, cube=0, pe=1, buffer_kind="tcm",
|
|
||||||
rx_base_pa=0x10_000, rx_base_va=0,
|
|
||||||
n_slots=4, slot_size=4096,
|
|
||||||
)
|
|
||||||
init_msg = IpcqInitMsg(
|
|
||||||
correlation_id="t", request_id="t",
|
|
||||||
target_sips=(0,), target_cubes=(0,), target_pe=0,
|
|
||||||
entries=(IpcqInitEntry(
|
|
||||||
direction="W", peer=ep,
|
|
||||||
my_rx_base_pa=0x40_000, my_rx_base_va=0,
|
|
||||||
n_slots=4, slot_size=4096,
|
|
||||||
peer_credit_store=peer_credit,
|
|
||||||
),),
|
|
||||||
backpressure_mode="sleep",
|
|
||||||
buffer_kind="tcm",
|
|
||||||
credit_size_bytes=16,
|
|
||||||
)
|
|
||||||
done = env.event()
|
|
||||||
comp.in_ports["host"].put(_FakeTxn(request=init_msg, done=done))
|
|
||||||
env.run(until=done)
|
|
||||||
return comp
|
|
||||||
|
|
||||||
|
|
||||||
def test_pointer_dump_includes_blocked_state():
|
|
||||||
"""A blocked recv should still be visible in the pointer dump."""
|
|
||||||
env = simpy.Environment()
|
|
||||||
comp = _make_isolated_pe_ipcq(env)
|
|
||||||
|
|
||||||
# Issue a recv that will block (no data has arrived)
|
|
||||||
recv_cmd = IpcqRecvCmd(direction="W", shape=(8,), dtype="f16", handle_id="r1")
|
|
||||||
req = IpcqRequest(command=recv_cmd, done=env.event())
|
|
||||||
comp.in_ports["host"].put(req)
|
|
||||||
env.run(until=10)
|
|
||||||
assert not req.done.triggered
|
|
||||||
|
|
||||||
# Pointer dump should show my_tail=0 and peer_head_cache=0
|
|
||||||
# We need to use the engine API but for an isolated component, just call directly
|
|
||||||
class FakeEngine:
|
|
||||||
_components = {"sip0.cube0.pe0.pe_ipcq": comp}
|
|
||||||
|
|
||||||
dump = diagnostics.pointer_dump(FakeEngine())
|
|
||||||
assert "my_tail=0" in dump
|
|
||||||
assert "peer_head_cache=0" in dump
|
|
||||||
|
|
||||||
|
|
||||||
def test_deadlock_detection_recv_without_send():
|
|
||||||
"""A recv with no matching sender → SimPy schedule empties → engine
|
|
||||||
raises ``IpcqDeadlock`` with a pointer dump.
|
|
||||||
"""
|
|
||||||
from kernbench.ccl.diagnostics import IpcqDeadlock
|
|
||||||
from kernbench.policy.placement.dp import DPPolicy
|
|
||||||
from kernbench.runtime_api.bench_runner import run_bench
|
|
||||||
from kernbench.runtime_api.types import resolve_device
|
|
||||||
from kernbench.sim_engine.engine import GraphEngine
|
|
||||||
from kernbench.topology.builder import resolve_topology
|
|
||||||
|
|
||||||
def deadlock_kernel(t_ptr, n_elem, tl):
|
|
||||||
# Every PE just receives, no sends → no one delivers → deadlock
|
|
||||||
tl.recv(dir="W", shape=(n_elem,), dtype="f16")
|
|
||||||
|
|
||||||
topo = resolve_topology("topology.yaml")
|
|
||||||
|
|
||||||
def run(torch):
|
|
||||||
torch.install_ipcq(
|
|
||||||
algorithm="ring_allreduce_tcm", world_size_override=8,
|
|
||||||
)
|
|
||||||
a = torch.zeros(
|
|
||||||
(1, 8 * 8),
|
|
||||||
dtype="f16",
|
|
||||||
dp=DPPolicy(
|
|
||||||
sip="replicate", cube="replicate", pe="column_wise",
|
|
||||||
num_sips=1, num_cubes=1,
|
|
||||||
),
|
|
||||||
name="dl_in",
|
|
||||||
)
|
|
||||||
torch.launch("dl", deadlock_kernel, a, 8)
|
|
||||||
|
|
||||||
with pytest.raises(IpcqDeadlock):
|
|
||||||
run_bench(
|
|
||||||
topology=topo, bench_fn=run,
|
|
||||||
device=resolve_device("all"),
|
|
||||||
engine_factory=lambda t, d: GraphEngine(
|
|
||||||
getattr(t, "topology_obj", t), enable_data=True
|
|
||||||
),
|
|
||||||
)
|
|
||||||
@@ -1,70 +0,0 @@
|
|||||||
"""Tests for CCL diagnostics: trace + pointer dump (ADR-0023 D14)."""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import os
|
|
||||||
|
|
||||||
from kernbench.ccl import diagnostics
|
|
||||||
|
|
||||||
|
|
||||||
# ── trace toggle ─────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
|
|
||||||
def test_trace_disabled_by_default(monkeypatch):
|
|
||||||
monkeypatch.delenv("KERNBENCH_CCL_TRACE", raising=False)
|
|
||||||
diagnostics.reload_trace_setting()
|
|
||||||
assert diagnostics.trace_enabled() is False
|
|
||||||
|
|
||||||
|
|
||||||
def test_trace_enabled_via_env(monkeypatch):
|
|
||||||
monkeypatch.setenv("KERNBENCH_CCL_TRACE", "1")
|
|
||||||
diagnostics.reload_trace_setting()
|
|
||||||
assert diagnostics.trace_enabled() is True
|
|
||||||
|
|
||||||
|
|
||||||
def test_trace_record_send(monkeypatch, capsys):
|
|
||||||
monkeypatch.setenv("KERNBENCH_CCL_TRACE", "1")
|
|
||||||
diagnostics.reload_trace_setting()
|
|
||||||
diagnostics.log_send(t_ns=100.0, sender="sip0.cube0.pe0",
|
|
||||||
direction="E", nbytes=64, sender_seq=0)
|
|
||||||
out = capsys.readouterr().out
|
|
||||||
assert "send" in out
|
|
||||||
assert "sip0.cube0.pe0" in out
|
|
||||||
assert "dir=E" in out
|
|
||||||
monkeypatch.delenv("KERNBENCH_CCL_TRACE")
|
|
||||||
diagnostics.reload_trace_setting()
|
|
||||||
|
|
||||||
|
|
||||||
def test_trace_record_recv(monkeypatch, capsys):
|
|
||||||
monkeypatch.setenv("KERNBENCH_CCL_TRACE", "1")
|
|
||||||
diagnostics.reload_trace_setting()
|
|
||||||
diagnostics.log_recv(t_ns=200.0, receiver="sip0.cube0.pe1",
|
|
||||||
direction="W", nbytes=64)
|
|
||||||
out = capsys.readouterr().out
|
|
||||||
assert "recv" in out
|
|
||||||
assert "sip0.cube0.pe1" in out
|
|
||||||
monkeypatch.delenv("KERNBENCH_CCL_TRACE")
|
|
||||||
diagnostics.reload_trace_setting()
|
|
||||||
|
|
||||||
|
|
||||||
# ── pointer dump ────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
|
|
||||||
def test_pointer_dump_format():
|
|
||||||
from kernbench.sim_engine.engine import GraphEngine
|
|
||||||
from kernbench.topology.builder import resolve_topology
|
|
||||||
from kernbench.ccl.install import (
|
|
||||||
install_ipcq, load_ccl_config, resolve_algorithm_config,
|
|
||||||
)
|
|
||||||
|
|
||||||
topo = resolve_topology("topology.yaml").topology_obj
|
|
||||||
engine = GraphEngine(topo, enable_data=True)
|
|
||||||
cfg = resolve_algorithm_config(load_ccl_config(), name="ring_allreduce_tcm")
|
|
||||||
install_ipcq(engine, topo.spec, cfg)
|
|
||||||
|
|
||||||
dump = diagnostics.pointer_dump(engine)
|
|
||||||
# 8 ranks × 2 directions = 16 lines (plus 8 PE headers)
|
|
||||||
assert "sip0.cube0.pe0" in dump
|
|
||||||
assert "E:" in dump
|
|
||||||
assert "W:" in dump
|
|
||||||
assert "my_head=" in dump
|
|
||||||
assert "peer_tail_cache=" in dump
|
|
||||||
@@ -1,81 +0,0 @@
|
|||||||
"""Validate the hello-world example from docs/ccl-author-guide.md.
|
|
||||||
|
|
||||||
This is the simplest possible CCL kernel — each PE sends its tile E
|
|
||||||
and receives a tile from W. After running, each rank's slice should
|
|
||||||
contain the data of the previous rank.
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
from kernbench.ccl.algorithms import hello_send
|
|
||||||
from kernbench.ccl.testing import run_kernel_in_mock
|
|
||||||
|
|
||||||
|
|
||||||
def test_hello_send_4_ranks_mock():
|
|
||||||
n_elem = 8
|
|
||||||
inputs = [np.full((n_elem,), float(r + 1), dtype=np.float16) for r in range(4)]
|
|
||||||
|
|
||||||
outputs = run_kernel_in_mock(
|
|
||||||
kernel_fn=hello_send.kernel,
|
|
||||||
world_size=4,
|
|
||||||
topology="ring_1d",
|
|
||||||
inputs=inputs,
|
|
||||||
kernel_args=(n_elem,),
|
|
||||||
)
|
|
||||||
|
|
||||||
# rank r should have rank (r-1) % 4's data
|
|
||||||
for r in range(4):
|
|
||||||
prev = inputs[(r - 1) % 4]
|
|
||||||
assert np.array_equal(outputs[r], prev), f"rank {r}: got {outputs[r]}"
|
|
||||||
|
|
||||||
|
|
||||||
def test_hello_send_via_simpy_runner():
|
|
||||||
"""Same but through real SimPy + IPCQ."""
|
|
||||||
from kernbench.policy.placement.dp import DPPolicy
|
|
||||||
from kernbench.runtime_api.bench_runner import run_bench
|
|
||||||
from kernbench.runtime_api.types import resolve_device
|
|
||||||
from kernbench.sim_engine.engine import GraphEngine
|
|
||||||
from kernbench.topology.builder import resolve_topology
|
|
||||||
|
|
||||||
topo = resolve_topology("topology.yaml")
|
|
||||||
n_elem = 8
|
|
||||||
world_size = 8
|
|
||||||
|
|
||||||
def run(torch):
|
|
||||||
# World size for this hello test is 8 (one cube). ccl.yaml no
|
|
||||||
# longer carries a default world_size — pass it explicitly.
|
|
||||||
plan = torch.install_ipcq(
|
|
||||||
algorithm="ring_allreduce_tcm", world_size_override=world_size,
|
|
||||||
)
|
|
||||||
a = torch.zeros(
|
|
||||||
(1, world_size * n_elem), dtype="f16",
|
|
||||||
dp=DPPolicy(
|
|
||||||
sip="replicate", cube="replicate", pe="column_wise",
|
|
||||||
num_sips=1, num_cubes=1,
|
|
||||||
),
|
|
||||||
name="hello_in",
|
|
||||||
)
|
|
||||||
store = torch.engine.memory_store
|
|
||||||
base = a._handle.va_base or a._handle.shards[0].pa
|
|
||||||
nbytes = n_elem * 2
|
|
||||||
for r in range(world_size):
|
|
||||||
store.write("hbm", base + r * nbytes,
|
|
||||||
np.full((n_elem,), float(r + 1), dtype=np.float16))
|
|
||||||
|
|
||||||
torch.launch("hello_send", hello_send.kernel, a, n_elem)
|
|
||||||
|
|
||||||
# Each rank should hold the previous rank's data after the round
|
|
||||||
for r in range(world_size):
|
|
||||||
arr = store.read("hbm", base + r * nbytes, shape=(n_elem,), dtype="f16")
|
|
||||||
prev_value = float(((r - 1) % world_size) + 1)
|
|
||||||
assert np.allclose(arr, prev_value), f"rank {r}: got {arr}, expected {prev_value}"
|
|
||||||
|
|
||||||
result = run_bench(
|
|
||||||
topology=topo, bench_fn=run,
|
|
||||||
device=resolve_device("all"),
|
|
||||||
engine_factory=lambda t, d: GraphEngine(
|
|
||||||
getattr(t, "topology_obj", t), enable_data=True
|
|
||||||
),
|
|
||||||
)
|
|
||||||
assert result.completion.ok
|
|
||||||
@@ -2,7 +2,6 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from kernbench.ccl.install import (
|
from kernbench.ccl.install import (
|
||||||
install_ipcq,
|
|
||||||
linear_rank_to_pe,
|
linear_rank_to_pe,
|
||||||
load_ccl_config,
|
load_ccl_config,
|
||||||
resolve_algorithm_config,
|
resolve_algorithm_config,
|
||||||
@@ -26,28 +25,14 @@ def test_resolve_algorithm_config_default():
|
|||||||
cfg = load_ccl_config()
|
cfg = load_ccl_config()
|
||||||
merged = resolve_algorithm_config(cfg)
|
merged = resolve_algorithm_config(cfg)
|
||||||
assert merged["algorithm"] == cfg["defaults"]["algorithm"]
|
assert merged["algorithm"] == cfg["defaults"]["algorithm"]
|
||||||
# ccl.yaml no longer carries defaults.world_size — backend derives
|
|
||||||
# it from topology.yaml at install time. Just check the field is
|
|
||||||
# absent here (verified per-test where install_ipcq is called).
|
|
||||||
assert "world_size" not in merged or merged["world_size"] >= 1
|
assert "world_size" not in merged or merged["world_size"] >= 1
|
||||||
|
|
||||||
|
|
||||||
def test_resolve_algorithm_config_override():
|
|
||||||
cfg = load_ccl_config()
|
|
||||||
merged = resolve_algorithm_config(cfg, name="ring_allreduce_hbm")
|
|
||||||
assert merged["algorithm"] == "ring_allreduce_hbm"
|
|
||||||
assert merged["buffer_kind"] == "hbm" # algo override
|
|
||||||
# defaults still apply
|
|
||||||
assert merged["n_slots"] == cfg["defaults"]["n_slots"]
|
|
||||||
|
|
||||||
|
|
||||||
def test_linear_rank_to_pe():
|
def test_linear_rank_to_pe():
|
||||||
engine, topo = _engine()
|
engine, topo = _engine()
|
||||||
spec = topo.spec
|
spec = topo.spec
|
||||||
# Cube 0 of SIP 0
|
|
||||||
assert linear_rank_to_pe(0, spec) == (0, 0, 0)
|
assert linear_rank_to_pe(0, spec) == (0, 0, 0)
|
||||||
assert linear_rank_to_pe(7, spec) == (0, 0, 7)
|
assert linear_rank_to_pe(7, spec) == (0, 0, 7)
|
||||||
# Should not exceed total PE count
|
|
||||||
pes_per_sip = (
|
pes_per_sip = (
|
||||||
spec["sip"]["cube_mesh"]["w"] * spec["sip"]["cube_mesh"]["h"]
|
spec["sip"]["cube_mesh"]["w"] * spec["sip"]["cube_mesh"]["h"]
|
||||||
* spec["cube"]["pe_layout"]["pe_per_corner"]
|
* spec["cube"]["pe_layout"]["pe_per_corner"]
|
||||||
@@ -56,45 +41,3 @@ def test_linear_rank_to_pe():
|
|||||||
sips = spec["system"]["sips"]["count"]
|
sips = spec["system"]["sips"]["count"]
|
||||||
total = sips * pes_per_sip
|
total = sips * pes_per_sip
|
||||||
assert total >= 8
|
assert total >= 8
|
||||||
|
|
||||||
|
|
||||||
def test_install_ipcq_neighbors_correct():
|
|
||||||
engine, topo = _engine()
|
|
||||||
cfg = load_ccl_config()
|
|
||||||
merged = resolve_algorithm_config(cfg, name="ring_allreduce_tcm")
|
|
||||||
# Force a single-cube 8-rank install for the assertions below.
|
|
||||||
merged["world_size"] = 8
|
|
||||||
plan = install_ipcq(engine, topo.spec, merged)
|
|
||||||
|
|
||||||
assert plan["world_size"] == 8
|
|
||||||
assert plan["buffer_kind"] == "tcm"
|
|
||||||
|
|
||||||
# Each rank should have E and W entries
|
|
||||||
for r, nbrs in plan["neighbor_table"].items():
|
|
||||||
assert "E" in nbrs
|
|
||||||
assert "W" in nbrs
|
|
||||||
|
|
||||||
# Inspect installed PE_IPCQ for rank 0
|
|
||||||
ipcq = engine._components["sip0.cube0.pe0.pe_ipcq"]
|
|
||||||
qp_e = ipcq.queue_pairs["E"]
|
|
||||||
qp_w = ipcq.queue_pairs["W"]
|
|
||||||
assert qp_e["peer"].pe == 1 # rank 0's E neighbor is rank 1
|
|
||||||
assert qp_w["peer"].pe == 7 # rank 0's W neighbor is rank 7
|
|
||||||
# rx_base addresses should be unique
|
|
||||||
assert qp_e["my_rx_base_pa"] != qp_w["my_rx_base_pa"]
|
|
||||||
|
|
||||||
|
|
||||||
def test_install_ipcq_credit_stores_wired():
|
|
||||||
engine, topo = _engine()
|
|
||||||
cfg = load_ccl_config()
|
|
||||||
merged = resolve_algorithm_config(cfg, name="ring_allreduce_tcm")
|
|
||||||
merged["world_size"] = 8
|
|
||||||
install_ipcq(engine, topo.spec, merged)
|
|
||||||
|
|
||||||
# rank 0 (pe0) sending E goes to rank 1 (pe1)
|
|
||||||
# rank 0's peer_credit_store on E direction should equal rank 1's credit_inbox
|
|
||||||
pe0 = engine._components["sip0.cube0.pe0.pe_ipcq"]
|
|
||||||
pe1 = engine._components["sip0.cube0.pe1.pe_ipcq"]
|
|
||||||
|
|
||||||
qp_e = pe0.queue_pairs["E"]
|
|
||||||
assert qp_e["peer_credit_store"] is pe1.credit_inbox
|
|
||||||
|
|||||||
@@ -1,83 +0,0 @@
|
|||||||
"""Tests for the mock CCL runtime (ADR-0023 D15)."""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
from kernbench.ccl.algorithms import ring_allreduce
|
|
||||||
from kernbench.ccl.testing import run_kernel_in_mock
|
|
||||||
|
|
||||||
|
|
||||||
def test_ring_allreduce_4_ranks():
|
|
||||||
"""Run the ring all-reduce kernel under the mock runtime, no SimPy."""
|
|
||||||
n_elem = 8
|
|
||||||
inputs = [
|
|
||||||
np.full((n_elem,), float(r + 1), dtype=np.float16)
|
|
||||||
for r in range(4)
|
|
||||||
]
|
|
||||||
expected = sum(inputs) # [10, 10, ..., 10]
|
|
||||||
|
|
||||||
outputs = run_kernel_in_mock(
|
|
||||||
kernel_fn=ring_allreduce.kernel,
|
|
||||||
world_size=4,
|
|
||||||
topology="ring_1d",
|
|
||||||
inputs=inputs,
|
|
||||||
kernel_args=(n_elem, 4),
|
|
||||||
)
|
|
||||||
|
|
||||||
assert len(outputs) == 4
|
|
||||||
for r in range(4):
|
|
||||||
assert np.allclose(outputs[r], expected)
|
|
||||||
|
|
||||||
|
|
||||||
def test_ring_allreduce_8_ranks():
|
|
||||||
n_elem = 16
|
|
||||||
inputs = [
|
|
||||||
np.full((n_elem,), float(r + 1), dtype=np.float16)
|
|
||||||
for r in range(8)
|
|
||||||
]
|
|
||||||
expected = sum(inputs) # [36, 36, ...]
|
|
||||||
|
|
||||||
outputs = run_kernel_in_mock(
|
|
||||||
kernel_fn=ring_allreduce.kernel,
|
|
||||||
world_size=8,
|
|
||||||
topology="ring_1d",
|
|
||||||
inputs=inputs,
|
|
||||||
kernel_args=(n_elem, 8),
|
|
||||||
)
|
|
||||||
for r in range(8):
|
|
||||||
assert np.allclose(outputs[r], expected)
|
|
||||||
|
|
||||||
|
|
||||||
def test_ring_allreduce_random_data():
|
|
||||||
n_elem = 32
|
|
||||||
rng = np.random.default_rng(42)
|
|
||||||
inputs = [rng.standard_normal(n_elem).astype(np.float16) for _ in range(4)]
|
|
||||||
expected = sum(inputs)
|
|
||||||
|
|
||||||
outputs = run_kernel_in_mock(
|
|
||||||
kernel_fn=ring_allreduce.kernel,
|
|
||||||
world_size=4,
|
|
||||||
topology="ring_1d",
|
|
||||||
inputs=inputs,
|
|
||||||
kernel_args=(n_elem, 4),
|
|
||||||
)
|
|
||||||
for r in range(4):
|
|
||||||
assert np.allclose(outputs[r], expected, rtol=1e-2, atol=1e-2)
|
|
||||||
|
|
||||||
|
|
||||||
def test_mock_runtime_invalid_direction_raises():
|
|
||||||
"""A kernel that uses an unsupported direction should raise."""
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
def bad_kernel(t_ptr, n_elem, tl):
|
|
||||||
tl.send(dir="N", src_addr=0, nbytes=2, shape=(1,), dtype="f16", space="hbm")
|
|
||||||
|
|
||||||
inputs = [np.array([1.0], dtype=np.float16) for _ in range(2)]
|
|
||||||
with pytest.raises(Exception):
|
|
||||||
run_kernel_in_mock(
|
|
||||||
kernel_fn=bad_kernel,
|
|
||||||
world_size=2,
|
|
||||||
topology="ring_1d",
|
|
||||||
inputs=inputs,
|
|
||||||
kernel_args=(1,),
|
|
||||||
)
|
|
||||||
@@ -1,87 +0,0 @@
|
|||||||
"""CCL performance validation tests (ADR-0023 D13 T5).
|
|
||||||
|
|
||||||
Sanity-checks the simulated latency of the unified ``ccl_allreduce`` bench.
|
|
||||||
|
|
||||||
Uses 8-rank (single cube) for all buffer variants — the latency model
|
|
||||||
is topology-aware, so buffer_kind differences are visible even at small
|
|
||||||
scale. Full-system (256-rank) cross-SIP latency is covered by the
|
|
||||||
``test_ccl_allreduce_matrix[ring_full_system]`` slow test.
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import importlib
|
|
||||||
import os
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
from kernbench.runtime_api.bench_runner import run_bench
|
|
||||||
from kernbench.runtime_api.types import resolve_device
|
|
||||||
from kernbench.sim_engine.engine import GraphEngine
|
|
||||||
from kernbench.topology.builder import resolve_topology
|
|
||||||
|
|
||||||
|
|
||||||
def _engine_factory(topology, device):
|
|
||||||
return GraphEngine(getattr(topology, "topology_obj", topology), enable_data=True)
|
|
||||||
|
|
||||||
|
|
||||||
def _run_8rank(algorithm: str, buffer_kind: str = "tcm") -> float:
|
|
||||||
"""Run an 8-rank ring via the unified bench with a tmp ccl.yaml overlay.
|
|
||||||
Returns simulated kernel total_ns."""
|
|
||||||
import tempfile
|
|
||||||
|
|
||||||
body = f"""\
|
|
||||||
defaults:
|
|
||||||
algorithm: {algorithm}
|
|
||||||
buffer_kind: {buffer_kind}
|
|
||||||
backpressure: sleep
|
|
||||||
n_slots: 4
|
|
||||||
slot_size: 4096
|
|
||||||
vc_chunk_size: 256
|
|
||||||
ipcq_credit_size_bytes: 16
|
|
||||||
|
|
||||||
algorithms:
|
|
||||||
{algorithm}:
|
|
||||||
module: kernbench.ccl.algorithms.ring_allreduce
|
|
||||||
topology: ring_1d
|
|
||||||
buffer_kind: {buffer_kind}
|
|
||||||
world_size: 8
|
|
||||||
n_elem: 32
|
|
||||||
"""
|
|
||||||
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
|
||||||
with tempfile.TemporaryDirectory() as tmp:
|
|
||||||
with open(os.path.join(tmp, "ccl.yaml"), "w") as f:
|
|
||||||
f.write(body)
|
|
||||||
old_cwd = os.getcwd()
|
|
||||||
os.chdir(tmp)
|
|
||||||
try:
|
|
||||||
topo = resolve_topology(os.path.join(project_root, "topology.yaml"))
|
|
||||||
bench_mod = importlib.import_module("benches.ccl_allreduce")
|
|
||||||
result = run_bench(
|
|
||||||
topology=topo, bench_fn=bench_mod.run,
|
|
||||||
device=resolve_device("all"),
|
|
||||||
engine_factory=_engine_factory,
|
|
||||||
)
|
|
||||||
finally:
|
|
||||||
os.chdir(old_cwd)
|
|
||||||
|
|
||||||
assert result.completion.ok, f"{algorithm} did not complete"
|
|
||||||
last_kernel = None
|
|
||||||
for tr in (result.traces or []):
|
|
||||||
if tr.get("phase") == "kernel":
|
|
||||||
last_kernel = tr
|
|
||||||
assert last_kernel is not None, f"{algorithm} produced no kernel trace"
|
|
||||||
return float(last_kernel.get("total_ns", 0.0))
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("buffer_kind", ["tcm", "hbm", "sram"])
|
|
||||||
def test_ccl_latency_positive(buffer_kind):
|
|
||||||
"""Every buffer kind must produce a positive simulated latency."""
|
|
||||||
algo = f"ring_allreduce_{buffer_kind}"
|
|
||||||
ns = _run_8rank(algo, buffer_kind)
|
|
||||||
assert ns > 0
|
|
||||||
|
|
||||||
|
|
||||||
def test_ccl_latency_under_reasonable_bound():
|
|
||||||
"""8-rank ring all-reduce (tile=32 f16) should finish well under 1ms."""
|
|
||||||
ns = _run_8rank("ring_allreduce_tcm", "tcm")
|
|
||||||
assert ns < 1_000_000 # < 1 ms simulated
|
|
||||||
@@ -0,0 +1,119 @@
|
|||||||
|
"""End-to-end distributed test for intercube allreduce.
|
||||||
|
|
||||||
|
Exercises the full process-group path:
|
||||||
|
dist.init_process_group(backend="ahbm")
|
||||||
|
→ mp.spawn(nprocs=n_sips)
|
||||||
|
→ each worker: set_device → allocate → fill → dist.all_reduce → verify
|
||||||
|
|
||||||
|
This is the same flow a real DDP training script would use.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
import textwrap
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"
|
||||||
|
|
||||||
|
N_CUBES = 16
|
||||||
|
N_ELEM = 8
|
||||||
|
|
||||||
|
|
||||||
|
def _write_ccl_yaml(tmp_path) -> str:
|
||||||
|
body = textwrap.dedent("""\
|
||||||
|
defaults:
|
||||||
|
algorithm: intercube_allreduce
|
||||||
|
buffer_kind: tcm
|
||||||
|
backpressure: sleep
|
||||||
|
n_slots: 4
|
||||||
|
slot_size: 4096
|
||||||
|
vc_chunk_size: 256
|
||||||
|
ipcq_credit_size_bytes: 16
|
||||||
|
|
||||||
|
algorithms:
|
||||||
|
intercube_allreduce:
|
||||||
|
module: kernbench.ccl.algorithms.intercube_allreduce
|
||||||
|
topology: none
|
||||||
|
buffer_kind: tcm
|
||||||
|
n_elem: 8
|
||||||
|
root_cube: 15
|
||||||
|
""")
|
||||||
|
(tmp_path / "ccl.yaml").write_text(body)
|
||||||
|
return str(tmp_path)
|
||||||
|
|
||||||
|
|
||||||
|
def _worker(rank: int, n_sips: int, torch) -> None:
|
||||||
|
"""Per-SIP worker: allocate, fill, all_reduce, verify."""
|
||||||
|
from kernbench.policy.placement.dp import DPPolicy
|
||||||
|
|
||||||
|
torch.ahbm.set_device(rank)
|
||||||
|
|
||||||
|
dp = DPPolicy(
|
||||||
|
cube="row_wise", pe="replicate",
|
||||||
|
num_pes=1, num_cubes=N_CUBES,
|
||||||
|
)
|
||||||
|
tensor = torch.zeros(
|
||||||
|
(N_CUBES, N_ELEM), dtype="f16", dp=dp,
|
||||||
|
name=f"sip{rank}",
|
||||||
|
)
|
||||||
|
|
||||||
|
init_arr = np.full((N_CUBES, N_ELEM), float(rank + 1), dtype=np.float16)
|
||||||
|
tensor.copy_(torch.from_numpy(init_arr))
|
||||||
|
|
||||||
|
print(f"[SIP {rank}] input cube0[:4] = {tensor.numpy()[0][:4].tolist()}")
|
||||||
|
|
||||||
|
torch.distributed.all_reduce(tensor, op="sum")
|
||||||
|
|
||||||
|
arr = tensor.numpy()
|
||||||
|
expected = float(N_CUBES * sum(range(1, n_sips + 1)))
|
||||||
|
|
||||||
|
print(f"[SIP {rank}] output cube0[:4] = {arr[0][:4].tolist()}")
|
||||||
|
print(f"[SIP {rank}] output cube15[:4] = {arr[15][:4].tolist()}")
|
||||||
|
|
||||||
|
for cube_id in range(N_CUBES):
|
||||||
|
assert np.allclose(arr[cube_id], expected, rtol=1e-1, atol=1e-1), (
|
||||||
|
f"SIP{rank} cube {cube_id}: "
|
||||||
|
f"got {arr[cube_id][:4]}, expected {expected}"
|
||||||
|
)
|
||||||
|
|
||||||
|
if rank == 0:
|
||||||
|
print(f"\n intercube_allreduce (ws={n_sips}): "
|
||||||
|
f"{n_sips * N_CUBES} OK")
|
||||||
|
|
||||||
|
|
||||||
|
def test_distributed_intercube_allreduce(tmp_path, monkeypatch):
|
||||||
|
"""Full distributed path: init_process_group → mp.spawn → all_reduce."""
|
||||||
|
from kernbench.runtime_api.context import RuntimeContext
|
||||||
|
from kernbench.runtime_api.types import DeviceSelector
|
||||||
|
from kernbench.sim_engine.engine import GraphEngine
|
||||||
|
from kernbench.topology.builder import resolve_topology
|
||||||
|
|
||||||
|
monkeypatch.chdir(_write_ccl_yaml(tmp_path))
|
||||||
|
|
||||||
|
topo = resolve_topology(str(TOPOLOGY_PATH))
|
||||||
|
engine = GraphEngine(topo.topology_obj, enable_data=True)
|
||||||
|
spec = topo.topology_obj.spec
|
||||||
|
n_sips = int(spec["system"]["sips"]["count"])
|
||||||
|
|
||||||
|
with RuntimeContext(
|
||||||
|
engine=engine,
|
||||||
|
target_device=DeviceSelector("all"),
|
||||||
|
correlation_id="dist_intercube_ar",
|
||||||
|
spec=spec,
|
||||||
|
) as ctx:
|
||||||
|
ctx.distributed.init_process_group(backend="ahbm")
|
||||||
|
|
||||||
|
assert ctx.distributed.get_world_size() == n_sips
|
||||||
|
|
||||||
|
t_start = engine._env.now
|
||||||
|
|
||||||
|
ctx.multiprocessing.spawn(
|
||||||
|
_worker, args=(n_sips, ctx), nprocs=n_sips,
|
||||||
|
)
|
||||||
|
|
||||||
|
t_end = engine._env.now
|
||||||
|
print(f"\n[distributed] sim latency = "
|
||||||
|
f"{t_end - t_start:.1f} ns ({(t_end - t_start) / 1000:.3f} us)")
|
||||||
@@ -0,0 +1,113 @@
|
|||||||
|
"""Tests for configure_sfr_intercube_multisip neighbor table wiring.
|
||||||
|
|
||||||
|
Verifies that IPCQ neighbor tables are correctly installed for
|
||||||
|
intercube (pe0, 4×4 mesh N/S/E/W) + inter-SIP (pe0, all cubes,
|
||||||
|
global_E/global_W) communication.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from kernbench.ccl.install import load_ccl_config, resolve_algorithm_config
|
||||||
|
from kernbench.ccl.sfr_config import configure_sfr_intercube_multisip
|
||||||
|
from kernbench.sim_engine.engine import GraphEngine
|
||||||
|
from kernbench.topology.builder import resolve_topology
|
||||||
|
|
||||||
|
TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"
|
||||||
|
|
||||||
|
N_CUBES = 16
|
||||||
|
|
||||||
|
|
||||||
|
def _engine_and_spec():
|
||||||
|
topo = resolve_topology(str(TOPOLOGY_PATH))
|
||||||
|
engine = GraphEngine(topo.topology_obj, enable_data=True)
|
||||||
|
return engine, topo.topology_obj.spec
|
||||||
|
|
||||||
|
|
||||||
|
def _merged_cfg():
|
||||||
|
cfg = load_ccl_config()
|
||||||
|
return resolve_algorithm_config(cfg, name="intercube_allreduce")
|
||||||
|
|
||||||
|
|
||||||
|
class TestConfigureSfrNeighborTables:
|
||||||
|
def test_world_size_and_rank_to_pe(self):
|
||||||
|
engine, spec = _engine_and_spec()
|
||||||
|
cfg = _merged_cfg()
|
||||||
|
plan = configure_sfr_intercube_multisip(engine, spec, cfg)
|
||||||
|
|
||||||
|
n_sips = int(spec["system"]["sips"]["count"])
|
||||||
|
assert plan["world_size"] == n_sips * N_CUBES
|
||||||
|
assert len(plan["rank_to_pe"]) == n_sips * N_CUBES
|
||||||
|
for pe_idx, (sip, cube, pe) in enumerate(plan["rank_to_pe"]):
|
||||||
|
assert pe == 0, f"pe_idx {pe_idx}: pe must be 0, got {pe}"
|
||||||
|
|
||||||
|
def test_corner_cube0_has_E_and_S_only(self):
|
||||||
|
"""Cube 0 (row=0, col=0) is NW corner: only E and S neighbors."""
|
||||||
|
engine, spec = _engine_and_spec()
|
||||||
|
cfg = _merged_cfg()
|
||||||
|
configure_sfr_intercube_multisip(engine, spec, cfg)
|
||||||
|
|
||||||
|
ipcq = engine._components["sip0.cube0.pe0.pe_ipcq"]
|
||||||
|
qp = ipcq.queue_pairs
|
||||||
|
assert "E" in qp, "cube 0 must have E neighbor"
|
||||||
|
assert "S" in qp, "cube 0 must have S neighbor"
|
||||||
|
assert "W" not in qp, "cube 0 (col=0) must NOT have W neighbor"
|
||||||
|
assert "N" not in qp, "cube 0 (row=0) must NOT have N neighbor"
|
||||||
|
assert qp["E"]["peer"].cube == 1
|
||||||
|
assert qp["S"]["peer"].cube == 4
|
||||||
|
|
||||||
|
def test_interior_cube5_has_all_four(self):
|
||||||
|
"""Cube 5 (row=1, col=1) is interior: N/S/E/W all present."""
|
||||||
|
engine, spec = _engine_and_spec()
|
||||||
|
cfg = _merged_cfg()
|
||||||
|
configure_sfr_intercube_multisip(engine, spec, cfg)
|
||||||
|
|
||||||
|
ipcq = engine._components["sip0.cube5.pe0.pe_ipcq"]
|
||||||
|
qp = ipcq.queue_pairs
|
||||||
|
assert qp["N"]["peer"].cube == 1
|
||||||
|
assert qp["S"]["peer"].cube == 9
|
||||||
|
assert qp["E"]["peer"].cube == 6
|
||||||
|
assert qp["W"]["peer"].cube == 4
|
||||||
|
|
||||||
|
def test_root_cube15_has_inter_sip(self):
|
||||||
|
"""Cube 15 (root, SE corner) has N, W + global_E/global_W."""
|
||||||
|
engine, spec = _engine_and_spec()
|
||||||
|
cfg = _merged_cfg()
|
||||||
|
configure_sfr_intercube_multisip(engine, spec, cfg)
|
||||||
|
|
||||||
|
ipcq0 = engine._components["sip0.cube15.pe0.pe_ipcq"]
|
||||||
|
qp0 = ipcq0.queue_pairs
|
||||||
|
assert "N" in qp0
|
||||||
|
assert "W" in qp0
|
||||||
|
assert "E" not in qp0, "cube 15 (col=3) must NOT have E"
|
||||||
|
assert "S" not in qp0, "cube 15 (row=3) must NOT have S"
|
||||||
|
assert "global_E" in qp0, "root cube must have global_E"
|
||||||
|
assert "global_W" in qp0, "root cube must have global_W"
|
||||||
|
assert qp0["global_E"]["peer"].sip == 1
|
||||||
|
assert qp0["global_E"]["peer"].cube == 15
|
||||||
|
|
||||||
|
ipcq1 = engine._components["sip1.cube15.pe0.pe_ipcq"]
|
||||||
|
qp1 = ipcq1.queue_pairs
|
||||||
|
assert qp1["global_E"]["peer"].sip == 0
|
||||||
|
assert qp1["global_E"]["peer"].cube == 15
|
||||||
|
|
||||||
|
def test_all_cubes_have_inter_sip(self):
|
||||||
|
"""ALL cubes (not just root) are wired for inter-SIP."""
|
||||||
|
engine, spec = _engine_and_spec()
|
||||||
|
cfg = _merged_cfg()
|
||||||
|
configure_sfr_intercube_multisip(engine, spec, cfg)
|
||||||
|
|
||||||
|
root_cube = int(cfg.get("root_cube", N_CUBES - 1))
|
||||||
|
for cube_id in range(N_CUBES):
|
||||||
|
ipcq = engine._components[f"sip0.cube{cube_id}.pe0.pe_ipcq"]
|
||||||
|
qp = ipcq.queue_pairs
|
||||||
|
assert "global_E" in qp, (
|
||||||
|
f"sip0.cube{cube_id}.pe0 missing global_E"
|
||||||
|
)
|
||||||
|
assert "global_W" in qp, (
|
||||||
|
f"sip0.cube{cube_id}.pe0 missing global_W"
|
||||||
|
)
|
||||||
|
if cube_id == root_cube:
|
||||||
|
assert qp["global_E"]["peer"].sip != 0, (
|
||||||
|
f"root cube {root_cube} global_E must point to another SIP"
|
||||||
|
)
|
||||||
@@ -63,7 +63,8 @@ def test_ipcq_dma_token():
|
|||||||
|
|
||||||
def test_ipcq_credit_metadata():
|
def test_ipcq_credit_metadata():
|
||||||
cm = IpcqCreditMetadata(
|
cm = IpcqCreditMetadata(
|
||||||
consumer_seq=3, src_sip=0, src_cube=0, src_pe=1, src_direction="W",
|
consumer_seq=3, dst_rx_base_pa=0x1000,
|
||||||
|
src_sip=0, src_cube=0, src_pe=1, src_direction="W",
|
||||||
)
|
)
|
||||||
assert cm.consumer_seq == 3
|
assert cm.consumer_seq == 3
|
||||||
assert cm.src_direction == "W"
|
assert cm.src_direction == "W"
|
||||||
@@ -71,7 +72,8 @@ def test_ipcq_credit_metadata():
|
|||||||
|
|
||||||
def test_ipcq_credit_metadata_frozen():
|
def test_ipcq_credit_metadata_frozen():
|
||||||
cm = IpcqCreditMetadata(
|
cm = IpcqCreditMetadata(
|
||||||
consumer_seq=3, src_sip=0, src_cube=0, src_pe=1, src_direction="W",
|
consumer_seq=3, dst_rx_base_pa=0x1000,
|
||||||
|
src_sip=0, src_cube=0, src_pe=1, src_direction="W",
|
||||||
)
|
)
|
||||||
with pytest.raises(Exception):
|
with pytest.raises(Exception):
|
||||||
cm.consumer_seq = 99 # type: ignore
|
cm.consumer_seq = 99 # type: ignore
|
||||||
|
|||||||
+157
-1
@@ -291,9 +291,12 @@ def test_send_blocks_when_peer_slot_full():
|
|||||||
env.run(until=20)
|
env.run(until=20)
|
||||||
assert not req5.done.triggered
|
assert not req5.done.triggered
|
||||||
|
|
||||||
# Send a credit return: peer (E direction, pe=1) consumed slot 0
|
# Send a credit return: peer (E direction, pe=1) consumed slot 0.
|
||||||
|
# dst_rx_base_pa is the peer-side rx buffer — which equals my qp_E's
|
||||||
|
# peer.rx_base_pa (0x10_000 from _install_two_neighbors).
|
||||||
credit = IpcqCreditMetadata(
|
credit = IpcqCreditMetadata(
|
||||||
consumer_seq=1, # peer consumed up to my_tail=1
|
consumer_seq=1, # peer consumed up to my_tail=1
|
||||||
|
dst_rx_base_pa=0x10_000, # E's peer.rx_base_pa (ADR-0025 D3)
|
||||||
src_sip=0, src_cube=0, src_pe=1, src_direction="W", # peer's view
|
src_sip=0, src_cube=0, src_pe=1, src_direction="W", # peer's view
|
||||||
)
|
)
|
||||||
comp.credit_inbox.put(credit)
|
comp.credit_inbox.put(credit)
|
||||||
@@ -315,3 +318,156 @@ def test_init_installs_neighbors():
|
|||||||
assert comp._queue_pairs["W"]["peer"].pe == 2
|
assert comp._queue_pairs["W"]["peer"].pe == 2
|
||||||
assert comp._queue_pairs["E"]["my_head"] == 0
|
assert comp._queue_pairs["E"]["my_head"] == 0
|
||||||
assert comp._queue_pairs["E"]["peer_tail_cache"] == 0
|
assert comp._queue_pairs["E"]["peer_tail_cache"] == 0
|
||||||
|
|
||||||
|
|
||||||
|
# ── ADR-0025: address-based matching in meta arrival / credit ────────
|
||||||
|
|
||||||
|
|
||||||
|
def _install_same_peer_neighbors(
|
||||||
|
env: simpy.Environment, comp: PeIpcqComponent,
|
||||||
|
) -> tuple[simpy.Store, simpy.Store]:
|
||||||
|
"""Install E and W neighbors BOTH pointing to the same peer (pe=1).
|
||||||
|
|
||||||
|
This mirrors the 2-rank bidirectional ring topology (ADR-0025 motivation):
|
||||||
|
rank 0's E and W neighbors are the same peer rank, but target different
|
||||||
|
rx slots on that peer (E→peer's W-rx, W→peer's E-rx).
|
||||||
|
|
||||||
|
- E's peer.rx_base_pa = 0x10_000 (peer's W-rx buffer)
|
||||||
|
- W's peer.rx_base_pa = 0x20_000 (peer's E-rx buffer)
|
||||||
|
- my_rx_base_pa: E=0x30_000, W=0x40_000 (local rx for each dir)
|
||||||
|
"""
|
||||||
|
peer_e_credit = simpy.Store(env)
|
||||||
|
peer_w_credit = simpy.Store(env)
|
||||||
|
|
||||||
|
ep_e = IpcqEndpoint(
|
||||||
|
sip=0, cube=0, pe=1,
|
||||||
|
buffer_kind="tcm",
|
||||||
|
rx_base_pa=0x10_000, rx_base_va=0,
|
||||||
|
n_slots=4, slot_size=4096,
|
||||||
|
)
|
||||||
|
ep_w = IpcqEndpoint(
|
||||||
|
sip=0, cube=0, pe=1, # SAME peer as ep_e
|
||||||
|
buffer_kind="tcm",
|
||||||
|
rx_base_pa=0x20_000, rx_base_va=0, # different target slot
|
||||||
|
n_slots=4, slot_size=4096,
|
||||||
|
)
|
||||||
|
init_msg = IpcqInitMsg(
|
||||||
|
correlation_id="t", request_id="t",
|
||||||
|
target_sips=(0,), target_cubes=(0,), target_pe=0,
|
||||||
|
entries=(
|
||||||
|
IpcqInitEntry(
|
||||||
|
direction="E", peer=ep_e,
|
||||||
|
my_rx_base_pa=0x30_000, my_rx_base_va=0,
|
||||||
|
n_slots=4, slot_size=4096,
|
||||||
|
peer_credit_store=peer_e_credit,
|
||||||
|
),
|
||||||
|
IpcqInitEntry(
|
||||||
|
direction="W", peer=ep_w,
|
||||||
|
my_rx_base_pa=0x40_000, my_rx_base_va=0,
|
||||||
|
n_slots=4, slot_size=4096,
|
||||||
|
peer_credit_store=peer_w_credit,
|
||||||
|
),
|
||||||
|
),
|
||||||
|
backpressure_mode="sleep",
|
||||||
|
buffer_kind="tcm",
|
||||||
|
credit_size_bytes=16,
|
||||||
|
)
|
||||||
|
done = env.event()
|
||||||
|
comp.in_ports["host"].put(_FakeTxn(request=init_msg, done=done))
|
||||||
|
env.run(until=done)
|
||||||
|
return peer_e_credit, peer_w_credit
|
||||||
|
|
||||||
|
|
||||||
|
def test_meta_arrival_matches_by_dst_addr_same_peer():
|
||||||
|
"""ADR-0025 D2: when E and W point to the same peer (2-rank ring),
|
||||||
|
dst_addr range must determine which qp's peer_head_cache updates.
|
||||||
|
|
||||||
|
Under the old sender-key matching, the first matching direction (E)
|
||||||
|
would win for any arrival, regardless of which rx slot was written.
|
||||||
|
Under D2 address-based matching, dst_addr within W's rx range
|
||||||
|
(my_rx_base_pa_W .. +n_slots*slot_size) must update W, and dst_addr
|
||||||
|
within E's rx range must update E.
|
||||||
|
"""
|
||||||
|
env = simpy.Environment()
|
||||||
|
comp = _make_pe_ipcq(env)
|
||||||
|
_install_same_peer_neighbors(env, comp)
|
||||||
|
|
||||||
|
# Arrival into W's rx buffer (my_rx_base_pa=0x40_000)
|
||||||
|
token_into_w = IpcqDmaToken(
|
||||||
|
src_addr=0, src_space="tcm",
|
||||||
|
dst_addr=0x40_000, dst_endpoint=comp._queue_pairs["W"]["peer"],
|
||||||
|
nbytes=64, handle_id="w1",
|
||||||
|
shape=(8,), dtype="f16",
|
||||||
|
sender_seq=0,
|
||||||
|
src_sip=0, src_cube=0, src_pe=1, src_direction="E",
|
||||||
|
)
|
||||||
|
comp.in_ports["host"].put(IpcqMetaArrival(token=token_into_w))
|
||||||
|
env.run(until=5)
|
||||||
|
|
||||||
|
# W's peer_head_cache should increment; E's stays 0.
|
||||||
|
assert comp._queue_pairs["W"]["peer_head_cache"] == 1, (
|
||||||
|
"W qp should have been updated because dst_addr is in W's rx range"
|
||||||
|
)
|
||||||
|
assert comp._queue_pairs["E"]["peer_head_cache"] == 0, (
|
||||||
|
"E qp should NOT be updated; current sender-key matching wrongly "
|
||||||
|
"picks the first direction with a matching peer"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Second arrival into E's rx buffer (my_rx_base_pa=0x30_000)
|
||||||
|
token_into_e = IpcqDmaToken(
|
||||||
|
src_addr=0, src_space="tcm",
|
||||||
|
dst_addr=0x30_000, dst_endpoint=comp._queue_pairs["E"]["peer"],
|
||||||
|
nbytes=64, handle_id="e1",
|
||||||
|
shape=(8,), dtype="f16",
|
||||||
|
sender_seq=0,
|
||||||
|
src_sip=0, src_cube=0, src_pe=1, src_direction="W",
|
||||||
|
)
|
||||||
|
comp.in_ports["host"].put(IpcqMetaArrival(token=token_into_e))
|
||||||
|
env.run(until=10)
|
||||||
|
|
||||||
|
assert comp._queue_pairs["E"]["peer_head_cache"] == 1
|
||||||
|
assert comp._queue_pairs["W"]["peer_head_cache"] == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_credit_matches_by_dst_rx_base_pa_same_peer():
|
||||||
|
"""ADR-0025 D3: credit must carry dst_rx_base_pa (the receiver-side
|
||||||
|
rx buffer base) so the original sender can match it against
|
||||||
|
qp.peer.rx_base_pa and find the correct direction. Under old
|
||||||
|
sender-key matching, first-match-wins would always pick E when
|
||||||
|
E and W share the same peer.
|
||||||
|
"""
|
||||||
|
env = simpy.Environment()
|
||||||
|
comp = _make_pe_ipcq(env)
|
||||||
|
_install_same_peer_neighbors(env, comp)
|
||||||
|
|
||||||
|
# Credit corresponding to a send through W direction:
|
||||||
|
# - My W sent to peer's rx at 0x20_000 (qp_w["peer"].rx_base_pa)
|
||||||
|
# - Peer consumed it; sends credit back with dst_rx_base_pa=0x20_000
|
||||||
|
# - Receiver (me, the original sender) should update W's peer_tail_cache
|
||||||
|
credit_for_w = IpcqCreditMetadata(
|
||||||
|
consumer_seq=1,
|
||||||
|
dst_rx_base_pa=0x20_000, # matches W's peer.rx_base_pa
|
||||||
|
src_sip=0, src_cube=0, src_pe=1, src_direction="E",
|
||||||
|
)
|
||||||
|
comp.credit_inbox.put(credit_for_w)
|
||||||
|
env.run(until=5)
|
||||||
|
|
||||||
|
assert comp._queue_pairs["W"]["peer_tail_cache"] == 1, (
|
||||||
|
"W's peer_tail_cache should update — credit.dst_rx_base_pa matches "
|
||||||
|
"W qp's peer.rx_base_pa"
|
||||||
|
)
|
||||||
|
assert comp._queue_pairs["E"]["peer_tail_cache"] == 0, (
|
||||||
|
"E's peer_tail_cache should NOT update"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Second credit: for E direction
|
||||||
|
credit_for_e = IpcqCreditMetadata(
|
||||||
|
consumer_seq=2,
|
||||||
|
dst_rx_base_pa=0x10_000, # matches E's peer.rx_base_pa
|
||||||
|
src_sip=0, src_cube=0, src_pe=1, src_direction="W",
|
||||||
|
)
|
||||||
|
comp.credit_inbox.put(credit_for_e)
|
||||||
|
env.run(until=10)
|
||||||
|
|
||||||
|
assert comp._queue_pairs["E"]["peer_tail_cache"] == 2
|
||||||
|
assert comp._queue_pairs["W"]["peer_tail_cache"] == 1
|
||||||
|
|||||||
@@ -1,80 +0,0 @@
|
|||||||
"""Tests for recv_mode='copy_to_dst' (ADR-0023 D9.5)."""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
|
|
||||||
def test_recv_copy_to_dst_via_simpy_runner():
|
|
||||||
"""Run a kernel that uses tl.recv(..., dst_addr=..., dst_space=...).
|
|
||||||
Verify the data is moved to the dst location after recv.
|
|
||||||
"""
|
|
||||||
import importlib
|
|
||||||
|
|
||||||
from kernbench.policy.placement.dp import DPPolicy
|
|
||||||
from kernbench.runtime_api.bench_runner import run_bench
|
|
||||||
from kernbench.runtime_api.types import resolve_device
|
|
||||||
from kernbench.sim_engine.engine import GraphEngine
|
|
||||||
from kernbench.topology.builder import resolve_topology
|
|
||||||
from kernbench.common.pe_commands import TensorHandle
|
|
||||||
|
|
||||||
def kernel(t_ptr, n_elem, dst_buf_addr, tl):
|
|
||||||
rank = tl.program_id(axis=0)
|
|
||||||
ws = tl.num_programs(axis=0)
|
|
||||||
nbytes = n_elem * 2
|
|
||||||
# Each PE sends own data, then recv into a custom dst slot
|
|
||||||
current = TensorHandle(
|
|
||||||
id="loc", addr=t_ptr + rank * nbytes,
|
|
||||||
shape=(n_elem,), dtype="f16",
|
|
||||||
nbytes=nbytes, data=None, space="hbm",
|
|
||||||
)
|
|
||||||
tl.send(dir="E", src=current)
|
|
||||||
# copy_to_dst: move into a per-rank scratch HBM addr
|
|
||||||
recv = tl.recv(
|
|
||||||
dir="W", shape=(n_elem,), dtype="f16",
|
|
||||||
dst_addr=dst_buf_addr + rank * nbytes,
|
|
||||||
dst_space="hbm",
|
|
||||||
)
|
|
||||||
# Sanity: recv handle should now point to our dst addr
|
|
||||||
assert recv.addr == dst_buf_addr + rank * nbytes
|
|
||||||
assert recv.space == "hbm"
|
|
||||||
|
|
||||||
topo = resolve_topology("topology.yaml")
|
|
||||||
|
|
||||||
def run(torch):
|
|
||||||
plan = torch.install_ipcq(
|
|
||||||
algorithm="ring_allreduce_tcm", world_size_override=8,
|
|
||||||
)
|
|
||||||
a = torch.zeros(
|
|
||||||
(1, 8 * 8),
|
|
||||||
dtype="f16",
|
|
||||||
dp=DPPolicy(
|
|
||||||
sip="replicate", cube="replicate", pe="column_wise",
|
|
||||||
num_sips=1, num_cubes=1,
|
|
||||||
),
|
|
||||||
name="copy_in",
|
|
||||||
)
|
|
||||||
store = torch.engine.memory_store
|
|
||||||
base = a._handle.va_base or a._handle.shards[0].pa
|
|
||||||
nbytes = 8 * 2
|
|
||||||
for r in range(8):
|
|
||||||
store.write("hbm", base + r * nbytes,
|
|
||||||
np.full((8,), float(r + 1), dtype=np.float16))
|
|
||||||
|
|
||||||
# Use a separate dst region (synthetic addresses)
|
|
||||||
dst_buf = 0xC0FFEE_0000
|
|
||||||
torch.launch("ring_allreduce_tcm", kernel, a, 8, dst_buf)
|
|
||||||
|
|
||||||
# After the kernel, dst_buf + r*16 should contain rank (r-1)%8's data
|
|
||||||
for r in range(8):
|
|
||||||
arr = store.read("hbm", dst_buf + r * nbytes, shape=(8,), dtype="f16")
|
|
||||||
expected = float(((r - 1) % 8) + 1)
|
|
||||||
assert np.allclose(arr, expected), f"rank {r}: got {arr}, expected {expected}"
|
|
||||||
|
|
||||||
result = run_bench(
|
|
||||||
topology=topo, bench_fn=run,
|
|
||||||
device=resolve_device("all"),
|
|
||||||
engine_factory=lambda t, d: GraphEngine(
|
|
||||||
getattr(t, "topology_obj", t), enable_data=True
|
|
||||||
),
|
|
||||||
)
|
|
||||||
assert result.completion.ok
|
|
||||||
@@ -48,8 +48,8 @@ def test_from_numpy_creates_host_tensor():
|
|||||||
assert h._handle is None
|
assert h._handle is None
|
||||||
# Submit a no-op so run_bench has at least one handle.
|
# Submit a no-op so run_bench has at least one handle.
|
||||||
torch.zeros((1, 8), dtype="f16",
|
torch.zeros((1, 8), dtype="f16",
|
||||||
dp=DPPolicy(sip="replicate", cube="replicate", pe="replicate",
|
dp=DPPolicy(cube="replicate", pe="replicate",
|
||||||
num_sips=1, num_cubes=1, num_pes=1),
|
num_cubes=1, num_pes=1),
|
||||||
name="dummy")
|
name="dummy")
|
||||||
|
|
||||||
_run_with(body)
|
_run_with(body)
|
||||||
@@ -63,8 +63,8 @@ def test_copy_and_numpy_single_pe():
|
|||||||
a single-PE (no real sharding) tensor."""
|
a single-PE (no real sharding) tensor."""
|
||||||
|
|
||||||
def body(torch):
|
def body(torch):
|
||||||
dp = DPPolicy(sip="replicate", cube="replicate", pe="replicate",
|
dp = DPPolicy(cube="replicate", pe="replicate",
|
||||||
num_sips=1, num_cubes=1, num_pes=1)
|
num_cubes=1, num_pes=1)
|
||||||
t = torch.zeros((1, 16), dtype="f16", dp=dp, name="t")
|
t = torch.zeros((1, 16), dtype="f16", dp=dp, name="t")
|
||||||
src = np.arange(16, dtype=np.float16).reshape(1, 16)
|
src = np.arange(16, dtype=np.float16).reshape(1, 16)
|
||||||
t.copy_(torch.from_numpy(src))
|
t.copy_(torch.from_numpy(src))
|
||||||
@@ -83,8 +83,8 @@ def test_copy_and_numpy_multi_pe_column_wise():
|
|||||||
|
|
||||||
def body(torch):
|
def body(torch):
|
||||||
n_pe = 8
|
n_pe = 8
|
||||||
dp = DPPolicy(sip="replicate", cube="replicate", pe="column_wise",
|
dp = DPPolicy(cube="replicate", pe="column_wise",
|
||||||
num_sips=1, num_cubes=1, num_pes=n_pe)
|
num_cubes=1, num_pes=n_pe)
|
||||||
t = torch.zeros((1, n_pe * 4), dtype="f16", dp=dp, name="t")
|
t = torch.zeros((1, n_pe * 4), dtype="f16", dp=dp, name="t")
|
||||||
src = np.arange(n_pe * 4, dtype=np.float16).reshape(1, n_pe * 4)
|
src = np.arange(n_pe * 4, dtype=np.float16).reshape(1, n_pe * 4)
|
||||||
t.copy_(torch.from_numpy(src))
|
t.copy_(torch.from_numpy(src))
|
||||||
@@ -107,8 +107,8 @@ def test_copy_and_numpy_multi_cube():
|
|||||||
n_pe_per_cube = 8
|
n_pe_per_cube = 8
|
||||||
n_cubes = 2
|
n_cubes = 2
|
||||||
total = n_cubes * n_pe_per_cube # 16
|
total = n_cubes * n_pe_per_cube # 16
|
||||||
dp = DPPolicy(sip="replicate", cube="column_wise", pe="column_wise",
|
dp = DPPolicy(cube="column_wise", pe="column_wise",
|
||||||
num_sips=1, num_cubes=n_cubes)
|
num_cubes=n_cubes)
|
||||||
t = torch.zeros((1, total * 4), dtype="f16", dp=dp, name="t")
|
t = torch.zeros((1, total * 4), dtype="f16", dp=dp, name="t")
|
||||||
src = np.arange(total * 4, dtype=np.float16).reshape(1, total * 4)
|
src = np.arange(total * 4, dtype=np.float16).reshape(1, total * 4)
|
||||||
t.copy_(torch.from_numpy(src))
|
t.copy_(torch.from_numpy(src))
|
||||||
@@ -126,8 +126,8 @@ def test_copy_shape_mismatch_raises():
|
|||||||
"""copy_ with mismatched shapes raises ValueError."""
|
"""copy_ with mismatched shapes raises ValueError."""
|
||||||
|
|
||||||
def body(torch):
|
def body(torch):
|
||||||
dp = DPPolicy(sip="replicate", cube="replicate", pe="replicate",
|
dp = DPPolicy(cube="replicate", pe="replicate",
|
||||||
num_sips=1, num_cubes=1, num_pes=1)
|
num_cubes=1, num_pes=1)
|
||||||
t = torch.zeros((1, 8), dtype="f16", dp=dp, name="t")
|
t = torch.zeros((1, 8), dtype="f16", dp=dp, name="t")
|
||||||
src = np.zeros((1, 16), dtype=np.float16)
|
src = np.zeros((1, 16), dtype=np.float16)
|
||||||
with pytest.raises(ValueError, match="copy_ shape mismatch"):
|
with pytest.raises(ValueError, match="copy_ shape mismatch"):
|
||||||
@@ -143,8 +143,8 @@ def test_setitem_getitem_single_pe():
|
|||||||
"""Scalar and slice assignment on a single-PE tensor round-trips."""
|
"""Scalar and slice assignment on a single-PE tensor round-trips."""
|
||||||
|
|
||||||
def body(torch):
|
def body(torch):
|
||||||
dp = DPPolicy(sip="replicate", cube="replicate", pe="replicate",
|
dp = DPPolicy(cube="replicate", pe="replicate",
|
||||||
num_sips=1, num_cubes=1, num_pes=1)
|
num_cubes=1, num_pes=1)
|
||||||
t = torch.zeros((1, 8), dtype="f16", dp=dp, name="t")
|
t = torch.zeros((1, 8), dtype="f16", dp=dp, name="t")
|
||||||
|
|
||||||
# Scalar broadcast
|
# Scalar broadcast
|
||||||
@@ -169,8 +169,8 @@ def test_setitem_getitem_multi_pe_shard_aligned():
|
|||||||
def body(torch):
|
def body(torch):
|
||||||
n_pe = 8
|
n_pe = 8
|
||||||
n_elem = 4 # per shard
|
n_elem = 4 # per shard
|
||||||
dp = DPPolicy(sip="replicate", cube="replicate", pe="column_wise",
|
dp = DPPolicy(cube="replicate", pe="column_wise",
|
||||||
num_sips=1, num_cubes=1, num_pes=n_pe)
|
num_cubes=1, num_pes=n_pe)
|
||||||
t = torch.zeros((1, n_pe * n_elem), dtype="f16", dp=dp, name="t")
|
t = torch.zeros((1, n_pe * n_elem), dtype="f16", dp=dp, name="t")
|
||||||
|
|
||||||
# Write each shard with its rank value
|
# Write each shard with its rank value
|
||||||
@@ -197,8 +197,8 @@ def test_setitem_cross_shard_raises():
|
|||||||
def body(torch):
|
def body(torch):
|
||||||
n_pe = 4
|
n_pe = 4
|
||||||
n_elem = 4
|
n_elem = 4
|
||||||
dp = DPPolicy(sip="replicate", cube="replicate", pe="column_wise",
|
dp = DPPolicy(cube="replicate", pe="column_wise",
|
||||||
num_sips=1, num_cubes=1, num_pes=n_pe)
|
num_cubes=1, num_pes=n_pe)
|
||||||
t = torch.zeros((1, n_pe * n_elem), dtype="f16", dp=dp, name="t")
|
t = torch.zeros((1, n_pe * n_elem), dtype="f16", dp=dp, name="t")
|
||||||
with pytest.raises(NotImplementedError, match="spans multiple shards"):
|
with pytest.raises(NotImplementedError, match="spans multiple shards"):
|
||||||
t[0, 2:6] = 1.0 # crosses shard 0 (0:4) and shard 1 (4:8)
|
t[0, 2:6] = 1.0 # crosses shard 0 (0:4) and shard 1 (4:8)
|
||||||
|
|||||||
+91
-128
@@ -1,157 +1,120 @@
|
|||||||
"""Tests for SIP-level tensor parallelism.
|
"""Tests for SIP-level tensor parallelism — ADR-0026 structural model.
|
||||||
|
|
||||||
Validates:
|
DPPolicy no longer carries a ``sip`` axis (ADR-0026 D1). SIP placement is
|
||||||
SP1. DPPolicy accepts sip field (default "replicate", backward compat)
|
now expressed structurally: each call to ``resolve_dp_policy(target_sip=N)``
|
||||||
SP2. sip="column_wise": tensor K-axis split across SIPs, each SIP gets K//num_sips
|
emits shards pinned to SIP N. Multi-SIP parallelism is composed by calling
|
||||||
SP3. sip="row_wise": tensor M-axis split across SIPs
|
the resolver once per SIP (typically driven by the ADR-0024 launcher, one
|
||||||
SP4. 3-level resolve: sip × cube × pe produces correct flat indices and offsets
|
worker greenlet per rank, each worker using ``torch.ahbm.set_device(rank)``).
|
||||||
SP5. sip="replicate": all SIPs get full copy (existing behavior)
|
|
||||||
SP6. PE_CPU sets num_programs from shard count per cube
|
Covered here:
|
||||||
SP7. End-to-end: TP kernel with sip="column_wise" completes on multi-SIP topology
|
SP1. ``target_sip`` stamps every shard.
|
||||||
|
SP2. Two-SIP placement: union of two resolver calls covers the whole
|
||||||
|
tensor K-axis when the combined bench treats them as column-split.
|
||||||
|
SP3. Same for row-wise.
|
||||||
|
SP4. Cube + PE sharding within a SIP remains correct across SIPs.
|
||||||
|
SP5. PE_CPU num_programs contract (unchanged by ADR-0026).
|
||||||
"""
|
"""
|
||||||
import pytest
|
from __future__ import annotations
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from kernbench.policy.placement.dp import DPPolicy, ShardSpec, resolve_dp_policy
|
from kernbench.policy.placement.dp import DPPolicy, resolve_dp_policy
|
||||||
|
|
||||||
|
|
||||||
# ── SP1. DPPolicy sip field ──────────────────────────────────────────
|
# ── SP1. target_sip stamps shards ────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
def test_dp_policy_sip_default_replicate():
|
def test_target_sip_stamps_all_shards():
|
||||||
"""DPPolicy without sip= defaults to 'replicate'."""
|
|
||||||
dp = DPPolicy(cube="replicate", pe="column_wise")
|
dp = DPPolicy(cube="replicate", pe="column_wise")
|
||||||
assert dp.sip == "replicate"
|
|
||||||
|
|
||||||
|
|
||||||
def test_dp_policy_sip_column_wise():
|
|
||||||
"""DPPolicy accepts sip='column_wise'."""
|
|
||||||
dp = DPPolicy(sip="column_wise", cube="replicate", pe="column_wise")
|
|
||||||
assert dp.sip == "column_wise"
|
|
||||||
|
|
||||||
|
|
||||||
# ── SP2. sip="column_wise" ──────────────────────────────────────────────
|
|
||||||
|
|
||||||
|
|
||||||
def test_sip_column_wise_splits_across_sips():
|
|
||||||
"""sip='column_wise' with 2 SIPs: each SIP gets K//2 columns."""
|
|
||||||
dp = DPPolicy(sip="column_wise", cube="replicate", pe="column_wise")
|
|
||||||
shards = resolve_dp_policy(
|
shards = resolve_dp_policy(
|
||||||
dp, shape=(128, 256), itemsize=2,
|
dp, shape=(128, 256), itemsize=2,
|
||||||
num_pe=8, num_cubes=1, num_sips=2,
|
num_pe=8, num_cubes=1, target_sip=3,
|
||||||
)
|
)
|
||||||
# 2 SIPs × 1 cube × 8 PEs = 16 shards
|
assert all(s.sip == 3 for s in shards)
|
||||||
assert len(shards) == 16
|
assert all(0 <= s.pe < 8 for s in shards)
|
||||||
|
assert all(s.cube == 0 for s in shards)
|
||||||
# SIP0 shards: first half of K (0 to K//2)
|
|
||||||
# SIP1 shards: second half of K (K//2 to K)
|
|
||||||
total_bytes = 128 * 256 * 2 # 64KB
|
|
||||||
sip0_shards = [s for s in shards if s.pe_index < 8]
|
|
||||||
sip1_shards = [s for s in shards if s.pe_index >= 8]
|
|
||||||
|
|
||||||
# SIP0 offsets start at 0
|
|
||||||
assert sip0_shards[0].offset_bytes == 0
|
|
||||||
# SIP1 offsets start at half
|
|
||||||
assert sip1_shards[0].offset_bytes == total_bytes // 2
|
|
||||||
|
|
||||||
# Total coverage
|
|
||||||
assert sum(s.nbytes for s in sip0_shards) == total_bytes // 2
|
|
||||||
assert sum(s.nbytes for s in sip1_shards) == total_bytes // 2
|
|
||||||
|
|
||||||
|
|
||||||
# ── SP3. sip="row_wise" ──────────────────────────────────────────────
|
# ── SP2. column-wise placement composed across two SIPs ─────────────
|
||||||
|
|
||||||
|
|
||||||
def test_sip_row_wise_splits_across_sips():
|
def test_compose_two_sips_column_wise_covers_tensor():
|
||||||
"""sip='row_wise' with 2 SIPs: each SIP gets M//2 rows."""
|
"""Bench splits K-axis across 2 SIPs by calling resolve twice and
|
||||||
dp = DPPolicy(sip="row_wise", cube="replicate", pe="column_wise")
|
giving each SIP half of the tensor (half-shape + offset). Shards
|
||||||
shards = resolve_dp_policy(
|
from both SIPs together cover the whole K axis."""
|
||||||
|
full_shape = (128, 256)
|
||||||
|
itemsize = 2
|
||||||
|
# Per-SIP half-shape (K split across SIPs).
|
||||||
|
half_shape = (128, 128)
|
||||||
|
dp = DPPolicy(cube="replicate", pe="column_wise")
|
||||||
|
|
||||||
|
shards_sip0 = resolve_dp_policy(
|
||||||
|
dp, shape=half_shape, itemsize=itemsize,
|
||||||
|
num_pe=8, num_cubes=1, target_sip=0,
|
||||||
|
)
|
||||||
|
shards_sip1 = resolve_dp_policy(
|
||||||
|
dp, shape=half_shape, itemsize=itemsize,
|
||||||
|
num_pe=8, num_cubes=1, target_sip=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
total_bytes = full_shape[0] * full_shape[1] * itemsize
|
||||||
|
sip0_bytes = sum(s.nbytes for s in shards_sip0)
|
||||||
|
sip1_bytes = sum(s.nbytes for s in shards_sip1)
|
||||||
|
assert sip0_bytes + sip1_bytes == total_bytes
|
||||||
|
assert all(s.sip == 0 for s in shards_sip0)
|
||||||
|
assert all(s.sip == 1 for s in shards_sip1)
|
||||||
|
|
||||||
|
|
||||||
|
# ── SP3. row-wise placement composed across two SIPs ────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_compose_two_sips_row_wise_covers_tensor():
|
||||||
|
full_shape = (128, 256)
|
||||||
|
itemsize = 2
|
||||||
|
half_shape = (64, 256) # per-SIP half of M
|
||||||
|
dp = DPPolicy(cube="replicate", pe="column_wise")
|
||||||
|
|
||||||
|
shards_sip0 = resolve_dp_policy(
|
||||||
|
dp, shape=half_shape, itemsize=itemsize,
|
||||||
|
num_pe=8, num_cubes=1, target_sip=0,
|
||||||
|
)
|
||||||
|
shards_sip1 = resolve_dp_policy(
|
||||||
|
dp, shape=half_shape, itemsize=itemsize,
|
||||||
|
num_pe=8, num_cubes=1, target_sip=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
total_bytes = full_shape[0] * full_shape[1] * itemsize
|
||||||
|
assert sum(s.nbytes for s in shards_sip0) + sum(s.nbytes for s in shards_sip1) == total_bytes
|
||||||
|
|
||||||
|
|
||||||
|
# ── SP4. cube × PE sharding is independent per SIP ──────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_cube_pe_sharding_independent_per_sip():
|
||||||
|
"""Intra-SIP cube + PE layout matches across SIPs; only sip field differs."""
|
||||||
|
dp = DPPolicy(cube="column_wise", pe="column_wise")
|
||||||
|
s0 = resolve_dp_policy(
|
||||||
dp, shape=(128, 256), itemsize=2,
|
dp, shape=(128, 256), itemsize=2,
|
||||||
num_pe=8, num_cubes=1, num_sips=2,
|
num_pe=4, num_cubes=2, target_sip=0,
|
||||||
)
|
)
|
||||||
assert len(shards) == 16
|
s1 = resolve_dp_policy(
|
||||||
|
|
||||||
sip0_shards = [s for s in shards if s.pe_index < 8]
|
|
||||||
sip1_shards = [s for s in shards if s.pe_index >= 8]
|
|
||||||
|
|
||||||
# SIP0: rows 0..63, SIP1: rows 64..127
|
|
||||||
total_bytes = 128 * 256 * 2
|
|
||||||
assert sip0_shards[0].offset_bytes == 0
|
|
||||||
assert sip1_shards[0].offset_bytes == total_bytes // 2
|
|
||||||
|
|
||||||
|
|
||||||
# ── SP4. 3-level resolve ─────────────────────────────────────────────
|
|
||||||
|
|
||||||
|
|
||||||
def test_3level_resolve_flat_index():
|
|
||||||
"""3-level: sip × cube × pe produces correct flat indices."""
|
|
||||||
dp = DPPolicy(sip="column_wise", cube="replicate", pe="column_wise")
|
|
||||||
shards = resolve_dp_policy(
|
|
||||||
dp, shape=(128, 256), itemsize=2,
|
dp, shape=(128, 256), itemsize=2,
|
||||||
num_pe=8, num_cubes=2, num_sips=2,
|
num_pe=4, num_cubes=2, target_sip=1,
|
||||||
)
|
)
|
||||||
# 2 SIPs × 2 cubes × 8 PEs = 32 shards
|
assert len(s0) == len(s1) == 2 * 4
|
||||||
assert len(shards) == 32
|
for a, b in zip(s0, s1):
|
||||||
|
assert a.sip == 0 and b.sip == 1
|
||||||
# Flat index: sip_id * cubes_per_sip * num_pe + cube_id * num_pe + pe_id
|
assert (a.cube, a.pe, a.offset_bytes, a.nbytes) == (
|
||||||
indices = [s.pe_index for s in shards]
|
b.cube, b.pe, b.offset_bytes, b.nbytes
|
||||||
# SIP0: 0..15, SIP1: 16..31
|
)
|
||||||
assert min(indices) == 0
|
|
||||||
assert max(indices) == 31
|
|
||||||
assert len(set(indices)) == 32 # all unique
|
|
||||||
|
|
||||||
|
|
||||||
def test_3level_offsets_cover_full_tensor():
|
# ── SP5. PE_CPU num_programs (contract unchanged) ───────────────────
|
||||||
"""3-level sharding covers the entire tensor with no gaps."""
|
|
||||||
dp = DPPolicy(sip="column_wise", cube="replicate", pe="column_wise")
|
|
||||||
shards = resolve_dp_policy(
|
|
||||||
dp, shape=(128, 256), itemsize=2,
|
|
||||||
num_pe=4, num_cubes=1, num_sips=2,
|
|
||||||
)
|
|
||||||
# 2 SIPs × 1 cube × 4 PEs = 8 shards
|
|
||||||
# sip="column_wise": K=128 per SIP, pe="column_wise": 32 cols per PE
|
|
||||||
total = 128 * 256 * 2
|
|
||||||
# For non-replicate, total shard bytes == tensor bytes
|
|
||||||
# (replicate within cube means cube shards overlap, but sip shards don't)
|
|
||||||
sip0_bytes = sum(s.nbytes for s in shards if s.pe_index < 4)
|
|
||||||
sip1_bytes = sum(s.nbytes for s in shards if s.pe_index >= 4)
|
|
||||||
assert sip0_bytes + sip1_bytes == total
|
|
||||||
|
|
||||||
|
|
||||||
# ── SP5. sip="replicate" backward compat ─────────────────────────────
|
|
||||||
|
|
||||||
|
|
||||||
def test_sip_replicate_backward_compat():
|
|
||||||
"""sip='replicate' produces same result as before (2-level)."""
|
|
||||||
dp_old = DPPolicy(cube="replicate", pe="column_wise")
|
|
||||||
dp_new = DPPolicy(sip="replicate", cube="replicate", pe="column_wise")
|
|
||||||
|
|
||||||
shards_old = resolve_dp_policy(
|
|
||||||
dp_old, shape=(128, 256), itemsize=2,
|
|
||||||
num_pe=8, num_cubes=2, num_sips=2,
|
|
||||||
)
|
|
||||||
shards_new = resolve_dp_policy(
|
|
||||||
dp_new, shape=(128, 256), itemsize=2,
|
|
||||||
num_pe=8, num_cubes=2, num_sips=2,
|
|
||||||
)
|
|
||||||
assert len(shards_old) == len(shards_new)
|
|
||||||
for a, b in zip(shards_old, shards_new):
|
|
||||||
assert a.pe_index == b.pe_index
|
|
||||||
assert a.offset_bytes == b.offset_bytes
|
|
||||||
assert a.nbytes == b.nbytes
|
|
||||||
|
|
||||||
|
|
||||||
# ── SP6. PE_CPU num_programs ──────────────────────────────────────────
|
|
||||||
|
|
||||||
|
|
||||||
def test_pe_cpu_sets_num_programs():
|
def test_pe_cpu_sets_num_programs():
|
||||||
"""PE_CPU should create TLContext with num_programs = PEs per cube."""
|
"""TLContext reports num_programs from its initializer — used by PE_CPU
|
||||||
# This test validates the interface contract.
|
when it launches a kernel on behalf of its shards."""
|
||||||
# After implementation, PE_CPU should derive num_programs from the
|
|
||||||
# number of PE shards in the kernel launch's target cube.
|
|
||||||
from kernbench.triton_emu.tl_context import TLContext
|
from kernbench.triton_emu.tl_context import TLContext
|
||||||
|
|
||||||
# With 8 PEs per cube, num_programs should be 8
|
|
||||||
tl = TLContext(pe_id=3, num_programs=8)
|
tl = TLContext(pe_id=3, num_programs=8)
|
||||||
assert tl.program_id(0) == 3
|
assert tl.program_id(0) == 3
|
||||||
assert tl.num_programs(0) == 8
|
assert tl.num_programs(0) == 8
|
||||||
|
|||||||
+23
-17
@@ -2,11 +2,13 @@ import pytest
|
|||||||
|
|
||||||
from kernbench.policy.address.allocator import AddressConfig, AllocationError, PEMemAllocator
|
from kernbench.policy.address.allocator import AddressConfig, AllocationError, PEMemAllocator
|
||||||
from kernbench.policy.placement.dp import (
|
from kernbench.policy.placement.dp import (
|
||||||
|
DPPolicy,
|
||||||
ShardSpec,
|
ShardSpec,
|
||||||
column_wise,
|
column_wise,
|
||||||
tiled_column_major,
|
|
||||||
replicate,
|
replicate,
|
||||||
|
resolve_dp_policy,
|
||||||
row_wise,
|
row_wise,
|
||||||
|
tiled_column_major,
|
||||||
tiled_row_major,
|
tiled_row_major,
|
||||||
)
|
)
|
||||||
from kernbench.runtime_api.kernel import (
|
from kernbench.runtime_api.kernel import (
|
||||||
@@ -40,9 +42,9 @@ _CFG = AddressConfig(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def _make_allocators(num_pe: int = 8) -> dict[int, PEMemAllocator]:
|
def _make_allocators(num_pe: int = 8) -> dict[tuple[int, int, int], PEMemAllocator]:
|
||||||
return {
|
return {
|
||||||
i: PEMemAllocator(rack_id=0, sip_id=0, cube_id=0, pe_id=i, cfg=_CFG)
|
(0, 0, i): PEMemAllocator(rack_id=0, sip_id=0, cube_id=0, pe_id=i, cfg=_CFG)
|
||||||
for i in range(num_pe)
|
for i in range(num_pe)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -133,7 +135,7 @@ def test_column_wise_placement():
|
|||||||
assert len(shards) == 8
|
assert len(shards) == 8
|
||||||
expected_nbytes = 1024 * 64 * 2 # 128 KB
|
expected_nbytes = 1024 * 64 * 2 # 128 KB
|
||||||
for i, s in enumerate(shards):
|
for i, s in enumerate(shards):
|
||||||
assert s.pe_index == i
|
assert s.local_pe == i
|
||||||
assert s.nbytes == expected_nbytes
|
assert s.nbytes == expected_nbytes
|
||||||
# offsets are contiguous
|
# offsets are contiguous
|
||||||
assert shards[0].offset_bytes == 0
|
assert shards[0].offset_bytes == 0
|
||||||
@@ -151,7 +153,7 @@ def test_row_wise_placement():
|
|||||||
assert len(shards) == 8
|
assert len(shards) == 8
|
||||||
expected_nbytes = 128 * 512 * 2 # 128 KB
|
expected_nbytes = 128 * 512 * 2 # 128 KB
|
||||||
for i, s in enumerate(shards):
|
for i, s in enumerate(shards):
|
||||||
assert s.pe_index == i
|
assert s.local_pe == i
|
||||||
assert s.nbytes == expected_nbytes
|
assert s.nbytes == expected_nbytes
|
||||||
assert shards[0].offset_bytes == 0
|
assert shards[0].offset_bytes == 0
|
||||||
assert sum(s.nbytes for s in shards) == 1024 * 512 * 2
|
assert sum(s.nbytes for s in shards) == 1024 * 512 * 2
|
||||||
@@ -166,7 +168,7 @@ def test_replicate_placement():
|
|||||||
assert len(shards) == 8
|
assert len(shards) == 8
|
||||||
full_nbytes = 1024 * 512 * 2 # 1 MB
|
full_nbytes = 1024 * 512 * 2 # 1 MB
|
||||||
for i, s in enumerate(shards):
|
for i, s in enumerate(shards):
|
||||||
assert s.pe_index == i
|
assert s.local_pe == i
|
||||||
assert s.nbytes == full_nbytes
|
assert s.nbytes == full_nbytes
|
||||||
assert s.offset_bytes == 0 # each is a full copy
|
assert s.offset_bytes == 0 # each is a full copy
|
||||||
|
|
||||||
@@ -188,10 +190,10 @@ def test_tiled_column_major():
|
|||||||
# tile (m=0,k=0) → PE0, tile (m=0,k=1) → PE1, ..., (m=0,k=3) → PE3
|
# tile (m=0,k=0) → PE0, tile (m=0,k=1) → PE1, ..., (m=0,k=3) → PE3
|
||||||
# tile (m=1,k=0) → PE4, tile (m=1,k=1) → PE5, ..., (m=1,k=3) → PE7
|
# tile (m=1,k=0) → PE4, tile (m=1,k=1) → PE5, ..., (m=1,k=3) → PE7
|
||||||
# tile (m=2,k=0) → PE0, ...
|
# tile (m=2,k=0) → PE0, ...
|
||||||
assert shards[0].pe_index == 0
|
assert shards[0].local_pe == 0
|
||||||
assert shards[1].pe_index == 1
|
assert shards[1].local_pe == 1
|
||||||
assert shards[7].pe_index == 7
|
assert shards[7].local_pe == 7
|
||||||
assert shards[8].pe_index == 0 # wraps around
|
assert shards[8].local_pe == 0 # wraps around
|
||||||
# total coverage
|
# total coverage
|
||||||
assert sum(s.nbytes for s in shards) == 1024 * 512 * 2
|
assert sum(s.nbytes for s in shards) == 1024 * 512 * 2
|
||||||
|
|
||||||
@@ -212,10 +214,10 @@ def test_tiled_row_major():
|
|||||||
# tile (m=0,k=0) → PE0, tile (m=1,k=0) → PE1, ..., (m=3,k=0) → PE3
|
# tile (m=0,k=0) → PE0, tile (m=1,k=0) → PE1, ..., (m=3,k=0) → PE3
|
||||||
# tile (m=0,k=1) → PE4, tile (m=1,k=1) → PE5, ..., (m=3,k=1) → PE7
|
# tile (m=0,k=1) → PE4, tile (m=1,k=1) → PE5, ..., (m=3,k=1) → PE7
|
||||||
# tile (m=0,k=2) → PE0, ...
|
# tile (m=0,k=2) → PE0, ...
|
||||||
assert shards[0].pe_index == 0
|
assert shards[0].local_pe == 0
|
||||||
assert shards[1].pe_index == 1
|
assert shards[1].local_pe == 1
|
||||||
assert shards[7].pe_index == 7
|
assert shards[7].local_pe == 7
|
||||||
assert shards[8].pe_index == 0 # wraps around
|
assert shards[8].local_pe == 0 # wraps around
|
||||||
# total coverage
|
# total coverage
|
||||||
assert sum(s.nbytes for s in shards) == 1024 * 512 * 2
|
assert sum(s.nbytes for s in shards) == 1024 * 512 * 2
|
||||||
|
|
||||||
@@ -226,7 +228,11 @@ def test_tiled_row_major():
|
|||||||
def test_deploy_tensor_hbm():
|
def test_deploy_tensor_hbm():
|
||||||
"""Deploy with column_wise placement → TensorHandle with valid PA shards."""
|
"""Deploy with column_wise placement → TensorHandle with valid PA shards."""
|
||||||
allocs = _make_allocators()
|
allocs = _make_allocators()
|
||||||
placement = column_wise(shape=(1024, 512), itemsize=2, num_pe=8)
|
placement = resolve_dp_policy(
|
||||||
|
DPPolicy(cube="replicate", pe="column_wise"),
|
||||||
|
shape=(1024, 512), itemsize=2,
|
||||||
|
num_pe=8, num_cubes=1, target_sip=0,
|
||||||
|
)
|
||||||
th = deploy_tensor(
|
th = deploy_tensor(
|
||||||
name="W",
|
name="W",
|
||||||
shape=(1024, 512),
|
shape=(1024, 512),
|
||||||
@@ -253,7 +259,7 @@ def test_deploy_tensor_hbm():
|
|||||||
def test_deploy_tensor_tcm():
|
def test_deploy_tensor_tcm():
|
||||||
"""Deploy with TCM → uses pe_tcm_addr allocation."""
|
"""Deploy with TCM → uses pe_tcm_addr allocation."""
|
||||||
allocs = _make_allocators()
|
allocs = _make_allocators()
|
||||||
placement = [ShardSpec(pe_index=0, offset_bytes=0, nbytes=256)]
|
placement = [ShardSpec(sip=0, cube=0, pe=0, offset_bytes=0, nbytes=256)]
|
||||||
th = deploy_tensor(
|
th = deploy_tensor(
|
||||||
name="small",
|
name="small",
|
||||||
shape=(128,),
|
shape=(128,),
|
||||||
@@ -271,7 +277,7 @@ def test_deploy_tensor_overflow():
|
|||||||
"""Allocation exceeding PE HBM capacity raises AllocationError."""
|
"""Allocation exceeding PE HBM capacity raises AllocationError."""
|
||||||
allocs = _make_allocators()
|
allocs = _make_allocators()
|
||||||
# 6 GB per PE slice, try to allocate 7 GB
|
# 6 GB per PE slice, try to allocate 7 GB
|
||||||
big_shard = ShardSpec(pe_index=0, offset_bytes=0, nbytes=7 * _GB)
|
big_shard = ShardSpec(sip=0, cube=0, pe=0, offset_bytes=0, nbytes=7 * _GB)
|
||||||
with pytest.raises(AllocationError):
|
with pytest.raises(AllocationError):
|
||||||
deploy_tensor(
|
deploy_tensor(
|
||||||
name="toobig",
|
name="toobig",
|
||||||
|
|||||||
@@ -1,106 +0,0 @@
|
|||||||
"""Tests for tl.recv_async + tl.wait (ADR-0023 D4)."""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
from kernbench.ccl.testing import run_kernel_in_mock
|
|
||||||
|
|
||||||
|
|
||||||
def kernel_async_recv(t_ptr, n_elem, tl):
|
|
||||||
"""Each PE issues recv_async first, then send, then wait — this exercises
|
|
||||||
the non-blocking path. Uses TensorHandle math (PE_MATH) for accumulation
|
|
||||||
so Phase 2 produces correct final HBM contents."""
|
|
||||||
rank = tl.program_id(axis=0)
|
|
||||||
world_size = tl.num_programs(axis=0)
|
|
||||||
nbytes = n_elem * 2
|
|
||||||
|
|
||||||
pe_addr = t_ptr + rank * nbytes
|
|
||||||
acc = tl.load(pe_addr, shape=(n_elem,), dtype="f16")
|
|
||||||
current = acc
|
|
||||||
|
|
||||||
for _step in range(world_size - 1):
|
|
||||||
future = tl.recv_async(dir="W", shape=(n_elem,), dtype="f16")
|
|
||||||
tl.send(dir="E", src=current)
|
|
||||||
recv = tl.wait(future)
|
|
||||||
acc = acc + recv
|
|
||||||
current = recv # forward W's tile to E next round
|
|
||||||
|
|
||||||
tl.store(pe_addr, acc)
|
|
||||||
|
|
||||||
|
|
||||||
def test_recv_async_mock_runtime():
|
|
||||||
n_elem = 8
|
|
||||||
inputs = [
|
|
||||||
np.full((n_elem,), float(r + 1), dtype=np.float16)
|
|
||||||
for r in range(4)
|
|
||||||
]
|
|
||||||
expected = sum(inputs)
|
|
||||||
|
|
||||||
outputs = run_kernel_in_mock(
|
|
||||||
kernel_fn=kernel_async_recv,
|
|
||||||
world_size=4,
|
|
||||||
topology="ring_1d",
|
|
||||||
inputs=inputs,
|
|
||||||
kernel_args=(n_elem,),
|
|
||||||
)
|
|
||||||
for r in range(4):
|
|
||||||
assert np.allclose(outputs[r], expected)
|
|
||||||
|
|
||||||
|
|
||||||
def test_recv_async_simpy_runner():
|
|
||||||
"""Run the async kernel through the real SimPy stack via the
|
|
||||||
install_ipcq + launch path.
|
|
||||||
"""
|
|
||||||
import importlib
|
|
||||||
|
|
||||||
from kernbench.runtime_api.bench_runner import run_bench
|
|
||||||
from kernbench.runtime_api.types import resolve_device
|
|
||||||
from kernbench.sim_engine.engine import GraphEngine
|
|
||||||
from kernbench.topology.builder import resolve_topology
|
|
||||||
|
|
||||||
# Re-use the standard 8-PE bench skeleton but swap in the async kernel.
|
|
||||||
topo = resolve_topology("topology.yaml")
|
|
||||||
|
|
||||||
# Build a tiny inline bench module
|
|
||||||
import types
|
|
||||||
mod = types.ModuleType("inline_bench_async")
|
|
||||||
|
|
||||||
from kernbench.policy.placement.dp import DPPolicy
|
|
||||||
|
|
||||||
def run(torch):
|
|
||||||
plan = torch.install_ipcq(
|
|
||||||
algorithm="ring_allreduce_tcm", world_size_override=8,
|
|
||||||
)
|
|
||||||
a = torch.zeros(
|
|
||||||
(1, 8 * 8),
|
|
||||||
dtype="f16",
|
|
||||||
dp=DPPolicy(
|
|
||||||
sip="replicate", cube="replicate", pe="column_wise",
|
|
||||||
num_sips=1, num_cubes=1,
|
|
||||||
),
|
|
||||||
name="async_in",
|
|
||||||
)
|
|
||||||
store = torch.engine.memory_store
|
|
||||||
base = a._handle.va_base or a._handle.shards[0].pa
|
|
||||||
nbytes = 8 * 2
|
|
||||||
for r in range(8):
|
|
||||||
store.write("hbm", base + r * nbytes,
|
|
||||||
np.full((8,), float(r + 1), dtype=np.float16))
|
|
||||||
|
|
||||||
torch.launch("ring_allreduce_tcm", kernel_async_recv, a, 8)
|
|
||||||
|
|
||||||
for r in range(8):
|
|
||||||
result = store.read("hbm", base + r * nbytes, shape=(8,), dtype="f16")
|
|
||||||
expected = float(sum(range(1, 9))) # 36
|
|
||||||
assert np.allclose(result, expected, rtol=1e-2, atol=1e-2), \
|
|
||||||
f"rank {r}: got {result}, expected {expected}"
|
|
||||||
|
|
||||||
mod.run = run
|
|
||||||
result = run_bench(
|
|
||||||
topology=topo, bench_fn=mod.run,
|
|
||||||
device=resolve_device("all"),
|
|
||||||
engine_factory=lambda t, d: GraphEngine(
|
|
||||||
getattr(t, "topology_obj", t), enable_data=True
|
|
||||||
),
|
|
||||||
)
|
|
||||||
assert result.completion.ok
|
|
||||||
@@ -0,0 +1,234 @@
|
|||||||
|
"""ADR-0027 T2: TP layer shape + numerical correctness (D4/D5).
|
||||||
|
|
||||||
|
Phase 1: ``kernbench.tp.layers`` doesn't exist → import failure. Phase 2
|
||||||
|
lands D4/D5 and T2 passes with deterministic non-zero weight patterns.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
def _make_ctx(topology):
|
||||||
|
from kernbench.runtime_api.context import RuntimeContext
|
||||||
|
from kernbench.runtime_api.types import DeviceSelector
|
||||||
|
from kernbench.sim_engine.engine import GraphEngine
|
||||||
|
|
||||||
|
engine = GraphEngine(topology.topology_obj, enable_data=True)
|
||||||
|
return RuntimeContext(
|
||||||
|
engine=engine,
|
||||||
|
target_device=DeviceSelector("all"),
|
||||||
|
correlation_id="test_t2",
|
||||||
|
spec=topology.topology_obj.spec,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Shape / structural ───────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_column_parallel_weight_shape_per_rank(topology):
|
||||||
|
"""ColumnParallelLinear weight per rank is (in_features, out // ws)."""
|
||||||
|
import kernbench.tp as tp
|
||||||
|
from kernbench.runtime_api.tensor import Tensor
|
||||||
|
|
||||||
|
with _make_ctx(topology) as ctx:
|
||||||
|
ctx.distributed.init_process_group(backend="ahbm")
|
||||||
|
ws = ctx.distributed.get_world_size()
|
||||||
|
tp.initialize_model_parallel(ws)
|
||||||
|
|
||||||
|
def _worker(rank: int):
|
||||||
|
ctx.ahbm.set_device(rank)
|
||||||
|
fc = tp.ColumnParallelLinear(
|
||||||
|
in_features=256, out_features=512, torch=ctx,
|
||||||
|
)
|
||||||
|
assert fc.weight.shape == (256, 512 // ws)
|
||||||
|
|
||||||
|
ctx.multiprocessing.spawn(_worker, args=(), nprocs=ws)
|
||||||
|
|
||||||
|
|
||||||
|
def test_row_parallel_weight_shape_per_rank(topology):
|
||||||
|
"""RowParallelLinear weight per rank is (in_features // ws, out_features)."""
|
||||||
|
import kernbench.tp as tp
|
||||||
|
|
||||||
|
with _make_ctx(topology) as ctx:
|
||||||
|
ctx.distributed.init_process_group(backend="ahbm")
|
||||||
|
ws = ctx.distributed.get_world_size()
|
||||||
|
tp.initialize_model_parallel(ws)
|
||||||
|
|
||||||
|
def _worker(rank: int):
|
||||||
|
ctx.ahbm.set_device(rank)
|
||||||
|
fc = tp.RowParallelLinear(
|
||||||
|
in_features=512, out_features=256, torch=ctx,
|
||||||
|
)
|
||||||
|
assert fc.weight.shape == (512 // ws, 256)
|
||||||
|
|
||||||
|
ctx.multiprocessing.spawn(_worker, args=(), nprocs=ws)
|
||||||
|
|
||||||
|
|
||||||
|
# ── T2.a: ColumnParallel deterministic numerical ─────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_column_parallel_forward_matches_matmul(topology):
|
||||||
|
"""T2.a: ColumnParallelLinear.forward output == x @ W_rank (rtol 1e-2)."""
|
||||||
|
import kernbench.tp as tp
|
||||||
|
from kernbench.runtime_api.tensor import Tensor
|
||||||
|
|
||||||
|
with _make_ctx(topology) as ctx:
|
||||||
|
ctx.distributed.init_process_group(backend="ahbm")
|
||||||
|
ws = ctx.distributed.get_world_size()
|
||||||
|
tp.initialize_model_parallel(ws)
|
||||||
|
|
||||||
|
M = 4
|
||||||
|
D_in, D_out = 32, 32 * ws
|
||||||
|
|
||||||
|
def _worker(rank: int):
|
||||||
|
ctx.ahbm.set_device(rank)
|
||||||
|
fc = tp.ColumnParallelLinear(
|
||||||
|
in_features=D_in, out_features=D_out, torch=ctx,
|
||||||
|
)
|
||||||
|
# Deterministic non-zero weight: rank-scaled constant.
|
||||||
|
k_local = D_out // ws
|
||||||
|
weight_np = np.full(
|
||||||
|
(D_in, k_local), 0.01 * (rank + 1), dtype=np.float16,
|
||||||
|
)
|
||||||
|
src = Tensor(shape=(D_in, k_local), dtype="f16", name="host_w")
|
||||||
|
src._host_buffer = weight_np
|
||||||
|
fc.weight.copy_(src)
|
||||||
|
|
||||||
|
# Input: full-replicated constant.
|
||||||
|
x_np = np.full((M, D_in), 0.5, dtype=np.float16)
|
||||||
|
x = ctx.zeros(
|
||||||
|
(M, D_in), dtype="f16",
|
||||||
|
dp=_replicate_dp(), name=f"t2a_x_r{rank}",
|
||||||
|
)
|
||||||
|
hx = Tensor(shape=x_np.shape, dtype="f16", name="host_x")
|
||||||
|
hx._host_buffer = x_np
|
||||||
|
x.copy_(hx)
|
||||||
|
|
||||||
|
y = fc.forward(x)
|
||||||
|
out = y.numpy()
|
||||||
|
|
||||||
|
expected = x_np.astype(np.float32) @ weight_np.astype(np.float32)
|
||||||
|
assert out.shape == (M, k_local)
|
||||||
|
assert np.allclose(out.astype(np.float32), expected,
|
||||||
|
rtol=1e-2, atol=1e-2), (
|
||||||
|
f"rank {rank}: output does not match x @ W_local"
|
||||||
|
)
|
||||||
|
|
||||||
|
ctx.multiprocessing.spawn(_worker, args=(), nprocs=ws)
|
||||||
|
|
||||||
|
|
||||||
|
# ── T2.b: RowParallel observable equality ────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_row_parallel_forward_concat_matmul_equality(topology):
|
||||||
|
"""T2.b (primary): RowParallel output == concat(x) @ concat(W) (all-reduced)."""
|
||||||
|
import kernbench.tp as tp
|
||||||
|
from kernbench.runtime_api.tensor import Tensor
|
||||||
|
|
||||||
|
with _make_ctx(topology) as ctx:
|
||||||
|
ctx.distributed.init_process_group(backend="ahbm")
|
||||||
|
ws = ctx.distributed.get_world_size()
|
||||||
|
tp.initialize_model_parallel(ws)
|
||||||
|
|
||||||
|
M = 4
|
||||||
|
D_in, D_out = 32 * ws, 32 # must divide ws evenly
|
||||||
|
results: dict[int, np.ndarray] = {}
|
||||||
|
|
||||||
|
def _worker(rank: int):
|
||||||
|
ctx.ahbm.set_device(rank)
|
||||||
|
fc = tp.RowParallelLinear(
|
||||||
|
in_features=D_in, out_features=D_out, torch=ctx,
|
||||||
|
)
|
||||||
|
# Per-rank W_k = constant 0.01 * (rank + 1)
|
||||||
|
n_local = D_in // ws
|
||||||
|
weight_np = np.full(
|
||||||
|
(n_local, D_out), 0.01 * (rank + 1), dtype=np.float16,
|
||||||
|
)
|
||||||
|
src = Tensor(shape=weight_np.shape, dtype="f16", name="host_w")
|
||||||
|
src._host_buffer = weight_np
|
||||||
|
fc.weight.copy_(src)
|
||||||
|
|
||||||
|
# Input x_k = constant 0.1 * (rank + 1) (pretending it was
|
||||||
|
# column-sharded from upstream).
|
||||||
|
x_np = np.full((M, n_local), 0.1 * (rank + 1), dtype=np.float16)
|
||||||
|
x = ctx.zeros(
|
||||||
|
(M, n_local), dtype="f16",
|
||||||
|
dp=_replicate_dp(), name=f"t2b_x_r{rank}",
|
||||||
|
)
|
||||||
|
hx = Tensor(shape=x_np.shape, dtype="f16", name="host_x")
|
||||||
|
hx._host_buffer = x_np
|
||||||
|
x.copy_(hx)
|
||||||
|
|
||||||
|
y = fc.forward(x)
|
||||||
|
results[rank] = y.numpy().astype(np.float32)
|
||||||
|
|
||||||
|
ctx.multiprocessing.spawn(_worker, args=(), nprocs=ws)
|
||||||
|
|
||||||
|
# Host-side reference: compute sum_r (x_r @ W_r) = y (same on all ranks).
|
||||||
|
expected = np.zeros((M, D_out), dtype=np.float32)
|
||||||
|
n_local = D_in // ws
|
||||||
|
for r in range(ws):
|
||||||
|
x_r = np.full((M, n_local), 0.1 * (r + 1), dtype=np.float32)
|
||||||
|
w_r = np.full((n_local, D_out), 0.01 * (r + 1), dtype=np.float32)
|
||||||
|
expected += x_r @ w_r
|
||||||
|
|
||||||
|
for r, out in results.items():
|
||||||
|
assert np.allclose(out, expected, rtol=1e-2, atol=1e-2), (
|
||||||
|
f"rank {r}: all-reduced output != expected partial sum"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ── T2.c: rank-consistency post all-reduce ───────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_row_parallel_rank_identity_post_all_reduce(topology):
|
||||||
|
"""T2.c: after all_reduce, all ranks see elementwise-identical output."""
|
||||||
|
import kernbench.tp as tp
|
||||||
|
from kernbench.runtime_api.tensor import Tensor
|
||||||
|
|
||||||
|
with _make_ctx(topology) as ctx:
|
||||||
|
ctx.distributed.init_process_group(backend="ahbm")
|
||||||
|
ws = ctx.distributed.get_world_size()
|
||||||
|
tp.initialize_model_parallel(ws)
|
||||||
|
|
||||||
|
M = 2
|
||||||
|
D_in, D_out = 16 * ws, 16
|
||||||
|
results: dict[int, np.ndarray] = {}
|
||||||
|
|
||||||
|
def _worker(rank: int):
|
||||||
|
ctx.ahbm.set_device(rank)
|
||||||
|
fc = tp.RowParallelLinear(
|
||||||
|
in_features=D_in, out_features=D_out, torch=ctx,
|
||||||
|
)
|
||||||
|
n_local = D_in // ws
|
||||||
|
weight_np = np.full((n_local, D_out), 0.01, dtype=np.float16)
|
||||||
|
src = Tensor(shape=weight_np.shape, dtype="f16", name="host_w")
|
||||||
|
src._host_buffer = weight_np
|
||||||
|
fc.weight.copy_(src)
|
||||||
|
|
||||||
|
x_np = np.full((M, n_local), 0.1, dtype=np.float16)
|
||||||
|
x = ctx.zeros(
|
||||||
|
(M, n_local), dtype="f16",
|
||||||
|
dp=_replicate_dp(), name=f"t2c_x_r{rank}",
|
||||||
|
)
|
||||||
|
hx = Tensor(shape=x_np.shape, dtype="f16", name="host_x")
|
||||||
|
hx._host_buffer = x_np
|
||||||
|
x.copy_(hx)
|
||||||
|
|
||||||
|
y = fc.forward(x)
|
||||||
|
results[rank] = y.numpy()
|
||||||
|
|
||||||
|
ctx.multiprocessing.spawn(_worker, args=(), nprocs=ws)
|
||||||
|
|
||||||
|
ref = results[0]
|
||||||
|
for r, out in results.items():
|
||||||
|
assert np.allclose(out, ref, rtol=1e-2, atol=1e-2), (
|
||||||
|
f"rank {r} output differs from rank 0 — all_reduce failed to make "
|
||||||
|
f"outputs elementwise identical"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _replicate_dp():
|
||||||
|
from kernbench.policy.placement.dp import DPPolicy
|
||||||
|
return DPPolicy(cube="replicate", pe="replicate", num_cubes=1, num_pes=1)
|
||||||
@@ -0,0 +1,238 @@
|
|||||||
|
"""ADR-0027 T6: End-to-end 2-layer MLP with TP.
|
||||||
|
|
||||||
|
Phase 1: fails at imports. Phase 2 lands the TP package + D7 bench pattern
|
||||||
|
and these pass with numerical-correctness checks.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
def _make_ctx(topology):
|
||||||
|
from kernbench.runtime_api.context import RuntimeContext
|
||||||
|
from kernbench.runtime_api.types import DeviceSelector
|
||||||
|
from kernbench.sim_engine.engine import GraphEngine
|
||||||
|
|
||||||
|
engine = GraphEngine(topology.topology_obj, enable_data=True)
|
||||||
|
return RuntimeContext(
|
||||||
|
engine=engine,
|
||||||
|
target_device=DeviceSelector("all"),
|
||||||
|
correlation_id="test_t6",
|
||||||
|
spec=topology.topology_obj.spec,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _replicate_dp():
|
||||||
|
from kernbench.policy.placement.dp import DPPolicy
|
||||||
|
return DPPolicy(cube="replicate", pe="replicate", num_cubes=1, num_pes=1)
|
||||||
|
|
||||||
|
|
||||||
|
# ── T6.a: zero-weight smoke ──────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_mlp_zero_weight_produces_zero_output(topology):
|
||||||
|
"""T6.a: zero-init weight → output ≈ 0 for every rank."""
|
||||||
|
import kernbench.tp as tp
|
||||||
|
|
||||||
|
with _make_ctx(topology) as ctx:
|
||||||
|
ctx.distributed.init_process_group(backend="ahbm")
|
||||||
|
ws = ctx.distributed.get_world_size()
|
||||||
|
tp.initialize_model_parallel(ws)
|
||||||
|
|
||||||
|
B, D_in, D_hidden, D_out = 1, 32, 32 * ws, 32
|
||||||
|
outputs: dict[int, np.ndarray] = {}
|
||||||
|
|
||||||
|
def _worker(rank: int):
|
||||||
|
ctx.ahbm.set_device(rank)
|
||||||
|
fc1 = tp.ColumnParallelLinear(D_in, D_hidden, torch=ctx)
|
||||||
|
fc2 = tp.RowParallelLinear(D_hidden, D_out, torch=ctx)
|
||||||
|
|
||||||
|
x = ctx.zeros((B, D_in), dtype="f16",
|
||||||
|
dp=_replicate_dp(), name=f"t6a_x_r{rank}")
|
||||||
|
from kernbench.runtime_api.tensor import Tensor
|
||||||
|
hx = Tensor(shape=(B, D_in), dtype="f16", name="host_x")
|
||||||
|
hx._host_buffer = np.full((B, D_in), 0.1, dtype=np.float16)
|
||||||
|
x.copy_(hx)
|
||||||
|
|
||||||
|
h = fc1.forward(x)
|
||||||
|
y = fc2.forward(h)
|
||||||
|
outputs[rank] = y.numpy()
|
||||||
|
|
||||||
|
ctx.multiprocessing.spawn(_worker, args=(), nprocs=ws)
|
||||||
|
|
||||||
|
for r, out in outputs.items():
|
||||||
|
assert np.allclose(out, 0.0, atol=1e-2), (
|
||||||
|
f"rank {r}: zero-weight output should be ~0; got mean={out.mean()}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ── T6.b: deterministic weight + numerical check ─────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_mlp_deterministic_weight_matches_reference(topology):
|
||||||
|
"""T6.b: non-zero deterministic weights → output matches numpy reference."""
|
||||||
|
import kernbench.tp as tp
|
||||||
|
from kernbench.runtime_api.tensor import Tensor
|
||||||
|
|
||||||
|
with _make_ctx(topology) as ctx:
|
||||||
|
ctx.distributed.init_process_group(backend="ahbm")
|
||||||
|
ws = ctx.distributed.get_world_size()
|
||||||
|
tp.initialize_model_parallel(ws)
|
||||||
|
|
||||||
|
B, D_in, D_hidden, D_out = 1, 16, 16 * ws, 16
|
||||||
|
# W1 (D_in, D_hidden) — column-sharded; per rank: (D_in, D_hidden/ws)
|
||||||
|
# W2 (D_hidden, D_out) — row-sharded; per rank: (D_hidden/ws, D_out)
|
||||||
|
# Constant values: W1 = 0.02, W2 = 0.03, x = 0.1 (all fp16).
|
||||||
|
X_VAL, W1_VAL, W2_VAL = 0.1, 0.02, 0.03
|
||||||
|
|
||||||
|
outputs: dict[int, np.ndarray] = {}
|
||||||
|
|
||||||
|
def _worker(rank: int):
|
||||||
|
ctx.ahbm.set_device(rank)
|
||||||
|
fc1 = tp.ColumnParallelLinear(D_in, D_hidden, torch=ctx)
|
||||||
|
fc2 = tp.RowParallelLinear(D_hidden, D_out, torch=ctx)
|
||||||
|
|
||||||
|
# W1 slice (per rank column slice)
|
||||||
|
k_local_1 = D_hidden // ws
|
||||||
|
w1_np = np.full((D_in, k_local_1), W1_VAL, dtype=np.float16)
|
||||||
|
src1 = Tensor(shape=w1_np.shape, dtype="f16", name="host_w1")
|
||||||
|
src1._host_buffer = w1_np
|
||||||
|
fc1.weight.copy_(src1)
|
||||||
|
|
||||||
|
# W2 slice (per rank row slice)
|
||||||
|
n_local_2 = D_hidden // ws
|
||||||
|
w2_np = np.full((n_local_2, D_out), W2_VAL, dtype=np.float16)
|
||||||
|
src2 = Tensor(shape=w2_np.shape, dtype="f16", name="host_w2")
|
||||||
|
src2._host_buffer = w2_np
|
||||||
|
fc2.weight.copy_(src2)
|
||||||
|
|
||||||
|
# Input x (full-replicated constant)
|
||||||
|
x = ctx.zeros((B, D_in), dtype="f16",
|
||||||
|
dp=_replicate_dp(), name=f"t6b_x_r{rank}")
|
||||||
|
hx = Tensor(shape=(B, D_in), dtype="f16", name="host_x")
|
||||||
|
hx._host_buffer = np.full((B, D_in), X_VAL, dtype=np.float16)
|
||||||
|
x.copy_(hx)
|
||||||
|
|
||||||
|
h = fc1.forward(x)
|
||||||
|
y = fc2.forward(h)
|
||||||
|
outputs[rank] = y.numpy().astype(np.float32)
|
||||||
|
|
||||||
|
ctx.multiprocessing.spawn(_worker, args=(), nprocs=ws)
|
||||||
|
|
||||||
|
# Host reference: y = x @ W1_full @ W2_full
|
||||||
|
w1_full = np.full((D_in, D_hidden), W1_VAL, dtype=np.float32)
|
||||||
|
w2_full = np.full((D_hidden, D_out), W2_VAL, dtype=np.float32)
|
||||||
|
x_full = np.full((B, D_in), X_VAL, dtype=np.float32)
|
||||||
|
expected = x_full @ w1_full @ w2_full
|
||||||
|
|
||||||
|
for r, out in outputs.items():
|
||||||
|
assert out.shape == (B, D_out)
|
||||||
|
assert np.allclose(out, expected, rtol=1e-2, atol=1e-2), (
|
||||||
|
f"rank {r}: MLP output != reference "
|
||||||
|
f"(got mean={out.mean():.4f}, expected={expected.mean():.4f})"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ── T6.c: rank-consistency after final all_reduce ────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_mlp_rank_consistency_after_all_reduce(topology):
|
||||||
|
"""T6.c: all ranks see elementwise-identical final output."""
|
||||||
|
import kernbench.tp as tp
|
||||||
|
from kernbench.runtime_api.tensor import Tensor
|
||||||
|
|
||||||
|
with _make_ctx(topology) as ctx:
|
||||||
|
ctx.distributed.init_process_group(backend="ahbm")
|
||||||
|
ws = ctx.distributed.get_world_size()
|
||||||
|
tp.initialize_model_parallel(ws)
|
||||||
|
|
||||||
|
B, D_in, D_hidden, D_out = 1, 16, 16 * ws, 16
|
||||||
|
outputs: dict[int, np.ndarray] = {}
|
||||||
|
|
||||||
|
def _worker(rank: int):
|
||||||
|
ctx.ahbm.set_device(rank)
|
||||||
|
fc1 = tp.ColumnParallelLinear(D_in, D_hidden, torch=ctx)
|
||||||
|
fc2 = tp.RowParallelLinear(D_hidden, D_out, torch=ctx)
|
||||||
|
|
||||||
|
# Zero weights OK for this check — just need all_reduce to run.
|
||||||
|
x = ctx.zeros((B, D_in), dtype="f16",
|
||||||
|
dp=_replicate_dp(), name=f"t6c_x_r{rank}")
|
||||||
|
hx = Tensor(shape=(B, D_in), dtype="f16", name="host_x")
|
||||||
|
hx._host_buffer = np.full((B, D_in), 0.1, dtype=np.float16)
|
||||||
|
x.copy_(hx)
|
||||||
|
|
||||||
|
h = fc1.forward(x)
|
||||||
|
y = fc2.forward(h)
|
||||||
|
outputs[rank] = y.numpy()
|
||||||
|
|
||||||
|
ctx.multiprocessing.spawn(_worker, args=(), nprocs=ws)
|
||||||
|
|
||||||
|
ref = outputs[0]
|
||||||
|
for r, out in outputs.items():
|
||||||
|
assert np.array_equal(out, ref), (
|
||||||
|
f"rank {r} output differs from rank 0 — all-reduce should "
|
||||||
|
f"make every rank see the same final tensor"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ── T6.d: shape contract ─────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_mlp_shape_contract(topology):
|
||||||
|
"""T6.d: ColumnParallel → (B, D_hidden/ws); RowParallel → (B, D_out)."""
|
||||||
|
import kernbench.tp as tp
|
||||||
|
|
||||||
|
with _make_ctx(topology) as ctx:
|
||||||
|
ctx.distributed.init_process_group(backend="ahbm")
|
||||||
|
ws = ctx.distributed.get_world_size()
|
||||||
|
tp.initialize_model_parallel(ws)
|
||||||
|
|
||||||
|
B, D_in, D_hidden, D_out = 1, 16, 16 * ws, 16
|
||||||
|
|
||||||
|
def _worker(rank: int):
|
||||||
|
ctx.ahbm.set_device(rank)
|
||||||
|
fc1 = tp.ColumnParallelLinear(D_in, D_hidden, torch=ctx)
|
||||||
|
fc2 = tp.RowParallelLinear(D_hidden, D_out, torch=ctx)
|
||||||
|
|
||||||
|
x = ctx.zeros((B, D_in), dtype="f16",
|
||||||
|
dp=_replicate_dp(), name=f"t6d_x_r{rank}")
|
||||||
|
h = fc1.forward(x)
|
||||||
|
assert h.shape == (B, D_hidden // ws), (
|
||||||
|
f"ColumnParallel output shape: {h.shape} != (B, D_hidden/ws)"
|
||||||
|
)
|
||||||
|
y = fc2.forward(h)
|
||||||
|
assert y.shape == (B, D_out), (
|
||||||
|
f"RowParallel output shape: {y.shape} != (B, D_out)"
|
||||||
|
)
|
||||||
|
|
||||||
|
ctx.multiprocessing.spawn(_worker, args=(), nprocs=ws)
|
||||||
|
|
||||||
|
|
||||||
|
# ── liveness: deadlock 없음 (pytest timeout 간접 검증) ───────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_mlp_completes_without_deadlock(topology):
|
||||||
|
"""Structural: full E2E spawn returns within a reasonable wall-clock.
|
||||||
|
|
||||||
|
Relies on the test suite's overall timeout harness. If this hangs
|
||||||
|
beyond ~60s it would surface as a pytest timeout — a deadlock
|
||||||
|
regression in the scheduler loop would manifest here."""
|
||||||
|
import kernbench.tp as tp
|
||||||
|
|
||||||
|
with _make_ctx(topology) as ctx:
|
||||||
|
ctx.distributed.init_process_group(backend="ahbm")
|
||||||
|
ws = ctx.distributed.get_world_size()
|
||||||
|
tp.initialize_model_parallel(ws)
|
||||||
|
|
||||||
|
def _worker(rank: int):
|
||||||
|
ctx.ahbm.set_device(rank)
|
||||||
|
fc1 = tp.ColumnParallelLinear(16, 16 * ws, torch=ctx)
|
||||||
|
fc2 = tp.RowParallelLinear(16 * ws, 16, torch=ctx)
|
||||||
|
x = ctx.zeros((1, 16), dtype="f16",
|
||||||
|
dp=_replicate_dp(), name=f"t6live_r{rank}")
|
||||||
|
h = fc1.forward(x)
|
||||||
|
y = fc2.forward(h)
|
||||||
|
_ = y.numpy()
|
||||||
|
|
||||||
|
ctx.multiprocessing.spawn(_worker, args=(), nprocs=ws)
|
||||||
@@ -0,0 +1,85 @@
|
|||||||
|
"""ADR-0027 T1: TP parallel_state (D3).
|
||||||
|
|
||||||
|
Phase 1: ``kernbench.tp`` module does not exist yet — tests fail at import.
|
||||||
|
Phase 2 (D2/D3) lands the package and these pass.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
def _make_ctx(topology):
|
||||||
|
from kernbench.runtime_api.context import RuntimeContext
|
||||||
|
from kernbench.runtime_api.types import DeviceSelector
|
||||||
|
from kernbench.sim_engine.engine import GraphEngine
|
||||||
|
|
||||||
|
engine = GraphEngine(topology.topology_obj, enable_data=True)
|
||||||
|
return RuntimeContext(
|
||||||
|
engine=engine,
|
||||||
|
target_device=DeviceSelector("all"),
|
||||||
|
correlation_id="test_t1",
|
||||||
|
spec=topology.topology_obj.spec,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_tp_package_importable():
|
||||||
|
"""D2: kernbench.tp must be importable."""
|
||||||
|
import kernbench.tp as tp
|
||||||
|
assert hasattr(tp, "initialize_model_parallel")
|
||||||
|
assert hasattr(tp, "get_tensor_model_parallel_world_size")
|
||||||
|
assert hasattr(tp, "get_tensor_model_parallel_rank")
|
||||||
|
|
||||||
|
|
||||||
|
def test_initialize_model_parallel_matches_world_size(topology, tmp_path, monkeypatch):
|
||||||
|
"""D3: TP size must equal dist world_size; otherwise NotImplementedError."""
|
||||||
|
import kernbench.tp as tp
|
||||||
|
|
||||||
|
with _make_ctx(topology) as ctx:
|
||||||
|
ctx.distributed.init_process_group(backend="ahbm")
|
||||||
|
ws = ctx.distributed.get_world_size()
|
||||||
|
|
||||||
|
tp.initialize_model_parallel(ws)
|
||||||
|
assert tp.get_tensor_model_parallel_world_size() == ws
|
||||||
|
|
||||||
|
|
||||||
|
def test_initialize_mismatched_ws_raises(topology):
|
||||||
|
"""D3: calling with tp_size != world_size raises NotImplementedError."""
|
||||||
|
import kernbench.tp as tp
|
||||||
|
|
||||||
|
with _make_ctx(topology) as ctx:
|
||||||
|
ctx.distributed.init_process_group(backend="ahbm")
|
||||||
|
ws = ctx.distributed.get_world_size()
|
||||||
|
|
||||||
|
with pytest.raises(NotImplementedError):
|
||||||
|
tp.initialize_model_parallel(ws + 1)
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_tp_rank_is_greenlet_local(topology):
|
||||||
|
"""D3: get_tensor_model_parallel_rank returns greenlet-local rank
|
||||||
|
(delegates to torch.distributed.get_rank, ADR-0024 D9)."""
|
||||||
|
import kernbench.tp as tp
|
||||||
|
|
||||||
|
with _make_ctx(topology) as ctx:
|
||||||
|
ctx.distributed.init_process_group(backend="ahbm")
|
||||||
|
ws = ctx.distributed.get_world_size()
|
||||||
|
tp.initialize_model_parallel(ws)
|
||||||
|
|
||||||
|
observed: list[int] = []
|
||||||
|
|
||||||
|
def _worker(rank: int):
|
||||||
|
observed.append(tp.get_tensor_model_parallel_rank())
|
||||||
|
|
||||||
|
ctx.multiprocessing.spawn(_worker, args=(), nprocs=ws)
|
||||||
|
|
||||||
|
assert sorted(observed) == list(range(ws))
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_world_size_before_init_raises():
|
||||||
|
"""D3: uninitialised TP group → accessing world_size fails informatively."""
|
||||||
|
from kernbench.tp import parallel_state
|
||||||
|
|
||||||
|
# Reset internal state if previous tests (or parallel workers) left it set.
|
||||||
|
parallel_state._reset_for_tests()
|
||||||
|
|
||||||
|
with pytest.raises((RuntimeError, AssertionError, TypeError)):
|
||||||
|
_ = parallel_state.get_tensor_model_parallel_world_size() + 0
|
||||||
@@ -12,7 +12,7 @@ import pytest
|
|||||||
from kernbench.policy.address.allocator import AddressConfig, PEMemAllocator
|
from kernbench.policy.address.allocator import AddressConfig, PEMemAllocator
|
||||||
from kernbench.policy.address.pe_mmu import PeMMU
|
from kernbench.policy.address.pe_mmu import PeMMU
|
||||||
from kernbench.policy.address.va_allocator import VirtualAllocator
|
from kernbench.policy.address.va_allocator import VirtualAllocator
|
||||||
from kernbench.policy.placement.dp import column_wise, ShardSpec
|
from kernbench.policy.placement.dp import DPPolicy, ShardSpec, resolve_dp_policy
|
||||||
from kernbench.runtime_api.tensor import (
|
from kernbench.runtime_api.tensor import (
|
||||||
TensorHandle,
|
TensorHandle,
|
||||||
TensorShard,
|
TensorShard,
|
||||||
@@ -37,9 +37,9 @@ _CFG = AddressConfig(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def _make_allocators(num_pe: int = 8) -> dict[int, PEMemAllocator]:
|
def _make_allocators(num_pe: int = 8) -> dict[tuple[int, int, int], PEMemAllocator]:
|
||||||
return {
|
return {
|
||||||
i: PEMemAllocator(rack_id=0, sip_id=0, cube_id=0, pe_id=i, cfg=_CFG)
|
(0, 0, i): PEMemAllocator(rack_id=0, sip_id=0, cube_id=0, pe_id=i, cfg=_CFG)
|
||||||
for i in range(num_pe)
|
for i in range(num_pe)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -88,7 +88,11 @@ def test_deploy_tensor_assigns_va_base():
|
|||||||
"""deploy_tensor with VA allocator assigns va_base to TensorHandle."""
|
"""deploy_tensor with VA allocator assigns va_base to TensorHandle."""
|
||||||
allocs = _make_allocators()
|
allocs = _make_allocators()
|
||||||
va_alloc = _make_va_allocator()
|
va_alloc = _make_va_allocator()
|
||||||
placement = column_wise(shape=(1024, 512), itemsize=2, num_pe=8)
|
placement = resolve_dp_policy(
|
||||||
|
DPPolicy(cube="replicate", pe="column_wise"),
|
||||||
|
shape=(1024, 512), itemsize=2,
|
||||||
|
num_pe=8, num_cubes=1, target_sip=0,
|
||||||
|
)
|
||||||
|
|
||||||
th = deploy_tensor(
|
th = deploy_tensor(
|
||||||
name="W",
|
name="W",
|
||||||
@@ -107,7 +111,11 @@ def test_deploy_tensor_va_covers_all_shards():
|
|||||||
"""VA allocation covers the entire tensor; each shard is at va_base + offset."""
|
"""VA allocation covers the entire tensor; each shard is at va_base + offset."""
|
||||||
allocs = _make_allocators()
|
allocs = _make_allocators()
|
||||||
va_alloc = _make_va_allocator()
|
va_alloc = _make_va_allocator()
|
||||||
placement = column_wise(shape=(1024, 512), itemsize=2, num_pe=8)
|
placement = resolve_dp_policy(
|
||||||
|
DPPolicy(cube="replicate", pe="column_wise"),
|
||||||
|
shape=(1024, 512), itemsize=2,
|
||||||
|
num_pe=8, num_cubes=1, target_sip=0,
|
||||||
|
)
|
||||||
|
|
||||||
th = deploy_tensor(
|
th = deploy_tensor(
|
||||||
name="W",
|
name="W",
|
||||||
@@ -128,7 +136,11 @@ def test_deploy_tensor_does_not_install_mmu_mappings():
|
|||||||
allocs = _make_allocators()
|
allocs = _make_allocators()
|
||||||
va_alloc = _make_va_allocator()
|
va_alloc = _make_va_allocator()
|
||||||
mmus = _make_mmus()
|
mmus = _make_mmus()
|
||||||
placement = column_wise(shape=(1024, 512), itemsize=2, num_pe=8)
|
placement = resolve_dp_policy(
|
||||||
|
DPPolicy(cube="replicate", pe="column_wise"),
|
||||||
|
shape=(1024, 512), itemsize=2,
|
||||||
|
num_pe=8, num_cubes=1, target_sip=0,
|
||||||
|
)
|
||||||
|
|
||||||
deploy_tensor(
|
deploy_tensor(
|
||||||
name="W",
|
name="W",
|
||||||
@@ -153,7 +165,7 @@ def test_tensor_va_property():
|
|||||||
|
|
||||||
allocs = _make_allocators(1)
|
allocs = _make_allocators(1)
|
||||||
va_alloc = _make_va_allocator()
|
va_alloc = _make_va_allocator()
|
||||||
placement = [ShardSpec(pe_index=0, offset_bytes=0, nbytes=4096)]
|
placement = [ShardSpec(sip=0, cube=0, pe=0, offset_bytes=0, nbytes=4096)]
|
||||||
|
|
||||||
t = Tensor(shape=(2048,), dtype="f16", name="test")
|
t = Tensor(shape=(2048,), dtype="f16", name="test")
|
||||||
t._handle = deploy_tensor(
|
t._handle = deploy_tensor(
|
||||||
|
|||||||
+15
-5
@@ -20,7 +20,7 @@ from kernbench.policy.address.allocator import AddressConfig, PEMemAllocator
|
|||||||
from kernbench.policy.address.pe_mmu import PeMMU
|
from kernbench.policy.address.pe_mmu import PeMMU
|
||||||
from kernbench.policy.address.phyaddr import PhysAddr
|
from kernbench.policy.address.phyaddr import PhysAddr
|
||||||
from kernbench.policy.address.va_allocator import VirtualAllocator
|
from kernbench.policy.address.va_allocator import VirtualAllocator
|
||||||
from kernbench.policy.placement.dp import DPPolicy, column_wise
|
from kernbench.policy.placement.dp import DPPolicy, resolve_dp_policy
|
||||||
from kernbench.runtime_api.tensor import deploy_tensor
|
from kernbench.runtime_api.tensor import deploy_tensor
|
||||||
from kernbench.sim_engine.engine import GraphEngine
|
from kernbench.sim_engine.engine import GraphEngine
|
||||||
from kernbench.runtime_api.context import RuntimeContext
|
from kernbench.runtime_api.context import RuntimeContext
|
||||||
@@ -70,7 +70,7 @@ def _make_standalone(shape, num_pe=NUM_PE):
|
|||||||
sram_bytes_per_cube=32 * _MB,
|
sram_bytes_per_cube=32 * _MB,
|
||||||
)
|
)
|
||||||
allocators = {
|
allocators = {
|
||||||
i: PEMemAllocator(rack_id=0, sip_id=0, cube_id=0, pe_id=i, cfg=cfg)
|
(0, 0, i): PEMemAllocator(rack_id=0, sip_id=0, cube_id=0, pe_id=i, cfg=cfg)
|
||||||
for i in range(num_pe)
|
for i in range(num_pe)
|
||||||
}
|
}
|
||||||
va_alloc = VirtualAllocator(va_base=0x1_0000_0000, va_size=64 * _GB, page_size=4096)
|
va_alloc = VirtualAllocator(va_base=0x1_0000_0000, va_size=64 * _GB, page_size=4096)
|
||||||
@@ -110,7 +110,11 @@ def test_2d_va_translates_to_local_hbm():
|
|||||||
cols_per_pe = K // NUM_PE
|
cols_per_pe = K // NUM_PE
|
||||||
block_bytes = M * cols_per_pe * ELEM_BYTES
|
block_bytes = M * cols_per_pe * ELEM_BYTES
|
||||||
|
|
||||||
placement = column_wise(shape=(M, K), itemsize=ELEM_BYTES, num_pe=NUM_PE)
|
placement = resolve_dp_policy(
|
||||||
|
DPPolicy(cube="replicate", pe="column_wise"),
|
||||||
|
shape=(M, K), itemsize=ELEM_BYTES,
|
||||||
|
num_pe=NUM_PE, num_cubes=1, target_sip=0,
|
||||||
|
)
|
||||||
handle = deploy_tensor(
|
handle = deploy_tensor(
|
||||||
name="src", shape=(M, K), dtype="fp16",
|
name="src", shape=(M, K), dtype="fp16",
|
||||||
placement=placement, allocators=allocators, va_allocator=va_alloc,
|
placement=placement, allocators=allocators, va_allocator=va_alloc,
|
||||||
@@ -178,7 +182,11 @@ def test_1d_va_translates_to_local_hbm():
|
|||||||
elems_per_pe = N_1D // NUM_PE
|
elems_per_pe = N_1D // NUM_PE
|
||||||
block_bytes = elems_per_pe * ELEM_BYTES
|
block_bytes = elems_per_pe * ELEM_BYTES
|
||||||
|
|
||||||
placement = column_wise(shape=(1, N_1D), itemsize=ELEM_BYTES, num_pe=NUM_PE)
|
placement = resolve_dp_policy(
|
||||||
|
DPPolicy(cube="replicate", pe="column_wise"),
|
||||||
|
shape=(1, N_1D), itemsize=ELEM_BYTES,
|
||||||
|
num_pe=NUM_PE, num_cubes=1, target_sip=0,
|
||||||
|
)
|
||||||
handle = deploy_tensor(
|
handle = deploy_tensor(
|
||||||
name="src_1d", shape=(N_1D,), dtype="fp16",
|
name="src_1d", shape=(N_1D,), dtype="fp16",
|
||||||
placement=placement, allocators=allocators, va_allocator=va_alloc,
|
placement=placement, allocators=allocators, va_allocator=va_alloc,
|
||||||
@@ -207,7 +215,9 @@ def test_1d_e2e_completes():
|
|||||||
correlation_id="vo6", spec=graph.spec,
|
correlation_id="vo6", spec=graph.spec,
|
||||||
)
|
)
|
||||||
|
|
||||||
dp = DPPolicy(sip="column_wise", cube="column_wise", pe="column_wise")
|
# ADR-0026: DPPolicy is intra-device only; SIP scoping comes from the
|
||||||
|
# RuntimeContext's target_device. This 1D e2e runs on a single SIP.
|
||||||
|
dp = DPPolicy(cube="column_wise", pe="column_wise")
|
||||||
src = ctx.zeros((N_1D,), dtype=DTYPE, dp=dp, name="src_1d")
|
src = ctx.zeros((N_1D,), dtype=DTYPE, dp=dp, name="src_1d")
|
||||||
dst = ctx.empty((N_1D,), dtype=DTYPE, dp=dp, name="dst_1d")
|
dst = ctx.empty((N_1D,), dtype=DTYPE, dp=dp, name="dst_1d")
|
||||||
|
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ system:
|
|||||||
|
|
||||||
sips:
|
sips:
|
||||||
count: 2
|
count: 2
|
||||||
|
topology: ring_1d
|
||||||
|
|
||||||
components:
|
components:
|
||||||
switch: { kind: switch, impl: builtin.switch, attrs: { overhead_ns: 5.0 } }
|
switch: { kind: switch, impl: builtin.switch, attrs: { overhead_ns: 5.0 } }
|
||||||
|
|||||||
Reference in New Issue
Block a user