357cab525b
DPPolicy no longer carries a cross-SIP axis. SIP-level placement is solely controlled by torch.ahbm.set_device(rank) (ADR-0024); DPPolicy itself describes only the cube × PE layout within one SIP. ShardSpec switches to structural (sip, cube, pe) coordinates; the flat pe_index field/property is fully removed — silent drift between global-flat and SIP-local interpretations was a foot-gun flagged by ADR-0024 D11. Breaking API (explicit TypeError / AttributeError): - DPPolicy(sip=...) / DPPolicy(num_sips=...) -> TypeError - ShardSpec.pe_index -> AttributeError - ShardSpec(pe_index=...) -> TypeError - resolve_dp_policy now takes target_sip= (required), no num_sips. Downstream migration: - PE allocator dict keyed by (sip, cube, pe) tuples, in both _ensure_allocators and _free_tensor. deploy_tensor uses tuple lookup. - _create_tensor passes target_sip=current_sip; post-hoc pe_index shifting removed entirely. - launch._compute_local_shape drops the dp.sip branch. - Internal resolvers (column_wise / row_wise / replicate / tiled_*) return _LocalPeShard (cube-local identifier) instead of ShardSpec — resolve_dp_policy lifts them to full structural coords. Tests: - New tests/test_adr0026_dppolicy_intra_device.py (12 tests) pins the contract end-to-end. - test_sip_parallel.py rewritten: SIP composition now modeled as two resolve_dp_policy(target_sip=...) calls (ADR-0024 launcher style). - Call-site migration: test_tensor, test_va_integration, test_va_offset, test_runtime_api_tensor, test_tl_recv_async, test_ccl_* and benches gemm_single_pe, gpt3_qkv, va_offset_verify, ccl_allreduce (legacy branch) all use intra-device DPPolicy and structural ShardSpec. Result: 523 passed, 1 strict xfail (ring_default_ws — unchanged ADR-0024 Phase B blocker; architectural fix deferred to ADR-0027). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
419 lines
15 KiB
Python
419 lines
15 KiB
Python
from __future__ import annotations
|
|
|
|
import math
|
|
import weakref
|
|
from dataclasses import dataclass
|
|
from typing import Literal
|
|
|
|
import numpy as np
|
|
|
|
from kernbench.policy.address.allocator import PEMemAllocator
|
|
from kernbench.policy.placement.dp import DPPolicy, ShardSpec
|
|
from kernbench.runtime_api.kernel import TensorArg, TensorArgShard
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class TensorShard:
|
|
sip: int
|
|
cube: int
|
|
pe: int
|
|
pa: int
|
|
nbytes: int
|
|
offset_bytes: int
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class TensorHandle:
|
|
name: str
|
|
shape: tuple[int, ...]
|
|
dtype: str
|
|
itemsize: int
|
|
shards: tuple[TensorShard, ...]
|
|
va_base: int = 0 # VA base address for the entire tensor
|
|
|
|
@property
|
|
def nbytes(self) -> int:
|
|
return math.prod(self.shape) * self.itemsize
|
|
|
|
|
|
_DTYPE_ITEMSIZE = {
|
|
"fp16": 2, "float16": 2, "f16": 2,
|
|
"fp32": 4, "float32": 4, "f32": 4,
|
|
"bf16": 2,
|
|
"int8": 1, "i8": 1,
|
|
"int16": 2, "i16": 2,
|
|
"int32": 4, "i32": 4,
|
|
}
|
|
|
|
|
|
def dtype_itemsize(dtype: str) -> int:
|
|
if dtype not in _DTYPE_ITEMSIZE:
|
|
raise ValueError(f"unsupported dtype: {dtype}")
|
|
return _DTYPE_ITEMSIZE[dtype]
|
|
|
|
|
|
_NUMPY_DTYPE = {
|
|
"f16": np.float16, "fp16": np.float16, "float16": np.float16,
|
|
"f32": np.float32, "fp32": np.float32, "float32": np.float32,
|
|
"bf16": np.float16,
|
|
"i8": np.int8, "int8": np.int8,
|
|
"i16": np.int16, "int16": np.int16,
|
|
"i32": np.int32, "int32": np.int32,
|
|
}
|
|
|
|
|
|
def _numpy_dtype(dtype: str) -> np.dtype:
|
|
return np.dtype(_NUMPY_DTYPE.get(dtype, np.float16))
|
|
|
|
|
|
def deploy_tensor(
|
|
*,
|
|
name: str,
|
|
shape: tuple[int, ...],
|
|
dtype: str,
|
|
placement: list[ShardSpec],
|
|
allocators: dict[tuple[int, int, int], PEMemAllocator],
|
|
mem_kind: Literal["hbm", "tcm"] = "hbm",
|
|
va_allocator=None,
|
|
) -> TensorHandle:
|
|
isize = dtype_itemsize(dtype)
|
|
total_nbytes = math.prod(shape) * isize
|
|
|
|
# Allocate VA range for the entire tensor (if VA allocator provided)
|
|
va_base = 0
|
|
if va_allocator is not None:
|
|
va_base = va_allocator.alloc(total_nbytes)
|
|
|
|
shards: list[TensorShard] = []
|
|
for spec in placement:
|
|
alloc = allocators[(spec.sip, spec.cube, spec.pe)]
|
|
if mem_kind == "hbm":
|
|
pa = alloc.alloc_hbm(spec.nbytes)
|
|
else:
|
|
pa = alloc.alloc_tcm(spec.nbytes)
|
|
shards.append(TensorShard(
|
|
sip=spec.sip,
|
|
cube=spec.cube,
|
|
pe=spec.pe,
|
|
pa=pa.encode(),
|
|
nbytes=spec.nbytes,
|
|
offset_bytes=spec.offset_bytes,
|
|
))
|
|
|
|
return TensorHandle(
|
|
name=name,
|
|
shape=shape,
|
|
dtype=dtype,
|
|
itemsize=isize,
|
|
shards=tuple(shards),
|
|
va_base=va_base,
|
|
)
|
|
|
|
|
|
# ── PyTorch-like Tensor API ──────────────────────────────────────────
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class DPMetadata:
|
|
"""Data-parallel placement metadata (stored as Tensor._dp_metadata)."""
|
|
|
|
placement: list[ShardSpec]
|
|
dp_policy: DPPolicy | None = None
|
|
sip: int = 0
|
|
cube: int = 0
|
|
target_pe: int | tuple[int, ...] | str = 0 # int → single PE, tuple → specific PEs, "all" → all PEs
|
|
|
|
|
|
class Tensor:
|
|
"""PyTorch-like tensor for benchmark code.
|
|
|
|
Usage::
|
|
|
|
a = ctx.zeros((M, K), dtype="f16", dp=DPPolicy(cube="replicate", pe="replicate"))
|
|
ctx.launch("kernel_name", kernel_fn, a, b, out, M=M, K=K)
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
shape: tuple[int, ...],
|
|
dtype: str = "f16",
|
|
name: str = "",
|
|
) -> None:
|
|
self.shape = shape
|
|
self.dtype = dtype
|
|
self.name = name
|
|
self._dp_metadata: DPMetadata | None = None
|
|
self._handle: TensorHandle | None = None
|
|
self._ctx_ref: weakref.ref | None = None # set by RuntimeContext
|
|
self._memory_store = None # set by RuntimeContext when enable_data=True
|
|
# Host-side staging buffer for torch.from_numpy() results. A tensor
|
|
# with a non-None _host_buffer is NOT deployed to any PE — it lives
|
|
# only on the host. Use `target.copy_(host_tensor)` to scatter the
|
|
# data into a deployed, sharded target tensor.
|
|
self._host_buffer: np.ndarray | None = None
|
|
|
|
def __del__(self) -> None:
|
|
if self._ctx_ref is None or self._handle is None:
|
|
return
|
|
ctx = self._ctx_ref()
|
|
if ctx is not None:
|
|
ctx._free_tensor(self)
|
|
|
|
# ── Indexing (shard-aligned slices) ────────────────────────────
|
|
|
|
def _resolve_shard_index(self, key) -> tuple[int, int | None]:
|
|
"""Map a numpy-style index key to (flat_start_elem, flat_stop_elem).
|
|
|
|
Only shard-aligned slices on the last dimension are supported.
|
|
Returns (start, stop) in element units from the flat layout, or
|
|
raises IndexError / NotImplementedError for unsupported keys.
|
|
"""
|
|
if self._handle is None:
|
|
raise RuntimeError(f"Tensor '{self.name}' is not deployed")
|
|
ndim = len(self.shape)
|
|
if not isinstance(key, tuple):
|
|
key = (key,)
|
|
if len(key) != ndim:
|
|
raise IndexError(
|
|
f"expected {ndim} indices, got {len(key)}"
|
|
)
|
|
# All leading dims must be int (selecting a single row/plane).
|
|
for i, k in enumerate(key[:-1]):
|
|
if not isinstance(k, int):
|
|
raise NotImplementedError(
|
|
"only integer indices are supported for leading dims"
|
|
)
|
|
last = key[-1]
|
|
total_elems = math.prod(self.shape)
|
|
if isinstance(last, int):
|
|
# Single element
|
|
return (last, last + 1)
|
|
if isinstance(last, slice):
|
|
start, stop, step = last.indices(self.shape[-1])
|
|
if step != 1:
|
|
raise NotImplementedError("step != 1 not supported")
|
|
return (start, stop)
|
|
raise NotImplementedError(f"unsupported index type: {type(last)}")
|
|
|
|
def _shard_for_range(self, start_elem: int, stop_elem: int) -> TensorShard:
|
|
"""Return the single shard that fully covers [start_elem, stop_elem).
|
|
|
|
Raises NotImplementedError if the range spans multiple shards.
|
|
"""
|
|
isize = self.itemsize
|
|
start_byte = start_elem * isize
|
|
stop_byte = stop_elem * isize
|
|
for shard in self._handle.shards:
|
|
s_start = shard.offset_bytes
|
|
s_end = shard.offset_bytes + shard.nbytes
|
|
if start_byte >= s_start and stop_byte <= s_end:
|
|
return shard
|
|
raise NotImplementedError(
|
|
f"slice [{start_elem}:{stop_elem}] spans multiple shards "
|
|
f"(only shard-aligned slices are supported)"
|
|
)
|
|
|
|
def __getitem__(self, key):
|
|
"""Read a shard-aligned slice. Returns a numpy array.
|
|
|
|
Mirrors ``torch.Tensor.__getitem__`` for the shard-aligned case.
|
|
"""
|
|
start, stop = self._resolve_shard_index(key)
|
|
shard = self._shard_for_range(start, stop)
|
|
if self._memory_store is None:
|
|
return np.zeros(stop - start, dtype=_numpy_dtype(self.dtype))
|
|
isize = self.itemsize
|
|
local_start = (start * isize - shard.offset_bytes) // isize
|
|
local_count = stop - start
|
|
try:
|
|
arr = self._memory_store.read(
|
|
"hbm", self._shard_store_addr(shard),
|
|
)
|
|
flat = np.asarray(arr, dtype=_numpy_dtype(self.dtype)).reshape(-1)
|
|
return flat[local_start : local_start + local_count]
|
|
except KeyError:
|
|
return np.zeros(local_count, dtype=_numpy_dtype(self.dtype))
|
|
|
|
def __setitem__(self, key, value):
|
|
"""Write a shard-aligned slice.
|
|
|
|
Mirrors ``torch.Tensor.__setitem__``. Scalar broadcast and
|
|
numpy array assignment are both supported.
|
|
"""
|
|
if self._handle is None or self._memory_store is None:
|
|
raise RuntimeError(
|
|
f"Tensor '{self.name}' must be deployed before assignment"
|
|
)
|
|
start, stop = self._resolve_shard_index(key)
|
|
shard = self._shard_for_range(start, stop)
|
|
np_dtype = _numpy_dtype(self.dtype)
|
|
isize = self.itemsize
|
|
local_start = (start * isize - shard.offset_bytes) // isize
|
|
local_count = stop - start
|
|
shard_elems = shard.nbytes // isize
|
|
addr = self._shard_store_addr(shard)
|
|
|
|
# Read current shard data (or zeros if uninitialized)
|
|
try:
|
|
arr = self._memory_store.read("hbm", addr)
|
|
arr = np.array(arr, dtype=np_dtype).reshape(-1).copy()
|
|
except KeyError:
|
|
arr = np.zeros(shard_elems, dtype=np_dtype)
|
|
|
|
# Write the slice
|
|
if isinstance(value, (int, float)):
|
|
arr[local_start : local_start + local_count] = np_dtype.type(value)
|
|
else:
|
|
v = np.asarray(value, dtype=np_dtype).reshape(-1)
|
|
arr[local_start : local_start + local_count] = v[:local_count]
|
|
|
|
self._memory_store.write("hbm", addr, arr)
|
|
|
|
def __repr__(self) -> str:
|
|
parts = [f"tensor(name={self.name}, shape={self.shape}, dtype={self.dtype}"]
|
|
if self._memory_store is not None and self._handle is not None:
|
|
arr = self.data
|
|
parts.append(f", mean={float(arr.mean()):.4g}, norm={float(np.linalg.norm(arr)):.4g}")
|
|
else:
|
|
parts.append(", data=N/A (placeholder)")
|
|
parts.append(")")
|
|
return "".join(parts)
|
|
|
|
@property
|
|
def data(self) -> np.ndarray:
|
|
"""Tensor data as numpy array.
|
|
|
|
Gathers all shards into a single full-shape array. Returns actual
|
|
values when enable_data=True, zeros placeholder otherwise (like an
|
|
uninitialized tensor). Alias of ``numpy()``.
|
|
"""
|
|
return self.numpy()
|
|
|
|
def _shard_store_addr(self, shard: TensorShard) -> int:
|
|
"""MemoryStore key for a shard.
|
|
|
|
Kernels read tensors via VA (translated to PA by PE_DMA's MMU when
|
|
a mapping exists, otherwise the addr is treated as a PA-equivalent
|
|
key). Tensor I/O therefore writes/reads at ``va_base + offset_bytes``
|
|
when ``va_base`` is set, falling back to ``shard.pa`` for the
|
|
VA-less mode used by some legacy paths.
|
|
"""
|
|
if self._handle and self._handle.va_base:
|
|
return self._handle.va_base + shard.offset_bytes
|
|
return shard.pa
|
|
|
|
def numpy(self) -> np.ndarray:
|
|
"""Return a single numpy array gathered from all shards.
|
|
|
|
Mirrors ``torch.Tensor.numpy()``. In kernbench, sharded tensors are
|
|
gathered into a single full-shape ndarray according to each shard's
|
|
``offset_bytes`` / ``nbytes`` range.
|
|
"""
|
|
np_dtype = _numpy_dtype(self.dtype)
|
|
# Host-side tensor (created via torch.from_numpy) has no shards.
|
|
if self._host_buffer is not None:
|
|
return self._host_buffer.copy()
|
|
if self._handle is None or self._memory_store is None:
|
|
return np.zeros(self.shape, dtype=np_dtype)
|
|
flat = np.zeros(math.prod(self.shape), dtype=np_dtype)
|
|
for shard in self._handle.shards:
|
|
start = shard.offset_bytes // self.itemsize
|
|
count = shard.nbytes // self.itemsize
|
|
try:
|
|
piece = self._memory_store.read(
|
|
"hbm", self._shard_store_addr(shard),
|
|
)
|
|
except KeyError:
|
|
continue
|
|
flat[start : start + count] = (
|
|
np.asarray(piece, dtype=np_dtype).reshape(-1)[:count]
|
|
)
|
|
return flat.reshape(self.shape)
|
|
|
|
def copy_(self, source: "Tensor") -> "Tensor":
|
|
"""In-place copy from another tensor into self.
|
|
|
|
Mirrors ``torch.Tensor.copy_()``. If ``source`` is a host tensor
|
|
(from ``torch.from_numpy``), its ndarray is split across self's
|
|
shards using each shard's byte range. If ``source`` is a deployed
|
|
(sharded) tensor, its contents are gathered first and then
|
|
re-scattered into self's shard layout.
|
|
|
|
Shapes must match. Returns self.
|
|
"""
|
|
if self._handle is None or self._memory_store is None:
|
|
raise RuntimeError(
|
|
f"Tensor '{self.name}' must be deployed before copy_()"
|
|
)
|
|
if source.shape != self.shape:
|
|
raise ValueError(
|
|
f"copy_ shape mismatch: self={self.shape} source={source.shape}"
|
|
)
|
|
np_dtype = _numpy_dtype(self.dtype)
|
|
arr = source.numpy().astype(np_dtype, copy=False)
|
|
flat = np.ascontiguousarray(arr).reshape(-1)
|
|
for shard in self._handle.shards:
|
|
start = shard.offset_bytes // self.itemsize
|
|
count = shard.nbytes // self.itemsize
|
|
piece = flat[start : start + count].copy()
|
|
self._memory_store.write(
|
|
"hbm", self._shard_store_addr(shard), piece,
|
|
)
|
|
return self
|
|
|
|
@property
|
|
def itemsize(self) -> int:
|
|
return dtype_itemsize(self.dtype)
|
|
|
|
@property
|
|
def nbytes(self) -> int:
|
|
return math.prod(self.shape) * self.itemsize
|
|
|
|
@property
|
|
def pa(self) -> int:
|
|
"""Primary PA (first shard). Used as kernel pointer argument."""
|
|
if self._handle is None or not self._handle.shards:
|
|
raise RuntimeError(f"Tensor '{self.name}' is not deployed yet")
|
|
return self._handle.shards[0].pa
|
|
|
|
@property
|
|
def va(self) -> int:
|
|
"""VA base address for the entire tensor."""
|
|
if self._handle is None:
|
|
raise RuntimeError(f"Tensor '{self.name}' is not deployed yet")
|
|
return self._handle.va_base
|
|
|
|
def to(
|
|
self,
|
|
placement: list[ShardSpec] | None = None,
|
|
*,
|
|
dp_policy: DPPolicy | None = None,
|
|
sip: int = 0,
|
|
cube: int = 0,
|
|
target_pe: int | tuple[int, ...] | str = 0,
|
|
) -> Tensor:
|
|
"""Set DP placement metadata (like torch.Tensor.to())."""
|
|
if placement is None:
|
|
placement = [ShardSpec(sip=0, cube=0, pe=0,
|
|
offset_bytes=0, nbytes=self.nbytes)]
|
|
self._dp_metadata = DPMetadata(
|
|
placement=placement, dp_policy=dp_policy,
|
|
sip=sip, cube=cube, target_pe=target_pe,
|
|
)
|
|
return self
|
|
|
|
def to_tensor_arg(self) -> TensorArg:
|
|
"""Convert deployed shards to KernelLaunchMsg TensorArg."""
|
|
if self._handle is None:
|
|
raise RuntimeError(f"Tensor '{self.name}' is not deployed yet")
|
|
return TensorArg(
|
|
shards=tuple(
|
|
TensorArgShard(
|
|
sip=s.sip, cube=s.cube, pe=s.pe,
|
|
pa=s.pa, nbytes=s.nbytes, offset_bytes=s.offset_bytes,
|
|
)
|
|
for s in self._handle.shards
|
|
),
|
|
va_base=self._handle.va_base,
|
|
)
|