357cab525b
DPPolicy no longer carries a cross-SIP axis. SIP-level placement is solely controlled by torch.ahbm.set_device(rank) (ADR-0024); DPPolicy itself describes only the cube × PE layout within one SIP. ShardSpec switches to structural (sip, cube, pe) coordinates; the flat pe_index field/property is fully removed — silent drift between global-flat and SIP-local interpretations was a foot-gun flagged by ADR-0024 D11. Breaking API (explicit TypeError / AttributeError): - DPPolicy(sip=...) / DPPolicy(num_sips=...) -> TypeError - ShardSpec.pe_index -> AttributeError - ShardSpec(pe_index=...) -> TypeError - resolve_dp_policy now takes target_sip= (required), no num_sips. Downstream migration: - PE allocator dict keyed by (sip, cube, pe) tuples, in both _ensure_allocators and _free_tensor. deploy_tensor uses tuple lookup. - _create_tensor passes target_sip=current_sip; post-hoc pe_index shifting removed entirely. - launch._compute_local_shape drops the dp.sip branch. - Internal resolvers (column_wise / row_wise / replicate / tiled_*) return _LocalPeShard (cube-local identifier) instead of ShardSpec — resolve_dp_policy lifts them to full structural coords. Tests: - New tests/test_adr0026_dppolicy_intra_device.py (12 tests) pins the contract end-to-end. - test_sip_parallel.py rewritten: SIP composition now modeled as two resolve_dp_policy(target_sip=...) calls (ADR-0024 launcher style). - Call-site migration: test_tensor, test_va_integration, test_va_offset, test_runtime_api_tensor, test_tl_recv_async, test_ccl_* and benches gemm_single_pe, gpt3_qkv, va_offset_verify, ccl_allreduce (legacy branch) all use intra-device DPPolicy and structural ShardSpec. Result: 523 passed, 1 strict xfail (ring_default_ws — unchanged ADR-0024 Phase B blocker; architectural fix deferred to ADR-0027). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
240 lines
8.6 KiB
Python
240 lines
8.6 KiB
Python
"""ADR-0026 Phase 1 tests: DPPolicy intra-device only + ShardSpec structural.
|
|
|
|
These tests encode the contract from ADR-0026:
|
|
|
|
- DPPolicy no longer accepts ``sip`` or ``num_sips`` kwargs (TypeError).
|
|
- ShardSpec carries structural ``(sip, cube, pe)`` coordinates; the old flat
|
|
``pe_index`` field/property is fully removed (AttributeError).
|
|
- ``resolve_dp_policy(..., target_sip=N)`` stamps every returned ShardSpec
|
|
with ``sip=N``; cube and pe fields are local.
|
|
- ``RuntimeContext._allocators`` is keyed by ``(sip, cube, pe)`` tuples.
|
|
|
|
Phase 1: production code is unchanged → these tests SHOULD FAIL until the
|
|
Phase 2 diff lands. Phase 2 makes all of them pass.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import pytest
|
|
|
|
from kernbench.policy.address.allocator import AddressConfig, PEMemAllocator
|
|
from kernbench.policy.placement.dp import DPPolicy, ShardSpec, resolve_dp_policy
|
|
from kernbench.runtime_api.tensor import deploy_tensor
|
|
|
|
|
|
# ── D1: DPPolicy no longer accepts sip / num_sips ─────────────────────
|
|
|
|
|
|
def test_dppolicy_rejects_sip_kwarg():
|
|
"""DPPolicy(sip=...) must raise TypeError after field removal."""
|
|
with pytest.raises(TypeError):
|
|
DPPolicy(sip="column_wise", cube="replicate", pe="replicate")
|
|
|
|
|
|
def test_dppolicy_rejects_num_sips_kwarg():
|
|
"""DPPolicy(num_sips=...) must raise TypeError after field removal."""
|
|
with pytest.raises(TypeError):
|
|
DPPolicy(cube="replicate", pe="replicate", num_sips=2)
|
|
|
|
|
|
def test_dppolicy_accepts_only_intra_device_fields():
|
|
"""Intra-device fields still work: cube, pe, num_cubes, num_pes."""
|
|
dp = DPPolicy(cube="column_wise", pe="column_wise",
|
|
num_cubes=2, num_pes=4)
|
|
assert dp.cube == "column_wise"
|
|
assert dp.pe == "column_wise"
|
|
assert dp.num_cubes == 2
|
|
assert dp.num_pes == 4
|
|
# No sip / num_sips attributes — even reading them must fail.
|
|
assert not hasattr(dp, "sip"), "DPPolicy.sip must be removed"
|
|
assert not hasattr(dp, "num_sips"), "DPPolicy.num_sips must be removed"
|
|
|
|
|
|
# ── D2: ShardSpec structural coords, no pe_index ──────────────────────
|
|
|
|
|
|
def test_shardspec_has_structural_coords():
|
|
"""ShardSpec constructs from (sip, cube, pe, offset_bytes, nbytes)."""
|
|
s = ShardSpec(sip=1, cube=2, pe=3, offset_bytes=128, nbytes=64)
|
|
assert s.sip == 1
|
|
assert s.cube == 2
|
|
assert s.pe == 3
|
|
assert s.offset_bytes == 128
|
|
assert s.nbytes == 64
|
|
|
|
|
|
def test_shardspec_has_no_pe_index_attr():
|
|
"""Flat pe_index must be fully removed — no field, no property."""
|
|
s = ShardSpec(sip=0, cube=0, pe=0, offset_bytes=0, nbytes=8)
|
|
with pytest.raises(AttributeError):
|
|
_ = s.pe_index # noqa: F841
|
|
|
|
|
|
def test_shardspec_rejects_pe_index_kwarg():
|
|
"""ShardSpec(pe_index=...) must raise TypeError."""
|
|
with pytest.raises(TypeError):
|
|
ShardSpec(pe_index=0, offset_bytes=0, nbytes=8) # type: ignore[call-arg]
|
|
|
|
|
|
# ── D3: resolve_dp_policy(target_sip=...) structural semantics ────────
|
|
|
|
|
|
def test_resolve_dp_policy_target_sip_stamps_shards():
|
|
"""All returned shards must carry sip == target_sip; cube/pe local."""
|
|
dp = DPPolicy(cube="column_wise", pe="column_wise")
|
|
shards = resolve_dp_policy(
|
|
dp, shape=(4, 32), itemsize=2,
|
|
num_pe=4, num_cubes=2, target_sip=1,
|
|
)
|
|
assert len(shards) == 2 * 4
|
|
assert all(s.sip == 1 for s in shards)
|
|
assert all(0 <= s.cube < 2 for s in shards)
|
|
assert all(0 <= s.pe < 4 for s in shards)
|
|
|
|
|
|
def test_resolve_dp_policy_target_sip_differ_only_in_sip():
|
|
"""Same policy + dims on two SIPs → shards identical except .sip."""
|
|
dp = DPPolicy(cube="replicate", pe="column_wise")
|
|
shards_0 = resolve_dp_policy(
|
|
dp, shape=(4, 32), itemsize=2,
|
|
num_pe=4, num_cubes=1, target_sip=0,
|
|
)
|
|
shards_1 = resolve_dp_policy(
|
|
dp, shape=(4, 32), itemsize=2,
|
|
num_pe=4, num_cubes=1, target_sip=1,
|
|
)
|
|
assert len(shards_0) == len(shards_1)
|
|
for a, b in zip(shards_0, shards_1):
|
|
assert a.sip == 0 and b.sip == 1
|
|
assert a.cube == b.cube
|
|
assert a.pe == b.pe
|
|
assert a.offset_bytes == b.offset_bytes
|
|
assert a.nbytes == b.nbytes
|
|
|
|
|
|
def test_resolve_dp_policy_no_num_sips_param():
|
|
"""resolve_dp_policy must not accept num_sips anymore.
|
|
|
|
Post-Phase-2 signature drops ``num_sips`` (DPPolicy no longer crosses
|
|
SIP boundaries) and adds required ``target_sip``. Calling with
|
|
``num_sips=...`` must raise TypeError (unexpected keyword argument).
|
|
"""
|
|
dp = DPPolicy(cube="replicate", pe="replicate")
|
|
with pytest.raises(TypeError, match="num_sips"):
|
|
resolve_dp_policy(
|
|
dp, shape=(4, 8), itemsize=2,
|
|
num_pe=1, num_cubes=1, num_sips=2, # type: ignore[call-arg]
|
|
)
|
|
|
|
|
|
# ── D5: Allocator dict keyed by (sip, cube, pe) tuples ────────────────
|
|
|
|
|
|
_MB = 1 << 20
|
|
_GB = 1 << 30
|
|
|
|
_CFG = AddressConfig(
|
|
sip_count=2,
|
|
cubes_per_sip=2,
|
|
pes_per_cube=4,
|
|
hbm_bytes_per_cube=_GB,
|
|
hbm_slices_per_cube=4,
|
|
tcm_bytes_per_pe=_MB,
|
|
tcm_scheduler_reserved_bytes=0,
|
|
sram_bytes_per_cube=_MB,
|
|
)
|
|
|
|
|
|
def _make_tuple_allocators(
|
|
num_sips: int = 1, num_cubes: int = 1, num_pe: int = 4,
|
|
) -> dict[tuple[int, int, int], PEMemAllocator]:
|
|
return {
|
|
(s, c, p): PEMemAllocator(
|
|
rack_id=0, sip_id=s, cube_id=c, pe_id=p, cfg=_CFG,
|
|
)
|
|
for s in range(num_sips)
|
|
for c in range(num_cubes)
|
|
for p in range(num_pe)
|
|
}
|
|
|
|
|
|
def test_deploy_tensor_uses_tuple_lookup():
|
|
"""deploy_tensor(allocators={(sip,cube,pe): alloc, ...}) succeeds."""
|
|
dp = DPPolicy(cube="replicate", pe="column_wise")
|
|
placement = resolve_dp_policy(
|
|
dp, shape=(4, 16), itemsize=2,
|
|
num_pe=4, num_cubes=1, target_sip=0,
|
|
)
|
|
allocators = _make_tuple_allocators(num_sips=1, num_cubes=1, num_pe=4)
|
|
handle = deploy_tensor(
|
|
name="t", shape=(4, 16), dtype="f16",
|
|
placement=placement, allocators=allocators,
|
|
)
|
|
assert len(handle.shards) == 4
|
|
# Each shard's TensorShard carries structural coords; those coords
|
|
# must match the shard's ShardSpec (sip, cube, pe).
|
|
for spec, shard in zip(placement, handle.shards):
|
|
assert shard.sip == spec.sip
|
|
assert shard.cube == spec.cube
|
|
assert shard.pe == spec.pe
|
|
|
|
|
|
def test_runtime_context_allocator_keys_are_tuples(topology):
|
|
"""After ctx tensor op, ctx._allocators keys are (sip, cube, pe) tuples.
|
|
|
|
Ensures D5 migration landed (allocator population + lookup).
|
|
"""
|
|
from kernbench.runtime_api.context import RuntimeContext
|
|
from kernbench.runtime_api.types import DeviceSelector
|
|
from kernbench.sim_engine.engine import GraphEngine
|
|
|
|
engine = GraphEngine(topology.topology_obj, enable_data=True)
|
|
ctx = RuntimeContext(
|
|
engine=engine,
|
|
target_device=DeviceSelector("sip:0"),
|
|
correlation_id="test_adr0026_tuple_keys",
|
|
spec=topology.topology_obj.spec,
|
|
)
|
|
dp = DPPolicy(cube="replicate", pe="replicate", num_cubes=1, num_pes=1)
|
|
_ = ctx.zeros((1, 16), dtype="f16", dp=dp)
|
|
|
|
assert ctx._allocators, "allocators dict should be populated"
|
|
keys = list(ctx._allocators.keys())
|
|
assert all(isinstance(k, tuple) and len(k) == 3 for k in keys), (
|
|
f"_allocators keys must be (sip, cube, pe) tuples; got {keys[:5]}"
|
|
)
|
|
|
|
|
|
# ── D4 (via regression): no SIP-crossing tensor without set_device ────
|
|
|
|
|
|
def test_create_tensor_on_target_sip_via_set_device(topology):
|
|
"""torch.ahbm.set_device(1) + DPPolicy(cube=replicate, pe=replicate)
|
|
→ all shards land on SIP 1 structurally (no post-hoc shifting needed)."""
|
|
from kernbench.runtime_api.context import RuntimeContext
|
|
from kernbench.runtime_api.types import DeviceSelector
|
|
from kernbench.sim_engine.engine import GraphEngine
|
|
|
|
# Skip the test if topology has only 1 SIP (nothing to verify).
|
|
n_sips = int(
|
|
topology.topology_obj.spec.get("system", {})
|
|
.get("sips", {}).get("count", 1)
|
|
)
|
|
if n_sips < 2:
|
|
pytest.skip("topology has <2 SIPs; set_device(1) not meaningful")
|
|
|
|
engine = GraphEngine(topology.topology_obj, enable_data=True)
|
|
ctx = RuntimeContext(
|
|
engine=engine,
|
|
target_device=DeviceSelector("sip:1"),
|
|
correlation_id="test_adr0026_set_device",
|
|
spec=topology.topology_obj.spec,
|
|
)
|
|
ctx.ahbm.set_device(1)
|
|
dp = DPPolicy(cube="replicate", pe="replicate", num_cubes=1, num_pes=1)
|
|
t = ctx.zeros((1, 16), dtype="f16", dp=dp)
|
|
|
|
assert t._handle is not None
|
|
assert all(s.sip == 1 for s in t._handle.shards), (
|
|
f"expected all shards on SIP 1; got {[s.sip for s in t._handle.shards]}"
|
|
)
|