Files
kernbench2/tests/test_adr0026_dppolicy_intra_device.py
T
ywkang 357cab525b ADR-0026: DPPolicy intra-device only + ShardSpec structural coords
DPPolicy no longer carries a cross-SIP axis. SIP-level placement is
solely controlled by torch.ahbm.set_device(rank) (ADR-0024); DPPolicy
itself describes only the cube × PE layout within one SIP. ShardSpec
switches to structural (sip, cube, pe) coordinates; the flat pe_index
field/property is fully removed — silent drift between global-flat and
SIP-local interpretations was a foot-gun flagged by ADR-0024 D11.

Breaking API (explicit TypeError / AttributeError):
- DPPolicy(sip=...) / DPPolicy(num_sips=...) -> TypeError
- ShardSpec.pe_index -> AttributeError
- ShardSpec(pe_index=...) -> TypeError
- resolve_dp_policy now takes target_sip= (required), no num_sips.

Downstream migration:
- PE allocator dict keyed by (sip, cube, pe) tuples, in both
  _ensure_allocators and _free_tensor. deploy_tensor uses tuple lookup.
- _create_tensor passes target_sip=current_sip; post-hoc pe_index
  shifting removed entirely.
- launch._compute_local_shape drops the dp.sip branch.
- Internal resolvers (column_wise / row_wise / replicate / tiled_*)
  return _LocalPeShard (cube-local identifier) instead of ShardSpec —
  resolve_dp_policy lifts them to full structural coords.

Tests:
- New tests/test_adr0026_dppolicy_intra_device.py (12 tests) pins the
  contract end-to-end.
- test_sip_parallel.py rewritten: SIP composition now modeled as two
  resolve_dp_policy(target_sip=...) calls (ADR-0024 launcher style).
- Call-site migration: test_tensor, test_va_integration, test_va_offset,
  test_runtime_api_tensor, test_tl_recv_async, test_ccl_* and benches
  gemm_single_pe, gpt3_qkv, va_offset_verify, ccl_allreduce (legacy
  branch) all use intra-device DPPolicy and structural ShardSpec.

Result: 523 passed, 1 strict xfail (ring_default_ws — unchanged
ADR-0024 Phase B blocker; architectural fix deferred to ADR-0027).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-14 13:02:19 -07:00

240 lines
8.6 KiB
Python

"""ADR-0026 Phase 1 tests: DPPolicy intra-device only + ShardSpec structural.
These tests encode the contract from ADR-0026:
- DPPolicy no longer accepts ``sip`` or ``num_sips`` kwargs (TypeError).
- ShardSpec carries structural ``(sip, cube, pe)`` coordinates; the old flat
``pe_index`` field/property is fully removed (AttributeError).
- ``resolve_dp_policy(..., target_sip=N)`` stamps every returned ShardSpec
with ``sip=N``; cube and pe fields are local.
- ``RuntimeContext._allocators`` is keyed by ``(sip, cube, pe)`` tuples.
Phase 1: production code is unchanged → these tests SHOULD FAIL until the
Phase 2 diff lands. Phase 2 makes all of them pass.
"""
from __future__ import annotations
import pytest
from kernbench.policy.address.allocator import AddressConfig, PEMemAllocator
from kernbench.policy.placement.dp import DPPolicy, ShardSpec, resolve_dp_policy
from kernbench.runtime_api.tensor import deploy_tensor
# ── D1: DPPolicy no longer accepts sip / num_sips ─────────────────────
def test_dppolicy_rejects_sip_kwarg():
"""DPPolicy(sip=...) must raise TypeError after field removal."""
with pytest.raises(TypeError):
DPPolicy(sip="column_wise", cube="replicate", pe="replicate")
def test_dppolicy_rejects_num_sips_kwarg():
"""DPPolicy(num_sips=...) must raise TypeError after field removal."""
with pytest.raises(TypeError):
DPPolicy(cube="replicate", pe="replicate", num_sips=2)
def test_dppolicy_accepts_only_intra_device_fields():
"""Intra-device fields still work: cube, pe, num_cubes, num_pes."""
dp = DPPolicy(cube="column_wise", pe="column_wise",
num_cubes=2, num_pes=4)
assert dp.cube == "column_wise"
assert dp.pe == "column_wise"
assert dp.num_cubes == 2
assert dp.num_pes == 4
# No sip / num_sips attributes — even reading them must fail.
assert not hasattr(dp, "sip"), "DPPolicy.sip must be removed"
assert not hasattr(dp, "num_sips"), "DPPolicy.num_sips must be removed"
# ── D2: ShardSpec structural coords, no pe_index ──────────────────────
def test_shardspec_has_structural_coords():
"""ShardSpec constructs from (sip, cube, pe, offset_bytes, nbytes)."""
s = ShardSpec(sip=1, cube=2, pe=3, offset_bytes=128, nbytes=64)
assert s.sip == 1
assert s.cube == 2
assert s.pe == 3
assert s.offset_bytes == 128
assert s.nbytes == 64
def test_shardspec_has_no_pe_index_attr():
"""Flat pe_index must be fully removed — no field, no property."""
s = ShardSpec(sip=0, cube=0, pe=0, offset_bytes=0, nbytes=8)
with pytest.raises(AttributeError):
_ = s.pe_index # noqa: F841
def test_shardspec_rejects_pe_index_kwarg():
"""ShardSpec(pe_index=...) must raise TypeError."""
with pytest.raises(TypeError):
ShardSpec(pe_index=0, offset_bytes=0, nbytes=8) # type: ignore[call-arg]
# ── D3: resolve_dp_policy(target_sip=...) structural semantics ────────
def test_resolve_dp_policy_target_sip_stamps_shards():
"""All returned shards must carry sip == target_sip; cube/pe local."""
dp = DPPolicy(cube="column_wise", pe="column_wise")
shards = resolve_dp_policy(
dp, shape=(4, 32), itemsize=2,
num_pe=4, num_cubes=2, target_sip=1,
)
assert len(shards) == 2 * 4
assert all(s.sip == 1 for s in shards)
assert all(0 <= s.cube < 2 for s in shards)
assert all(0 <= s.pe < 4 for s in shards)
def test_resolve_dp_policy_target_sip_differ_only_in_sip():
"""Same policy + dims on two SIPs → shards identical except .sip."""
dp = DPPolicy(cube="replicate", pe="column_wise")
shards_0 = resolve_dp_policy(
dp, shape=(4, 32), itemsize=2,
num_pe=4, num_cubes=1, target_sip=0,
)
shards_1 = resolve_dp_policy(
dp, shape=(4, 32), itemsize=2,
num_pe=4, num_cubes=1, target_sip=1,
)
assert len(shards_0) == len(shards_1)
for a, b in zip(shards_0, shards_1):
assert a.sip == 0 and b.sip == 1
assert a.cube == b.cube
assert a.pe == b.pe
assert a.offset_bytes == b.offset_bytes
assert a.nbytes == b.nbytes
def test_resolve_dp_policy_no_num_sips_param():
"""resolve_dp_policy must not accept num_sips anymore.
Post-Phase-2 signature drops ``num_sips`` (DPPolicy no longer crosses
SIP boundaries) and adds required ``target_sip``. Calling with
``num_sips=...`` must raise TypeError (unexpected keyword argument).
"""
dp = DPPolicy(cube="replicate", pe="replicate")
with pytest.raises(TypeError, match="num_sips"):
resolve_dp_policy(
dp, shape=(4, 8), itemsize=2,
num_pe=1, num_cubes=1, num_sips=2, # type: ignore[call-arg]
)
# ── D5: Allocator dict keyed by (sip, cube, pe) tuples ────────────────
_MB = 1 << 20
_GB = 1 << 30
_CFG = AddressConfig(
sip_count=2,
cubes_per_sip=2,
pes_per_cube=4,
hbm_bytes_per_cube=_GB,
hbm_slices_per_cube=4,
tcm_bytes_per_pe=_MB,
tcm_scheduler_reserved_bytes=0,
sram_bytes_per_cube=_MB,
)
def _make_tuple_allocators(
num_sips: int = 1, num_cubes: int = 1, num_pe: int = 4,
) -> dict[tuple[int, int, int], PEMemAllocator]:
return {
(s, c, p): PEMemAllocator(
rack_id=0, sip_id=s, cube_id=c, pe_id=p, cfg=_CFG,
)
for s in range(num_sips)
for c in range(num_cubes)
for p in range(num_pe)
}
def test_deploy_tensor_uses_tuple_lookup():
"""deploy_tensor(allocators={(sip,cube,pe): alloc, ...}) succeeds."""
dp = DPPolicy(cube="replicate", pe="column_wise")
placement = resolve_dp_policy(
dp, shape=(4, 16), itemsize=2,
num_pe=4, num_cubes=1, target_sip=0,
)
allocators = _make_tuple_allocators(num_sips=1, num_cubes=1, num_pe=4)
handle = deploy_tensor(
name="t", shape=(4, 16), dtype="f16",
placement=placement, allocators=allocators,
)
assert len(handle.shards) == 4
# Each shard's TensorShard carries structural coords; those coords
# must match the shard's ShardSpec (sip, cube, pe).
for spec, shard in zip(placement, handle.shards):
assert shard.sip == spec.sip
assert shard.cube == spec.cube
assert shard.pe == spec.pe
def test_runtime_context_allocator_keys_are_tuples(topology):
"""After ctx tensor op, ctx._allocators keys are (sip, cube, pe) tuples.
Ensures D5 migration landed (allocator population + lookup).
"""
from kernbench.runtime_api.context import RuntimeContext
from kernbench.runtime_api.types import DeviceSelector
from kernbench.sim_engine.engine import GraphEngine
engine = GraphEngine(topology.topology_obj, enable_data=True)
ctx = RuntimeContext(
engine=engine,
target_device=DeviceSelector("sip:0"),
correlation_id="test_adr0026_tuple_keys",
spec=topology.topology_obj.spec,
)
dp = DPPolicy(cube="replicate", pe="replicate", num_cubes=1, num_pes=1)
_ = ctx.zeros((1, 16), dtype="f16", dp=dp)
assert ctx._allocators, "allocators dict should be populated"
keys = list(ctx._allocators.keys())
assert all(isinstance(k, tuple) and len(k) == 3 for k in keys), (
f"_allocators keys must be (sip, cube, pe) tuples; got {keys[:5]}"
)
# ── D4 (via regression): no SIP-crossing tensor without set_device ────
def test_create_tensor_on_target_sip_via_set_device(topology):
"""torch.ahbm.set_device(1) + DPPolicy(cube=replicate, pe=replicate)
→ all shards land on SIP 1 structurally (no post-hoc shifting needed)."""
from kernbench.runtime_api.context import RuntimeContext
from kernbench.runtime_api.types import DeviceSelector
from kernbench.sim_engine.engine import GraphEngine
# Skip the test if topology has only 1 SIP (nothing to verify).
n_sips = int(
topology.topology_obj.spec.get("system", {})
.get("sips", {}).get("count", 1)
)
if n_sips < 2:
pytest.skip("topology has <2 SIPs; set_device(1) not meaningful")
engine = GraphEngine(topology.topology_obj, enable_data=True)
ctx = RuntimeContext(
engine=engine,
target_device=DeviceSelector("sip:1"),
correlation_id="test_adr0026_set_device",
spec=topology.topology_obj.spec,
)
ctx.ahbm.set_device(1)
dp = DPPolicy(cube="replicate", pe="replicate", num_cubes=1, num_pes=1)
t = ctx.zeros((1, 16), dtype="f16", dp=dp)
assert t._handle is not None
assert all(s.sip == 1 for s in t._handle.shards), (
f"expected all shards on SIP 1; got {[s.sip for s in t._handle.shards]}"
)