ADR-0026: DPPolicy intra-device only + ShardSpec structural coords

DPPolicy no longer carries a cross-SIP axis. SIP-level placement is solely controlled by torch.ahbm.set_device(rank) (ADR-0024); DPPolicy itself describes only the cube × PE layout within one SIP. ShardSpec switches to structural (sip, cube, pe) coordinates; the flat pe_index field/property is fully removed — silent drift between global-flat and SIP-local interpretations was a foot-gun flagged by ADR-0024 D11. Breaking API (explicit TypeError / AttributeError): - DPPolicy(sip=...) / DPPolicy(num_sips=...) -> TypeError - ShardSpec.pe_index -> AttributeError - ShardSpec(pe_index=...) -> TypeError - resolve_dp_policy now takes target_sip= (required), no num_sips. Downstream migration: - PE allocator dict keyed by (sip, cube, pe) tuples, in both _ensure_allocators and _free_tensor. deploy_tensor uses tuple lookup. - _create_tensor passes target_sip=current_sip; post-hoc pe_index shifting removed entirely. - launch._compute_local_shape drops the dp.sip branch. - Internal resolvers (column_wise / row_wise / replicate / tiled_*) return _LocalPeShard (cube-local identifier) instead of ShardSpec — resolve_dp_policy lifts them to full structural coords. Tests: - New tests/test_adr0026_dppolicy_intra_device.py (12 tests) pins the contract end-to-end. - test_sip_parallel.py rewritten: SIP composition now modeled as two resolve_dp_policy(target_sip=...) calls (ADR-0024 launcher style). - Call-site migration: test_tensor, test_va_integration, test_va_offset, test_runtime_api_tensor, test_tl_recv_async, test_ccl_* and benches gemm_single_pe, gpt3_qkv, va_offset_verify, ccl_allreduce (legacy branch) all use intra-device DPPolicy and structural ShardSpec. Result: 523 passed, 1 strict xfail (ring_default_ws — unchanged ADR-0024 Phase B blocker; architectural fix deferred to ADR-0027). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-14 13:02:19 -07:00
parent 787409ced1
commit 357cab525b
20 changed files with 549 additions and 328 deletions
@@ -2,11 +2,13 @@ import pytest

 from kernbench.policy.address.allocator import AddressConfig, AllocationError, PEMemAllocator
 from kernbench.policy.placement.dp import (
+    DPPolicy,
    ShardSpec,
    column_wise,
-    tiled_column_major,
    replicate,
+    resolve_dp_policy,
    row_wise,
+    tiled_column_major,
    tiled_row_major,
 )
 from kernbench.runtime_api.kernel import (
@@ -40,9 +42,9 @@ _CFG = AddressConfig(
 )


-def _make_allocators(num_pe: int = 8) -> dict[int, PEMemAllocator]:
+def _make_allocators(num_pe: int = 8) -> dict[tuple[int, int, int], PEMemAllocator]:
    return {
-        i: PEMemAllocator(rack_id=0, sip_id=0, cube_id=0, pe_id=i, cfg=_CFG)
+        (0, 0, i): PEMemAllocator(rack_id=0, sip_id=0, cube_id=0, pe_id=i, cfg=_CFG)
        for i in range(num_pe)
    }

@@ -133,7 +135,7 @@ def test_column_wise_placement():
    assert len(shards) == 8
    expected_nbytes = 1024 * 64 * 2  # 128 KB
    for i, s in enumerate(shards):
-        assert s.pe_index == i
+        assert s.local_pe == i
        assert s.nbytes == expected_nbytes
    # offsets are contiguous
    assert shards[0].offset_bytes == 0
@@ -151,7 +153,7 @@ def test_row_wise_placement():
    assert len(shards) == 8
    expected_nbytes = 128 * 512 * 2  # 128 KB
    for i, s in enumerate(shards):
-        assert s.pe_index == i
+        assert s.local_pe == i
        assert s.nbytes == expected_nbytes
    assert shards[0].offset_bytes == 0
    assert sum(s.nbytes for s in shards) == 1024 * 512 * 2
@@ -166,7 +168,7 @@ def test_replicate_placement():
    assert len(shards) == 8
    full_nbytes = 1024 * 512 * 2  # 1 MB
    for i, s in enumerate(shards):
-        assert s.pe_index == i
+        assert s.local_pe == i
        assert s.nbytes == full_nbytes
        assert s.offset_bytes == 0  # each is a full copy

@@ -188,10 +190,10 @@ def test_tiled_column_major():
    # tile (m=0,k=0) → PE0, tile (m=0,k=1) → PE1, ..., (m=0,k=3) → PE3
    # tile (m=1,k=0) → PE4, tile (m=1,k=1) → PE5, ..., (m=1,k=3) → PE7
    # tile (m=2,k=0) → PE0, ...
-    assert shards[0].pe_index == 0
-    assert shards[1].pe_index == 1
-    assert shards[7].pe_index == 7
-    assert shards[8].pe_index == 0  # wraps around
+    assert shards[0].local_pe == 0
+    assert shards[1].local_pe == 1
+    assert shards[7].local_pe == 7
+    assert shards[8].local_pe == 0  # wraps around
    # total coverage
    assert sum(s.nbytes for s in shards) == 1024 * 512 * 2

@@ -212,10 +214,10 @@ def test_tiled_row_major():
    # tile (m=0,k=0) → PE0, tile (m=1,k=0) → PE1, ..., (m=3,k=0) → PE3
    # tile (m=0,k=1) → PE4, tile (m=1,k=1) → PE5, ..., (m=3,k=1) → PE7
    # tile (m=0,k=2) → PE0, ...
-    assert shards[0].pe_index == 0
-    assert shards[1].pe_index == 1
-    assert shards[7].pe_index == 7
-    assert shards[8].pe_index == 0  # wraps around
+    assert shards[0].local_pe == 0
+    assert shards[1].local_pe == 1
+    assert shards[7].local_pe == 7
+    assert shards[8].local_pe == 0  # wraps around
    # total coverage
    assert sum(s.nbytes for s in shards) == 1024 * 512 * 2

@@ -226,7 +228,11 @@ def test_tiled_row_major():
 def test_deploy_tensor_hbm():
    """Deploy with column_wise placement → TensorHandle with valid PA shards."""
    allocs = _make_allocators()
-    placement = column_wise(shape=(1024, 512), itemsize=2, num_pe=8)
+    placement = resolve_dp_policy(
+        DPPolicy(cube="replicate", pe="column_wise"),
+        shape=(1024, 512), itemsize=2,
+        num_pe=8, num_cubes=1, target_sip=0,
+    )
    th = deploy_tensor(
        name="W",
        shape=(1024, 512),
@@ -253,7 +259,7 @@ def test_deploy_tensor_hbm():
 def test_deploy_tensor_tcm():
    """Deploy with TCM → uses pe_tcm_addr allocation."""
    allocs = _make_allocators()
-    placement = [ShardSpec(pe_index=0, offset_bytes=0, nbytes=256)]
+    placement = [ShardSpec(sip=0, cube=0, pe=0, offset_bytes=0, nbytes=256)]
    th = deploy_tensor(
        name="small",
        shape=(128,),
@@ -271,7 +277,7 @@ def test_deploy_tensor_overflow():
    """Allocation exceeding PE HBM capacity raises AllocationError."""
    allocs = _make_allocators()
    # 6 GB per PE slice, try to allocate 7 GB
-    big_shard = ShardSpec(pe_index=0, offset_bytes=0, nbytes=7 * _GB)
+    big_shard = ShardSpec(sip=0, cube=0, pe=0, offset_bytes=0, nbytes=7 * _GB)
    with pytest.raises(AllocationError):
        deploy_tensor(
            name="toobig",