kernbench2/tests/test_sip_parallel.py

"""Tests for SIP-level tensor parallelism — ADR-0026 structural model.

DPPolicy no longer carries a ``sip`` axis (ADR-0026 D1). SIP placement is
now expressed structurally: each call to ``resolve_dp_policy(target_sip=N)``
emits shards pinned to SIP N. Multi-SIP parallelism is composed by calling
the resolver once per SIP (typically driven by the ADR-0024 launcher, one
worker greenlet per rank, each worker using ``torch.ahbm.set_device(rank)``).

Covered here:
  SP1. ``target_sip`` stamps every shard.
  SP2. Two-SIP placement: union of two resolver calls covers the whole
       tensor K-axis when the combined bench treats them as column-split.
  SP3. Same for row-wise.
  SP4. Cube + PE sharding within a SIP remains correct across SIPs.
  SP5. PE_CPU num_programs contract (unchanged by ADR-0026).
"""
from __future__ import annotations

from kernbench.policy.placement.dp import DPPolicy, resolve_dp_policy


# ── SP1. target_sip stamps shards ────────────────────────────────────


def test_target_sip_stamps_all_shards():
    dp = DPPolicy(cube="replicate", pe="column_wise")
    shards = resolve_dp_policy(
        dp, shape=(128, 256), itemsize=2,
        num_pe=8, num_cubes=1, target_sip=3,
    )
    assert all(s.sip == 3 for s in shards)
    assert all(0 <= s.pe < 8 for s in shards)
    assert all(s.cube == 0 for s in shards)


# ── SP2. column-wise placement composed across two SIPs ─────────────


def test_compose_two_sips_column_wise_covers_tensor():
    """Bench splits K-axis across 2 SIPs by calling resolve twice and
    giving each SIP half of the tensor (half-shape + offset). Shards
    from both SIPs together cover the whole K axis."""
    full_shape = (128, 256)
    itemsize = 2
    # Per-SIP half-shape (K split across SIPs).
    half_shape = (128, 128)
    dp = DPPolicy(cube="replicate", pe="column_wise")

    shards_sip0 = resolve_dp_policy(
        dp, shape=half_shape, itemsize=itemsize,
        num_pe=8, num_cubes=1, target_sip=0,
    )
    shards_sip1 = resolve_dp_policy(
        dp, shape=half_shape, itemsize=itemsize,
        num_pe=8, num_cubes=1, target_sip=1,
    )

    total_bytes = full_shape[0] * full_shape[1] * itemsize
    sip0_bytes = sum(s.nbytes for s in shards_sip0)
    sip1_bytes = sum(s.nbytes for s in shards_sip1)
    assert sip0_bytes + sip1_bytes == total_bytes
    assert all(s.sip == 0 for s in shards_sip0)
    assert all(s.sip == 1 for s in shards_sip1)


# ── SP3. row-wise placement composed across two SIPs ────────────────


def test_compose_two_sips_row_wise_covers_tensor():
    full_shape = (128, 256)
    itemsize = 2
    half_shape = (64, 256)  # per-SIP half of M
    dp = DPPolicy(cube="replicate", pe="column_wise")

    shards_sip0 = resolve_dp_policy(
        dp, shape=half_shape, itemsize=itemsize,
        num_pe=8, num_cubes=1, target_sip=0,
    )
    shards_sip1 = resolve_dp_policy(
        dp, shape=half_shape, itemsize=itemsize,
        num_pe=8, num_cubes=1, target_sip=1,
    )

    total_bytes = full_shape[0] * full_shape[1] * itemsize
    assert sum(s.nbytes for s in shards_sip0) + sum(s.nbytes for s in shards_sip1) == total_bytes


# ── SP4. cube × PE sharding is independent per SIP ──────────────────


def test_cube_pe_sharding_independent_per_sip():
    """Intra-SIP cube + PE layout matches across SIPs; only sip field differs."""
    dp = DPPolicy(cube="column_wise", pe="column_wise")
    s0 = resolve_dp_policy(
        dp, shape=(128, 256), itemsize=2,
        num_pe=4, num_cubes=2, target_sip=0,
    )
    s1 = resolve_dp_policy(
        dp, shape=(128, 256), itemsize=2,
        num_pe=4, num_cubes=2, target_sip=1,
    )
    assert len(s0) == len(s1) == 2 * 4
    for a, b in zip(s0, s1):
        assert a.sip == 0 and b.sip == 1
        assert (a.cube, a.pe, a.offset_bytes, a.nbytes) == (
            b.cube, b.pe, b.offset_bytes, b.nbytes
        )


# ── SP5. PE_CPU num_programs (contract unchanged) ───────────────────


def test_pe_cpu_sets_num_programs():
    """TLContext reports num_programs from its initializer — used by PE_CPU
    when it launches a kernel on behalf of its shards."""
    from kernbench.triton_emu.tl_context import TLContext

    tl = TLContext(pe_id=3, num_programs=8)
    assert tl.program_id(0) == 3
    assert tl.num_programs(0) == 8