kernbench2/tests/test_per_pe_hbm_partition.py

"""Tests for ADR-0019 D1/D4 per-PE HBM partitioning.

Restores the architectural property that was lost in commit 5917b34
(2026-04-04 "Replace xbar/bridge/single-NOC with explicit router mesh"),
which over-consolidated 8 per-slice HBM CTRL nodes into one cube-wide
HBM CTRL connected to every router. ADR-0019 D1/D4 specifies:

- Each PE owns 8 of the cube's 64 pseudo-channels (PE_X → PCs 8X..8X+7).
- HBM CTRL is split per-PE: ``hbm_ctrl.pe{X}`` is reachable ONLY through
  PE_X's attaching router. Accessing PE_Y's slice from PE_X requires
  mesh routing to r_Y_attach before entering hbm_ctrl.pe{Y}.

These tests are written BEFORE the production change and are expected
to FAIL on current code (HBM CTRL is a single ``hbm_ctrl`` node attached
to all routers). Phase 2 must make them PASS without weakening
assertions.
"""
from __future__ import annotations

from pathlib import Path

import pytest

from kernbench.policy.address.phyaddr import PhysAddr
from kernbench.policy.routing.router import AddressResolver, PathRouter
from kernbench.runtime_api.kernel import PeDmaMsg
from kernbench.sim_engine.engine import GraphEngine
from kernbench.topology.builder import load_topology

TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"


def _graph():
    return load_topology(TOPOLOGY_PATH)


def _slice_bytes(spec: dict) -> int:
    mm = spec["cube"]["memory_map"]
    return mm["hbm_total_gb_per_cube"] * (1 << 30) // mm["hbm_slices_per_cube"]


def _hbm_pa(*, sip: int, cube: int, pe_id: int, offset: int, spec: dict) -> int:
    return PhysAddr.pe_hbm_addr(
        sip_id=sip, die_id=cube, pe_id=pe_id,
        pe_local_hbm_offset=offset, slice_size_bytes=_slice_bytes(spec),
    ).encode()


# Mapping derived from topology.yaml + cube_mesh attach info
PE_ATTACH_ROUTER = {
    0: "r0c0", 1: "r0c1",
    2: "r1c4", 3: "r1c5",
    4: "r4c0", 5: "r4c1",
    6: "r5c4", 7: "r5c5",
}


# ── 1. Topology: 8 per-PE HBM CTRL nodes per cube ────────────────────


def test_topology_has_8_hbm_ctrl_per_cube():
    """Each cube must expose 8 hbm_ctrl instances, one per PE
    (``hbm_ctrl.pe0`` .. ``hbm_ctrl.pe7``). The legacy single
    ``hbm_ctrl`` must be absent."""
    graph = _graph()
    for pe in range(8):
        nid = f"sip0.cube0.hbm_ctrl.pe{pe}"
        assert nid in graph.nodes, (
            f"Expected per-PE HBM CTRL node {nid!r} (ADR-0019 D1)"
        )
        node = graph.nodes[nid]
        assert int(node.attrs.get("num_pcs", 0)) == 8, (
            f"{nid} must have num_pcs=8; got {node.attrs.get('num_pcs')}"
        )
    # Legacy single hbm_ctrl must not exist
    assert "sip0.cube0.hbm_ctrl" not in graph.nodes, (
        "Legacy single sip0.cube0.hbm_ctrl must be removed in favor of "
        "per-PE hbm_ctrl.pe{X} (ADR-0019 D1)"
    )


# ── 2. Each per-PE HBM CTRL connects ONLY to its PE's attaching router ─


def test_per_pe_hbm_ctrl_connects_only_to_owning_router():
    """``hbm_ctrl.pe{X}`` must have exactly one router edge (to and from
    r_X_attach). No other router may have an edge to/from it."""
    graph = _graph()
    edge_map = {(e.src, e.dst): e for e in graph.edges}
    for pe in range(8):
        nid = f"sip0.cube0.hbm_ctrl.pe{pe}"
        owner = f"sip0.cube0.{PE_ATTACH_ROUTER[pe]}"
        # incoming edges (router→hbm_ctrl)
        incoming = [src for (src, dst) in edge_map if dst == nid]
        outgoing = [dst for (src, dst) in edge_map if src == nid]
        assert incoming == [owner], (
            f"{nid} must have a single incoming edge from {owner}; got {incoming}"
        )
        assert outgoing == [owner], (
            f"{nid} must have a single outgoing edge to {owner}; got {outgoing}"
        )


# ── 3. Resolver: PA pe_id → correct hbm_ctrl.pe{X} ───────────────────


@pytest.mark.parametrize("pe_id", list(range(8)))
def test_resolver_maps_pe_id_to_correct_hbm_ctrl(pe_id):
    """AddressResolver must dispatch HBM PA to the hbm_ctrl owned by the
    target PE (encoded by pe_local_hbm_offset / slice_size_bytes)."""
    graph = _graph()
    spec = graph.spec
    resolver = AddressResolver(graph)
    pa_val = _hbm_pa(sip=0, cube=0, pe_id=pe_id, offset=0x1000, spec=spec)
    pa = PhysAddr.decode(pa_val)
    dst = resolver.resolve(pa)
    assert dst == f"sip0.cube0.hbm_ctrl.pe{pe_id}", (
        f"PA with pe_id={pe_id} must resolve to hbm_ctrl.pe{pe_id}; got {dst!r}"
    )


# ── 4. Path: PE_X → PE_X_slice is single mesh hop ────────────────────


def test_pe0_to_pe0_slice_is_single_mesh_hop():
    """PE0 accessing its OWN HBM slice must take exactly one router hop
    (r0c0 → hbm_ctrl.pe0)."""
    graph = _graph()
    spec = graph.spec
    router = PathRouter(graph)
    dst = f"sip0.cube0.hbm_ctrl.pe0"
    path = router.find_path("sip0.cube0.pe0", dst)
    expected = [
        "sip0.cube0.pe0.pe_dma",
        "sip0.cube0.r0c0",
        "sip0.cube0.hbm_ctrl.pe0",
    ]
    assert path == expected, (
        f"pe0 → pe0_slice path must be {expected}; got {path}"
    )


# ── 5. Path: PE_X → PE_Y_slice traverses mesh through r_Y_attach ─────


def test_pe0_to_pe7_slice_traverses_mesh_to_r5c5():
    """PE0 accessing PE7's slice must mesh-route to r5c5 (PE7's attaching
    router) before entering hbm_ctrl.pe7. Last two nodes must be
    r5c5 → hbm_ctrl.pe7."""
    graph = _graph()
    router = PathRouter(graph)
    dst = "sip0.cube0.hbm_ctrl.pe7"
    path = router.find_path("sip0.cube0.pe0", dst)
    assert path[0] == "sip0.cube0.pe0.pe_dma"
    assert path[-2:] == ["sip0.cube0.r5c5", "sip0.cube0.hbm_ctrl.pe7"], (
        f"Last 2 path nodes must be r5c5 → hbm_ctrl.pe7; got {path[-2:]}"
    )
    # Multi-hop mesh between r0c0 and r5c5
    assert len(path) >= 5, f"Cross-PE path must traverse mesh; got {len(path)} nodes"


# ── 6. End-to-end: cross-PE latency > local PE latency ───────────────


def test_pe_dma_cross_pe_slower_than_local():
    """For a non-trivial transfer (16KB), PE0 → PE7_slice must measurably
    exceed PE0 → PE0_slice (the additional mesh hops add per-router
    overhead and consume per-link wire BW)."""
    graph = _graph()
    spec = graph.spec
    nbytes = 16384

    def _run(dst_pe: int) -> float:
        engine = GraphEngine(_graph())
        pa = _hbm_pa(sip=0, cube=0, pe_id=dst_pe, offset=0x1000, spec=spec)
        msg = PeDmaMsg(
            correlation_id="per-pe-hbm", request_id=f"to-pe{dst_pe}",
            src_sip=0, src_cube=0, src_pe=0,
            dst_pa=pa, nbytes=nbytes,
        )
        h = engine.submit(msg)
        engine.wait(h)
        _, trace = engine.get_completion(h)
        return float(trace["total_ns"])

    local = _run(0)
    cross = _run(7)
    assert cross > local * 1.05, (
        f"Cross-PE HBM access (pe0 → pe7_slice) must take measurably more "
        f"time than local (pe0 → pe0_slice). local={local:.2f}ns, "
        f"cross={cross:.2f}ns, ratio={cross/local:.3f} (expected > 1.05)"
    )


# ── 7. Probe CLI monotonicity (existing case names) ──────────────────


def test_probe_cli_intra_cube_cases_are_monotonic():
    """Probe CLI cases must show monotonic latency:
        pe-local-hbm < pe-same-half-hbm < pe-cross-half-hbm.

    Prior to per-PE partitioning these three return identical latency
    because all roads lead to the same hbm_ctrl. With ADR-0019 D4
    restored, same-half (pe0→pe1) is 1 mesh hop further than local,
    and cross-half (pe0→pe4) is several hops further.
    """
    graph = _graph()
    spec = graph.spec
    nbytes = 32768

    def _run(dst_pe: int) -> float:
        engine = GraphEngine(_graph())
        pa = _hbm_pa(sip=0, cube=0, pe_id=dst_pe, offset=0x1000, spec=spec)
        msg = PeDmaMsg(
            correlation_id="probe", request_id=f"to-pe{dst_pe}",
            src_sip=0, src_cube=0, src_pe=0,
            dst_pa=pa, nbytes=nbytes,
        )
        h = engine.submit(msg)
        engine.wait(h)
        _, trace = engine.get_completion(h)
        return float(trace["total_ns"])

    local = _run(0)         # pe-local-hbm
    same_half = _run(1)     # pe-same-half-hbm (adjacent: r0c0 → r0c1)
    cross_half = _run(4)    # pe-cross-half-hbm (r0c0 → r4c0)
    assert local < same_half < cross_half, (
        f"intra-cube DMA must be monotonic with mesh distance. "
        f"local={local:.2f}, same_half={same_half:.2f}, "
        f"cross_half={cross_half:.2f}"
    )