"""Tests for ADR-0019 D1/D4 per-PE HBM partitioning. Restores the architectural property that was lost in commit 5917b34 (2026-04-04 "Replace xbar/bridge/single-NOC with explicit router mesh"), which over-consolidated 8 per-slice HBM CTRL nodes into one cube-wide HBM CTRL connected to every router. ADR-0019 D1/D4 specifies: - Each PE owns 8 of the cube's 64 pseudo-channels (PE_X → PCs 8X..8X+7). - HBM CTRL is split per-PE: ``hbm_ctrl.pe{X}`` is reachable ONLY through PE_X's attaching router. Accessing PE_Y's slice from PE_X requires mesh routing to r_Y_attach before entering hbm_ctrl.pe{Y}. These tests are written BEFORE the production change and are expected to FAIL on current code (HBM CTRL is a single ``hbm_ctrl`` node attached to all routers). Phase 2 must make them PASS without weakening assertions. """ from __future__ import annotations from pathlib import Path import pytest from kernbench.policy.address.phyaddr import PhysAddr from kernbench.policy.routing.router import AddressResolver, PathRouter from kernbench.runtime_api.kernel import PeDmaMsg from kernbench.sim_engine.engine import GraphEngine from kernbench.topology.builder import load_topology TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml" def _graph(): return load_topology(TOPOLOGY_PATH) def _slice_bytes(spec: dict) -> int: mm = spec["cube"]["memory_map"] return mm["hbm_total_gb_per_cube"] * (1 << 30) // mm["hbm_slices_per_cube"] def _hbm_pa(*, sip: int, cube: int, pe_id: int, offset: int, spec: dict) -> int: return PhysAddr.pe_hbm_addr( sip_id=sip, die_id=cube, pe_id=pe_id, pe_local_hbm_offset=offset, slice_size_bytes=_slice_bytes(spec), ).encode() # Mapping derived from topology.yaml + cube_mesh attach info PE_ATTACH_ROUTER = { 0: "r0c0", 1: "r0c1", 2: "r1c4", 3: "r1c5", 4: "r4c0", 5: "r4c1", 6: "r5c4", 7: "r5c5", } # ── 1. Topology: 8 per-PE HBM CTRL nodes per cube ──────────────────── def test_topology_has_8_hbm_ctrl_per_cube(): """Each cube must expose 8 hbm_ctrl instances, one per PE (``hbm_ctrl.pe0`` .. ``hbm_ctrl.pe7``). The legacy single ``hbm_ctrl`` must be absent.""" graph = _graph() for pe in range(8): nid = f"sip0.cube0.hbm_ctrl.pe{pe}" assert nid in graph.nodes, ( f"Expected per-PE HBM CTRL node {nid!r} (ADR-0019 D1)" ) node = graph.nodes[nid] assert int(node.attrs.get("num_pcs", 0)) == 8, ( f"{nid} must have num_pcs=8; got {node.attrs.get('num_pcs')}" ) # Legacy single hbm_ctrl must not exist assert "sip0.cube0.hbm_ctrl" not in graph.nodes, ( "Legacy single sip0.cube0.hbm_ctrl must be removed in favor of " "per-PE hbm_ctrl.pe{X} (ADR-0019 D1)" ) # ── 2. Each per-PE HBM CTRL connects ONLY to its PE's attaching router ─ def test_per_pe_hbm_ctrl_connects_only_to_owning_router(): """``hbm_ctrl.pe{X}`` must have exactly one router edge (to and from r_X_attach). No other router may have an edge to/from it.""" graph = _graph() edge_map = {(e.src, e.dst): e for e in graph.edges} for pe in range(8): nid = f"sip0.cube0.hbm_ctrl.pe{pe}" owner = f"sip0.cube0.{PE_ATTACH_ROUTER[pe]}" # incoming edges (router→hbm_ctrl) incoming = [src for (src, dst) in edge_map if dst == nid] outgoing = [dst for (src, dst) in edge_map if src == nid] assert incoming == [owner], ( f"{nid} must have a single incoming edge from {owner}; got {incoming}" ) assert outgoing == [owner], ( f"{nid} must have a single outgoing edge to {owner}; got {outgoing}" ) # ── 3. Resolver: PA pe_id → correct hbm_ctrl.pe{X} ─────────────────── @pytest.mark.parametrize("pe_id", list(range(8))) def test_resolver_maps_pe_id_to_correct_hbm_ctrl(pe_id): """AddressResolver must dispatch HBM PA to the hbm_ctrl owned by the target PE (encoded by pe_local_hbm_offset / slice_size_bytes).""" graph = _graph() spec = graph.spec resolver = AddressResolver(graph) pa_val = _hbm_pa(sip=0, cube=0, pe_id=pe_id, offset=0x1000, spec=spec) pa = PhysAddr.decode(pa_val) dst = resolver.resolve(pa) assert dst == f"sip0.cube0.hbm_ctrl.pe{pe_id}", ( f"PA with pe_id={pe_id} must resolve to hbm_ctrl.pe{pe_id}; got {dst!r}" ) # ── 4. Path: PE_X → PE_X_slice is single mesh hop ──────────────────── def test_pe0_to_pe0_slice_is_single_mesh_hop(): """PE0 accessing its OWN HBM slice must take exactly one router hop (r0c0 → hbm_ctrl.pe0).""" graph = _graph() spec = graph.spec router = PathRouter(graph) dst = f"sip0.cube0.hbm_ctrl.pe0" path = router.find_path("sip0.cube0.pe0", dst) expected = [ "sip0.cube0.pe0.pe_dma", "sip0.cube0.r0c0", "sip0.cube0.hbm_ctrl.pe0", ] assert path == expected, ( f"pe0 → pe0_slice path must be {expected}; got {path}" ) # ── 5. Path: PE_X → PE_Y_slice traverses mesh through r_Y_attach ───── def test_pe0_to_pe7_slice_traverses_mesh_to_r5c5(): """PE0 accessing PE7's slice must mesh-route to r5c5 (PE7's attaching router) before entering hbm_ctrl.pe7. Last two nodes must be r5c5 → hbm_ctrl.pe7.""" graph = _graph() router = PathRouter(graph) dst = "sip0.cube0.hbm_ctrl.pe7" path = router.find_path("sip0.cube0.pe0", dst) assert path[0] == "sip0.cube0.pe0.pe_dma" assert path[-2:] == ["sip0.cube0.r5c5", "sip0.cube0.hbm_ctrl.pe7"], ( f"Last 2 path nodes must be r5c5 → hbm_ctrl.pe7; got {path[-2:]}" ) # Multi-hop mesh between r0c0 and r5c5 assert len(path) >= 5, f"Cross-PE path must traverse mesh; got {len(path)} nodes" # ── 6. End-to-end: cross-PE latency > local PE latency ─────────────── def test_pe_dma_cross_pe_slower_than_local(): """For a non-trivial transfer (16KB), PE0 → PE7_slice must measurably exceed PE0 → PE0_slice (the additional mesh hops add per-router overhead and consume per-link wire BW).""" graph = _graph() spec = graph.spec nbytes = 16384 def _run(dst_pe: int) -> float: engine = GraphEngine(_graph()) pa = _hbm_pa(sip=0, cube=0, pe_id=dst_pe, offset=0x1000, spec=spec) msg = PeDmaMsg( correlation_id="per-pe-hbm", request_id=f"to-pe{dst_pe}", src_sip=0, src_cube=0, src_pe=0, dst_pa=pa, nbytes=nbytes, ) h = engine.submit(msg) engine.wait(h) _, trace = engine.get_completion(h) return float(trace["total_ns"]) local = _run(0) cross = _run(7) assert cross > local * 1.05, ( f"Cross-PE HBM access (pe0 → pe7_slice) must take measurably more " f"time than local (pe0 → pe0_slice). local={local:.2f}ns, " f"cross={cross:.2f}ns, ratio={cross/local:.3f} (expected > 1.05)" ) # ── 7. Probe CLI monotonicity (existing case names) ────────────────── def test_probe_cli_intra_cube_cases_are_monotonic(): """Probe CLI cases must show monotonic latency: pe-local-hbm < pe-same-half-hbm < pe-cross-half-hbm. Prior to per-PE partitioning these three return identical latency because all roads lead to the same hbm_ctrl. With ADR-0019 D4 restored, same-half (pe0→pe1) is 1 mesh hop further than local, and cross-half (pe0→pe4) is several hops further. """ graph = _graph() spec = graph.spec nbytes = 32768 def _run(dst_pe: int) -> float: engine = GraphEngine(_graph()) pa = _hbm_pa(sip=0, cube=0, pe_id=dst_pe, offset=0x1000, spec=spec) msg = PeDmaMsg( correlation_id="probe", request_id=f"to-pe{dst_pe}", src_sip=0, src_cube=0, src_pe=0, dst_pa=pa, nbytes=nbytes, ) h = engine.submit(msg) engine.wait(h) _, trace = engine.get_completion(h) return float(trace["total_ns"]) local = _run(0) # pe-local-hbm same_half = _run(1) # pe-same-half-hbm (adjacent: r0c0 → r0c1) cross_half = _run(4) # pe-cross-half-hbm (r0c0 → r4c0) assert local < same_half < cross_half, ( f"intra-cube DMA must be monotonic with mesh distance. " f"local={local:.2f}, same_half={same_half:.2f}, " f"cross_half={cross_half:.2f}" )