kernbench2/tests/test_engine.py

import pytest

from pathlib import Path

from kernbench.common.types import Completion, RequestHandle
from kernbench.policy.address.phyaddr import PhysAddr
from kernbench.runtime_api.kernel import (
    KernelLaunchMsg,
    KernelRef,
    MemoryReadMsg,
    MemoryWriteMsg,
    ScalarArg,
    TensorArg,
    TensorArgShard,
)
from kernbench.sim_engine.engine import GraphEngine
from kernbench.topology.builder import load_topology

TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"


def _engine():
    graph = load_topology(TOPOLOGY_PATH)
    return GraphEngine(graph)


def _hbm_pa(sip: int = 0, cube: int = 0, pe_id: int = 0) -> int:
    """Create an HBM physical address targeting a specific PE's HBM slice."""
    # 48 GB / 8 slices = 6 GB per slice
    slice_bytes = 48 * (1 << 30) // 8
    pa = PhysAddr.pe_hbm_addr(
        rack_id=0, sip_id=sip, cube_id=cube, pe_id=pe_id,
        pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes,
    )
    return pa.encode()


def _sram_pa(sip: int = 0, cube: int = 0) -> int:
    """Create an SRAM physical address."""
    pa = PhysAddr.cube_sram_addr(rack_id=0, sip_id=sip, cube_id=cube, sram_offset=0x800)
    return pa.encode()


# ── 1. submit returns handle ────────────────────────────────────────


def test_engine_submit_returns_handle():
    """submit() must return a RequestHandle (non-empty string)."""
    engine = _engine()
    msg = MemoryWriteMsg(
        correlation_id="c0", request_id="r0",
        dst_sip=0, dst_cube=0, dst_pe=0,
        dst_pa=_hbm_pa(), nbytes=4096, pattern="zero",
    )
    handle = engine.submit(msg)
    assert isinstance(handle, str)
    assert len(handle) > 0


# ── 2. memory write completion ──────────────────────────────────────


def test_engine_memory_write_completion():
    """MemoryWrite must complete with ok=True."""
    engine = _engine()
    msg = MemoryWriteMsg(
        correlation_id="c0", request_id="r1",
        dst_sip=0, dst_cube=0, dst_pe=0,
        dst_pa=_hbm_pa(), nbytes=4096, pattern="zero",
    )
    h = engine.submit(msg)
    engine.wait(h)
    comp, trace = engine.get_completion(h)
    assert comp.ok is True


# ── 3. memory read completion ───────────────────────────────────────


def test_engine_memory_read_completion():
    """MemoryRead must complete with ok=True."""
    engine = _engine()
    msg = MemoryReadMsg(
        correlation_id="c0", request_id="r2",
        src_sip=0, src_cube=0, src_pe=0,
        src_pa=_hbm_pa(), nbytes=4096,
    )
    h = engine.submit(msg)
    engine.wait(h)
    comp, trace = engine.get_completion(h)
    assert comp.ok is True


# ── 4. latency positive ────────────────────────────────────────────


def test_engine_latency_positive():
    """Trace total_ns must be > 0 (ADR-0002 D4)."""
    engine = _engine()
    msg = MemoryWriteMsg(
        correlation_id="c0", request_id="r3",
        dst_sip=0, dst_cube=0, dst_pe=0,
        dst_pa=_hbm_pa(), nbytes=4096, pattern="zero",
    )
    h = engine.submit(msg)
    engine.wait(h)
    _, trace = engine.get_completion(h)
    assert trace["total_ns"] > 0


# ── 5. trace has total_ns and nbytes ───────────────────────────────


def test_engine_trace_has_total_ns_and_nbytes():
    """Trace must contain 'total_ns' and 'nbytes'."""
    engine = _engine()
    msg = MemoryWriteMsg(
        correlation_id="c0", request_id="r4",
        dst_sip=0, dst_cube=0, dst_pe=0,
        dst_pa=_hbm_pa(), nbytes=4096, pattern="zero",
    )
    h = engine.submit(msg)
    engine.wait(h)
    _, trace = engine.get_completion(h)
    assert "total_ns" in trace
    assert "nbytes" in trace
    assert trace["nbytes"] == 4096


# ── 6. latency includes node overhead_ns ────────────────────────────


def test_engine_latency_includes_node_overhead_ns():
    """Path traverses components with overhead_ns > 0, so total >= some minimum."""
    engine = _engine()
    msg = MemoryWriteMsg(
        correlation_id="c0", request_id="r7",
        dst_sip=0, dst_cube=0, dst_pe=0,
        dst_pa=_hbm_pa(), nbytes=4096, pattern="zero",
    )
    h = engine.submit(msg)
    engine.wait(h)
    _, trace = engine.get_completion(h)
    # pcie_ep (5.0) + io_cpu (10.0) + m_cpu (5.0) = at least 20 ns
    assert trace["total_ns"] >= 20.0


# ── 7. concurrent requests ─────────────────────────────────────────


def test_engine_concurrent_requests():
    """Two requests submitted before wait must both complete with traces."""
    engine = _engine()
    msg1 = MemoryWriteMsg(
        correlation_id="c0", request_id="r9a",
        dst_sip=0, dst_cube=0, dst_pe=0,
        dst_pa=_hbm_pa(), nbytes=4096, pattern="zero",
    )
    msg2 = MemoryWriteMsg(
        correlation_id="c0", request_id="r9b",
        dst_sip=0, dst_cube=0, dst_pe=1,
        dst_pa=_hbm_pa(pe_id=1), nbytes=4096, pattern="zero",
    )
    h1 = engine.submit(msg1)
    h2 = engine.submit(msg2)
    engine.wait(h1)
    engine.wait(h2)
    comp1, trace1 = engine.get_completion(h1)
    comp2, trace2 = engine.get_completion(h2)
    assert comp1.ok is True
    assert comp2.ok is True
    assert trace1["total_ns"] > 0
    assert trace2["total_ns"] > 0


# ── 8. kernel launch ───────────────────────────────────────────────


def test_engine_kernel_launch_simplified():
    """KernelLaunch returns latency > 0."""
    from kernbench.triton_emu.registry import clear_registry, register_kernel

    clear_registry()
    hbm_pa = _hbm_pa(pe_id=0)

    def gemm_kernel(a_ptr, tl):
        a = tl.load(a_ptr, shape=(4, 4), dtype="f16")
        tl.store(a_ptr, a)

    register_kernel("gemm", gemm_kernel)

    engine = _engine()
    shard0 = TensorArgShard(
        sip=0, cube=0, pe=0,
        pa=_hbm_pa(pe_id=0), nbytes=4096, offset_bytes=0,
    )
    shard1 = TensorArgShard(
        sip=0, cube=0, pe=1,
        pa=_hbm_pa(pe_id=1), nbytes=4096, offset_bytes=4096,
    )
    msg = KernelLaunchMsg(
        correlation_id="c0", request_id="r10",
        kernel_ref=KernelRef(name="gemm", kind="builtin"),
        args=(TensorArg(shards=(shard0, shard1)),),
    )
    h = engine.submit(msg)
    engine.wait(h)
    comp, trace = engine.get_completion(h)
    assert comp.ok is True
    assert trace["total_ns"] > 0
    clear_registry()


# ── 9. deterministic ───────────────────────────────────────────────


def test_engine_deterministic():
    """Same request on two engines must produce identical latency."""
    msg = MemoryWriteMsg(
        correlation_id="c0", request_id="r11",
        dst_sip=0, dst_cube=0, dst_pe=0,
        dst_pa=_hbm_pa(), nbytes=4096, pattern="zero",
    )
    e1 = _engine()
    h1 = e1.submit(msg)
    e1.wait(h1)
    _, t1 = e1.get_completion(h1)

    e2 = _engine()
    h2 = e2.submit(msg)
    e2.wait(h2)
    _, t2 = e2.get_completion(h2)

    assert t1["total_ns"] == t2["total_ns"]


# ── 10. remote cube access succeeds with higher latency ────────────


def test_dma_capacity_serializes_concurrent():
    """Two concurrent DMA writes to the same cube must contend at DMA capacity=1.

    When two MemoryWrite requests target the same cube's M_CPU simultaneously,
    the DMA engine (capacity=1) serializes them. The slower request must take
    longer than a single isolated request (ADR-0014 D4, ADR-0015 D5).
    """
    # Single isolated write baseline
    engine_single = _engine()
    msg_single = MemoryWriteMsg(
        correlation_id="c0", request_id="single",
        dst_sip=0, dst_cube=0, dst_pe=0,
        dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
        pattern="zero", target_pe=0,
    )
    h1 = engine_single.submit(msg_single)
    engine_single.wait(h1)
    _, t1 = engine_single.get_completion(h1)
    single_ns = t1["total_ns"]

    # Two concurrent writes to same cube (different PEs) → DMA contention
    engine_conc = _engine()
    msg_a = MemoryWriteMsg(
        correlation_id="c0", request_id="conc-a",
        dst_sip=0, dst_cube=0, dst_pe=0,
        dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
        pattern="zero", target_pe=0,
    )
    msg_b = MemoryWriteMsg(
        correlation_id="c0", request_id="conc-b",
        dst_sip=0, dst_cube=0, dst_pe=1,
        dst_pa=_hbm_pa(sip=0, cube=0, pe_id=1), nbytes=4096,
        pattern="zero", target_pe=1,
    )
    ha = engine_conc.submit(msg_a)
    hb = engine_conc.submit(msg_b)
    engine_conc.wait(ha)
    engine_conc.wait(hb)
    _, ta = engine_conc.get_completion(ha)
    _, tb = engine_conc.get_completion(hb)

    # At least one must be delayed by DMA contention
    max_ns = max(ta["total_ns"], tb["total_ns"])
    assert max_ns > single_ns, (
        f"concurrent max ({max_ns:.2f}ns) must > single ({single_ns:.2f}ns) "
        f"due to DMA capacity=1 contention"
    )


# ── 11. formula latency lower bound ──────────────────────────────


def test_formula_latency_lower_bound():
    """_formula_latency must be <= actual latency (ADR-0015 D7).

    Uses PE DMA path which is fully known at engine level.
    """
    from kernbench.policy.address.phyaddr import PhysAddr as PA
    from kernbench.policy.routing.router import AddressResolver, PathRouter
    from kernbench.topology.builder import load_topology as lt

    graph = lt(TOPOLOGY_PATH)
    engine = GraphEngine(graph)
    resolver = AddressResolver(graph)
    router = PathRouter(graph)

    pa = _hbm_pa(sip=0, cube=0, pe_id=1)
    pa_obj = PA.decode(pa)
    dst_node = resolver.resolve(pa_obj)
    pe_ref = "sip0.cube0.pe0"
    path = router.find_path(pe_ref, dst_node)
    formula = engine._formula_latency(path, 4096)

    # Run actual simulation
    msg = MemoryReadMsg(
        correlation_id="c0", request_id="formula-lb",
        src_sip=0, src_cube=0, src_pe=0,
        src_pa=pa, nbytes=4096, target_pe=1,
    )
    h = engine.submit(msg)
    engine.wait(h)
    _, trace = engine.get_completion(h)
    actual = trace["total_ns"]

    assert formula <= actual, (
        f"formula ({formula:.2f}) must <= actual ({actual:.2f})"
    )
    assert formula > 0, "formula must be > 0"


def test_formula_latency_exact_no_contention():
    """With no contention, formula should approximate actual for PE DMA.

    PE DMA is single-request with no fan-out or aggregation,
    so formula ≈ actual (within small tolerance for SimPy scheduling).
    """
    from kernbench.runtime_api.kernel import PeDmaMsg
    from kernbench.policy.address.phyaddr import PhysAddr as PA
    from kernbench.policy.routing.router import AddressResolver, PathRouter
    from kernbench.topology.builder import load_topology as lt

    graph = lt(TOPOLOGY_PATH)
    engine = GraphEngine(graph)
    resolver = AddressResolver(graph)
    router = PathRouter(graph)

    pa = _hbm_pa(sip=0, cube=0, pe_id=0)
    pa_obj = PA.decode(pa)
    dst_node = resolver.resolve(pa_obj)
    pe_ref = "sip0.cube0.pe0"
    path = router.find_path(pe_ref, dst_node)
    formula = engine._formula_latency(path, 4096)

    msg = PeDmaMsg(
        correlation_id="c0", request_id="formula-exact",
        src_sip=0, src_cube=0, src_pe=0,
        dst_pa=pa, nbytes=4096,
    )
    h = engine.submit(msg)
    engine.wait(h)
    _, trace = engine.get_completion(h)
    actual = trace["total_ns"]

    # No contention: formula should equal actual
    assert abs(formula - actual) < 0.01, (
        f"formula ({formula:.4f}) ≈ actual ({actual:.4f}) expected with no contention"
    )


# ── 10. remote cube access succeeds with higher latency ────────────


def test_engine_remote_cube_latency_higher():
    """Accessing a distant cube's HBM must have strictly higher latency than local.

    Uses separate engines to avoid contention effects.
    cube15 (far corner of 4x4 mesh) requires multiple UCIe + NOC hops
    from IO chiplet compared to cube0 (directly connected).
    """
    engine_local = _engine()
    engine_remote = _engine()
    msg_local = MemoryReadMsg(
        correlation_id="c0", request_id="r14a",
        src_sip=0, src_cube=0, src_pe=0,
        src_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
    )
    msg_remote = MemoryReadMsg(
        correlation_id="c0", request_id="r14b",
        src_sip=0, src_cube=0, src_pe=0,
        src_pa=_hbm_pa(sip=0, cube=15, pe_id=0), nbytes=4096,
    )
    h_local = engine_local.submit(msg_local)
    engine_local.wait(h_local)
    _, t_local = engine_local.get_completion(h_local)

    h_remote = engine_remote.submit(msg_remote)
    engine_remote.wait(h_remote)
    comp_remote, t_remote = engine_remote.get_completion(h_remote)

    assert comp_remote.ok is True
    assert t_remote is not None and t_local is not None
    assert t_remote["total_ns"] > t_local["total_ns"], (
        f"remote cube {t_remote['total_ns']:.2f} must > local {t_local['total_ns']:.2f}"
    )