Files
kernbench2/tests/test_engine.py
T
2026-03-18 11:47:48 -07:00

406 lines
13 KiB
Python

import pytest
from pathlib import Path
from kernbench.common.types import Completion, RequestHandle
from kernbench.policy.address.phyaddr import PhysAddr
from kernbench.runtime_api.kernel import (
KernelLaunchMsg,
KernelRef,
MemoryReadMsg,
MemoryWriteMsg,
ScalarArg,
TensorArg,
TensorArgShard,
)
from kernbench.sim_engine.engine import GraphEngine
from kernbench.topology.builder import load_topology
TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"
def _engine():
graph = load_topology(TOPOLOGY_PATH)
return GraphEngine(graph)
def _hbm_pa(sip: int = 0, cube: int = 0, pe_id: int = 0) -> int:
"""Create an HBM physical address targeting a specific PE's HBM slice."""
# 48 GB / 8 slices = 6 GB per slice
slice_bytes = 48 * (1 << 30) // 8
pa = PhysAddr.pe_hbm_addr(
rack_id=0, sip_id=sip, cube_id=cube, pe_id=pe_id,
pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes,
)
return pa.encode()
def _sram_pa(sip: int = 0, cube: int = 0) -> int:
"""Create an SRAM physical address."""
pa = PhysAddr.cube_sram_addr(rack_id=0, sip_id=sip, cube_id=cube, sram_offset=0x800)
return pa.encode()
# ── 1. submit returns handle ────────────────────────────────────────
def test_engine_submit_returns_handle():
"""submit() must return a RequestHandle (non-empty string)."""
engine = _engine()
msg = MemoryWriteMsg(
correlation_id="c0", request_id="r0",
dst_sip=0, dst_cube=0, dst_pe=0,
dst_pa=_hbm_pa(), nbytes=4096, pattern="zero",
)
handle = engine.submit(msg)
assert isinstance(handle, str)
assert len(handle) > 0
# ── 2. memory write completion ──────────────────────────────────────
def test_engine_memory_write_completion():
"""MemoryWrite must complete with ok=True."""
engine = _engine()
msg = MemoryWriteMsg(
correlation_id="c0", request_id="r1",
dst_sip=0, dst_cube=0, dst_pe=0,
dst_pa=_hbm_pa(), nbytes=4096, pattern="zero",
)
h = engine.submit(msg)
engine.wait(h)
comp, trace = engine.get_completion(h)
assert comp.ok is True
# ── 3. memory read completion ───────────────────────────────────────
def test_engine_memory_read_completion():
"""MemoryRead must complete with ok=True."""
engine = _engine()
msg = MemoryReadMsg(
correlation_id="c0", request_id="r2",
src_sip=0, src_cube=0, src_pe=0,
src_pa=_hbm_pa(), nbytes=4096,
)
h = engine.submit(msg)
engine.wait(h)
comp, trace = engine.get_completion(h)
assert comp.ok is True
# ── 4. latency positive ────────────────────────────────────────────
def test_engine_latency_positive():
"""Trace total_ns must be > 0 (ADR-0002 D4)."""
engine = _engine()
msg = MemoryWriteMsg(
correlation_id="c0", request_id="r3",
dst_sip=0, dst_cube=0, dst_pe=0,
dst_pa=_hbm_pa(), nbytes=4096, pattern="zero",
)
h = engine.submit(msg)
engine.wait(h)
_, trace = engine.get_completion(h)
assert trace["total_ns"] > 0
# ── 5. trace has total_ns and nbytes ───────────────────────────────
def test_engine_trace_has_total_ns_and_nbytes():
"""Trace must contain 'total_ns' and 'nbytes'."""
engine = _engine()
msg = MemoryWriteMsg(
correlation_id="c0", request_id="r4",
dst_sip=0, dst_cube=0, dst_pe=0,
dst_pa=_hbm_pa(), nbytes=4096, pattern="zero",
)
h = engine.submit(msg)
engine.wait(h)
_, trace = engine.get_completion(h)
assert "total_ns" in trace
assert "nbytes" in trace
assert trace["nbytes"] == 4096
# ── 6. latency includes node overhead_ns ────────────────────────────
def test_engine_latency_includes_node_overhead_ns():
"""Path traverses components with overhead_ns > 0, so total >= some minimum."""
engine = _engine()
msg = MemoryWriteMsg(
correlation_id="c0", request_id="r7",
dst_sip=0, dst_cube=0, dst_pe=0,
dst_pa=_hbm_pa(), nbytes=4096, pattern="zero",
)
h = engine.submit(msg)
engine.wait(h)
_, trace = engine.get_completion(h)
# pcie_ep (5.0) + io_cpu (10.0) + m_cpu (5.0) = at least 20 ns
assert trace["total_ns"] >= 20.0
# ── 7. concurrent requests ─────────────────────────────────────────
def test_engine_concurrent_requests():
"""Two requests submitted before wait must both complete with traces."""
engine = _engine()
msg1 = MemoryWriteMsg(
correlation_id="c0", request_id="r9a",
dst_sip=0, dst_cube=0, dst_pe=0,
dst_pa=_hbm_pa(), nbytes=4096, pattern="zero",
)
msg2 = MemoryWriteMsg(
correlation_id="c0", request_id="r9b",
dst_sip=0, dst_cube=0, dst_pe=1,
dst_pa=_hbm_pa(pe_id=1), nbytes=4096, pattern="zero",
)
h1 = engine.submit(msg1)
h2 = engine.submit(msg2)
engine.wait(h1)
engine.wait(h2)
comp1, trace1 = engine.get_completion(h1)
comp2, trace2 = engine.get_completion(h2)
assert comp1.ok is True
assert comp2.ok is True
assert trace1["total_ns"] > 0
assert trace2["total_ns"] > 0
# ── 8. kernel launch ───────────────────────────────────────────────
def test_engine_kernel_launch_simplified():
"""KernelLaunch returns latency > 0."""
from kernbench.triton_emu.registry import clear_registry, register_kernel
clear_registry()
hbm_pa = _hbm_pa(pe_id=0)
def gemm_kernel(a_ptr, tl):
a = tl.load(a_ptr, shape=(4, 4), dtype="f16")
tl.store(a_ptr, a)
register_kernel("gemm", gemm_kernel)
engine = _engine()
shard0 = TensorArgShard(
sip=0, cube=0, pe=0,
pa=_hbm_pa(pe_id=0), nbytes=4096, offset_bytes=0,
)
shard1 = TensorArgShard(
sip=0, cube=0, pe=1,
pa=_hbm_pa(pe_id=1), nbytes=4096, offset_bytes=4096,
)
msg = KernelLaunchMsg(
correlation_id="c0", request_id="r10",
kernel_ref=KernelRef(name="gemm", kind="builtin"),
args=(TensorArg(shards=(shard0, shard1)),),
)
h = engine.submit(msg)
engine.wait(h)
comp, trace = engine.get_completion(h)
assert comp.ok is True
assert trace["total_ns"] > 0
clear_registry()
# ── 9. deterministic ───────────────────────────────────────────────
def test_engine_deterministic():
"""Same request on two engines must produce identical latency."""
msg = MemoryWriteMsg(
correlation_id="c0", request_id="r11",
dst_sip=0, dst_cube=0, dst_pe=0,
dst_pa=_hbm_pa(), nbytes=4096, pattern="zero",
)
e1 = _engine()
h1 = e1.submit(msg)
e1.wait(h1)
_, t1 = e1.get_completion(h1)
e2 = _engine()
h2 = e2.submit(msg)
e2.wait(h2)
_, t2 = e2.get_completion(h2)
assert t1["total_ns"] == t2["total_ns"]
# ── 10. remote cube access succeeds with higher latency ────────────
def test_dma_capacity_serializes_concurrent():
"""Two concurrent DMA writes to the same cube must contend at DMA capacity=1.
When two MemoryWrite requests target the same cube's M_CPU simultaneously,
the DMA engine (capacity=1) serializes them. The slower request must take
longer than a single isolated request (ADR-0014 D4, ADR-0015 D5).
"""
# Single isolated write baseline
engine_single = _engine()
msg_single = MemoryWriteMsg(
correlation_id="c0", request_id="single",
dst_sip=0, dst_cube=0, dst_pe=0,
dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
pattern="zero", target_pe=0,
)
h1 = engine_single.submit(msg_single)
engine_single.wait(h1)
_, t1 = engine_single.get_completion(h1)
single_ns = t1["total_ns"]
# Two concurrent writes to same cube (different PEs) → DMA contention
engine_conc = _engine()
msg_a = MemoryWriteMsg(
correlation_id="c0", request_id="conc-a",
dst_sip=0, dst_cube=0, dst_pe=0,
dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
pattern="zero", target_pe=0,
)
msg_b = MemoryWriteMsg(
correlation_id="c0", request_id="conc-b",
dst_sip=0, dst_cube=0, dst_pe=1,
dst_pa=_hbm_pa(sip=0, cube=0, pe_id=1), nbytes=4096,
pattern="zero", target_pe=1,
)
ha = engine_conc.submit(msg_a)
hb = engine_conc.submit(msg_b)
engine_conc.wait(ha)
engine_conc.wait(hb)
_, ta = engine_conc.get_completion(ha)
_, tb = engine_conc.get_completion(hb)
# At least one must be delayed by DMA contention
max_ns = max(ta["total_ns"], tb["total_ns"])
assert max_ns > single_ns, (
f"concurrent max ({max_ns:.2f}ns) must > single ({single_ns:.2f}ns) "
f"due to DMA capacity=1 contention"
)
# ── 11. formula latency lower bound ──────────────────────────────
def test_formula_latency_lower_bound():
"""_formula_latency must be <= actual latency (ADR-0015 D7).
Uses PE DMA path which is fully known at engine level.
"""
from kernbench.policy.address.phyaddr import PhysAddr as PA
from kernbench.policy.routing.router import AddressResolver, PathRouter
from kernbench.topology.builder import load_topology as lt
graph = lt(TOPOLOGY_PATH)
engine = GraphEngine(graph)
resolver = AddressResolver(graph)
router = PathRouter(graph)
pa = _hbm_pa(sip=0, cube=0, pe_id=1)
pa_obj = PA.decode(pa)
dst_node = resolver.resolve(pa_obj)
pe_ref = "sip0.cube0.pe0"
path = router.find_path(pe_ref, dst_node)
formula = engine._formula_latency(path, 4096)
# Run actual simulation
msg = MemoryReadMsg(
correlation_id="c0", request_id="formula-lb",
src_sip=0, src_cube=0, src_pe=0,
src_pa=pa, nbytes=4096, target_pe=1,
)
h = engine.submit(msg)
engine.wait(h)
_, trace = engine.get_completion(h)
actual = trace["total_ns"]
assert formula <= actual, (
f"formula ({formula:.2f}) must <= actual ({actual:.2f})"
)
assert formula > 0, "formula must be > 0"
def test_formula_latency_exact_no_contention():
"""With no contention, formula should approximate actual for PE DMA.
PE DMA is single-request with no fan-out or aggregation,
so formula ≈ actual (within small tolerance for SimPy scheduling).
"""
from kernbench.runtime_api.kernel import PeDmaMsg
from kernbench.policy.address.phyaddr import PhysAddr as PA
from kernbench.policy.routing.router import AddressResolver, PathRouter
from kernbench.topology.builder import load_topology as lt
graph = lt(TOPOLOGY_PATH)
engine = GraphEngine(graph)
resolver = AddressResolver(graph)
router = PathRouter(graph)
pa = _hbm_pa(sip=0, cube=0, pe_id=0)
pa_obj = PA.decode(pa)
dst_node = resolver.resolve(pa_obj)
pe_ref = "sip0.cube0.pe0"
path = router.find_path(pe_ref, dst_node)
formula = engine._formula_latency(path, 4096)
msg = PeDmaMsg(
correlation_id="c0", request_id="formula-exact",
src_sip=0, src_cube=0, src_pe=0,
dst_pa=pa, nbytes=4096,
)
h = engine.submit(msg)
engine.wait(h)
_, trace = engine.get_completion(h)
actual = trace["total_ns"]
# No contention: formula should equal actual
assert abs(formula - actual) < 0.01, (
f"formula ({formula:.4f}) ≈ actual ({actual:.4f}) expected with no contention"
)
# ── 10. remote cube access succeeds with higher latency ────────────
def test_engine_remote_cube_latency_higher():
"""Accessing a distant cube's HBM must have strictly higher latency than local.
Uses separate engines to avoid contention effects.
cube15 (far corner of 4x4 mesh) requires multiple UCIe + NOC hops
from IO chiplet compared to cube0 (directly connected).
"""
engine_local = _engine()
engine_remote = _engine()
msg_local = MemoryReadMsg(
correlation_id="c0", request_id="r14a",
src_sip=0, src_cube=0, src_pe=0,
src_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
)
msg_remote = MemoryReadMsg(
correlation_id="c0", request_id="r14b",
src_sip=0, src_cube=0, src_pe=0,
src_pa=_hbm_pa(sip=0, cube=15, pe_id=0), nbytes=4096,
)
h_local = engine_local.submit(msg_local)
engine_local.wait(h_local)
_, t_local = engine_local.get_completion(h_local)
h_remote = engine_remote.submit(msg_remote)
engine_remote.wait(h_remote)
comp_remote, t_remote = engine_remote.get_completion(h_remote)
assert comp_remote.ok is True
assert t_remote is not None and t_local is not None
assert t_remote["total_ns"] > t_local["total_ns"], (
f"remote cube {t_remote['total_ns']:.2f} must > local {t_local['total_ns']:.2f}"
)