import pytest from pathlib import Path from kernbench.common.types import Completion, RequestHandle from kernbench.policy.address.phyaddr import PhysAddr from kernbench.runtime_api.kernel import ( KernelLaunchMsg, KernelRef, MemoryReadMsg, MemoryWriteMsg, ScalarArg, TensorArg, TensorArgShard, ) from kernbench.sim_engine.engine import GraphEngine from kernbench.topology.builder import load_topology TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml" def _engine(): graph = load_topology(TOPOLOGY_PATH) return GraphEngine(graph) def _hbm_pa(sip: int = 0, cube: int = 0, pe_id: int = 0) -> int: """Create an HBM physical address targeting a specific PE's HBM slice.""" # 48 GB / 8 slices = 6 GB per slice slice_bytes = 48 * (1 << 30) // 8 pa = PhysAddr.pe_hbm_addr( sip_id=sip, die_id=cube, pe_id=pe_id, pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes, ) return pa.encode() def _sram_pa(sip: int = 0, cube: int = 0) -> int: """Create an SRAM physical address.""" pa = PhysAddr.cube_sram_addr(sip_id=sip, die_id=cube, sram_offset=0x800) return pa.encode() # ── 1. submit returns handle ──────────────────────────────────────── def test_engine_submit_returns_handle(): """submit() must return a RequestHandle (non-empty string).""" engine = _engine() msg = MemoryWriteMsg( correlation_id="c0", request_id="r0", dst_sip=0, dst_cube=0, dst_pe=0, dst_pa=_hbm_pa(), nbytes=4096, pattern="zero", ) handle = engine.submit(msg) assert isinstance(handle, str) assert len(handle) > 0 # ── 2. memory write completion ────────────────────────────────────── def test_engine_memory_write_completion(): """MemoryWrite must complete with ok=True.""" engine = _engine() msg = MemoryWriteMsg( correlation_id="c0", request_id="r1", dst_sip=0, dst_cube=0, dst_pe=0, dst_pa=_hbm_pa(), nbytes=4096, pattern="zero", ) h = engine.submit(msg) engine.wait(h) comp, trace = engine.get_completion(h) assert comp.ok is True # ── 3. memory read completion ─────────────────────────────────────── def test_engine_memory_read_completion(): """MemoryRead must complete with ok=True.""" engine = _engine() msg = MemoryReadMsg( correlation_id="c0", request_id="r2", src_sip=0, src_cube=0, src_pe=0, src_pa=_hbm_pa(), nbytes=4096, ) h = engine.submit(msg) engine.wait(h) comp, trace = engine.get_completion(h) assert comp.ok is True # ── 4. latency positive ──────────────────────────────────────────── def test_engine_latency_positive(): """Trace total_ns must be > 0 (ADR-0002 D4).""" engine = _engine() msg = MemoryWriteMsg( correlation_id="c0", request_id="r3", dst_sip=0, dst_cube=0, dst_pe=0, dst_pa=_hbm_pa(), nbytes=4096, pattern="zero", ) h = engine.submit(msg) engine.wait(h) _, trace = engine.get_completion(h) assert trace["total_ns"] > 0 # ── 5. trace has total_ns and nbytes ─────────────────────────────── def test_engine_trace_has_total_ns_and_nbytes(): """Trace must contain 'total_ns' and 'nbytes'.""" engine = _engine() msg = MemoryWriteMsg( correlation_id="c0", request_id="r4", dst_sip=0, dst_cube=0, dst_pe=0, dst_pa=_hbm_pa(), nbytes=4096, pattern="zero", ) h = engine.submit(msg) engine.wait(h) _, trace = engine.get_completion(h) assert "total_ns" in trace assert "nbytes" in trace assert trace["nbytes"] == 4096 # ── 6. latency includes node overhead_ns ──────────────────────────── def test_engine_latency_includes_node_overhead_ns(): """Path traverses components with overhead_ns > 0, so total >= some minimum.""" engine = _engine() msg = MemoryWriteMsg( correlation_id="c0", request_id="r7", dst_sip=0, dst_cube=0, dst_pe=0, dst_pa=_hbm_pa(), nbytes=4096, pattern="zero", ) h = engine.submit(msg) engine.wait(h) _, trace = engine.get_completion(h) # pcie_ep (5.0) + io_cpu (10.0) + m_cpu (5.0) = at least 20 ns assert trace["total_ns"] >= 20.0 # ── 7. concurrent requests ───────────────────────────────────────── def test_engine_concurrent_requests(): """Two requests submitted before wait must both complete with traces.""" engine = _engine() msg1 = MemoryWriteMsg( correlation_id="c0", request_id="r9a", dst_sip=0, dst_cube=0, dst_pe=0, dst_pa=_hbm_pa(), nbytes=4096, pattern="zero", ) msg2 = MemoryWriteMsg( correlation_id="c0", request_id="r9b", dst_sip=0, dst_cube=0, dst_pe=1, dst_pa=_hbm_pa(pe_id=1), nbytes=4096, pattern="zero", ) h1 = engine.submit(msg1) h2 = engine.submit(msg2) engine.wait(h1) engine.wait(h2) comp1, trace1 = engine.get_completion(h1) comp2, trace2 = engine.get_completion(h2) assert comp1.ok is True assert comp2.ok is True assert trace1["total_ns"] > 0 assert trace2["total_ns"] > 0 # ── 8. kernel launch ─────────────────────────────────────────────── def test_engine_kernel_launch_simplified(): """KernelLaunch returns latency > 0.""" from kernbench.triton_emu.registry import clear_registry, register_kernel clear_registry() hbm_pa = _hbm_pa(pe_id=0) def gemm_kernel(a_ptr, tl): a = tl.load(a_ptr, shape=(4, 4), dtype="f16") tl.store(a_ptr, a) register_kernel("gemm", gemm_kernel) engine = _engine() shard0 = TensorArgShard( sip=0, cube=0, pe=0, pa=_hbm_pa(pe_id=0), nbytes=4096, offset_bytes=0, ) shard1 = TensorArgShard( sip=0, cube=0, pe=1, pa=_hbm_pa(pe_id=1), nbytes=4096, offset_bytes=4096, ) msg = KernelLaunchMsg( correlation_id="c0", request_id="r10", kernel_ref=KernelRef(name="gemm", kind="builtin"), args=(TensorArg(shards=(shard0, shard1)),), ) h = engine.submit(msg) engine.wait(h) comp, trace = engine.get_completion(h) assert comp.ok is True assert trace["total_ns"] > 0 clear_registry() # ── 9. deterministic ─────────────────────────────────────────────── def test_engine_deterministic(): """Same request on two engines must produce identical latency.""" msg = MemoryWriteMsg( correlation_id="c0", request_id="r11", dst_sip=0, dst_cube=0, dst_pe=0, dst_pa=_hbm_pa(), nbytes=4096, pattern="zero", ) e1 = _engine() h1 = e1.submit(msg) e1.wait(h1) _, t1 = e1.get_completion(h1) e2 = _engine() h2 = e2.submit(msg) e2.wait(h2) _, t2 = e2.get_completion(h2) assert t1["total_ns"] == t2["total_ns"] # ── 10. remote cube access succeeds with higher latency ──────────── def test_dma_capacity_serializes_concurrent(): """Two concurrent DMA writes to the same cube must contend at DMA capacity=1. When two MemoryWrite requests target the same cube's M_CPU simultaneously, the DMA engine (capacity=1) serializes them. The slower request must take longer than a single isolated request (ADR-0014 D4, ADR-0015 D5). """ # Single isolated write baseline engine_single = _engine() msg_single = MemoryWriteMsg( correlation_id="c0", request_id="single", dst_sip=0, dst_cube=0, dst_pe=0, dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096, pattern="zero", target_pe=0, ) h1 = engine_single.submit(msg_single) engine_single.wait(h1) _, t1 = engine_single.get_completion(h1) single_ns = t1["total_ns"] # Two concurrent writes to same cube (different PEs) → DMA contention engine_conc = _engine() msg_a = MemoryWriteMsg( correlation_id="c0", request_id="conc-a", dst_sip=0, dst_cube=0, dst_pe=0, dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096, pattern="zero", target_pe=0, ) msg_b = MemoryWriteMsg( correlation_id="c0", request_id="conc-b", dst_sip=0, dst_cube=0, dst_pe=1, dst_pa=_hbm_pa(sip=0, cube=0, pe_id=1), nbytes=4096, pattern="zero", target_pe=1, ) ha = engine_conc.submit(msg_a) hb = engine_conc.submit(msg_b) engine_conc.wait(ha) engine_conc.wait(hb) _, ta = engine_conc.get_completion(ha) _, tb = engine_conc.get_completion(hb) # At least one must be delayed by DMA contention max_ns = max(ta["total_ns"], tb["total_ns"]) assert max_ns > single_ns, ( f"concurrent max ({max_ns:.2f}ns) must > single ({single_ns:.2f}ns) " f"due to DMA capacity=1 contention" ) # ── 11. formula latency lower bound ────────────────────────────── def test_formula_latency_lower_bound(): """_formula_latency must be <= actual latency (ADR-0015 D7). Uses PE DMA path which is fully known at engine level. """ from kernbench.policy.address.phyaddr import PhysAddr as PA from kernbench.policy.routing.router import AddressResolver, PathRouter from kernbench.topology.builder import load_topology as lt graph = lt(TOPOLOGY_PATH) engine = GraphEngine(graph) resolver = AddressResolver(graph) router = PathRouter(graph) pa = _hbm_pa(sip=0, cube=0, pe_id=1) pa_obj = PA.decode(pa) dst_node = resolver.resolve(pa_obj) pe_ref = "sip0.cube0.pe0" path = router.find_path(pe_ref, dst_node) formula = engine._formula_latency(path, 4096) # Run actual simulation msg = MemoryReadMsg( correlation_id="c0", request_id="formula-lb", src_sip=0, src_cube=0, src_pe=0, src_pa=pa, nbytes=4096, target_pe=1, ) h = engine.submit(msg) engine.wait(h) _, trace = engine.get_completion(h) actual = trace["total_ns"] assert formula <= actual, ( f"formula ({formula:.2f}) must <= actual ({actual:.2f})" ) assert formula > 0, "formula must be > 0" def test_formula_latency_lower_bound_no_contention(): """With no contention, formula is a lower bound for PE DMA. PE DMA routes through NOC, which applies internal mesh traversal latency (XY routing based on physical positions) not captured by the formula (NOC edges have distance_mm=0 since NOC is distributed). Formula <= actual is the invariant. """ from kernbench.runtime_api.kernel import PeDmaMsg from kernbench.policy.address.phyaddr import PhysAddr as PA from kernbench.policy.routing.router import AddressResolver, PathRouter from kernbench.topology.builder import load_topology as lt graph = lt(TOPOLOGY_PATH) engine = GraphEngine(graph) resolver = AddressResolver(graph) router = PathRouter(graph) pa = _hbm_pa(sip=0, cube=0, pe_id=0) pa_obj = PA.decode(pa) dst_node = resolver.resolve(pa_obj) pe_ref = "sip0.cube0.pe0" path = router.find_path(pe_ref, dst_node) formula = engine._formula_latency(path, 4096) msg = PeDmaMsg( correlation_id="c0", request_id="formula-exact", src_sip=0, src_cube=0, src_pe=0, dst_pa=pa, nbytes=4096, ) h = engine.submit(msg) engine.wait(h) _, trace = engine.get_completion(h) actual = trace["total_ns"] # Formula is a lower bound; NOC internal traversal adds latency assert formula <= actual + 0.01, ( f"formula ({formula:.4f}) must be <= actual ({actual:.4f})" ) assert actual > 0 # ── 10. remote cube access succeeds with higher latency ──────────── def test_engine_remote_cube_latency_higher(): """Accessing a distant cube's HBM must have strictly higher latency than local. Uses separate engines to avoid contention effects. cube15 (far corner of 4x4 mesh) requires multiple UCIe + NOC hops from IO chiplet compared to cube0 (directly connected). """ engine_local = _engine() engine_remote = _engine() msg_local = MemoryReadMsg( correlation_id="c0", request_id="r14a", src_sip=0, src_cube=0, src_pe=0, src_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096, ) msg_remote = MemoryReadMsg( correlation_id="c0", request_id="r14b", src_sip=0, src_cube=0, src_pe=0, src_pa=_hbm_pa(sip=0, cube=15, pe_id=0), nbytes=4096, ) h_local = engine_local.submit(msg_local) engine_local.wait(h_local) _, t_local = engine_local.get_completion(h_local) h_remote = engine_remote.submit(msg_remote) engine_remote.wait(h_remote) comp_remote, t_remote = engine_remote.get_completion(h_remote) assert comp_remote.ok is True assert t_remote is not None and t_local is not None assert t_remote["total_ns"] > t_local["total_ns"], ( f"remote cube {t_remote['total_ns']:.2f} must > local {t_local['total_ns']:.2f}" )