import pytest from pathlib import Path from kernbench.common.types import Completion, RequestHandle from kernbench.policy.address.phyaddr import PhysAddr from kernbench.runtime_api.kernel import ( KernelLaunchMsg, KernelRef, MemoryReadMsg, MemoryWriteMsg, ScalarArg, TensorArg, TensorArgShard, ) from kernbench.sim_engine.engine import GraphEngine from kernbench.topology.builder import load_topology TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml" def _engine(): graph = load_topology(TOPOLOGY_PATH) return GraphEngine(graph) def _hbm_pa(sip: int = 0, cube: int = 0, pe_id: int = 0) -> int: """Create an HBM physical address targeting a specific PE's HBM slice.""" # 48 GB / 8 slices = 6 GB per slice slice_bytes = 48 * (1 << 30) // 8 pa = PhysAddr.pe_hbm_addr( rack_id=0, sip_id=sip, cube_id=cube, pe_id=pe_id, pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes, ) return pa.encode() def _sram_pa(sip: int = 0, cube: int = 0) -> int: """Create an SRAM physical address.""" pa = PhysAddr.cube_sram_addr(rack_id=0, sip_id=sip, cube_id=cube, sram_offset=0x800) return pa.encode() # ── 1. submit returns handle ──────────────────────────────────────── def test_engine_submit_returns_handle(): """submit() must return a RequestHandle (non-empty string).""" engine = _engine() msg = MemoryWriteMsg( correlation_id="c0", request_id="r0", dst_sip=0, dst_cube=0, dst_pe=0, dst_pa=_hbm_pa(), nbytes=4096, pattern="zero", ) handle = engine.submit(msg) assert isinstance(handle, str) assert len(handle) > 0 # ── 2. memory write completion ────────────────────────────────────── def test_engine_memory_write_completion(): """MemoryWrite must complete with ok=True.""" engine = _engine() msg = MemoryWriteMsg( correlation_id="c0", request_id="r1", dst_sip=0, dst_cube=0, dst_pe=0, dst_pa=_hbm_pa(), nbytes=4096, pattern="zero", ) h = engine.submit(msg) engine.wait(h) comp, trace = engine.get_completion(h) assert comp.ok is True # ── 3. memory read completion ─────────────────────────────────────── def test_engine_memory_read_completion(): """MemoryRead must complete with ok=True.""" engine = _engine() msg = MemoryReadMsg( correlation_id="c0", request_id="r2", src_sip=0, src_cube=0, src_pe=0, src_pa=_hbm_pa(), nbytes=4096, ) h = engine.submit(msg) engine.wait(h) comp, trace = engine.get_completion(h) assert comp.ok is True # ── 4. latency positive ──────────────────────────────────────────── def test_engine_latency_positive(): """Trace total_ns must be > 0 (ADR-0002 D4).""" engine = _engine() msg = MemoryWriteMsg( correlation_id="c0", request_id="r3", dst_sip=0, dst_cube=0, dst_pe=0, dst_pa=_hbm_pa(), nbytes=4096, pattern="zero", ) h = engine.submit(msg) engine.wait(h) _, trace = engine.get_completion(h) assert trace["total_ns"] > 0 # ── 5. trace has total_ns and nbytes ─────────────────────────────── def test_engine_trace_has_total_ns_and_nbytes(): """Trace must contain 'total_ns' and 'nbytes'.""" engine = _engine() msg = MemoryWriteMsg( correlation_id="c0", request_id="r4", dst_sip=0, dst_cube=0, dst_pe=0, dst_pa=_hbm_pa(), nbytes=4096, pattern="zero", ) h = engine.submit(msg) engine.wait(h) _, trace = engine.get_completion(h) assert "total_ns" in trace assert "nbytes" in trace assert trace["nbytes"] == 4096 # ── 6. latency includes node overhead_ns ──────────────────────────── def test_engine_latency_includes_node_overhead_ns(): """Path traverses components with overhead_ns > 0, so total >= some minimum.""" engine = _engine() msg = MemoryWriteMsg( correlation_id="c0", request_id="r7", dst_sip=0, dst_cube=0, dst_pe=0, dst_pa=_hbm_pa(), nbytes=4096, pattern="zero", ) h = engine.submit(msg) engine.wait(h) _, trace = engine.get_completion(h) # pcie_ep (5.0) + io_cpu (10.0) + m_cpu (5.0) = at least 20 ns assert trace["total_ns"] >= 20.0 # ── 7. concurrent requests ───────────────────────────────────────── def test_engine_concurrent_requests(): """Two requests submitted before wait must both complete with traces.""" engine = _engine() msg1 = MemoryWriteMsg( correlation_id="c0", request_id="r9a", dst_sip=0, dst_cube=0, dst_pe=0, dst_pa=_hbm_pa(), nbytes=4096, pattern="zero", ) msg2 = MemoryWriteMsg( correlation_id="c0", request_id="r9b", dst_sip=0, dst_cube=0, dst_pe=1, dst_pa=_hbm_pa(pe_id=1), nbytes=4096, pattern="zero", ) h1 = engine.submit(msg1) h2 = engine.submit(msg2) engine.wait(h1) engine.wait(h2) comp1, trace1 = engine.get_completion(h1) comp2, trace2 = engine.get_completion(h2) assert comp1.ok is True assert comp2.ok is True assert trace1["total_ns"] > 0 assert trace2["total_ns"] > 0 # ── 8. kernel launch ─────────────────────────────────────────────── def test_engine_kernel_launch_simplified(): """KernelLaunch returns latency > 0.""" from kernbench.triton_emu.registry import clear_registry, register_kernel clear_registry() hbm_pa = _hbm_pa(pe_id=0) def gemm_kernel(a_ptr, tl): a = tl.load(a_ptr, shape=(4, 4), dtype="f16") tl.store(a_ptr, a) register_kernel("gemm", gemm_kernel) engine = _engine() shard0 = TensorArgShard( sip=0, cube=0, pe=0, pa=_hbm_pa(pe_id=0), nbytes=4096, offset_bytes=0, ) shard1 = TensorArgShard( sip=0, cube=0, pe=1, pa=_hbm_pa(pe_id=1), nbytes=4096, offset_bytes=4096, ) msg = KernelLaunchMsg( correlation_id="c0", request_id="r10", kernel_ref=KernelRef(name="gemm", kind="builtin"), args=(TensorArg(shards=(shard0, shard1)),), ) h = engine.submit(msg) engine.wait(h) comp, trace = engine.get_completion(h) assert comp.ok is True assert trace["total_ns"] > 0 clear_registry() # ── 9. deterministic ─────────────────────────────────────────────── def test_engine_deterministic(): """Same request on two engines must produce identical latency.""" msg = MemoryWriteMsg( correlation_id="c0", request_id="r11", dst_sip=0, dst_cube=0, dst_pe=0, dst_pa=_hbm_pa(), nbytes=4096, pattern="zero", ) e1 = _engine() h1 = e1.submit(msg) e1.wait(h1) _, t1 = e1.get_completion(h1) e2 = _engine() h2 = e2.submit(msg) e2.wait(h2) _, t2 = e2.get_completion(h2) assert t1["total_ns"] == t2["total_ns"] # ── 10. remote cube access succeeds with higher latency ──────────── def test_dma_capacity_serializes_concurrent(): """Two concurrent DMA writes to the same cube must contend at DMA capacity=1. When two MemoryWrite requests target the same cube's M_CPU simultaneously, the DMA engine (capacity=1) serializes them. The slower request must take longer than a single isolated request (ADR-0014 D4, ADR-0015 D5). """ # Single isolated write baseline engine_single = _engine() msg_single = MemoryWriteMsg( correlation_id="c0", request_id="single", dst_sip=0, dst_cube=0, dst_pe=0, dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096, pattern="zero", target_pe=0, ) h1 = engine_single.submit(msg_single) engine_single.wait(h1) _, t1 = engine_single.get_completion(h1) single_ns = t1["total_ns"] # Two concurrent writes to same cube (different PEs) → DMA contention engine_conc = _engine() msg_a = MemoryWriteMsg( correlation_id="c0", request_id="conc-a", dst_sip=0, dst_cube=0, dst_pe=0, dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096, pattern="zero", target_pe=0, ) msg_b = MemoryWriteMsg( correlation_id="c0", request_id="conc-b", dst_sip=0, dst_cube=0, dst_pe=1, dst_pa=_hbm_pa(sip=0, cube=0, pe_id=1), nbytes=4096, pattern="zero", target_pe=1, ) ha = engine_conc.submit(msg_a) hb = engine_conc.submit(msg_b) engine_conc.wait(ha) engine_conc.wait(hb) _, ta = engine_conc.get_completion(ha) _, tb = engine_conc.get_completion(hb) # At least one must be delayed by DMA contention max_ns = max(ta["total_ns"], tb["total_ns"]) assert max_ns > single_ns, ( f"concurrent max ({max_ns:.2f}ns) must > single ({single_ns:.2f}ns) " f"due to DMA capacity=1 contention" ) # ── 11. formula latency lower bound ────────────────────────────── def test_formula_latency_lower_bound(): """_formula_latency must be <= actual latency (ADR-0015 D7). Uses PE DMA path which is fully known at engine level. """ from kernbench.policy.address.phyaddr import PhysAddr as PA from kernbench.policy.routing.router import AddressResolver, PathRouter from kernbench.topology.builder import load_topology as lt graph = lt(TOPOLOGY_PATH) engine = GraphEngine(graph) resolver = AddressResolver(graph) router = PathRouter(graph) pa = _hbm_pa(sip=0, cube=0, pe_id=1) pa_obj = PA.decode(pa) dst_node = resolver.resolve(pa_obj) pe_ref = "sip0.cube0.pe0" path = router.find_path(pe_ref, dst_node) formula = engine._formula_latency(path, 4096) # Run actual simulation msg = MemoryReadMsg( correlation_id="c0", request_id="formula-lb", src_sip=0, src_cube=0, src_pe=0, src_pa=pa, nbytes=4096, target_pe=1, ) h = engine.submit(msg) engine.wait(h) _, trace = engine.get_completion(h) actual = trace["total_ns"] assert formula <= actual, ( f"formula ({formula:.2f}) must <= actual ({actual:.2f})" ) assert formula > 0, "formula must be > 0" def test_formula_latency_exact_no_contention(): """With no contention, formula should approximate actual for PE DMA. PE DMA is single-request with no fan-out or aggregation, so formula ≈ actual (within small tolerance for SimPy scheduling). """ from kernbench.runtime_api.kernel import PeDmaMsg from kernbench.policy.address.phyaddr import PhysAddr as PA from kernbench.policy.routing.router import AddressResolver, PathRouter from kernbench.topology.builder import load_topology as lt graph = lt(TOPOLOGY_PATH) engine = GraphEngine(graph) resolver = AddressResolver(graph) router = PathRouter(graph) pa = _hbm_pa(sip=0, cube=0, pe_id=0) pa_obj = PA.decode(pa) dst_node = resolver.resolve(pa_obj) pe_ref = "sip0.cube0.pe0" path = router.find_path(pe_ref, dst_node) formula = engine._formula_latency(path, 4096) msg = PeDmaMsg( correlation_id="c0", request_id="formula-exact", src_sip=0, src_cube=0, src_pe=0, dst_pa=pa, nbytes=4096, ) h = engine.submit(msg) engine.wait(h) _, trace = engine.get_completion(h) actual = trace["total_ns"] # No contention: formula should equal actual assert abs(formula - actual) < 0.01, ( f"formula ({formula:.4f}) ≈ actual ({actual:.4f}) expected with no contention" ) # ── 10. remote cube access succeeds with higher latency ──────────── def test_engine_remote_cube_latency_higher(): """Accessing a distant cube's HBM must have strictly higher latency than local. Uses separate engines to avoid contention effects. cube15 (far corner of 4x4 mesh) requires multiple UCIe + NOC hops from IO chiplet compared to cube0 (directly connected). """ engine_local = _engine() engine_remote = _engine() msg_local = MemoryReadMsg( correlation_id="c0", request_id="r14a", src_sip=0, src_cube=0, src_pe=0, src_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096, ) msg_remote = MemoryReadMsg( correlation_id="c0", request_id="r14b", src_sip=0, src_cube=0, src_pe=0, src_pa=_hbm_pa(sip=0, cube=15, pe_id=0), nbytes=4096, ) h_local = engine_local.submit(msg_local) engine_local.wait(h_local) _, t_local = engine_local.get_completion(h_local) h_remote = engine_remote.submit(msg_remote) engine_remote.wait(h_remote) comp_remote, t_remote = engine_remote.get_completion(h_remote) assert comp_remote.ok is True assert t_remote is not None and t_local is not None assert t_remote["total_ns"] > t_local["total_ns"], ( f"remote cube {t_remote['total_ns']:.2f} must > local {t_local['total_ns']:.2f}" )