commit - release 1
This commit is contained in:
@@ -0,0 +1,405 @@
|
||||
import pytest
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from kernbench.common.types import Completion, RequestHandle
|
||||
from kernbench.policy.address.phyaddr import PhysAddr
|
||||
from kernbench.runtime_api.kernel import (
|
||||
KernelLaunchMsg,
|
||||
KernelRef,
|
||||
MemoryReadMsg,
|
||||
MemoryWriteMsg,
|
||||
ScalarArg,
|
||||
TensorArg,
|
||||
TensorArgShard,
|
||||
)
|
||||
from kernbench.sim_engine.engine import GraphEngine
|
||||
from kernbench.topology.builder import load_topology
|
||||
|
||||
TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"
|
||||
|
||||
|
||||
def _engine():
|
||||
graph = load_topology(TOPOLOGY_PATH)
|
||||
return GraphEngine(graph)
|
||||
|
||||
|
||||
def _hbm_pa(sip: int = 0, cube: int = 0, pe_id: int = 0) -> int:
|
||||
"""Create an HBM physical address targeting a specific PE's HBM slice."""
|
||||
# 48 GB / 8 slices = 6 GB per slice
|
||||
slice_bytes = 48 * (1 << 30) // 8
|
||||
pa = PhysAddr.pe_hbm_addr(
|
||||
rack_id=0, sip_id=sip, cube_id=cube, pe_id=pe_id,
|
||||
pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes,
|
||||
)
|
||||
return pa.encode()
|
||||
|
||||
|
||||
def _sram_pa(sip: int = 0, cube: int = 0) -> int:
|
||||
"""Create an SRAM physical address."""
|
||||
pa = PhysAddr.cube_sram_addr(rack_id=0, sip_id=sip, cube_id=cube, sram_offset=0x800)
|
||||
return pa.encode()
|
||||
|
||||
|
||||
# ── 1. submit returns handle ────────────────────────────────────────
|
||||
|
||||
|
||||
def test_engine_submit_returns_handle():
|
||||
"""submit() must return a RequestHandle (non-empty string)."""
|
||||
engine = _engine()
|
||||
msg = MemoryWriteMsg(
|
||||
correlation_id="c0", request_id="r0",
|
||||
dst_sip=0, dst_cube=0, dst_pe=0,
|
||||
dst_pa=_hbm_pa(), nbytes=4096, pattern="zero",
|
||||
)
|
||||
handle = engine.submit(msg)
|
||||
assert isinstance(handle, str)
|
||||
assert len(handle) > 0
|
||||
|
||||
|
||||
# ── 2. memory write completion ──────────────────────────────────────
|
||||
|
||||
|
||||
def test_engine_memory_write_completion():
|
||||
"""MemoryWrite must complete with ok=True."""
|
||||
engine = _engine()
|
||||
msg = MemoryWriteMsg(
|
||||
correlation_id="c0", request_id="r1",
|
||||
dst_sip=0, dst_cube=0, dst_pe=0,
|
||||
dst_pa=_hbm_pa(), nbytes=4096, pattern="zero",
|
||||
)
|
||||
h = engine.submit(msg)
|
||||
engine.wait(h)
|
||||
comp, trace = engine.get_completion(h)
|
||||
assert comp.ok is True
|
||||
|
||||
|
||||
# ── 3. memory read completion ───────────────────────────────────────
|
||||
|
||||
|
||||
def test_engine_memory_read_completion():
|
||||
"""MemoryRead must complete with ok=True."""
|
||||
engine = _engine()
|
||||
msg = MemoryReadMsg(
|
||||
correlation_id="c0", request_id="r2",
|
||||
src_sip=0, src_cube=0, src_pe=0,
|
||||
src_pa=_hbm_pa(), nbytes=4096,
|
||||
)
|
||||
h = engine.submit(msg)
|
||||
engine.wait(h)
|
||||
comp, trace = engine.get_completion(h)
|
||||
assert comp.ok is True
|
||||
|
||||
|
||||
# ── 4. latency positive ────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_engine_latency_positive():
|
||||
"""Trace total_ns must be > 0 (ADR-0002 D4)."""
|
||||
engine = _engine()
|
||||
msg = MemoryWriteMsg(
|
||||
correlation_id="c0", request_id="r3",
|
||||
dst_sip=0, dst_cube=0, dst_pe=0,
|
||||
dst_pa=_hbm_pa(), nbytes=4096, pattern="zero",
|
||||
)
|
||||
h = engine.submit(msg)
|
||||
engine.wait(h)
|
||||
_, trace = engine.get_completion(h)
|
||||
assert trace["total_ns"] > 0
|
||||
|
||||
|
||||
# ── 5. trace has total_ns and nbytes ───────────────────────────────
|
||||
|
||||
|
||||
def test_engine_trace_has_total_ns_and_nbytes():
|
||||
"""Trace must contain 'total_ns' and 'nbytes'."""
|
||||
engine = _engine()
|
||||
msg = MemoryWriteMsg(
|
||||
correlation_id="c0", request_id="r4",
|
||||
dst_sip=0, dst_cube=0, dst_pe=0,
|
||||
dst_pa=_hbm_pa(), nbytes=4096, pattern="zero",
|
||||
)
|
||||
h = engine.submit(msg)
|
||||
engine.wait(h)
|
||||
_, trace = engine.get_completion(h)
|
||||
assert "total_ns" in trace
|
||||
assert "nbytes" in trace
|
||||
assert trace["nbytes"] == 4096
|
||||
|
||||
|
||||
# ── 6. latency includes node overhead_ns ────────────────────────────
|
||||
|
||||
|
||||
def test_engine_latency_includes_node_overhead_ns():
|
||||
"""Path traverses components with overhead_ns > 0, so total >= some minimum."""
|
||||
engine = _engine()
|
||||
msg = MemoryWriteMsg(
|
||||
correlation_id="c0", request_id="r7",
|
||||
dst_sip=0, dst_cube=0, dst_pe=0,
|
||||
dst_pa=_hbm_pa(), nbytes=4096, pattern="zero",
|
||||
)
|
||||
h = engine.submit(msg)
|
||||
engine.wait(h)
|
||||
_, trace = engine.get_completion(h)
|
||||
# pcie_ep (5.0) + io_cpu (10.0) + m_cpu (5.0) = at least 20 ns
|
||||
assert trace["total_ns"] >= 20.0
|
||||
|
||||
|
||||
# ── 7. concurrent requests ─────────────────────────────────────────
|
||||
|
||||
|
||||
def test_engine_concurrent_requests():
|
||||
"""Two requests submitted before wait must both complete with traces."""
|
||||
engine = _engine()
|
||||
msg1 = MemoryWriteMsg(
|
||||
correlation_id="c0", request_id="r9a",
|
||||
dst_sip=0, dst_cube=0, dst_pe=0,
|
||||
dst_pa=_hbm_pa(), nbytes=4096, pattern="zero",
|
||||
)
|
||||
msg2 = MemoryWriteMsg(
|
||||
correlation_id="c0", request_id="r9b",
|
||||
dst_sip=0, dst_cube=0, dst_pe=1,
|
||||
dst_pa=_hbm_pa(pe_id=1), nbytes=4096, pattern="zero",
|
||||
)
|
||||
h1 = engine.submit(msg1)
|
||||
h2 = engine.submit(msg2)
|
||||
engine.wait(h1)
|
||||
engine.wait(h2)
|
||||
comp1, trace1 = engine.get_completion(h1)
|
||||
comp2, trace2 = engine.get_completion(h2)
|
||||
assert comp1.ok is True
|
||||
assert comp2.ok is True
|
||||
assert trace1["total_ns"] > 0
|
||||
assert trace2["total_ns"] > 0
|
||||
|
||||
|
||||
# ── 8. kernel launch ───────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_engine_kernel_launch_simplified():
|
||||
"""KernelLaunch returns latency > 0."""
|
||||
from kernbench.triton_emu.registry import clear_registry, register_kernel
|
||||
|
||||
clear_registry()
|
||||
hbm_pa = _hbm_pa(pe_id=0)
|
||||
|
||||
def gemm_kernel(a_ptr, tl):
|
||||
a = tl.load(a_ptr, shape=(4, 4), dtype="f16")
|
||||
tl.store(a_ptr, a)
|
||||
|
||||
register_kernel("gemm", gemm_kernel)
|
||||
|
||||
engine = _engine()
|
||||
shard0 = TensorArgShard(
|
||||
sip=0, cube=0, pe=0,
|
||||
pa=_hbm_pa(pe_id=0), nbytes=4096, offset_bytes=0,
|
||||
)
|
||||
shard1 = TensorArgShard(
|
||||
sip=0, cube=0, pe=1,
|
||||
pa=_hbm_pa(pe_id=1), nbytes=4096, offset_bytes=4096,
|
||||
)
|
||||
msg = KernelLaunchMsg(
|
||||
correlation_id="c0", request_id="r10",
|
||||
kernel_ref=KernelRef(name="gemm", kind="builtin"),
|
||||
args=(TensorArg(shards=(shard0, shard1)),),
|
||||
)
|
||||
h = engine.submit(msg)
|
||||
engine.wait(h)
|
||||
comp, trace = engine.get_completion(h)
|
||||
assert comp.ok is True
|
||||
assert trace["total_ns"] > 0
|
||||
clear_registry()
|
||||
|
||||
|
||||
# ── 9. deterministic ───────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_engine_deterministic():
|
||||
"""Same request on two engines must produce identical latency."""
|
||||
msg = MemoryWriteMsg(
|
||||
correlation_id="c0", request_id="r11",
|
||||
dst_sip=0, dst_cube=0, dst_pe=0,
|
||||
dst_pa=_hbm_pa(), nbytes=4096, pattern="zero",
|
||||
)
|
||||
e1 = _engine()
|
||||
h1 = e1.submit(msg)
|
||||
e1.wait(h1)
|
||||
_, t1 = e1.get_completion(h1)
|
||||
|
||||
e2 = _engine()
|
||||
h2 = e2.submit(msg)
|
||||
e2.wait(h2)
|
||||
_, t2 = e2.get_completion(h2)
|
||||
|
||||
assert t1["total_ns"] == t2["total_ns"]
|
||||
|
||||
|
||||
# ── 10. remote cube access succeeds with higher latency ────────────
|
||||
|
||||
|
||||
def test_dma_capacity_serializes_concurrent():
|
||||
"""Two concurrent DMA writes to the same cube must contend at DMA capacity=1.
|
||||
|
||||
When two MemoryWrite requests target the same cube's M_CPU simultaneously,
|
||||
the DMA engine (capacity=1) serializes them. The slower request must take
|
||||
longer than a single isolated request (ADR-0014 D4, ADR-0015 D5).
|
||||
"""
|
||||
# Single isolated write baseline
|
||||
engine_single = _engine()
|
||||
msg_single = MemoryWriteMsg(
|
||||
correlation_id="c0", request_id="single",
|
||||
dst_sip=0, dst_cube=0, dst_pe=0,
|
||||
dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
|
||||
pattern="zero", target_pe=0,
|
||||
)
|
||||
h1 = engine_single.submit(msg_single)
|
||||
engine_single.wait(h1)
|
||||
_, t1 = engine_single.get_completion(h1)
|
||||
single_ns = t1["total_ns"]
|
||||
|
||||
# Two concurrent writes to same cube (different PEs) → DMA contention
|
||||
engine_conc = _engine()
|
||||
msg_a = MemoryWriteMsg(
|
||||
correlation_id="c0", request_id="conc-a",
|
||||
dst_sip=0, dst_cube=0, dst_pe=0,
|
||||
dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
|
||||
pattern="zero", target_pe=0,
|
||||
)
|
||||
msg_b = MemoryWriteMsg(
|
||||
correlation_id="c0", request_id="conc-b",
|
||||
dst_sip=0, dst_cube=0, dst_pe=1,
|
||||
dst_pa=_hbm_pa(sip=0, cube=0, pe_id=1), nbytes=4096,
|
||||
pattern="zero", target_pe=1,
|
||||
)
|
||||
ha = engine_conc.submit(msg_a)
|
||||
hb = engine_conc.submit(msg_b)
|
||||
engine_conc.wait(ha)
|
||||
engine_conc.wait(hb)
|
||||
_, ta = engine_conc.get_completion(ha)
|
||||
_, tb = engine_conc.get_completion(hb)
|
||||
|
||||
# At least one must be delayed by DMA contention
|
||||
max_ns = max(ta["total_ns"], tb["total_ns"])
|
||||
assert max_ns > single_ns, (
|
||||
f"concurrent max ({max_ns:.2f}ns) must > single ({single_ns:.2f}ns) "
|
||||
f"due to DMA capacity=1 contention"
|
||||
)
|
||||
|
||||
|
||||
# ── 11. formula latency lower bound ──────────────────────────────
|
||||
|
||||
|
||||
def test_formula_latency_lower_bound():
|
||||
"""_formula_latency must be <= actual latency (ADR-0015 D7).
|
||||
|
||||
Uses PE DMA path which is fully known at engine level.
|
||||
"""
|
||||
from kernbench.policy.address.phyaddr import PhysAddr as PA
|
||||
from kernbench.policy.routing.router import AddressResolver, PathRouter
|
||||
from kernbench.topology.builder import load_topology as lt
|
||||
|
||||
graph = lt(TOPOLOGY_PATH)
|
||||
engine = GraphEngine(graph)
|
||||
resolver = AddressResolver(graph)
|
||||
router = PathRouter(graph)
|
||||
|
||||
pa = _hbm_pa(sip=0, cube=0, pe_id=1)
|
||||
pa_obj = PA.decode(pa)
|
||||
dst_node = resolver.resolve(pa_obj)
|
||||
pe_ref = "sip0.cube0.pe0"
|
||||
path = router.find_path(pe_ref, dst_node)
|
||||
formula = engine._formula_latency(path, 4096)
|
||||
|
||||
# Run actual simulation
|
||||
msg = MemoryReadMsg(
|
||||
correlation_id="c0", request_id="formula-lb",
|
||||
src_sip=0, src_cube=0, src_pe=0,
|
||||
src_pa=pa, nbytes=4096, target_pe=1,
|
||||
)
|
||||
h = engine.submit(msg)
|
||||
engine.wait(h)
|
||||
_, trace = engine.get_completion(h)
|
||||
actual = trace["total_ns"]
|
||||
|
||||
assert formula <= actual, (
|
||||
f"formula ({formula:.2f}) must <= actual ({actual:.2f})"
|
||||
)
|
||||
assert formula > 0, "formula must be > 0"
|
||||
|
||||
|
||||
def test_formula_latency_exact_no_contention():
|
||||
"""With no contention, formula should approximate actual for PE DMA.
|
||||
|
||||
PE DMA is single-request with no fan-out or aggregation,
|
||||
so formula ≈ actual (within small tolerance for SimPy scheduling).
|
||||
"""
|
||||
from kernbench.runtime_api.kernel import PeDmaMsg
|
||||
from kernbench.policy.address.phyaddr import PhysAddr as PA
|
||||
from kernbench.policy.routing.router import AddressResolver, PathRouter
|
||||
from kernbench.topology.builder import load_topology as lt
|
||||
|
||||
graph = lt(TOPOLOGY_PATH)
|
||||
engine = GraphEngine(graph)
|
||||
resolver = AddressResolver(graph)
|
||||
router = PathRouter(graph)
|
||||
|
||||
pa = _hbm_pa(sip=0, cube=0, pe_id=0)
|
||||
pa_obj = PA.decode(pa)
|
||||
dst_node = resolver.resolve(pa_obj)
|
||||
pe_ref = "sip0.cube0.pe0"
|
||||
path = router.find_path(pe_ref, dst_node)
|
||||
formula = engine._formula_latency(path, 4096)
|
||||
|
||||
msg = PeDmaMsg(
|
||||
correlation_id="c0", request_id="formula-exact",
|
||||
src_sip=0, src_cube=0, src_pe=0,
|
||||
dst_pa=pa, nbytes=4096,
|
||||
)
|
||||
h = engine.submit(msg)
|
||||
engine.wait(h)
|
||||
_, trace = engine.get_completion(h)
|
||||
actual = trace["total_ns"]
|
||||
|
||||
# No contention: formula should equal actual
|
||||
assert abs(formula - actual) < 0.01, (
|
||||
f"formula ({formula:.4f}) ≈ actual ({actual:.4f}) expected with no contention"
|
||||
)
|
||||
|
||||
|
||||
# ── 10. remote cube access succeeds with higher latency ────────────
|
||||
|
||||
|
||||
def test_engine_remote_cube_latency_higher():
|
||||
"""Accessing a distant cube's HBM must have strictly higher latency than local.
|
||||
|
||||
Uses separate engines to avoid contention effects.
|
||||
cube15 (far corner of 4x4 mesh) requires multiple UCIe + NOC hops
|
||||
from IO chiplet compared to cube0 (directly connected).
|
||||
"""
|
||||
engine_local = _engine()
|
||||
engine_remote = _engine()
|
||||
msg_local = MemoryReadMsg(
|
||||
correlation_id="c0", request_id="r14a",
|
||||
src_sip=0, src_cube=0, src_pe=0,
|
||||
src_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
|
||||
)
|
||||
msg_remote = MemoryReadMsg(
|
||||
correlation_id="c0", request_id="r14b",
|
||||
src_sip=0, src_cube=0, src_pe=0,
|
||||
src_pa=_hbm_pa(sip=0, cube=15, pe_id=0), nbytes=4096,
|
||||
)
|
||||
h_local = engine_local.submit(msg_local)
|
||||
engine_local.wait(h_local)
|
||||
_, t_local = engine_local.get_completion(h_local)
|
||||
|
||||
h_remote = engine_remote.submit(msg_remote)
|
||||
engine_remote.wait(h_remote)
|
||||
comp_remote, t_remote = engine_remote.get_completion(h_remote)
|
||||
|
||||
assert comp_remote.ok is True
|
||||
assert t_remote is not None and t_local is not None
|
||||
assert t_remote["total_ns"] > t_local["total_ns"], (
|
||||
f"remote cube {t_remote['total_ns']:.2f} must > local {t_local['total_ns']:.2f}"
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user