"""Tests for IOChiplet NOC + D2H (combined #3+#4). Validates: - IOChiplet topology: io_noc, io_ucie PHY, conn nodes - H2D MemoryWrite: data flows pcie_ep → io_noc → cube → hbm (m_cpu bypass) - D2H MemoryRead: data flows hbm → cube → io_noc → pcie_ep (host drain) - KernelLaunch: still routes through m_cpu → PE - Latency invariants preserved """ from pathlib import Path from kernbench.policy.address.phyaddr import PhysAddr from kernbench.policy.routing.router import AddressResolver, PathRouter from kernbench.runtime_api.kernel import ( KernelLaunchMsg, KernelRef, MemoryReadMsg, MemoryWriteMsg, TensorArg, TensorArgShard, ) from kernbench.sim_engine.engine import GraphEngine from kernbench.topology.builder import load_topology TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml" def _graph(): return load_topology(TOPOLOGY_PATH) def _engine(): return GraphEngine(_graph()) def _hbm_pa(sip: int = 0, cube: int = 0, pe_id: int = 0) -> int: slice_bytes = 48 * (1 << 30) // 8 pa = PhysAddr.pe_hbm_addr( rack_id=0, sip_id=sip, cube_id=cube, pe_id=pe_id, pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes, ) return pa.encode() # ══════════════════════════════════════════════════════════════════ # 1. IOChiplet Topology Structure # ══════════════════════════════════════════════════════════════════ def test_io_chiplet_has_noc_node(): """Each IOChiplet instance must have an io_noc node.""" graph = _graph() assert "sip0.io0.noc" in graph.nodes, "io_noc node missing" node = graph.nodes["sip0.io0.noc"] assert node.kind == "io_noc" def test_io_chiplet_has_ucie_phy_nodes(): """Each IOChiplet PHY must exist as a separate node.""" graph = _graph() for phy in ["P0", "P1", "P2", "P3"]: node_id = f"sip0.io0.ucie-{phy}" assert node_id in graph.nodes, f"io_ucie PHY node {node_id} missing" def test_io_chiplet_has_conn_nodes(): """Each IOChiplet PHY must have conn nodes (NOC ↔ conn ↔ io_ucie pattern).""" graph = _graph() for phy in ["P0", "P1", "P2", "P3"]: for ci in range(4): # n_connections=4 conn_id = f"sip0.io0.ucie-{phy}.conn{ci}" assert conn_id in graph.nodes, f"conn node {conn_id} missing" def test_io_noc_connects_to_pcie_ep(): """pcie_ep must connect to io_noc (bidirectional).""" graph = _graph() edge_set = {(e.src, e.dst) for e in graph.edges} assert ("sip0.io0.pcie_ep", "sip0.io0.noc") in edge_set assert ("sip0.io0.noc", "sip0.io0.pcie_ep") in edge_set def test_io_noc_connects_to_io_cpu(): """io_cpu must connect to io_noc (bidirectional).""" graph = _graph() edge_set = {(e.src, e.dst) for e in graph.edges} assert ("sip0.io0.io_cpu", "sip0.io0.noc") in edge_set assert ("sip0.io0.noc", "sip0.io0.io_cpu") in edge_set def test_io_noc_connects_to_conn_nodes(): """io_noc must connect to conn nodes (per PHY, bidirectional).""" graph = _graph() edge_set = {(e.src, e.dst) for e in graph.edges} conn_id = "sip0.io0.ucie-P0.conn0" assert ("sip0.io0.noc", conn_id) in edge_set assert (conn_id, "sip0.io0.noc") in edge_set def test_no_direct_io_cpu_to_cube_edges(): """io_cpu must NOT have direct edges to cube UCIe ports (replaced by io_noc path).""" graph = _graph() for e in graph.edges: if e.src == "sip0.io0.io_cpu" and "cube" in e.dst: raise AssertionError( f"Direct io_cpu→cube edge found: {e.src} → {e.dst}. " f"All cube traffic should route through io_noc." ) # ══════════════════════════════════════════════════════════════════ # 2. H2D MemoryWrite (data path: pcie_ep → io_noc → cube → hbm) # ══════════════════════════════════════════════════════════════════ def test_h2d_write_completes(): """H2D MemoryWrite must complete with ok=True and positive latency.""" engine = _engine() msg = MemoryWriteMsg( correlation_id="noc", request_id="h2d-w", dst_sip=0, dst_cube=0, dst_pe=0, dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096, pattern="zero", target_pe=0, ) h = engine.submit(msg) engine.wait(h) comp, trace = engine.get_completion(h) assert comp.ok is True assert trace["total_ns"] > 0 def test_h2d_write_cross_cube_completes(): """H2D MemoryWrite to remote cube must complete.""" engine = _engine() msg = MemoryWriteMsg( correlation_id="noc", request_id="h2d-remote", dst_sip=0, dst_cube=4, dst_pe=0, dst_pa=_hbm_pa(sip=0, cube=4, pe_id=0), nbytes=4096, pattern="zero", target_pe=0, ) h = engine.submit(msg) engine.wait(h) comp, trace = engine.get_completion(h) assert comp.ok is True assert trace["total_ns"] > 0 def test_h2d_write_deterministic(): """Same H2D write on two engines must produce identical latency.""" msg = MemoryWriteMsg( correlation_id="noc", request_id="h2d-det", dst_sip=0, dst_cube=0, dst_pe=0, dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096, pattern="zero", target_pe=0, ) e1, e2 = _engine(), _engine() h1 = e1.submit(msg) e1.wait(h1) _, t1 = e1.get_completion(h1) h2 = e2.submit(msg) e2.wait(h2) _, t2 = e2.get_completion(h2) assert t1["total_ns"] == t2["total_ns"] # ══════════════════════════════════════════════════════════════════ # 3. D2H MemoryRead (data path: hbm → cube → io_noc → pcie_ep) # ══════════════════════════════════════════════════════════════════ def test_d2h_read_completes(): """D2H MemoryRead must complete with ok=True and positive latency.""" engine = _engine() msg = MemoryReadMsg( correlation_id="noc", request_id="d2h-r", src_sip=0, src_cube=0, src_pe=0, src_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096, ) h = engine.submit(msg) engine.wait(h) comp, trace = engine.get_completion(h) assert comp.ok is True assert trace["total_ns"] > 0 def test_d2h_read_includes_host_drain(): """D2H MemoryRead latency must include host-side drain. Read data (nbytes>0) flows hbm → ... → pcie_ep with BW occupancy. D2H should take longer than H2D for the same address because: - H2D: data pcie_ep→hbm (forward only) - D2H: command pcie_ep→hbm (forward) + data hbm→pcie_ep (reverse, with nbytes) """ engine_w = _engine() msg_w = MemoryWriteMsg( correlation_id="noc", request_id="drain-w", dst_sip=0, dst_cube=0, dst_pe=0, dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096, pattern="zero", target_pe=0, ) hw = engine_w.submit(msg_w) engine_w.wait(hw) _, tw = engine_w.get_completion(hw) engine_r = _engine() msg_r = MemoryReadMsg( correlation_id="noc", request_id="drain-r", src_sip=0, src_cube=0, src_pe=0, src_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096, ) hr = engine_r.submit(msg_r) engine_r.wait(hr) _, tr = engine_r.get_completion(hr) # D2H read should include reverse data path + host drain # so it should be >= H2D write latency assert tr["total_ns"] >= tw["total_ns"] * 0.8, ( f"D2H read ({tr['total_ns']:.2f}ns) should be comparable to or " f"greater than H2D write ({tw['total_ns']:.2f}ns) due to host drain" ) # ══════════════════════════════════════════════════════════════════ # 4. KernelLaunch (still routes through m_cpu) # ══════════════════════════════════════════════════════════════════ def test_kernel_launch_still_works(): """KernelLaunch must still complete via m_cpu → PE path.""" from kernbench.triton_emu.registry import clear_registry, register_kernel clear_registry() def gemm_kernel(a_ptr, tl): a = tl.load(a_ptr, shape=(4, 4), dtype="f16") tl.store(a_ptr, a) register_kernel("gemm", gemm_kernel) engine = _engine() shard0 = TensorArgShard( sip=0, cube=0, pe=0, pa=_hbm_pa(pe_id=0), nbytes=4096, offset_bytes=0, ) msg = KernelLaunchMsg( correlation_id="noc", request_id="kern", kernel_ref=KernelRef(name="gemm", kind="builtin"), args=(TensorArg(shards=(shard0,)),), ) h = engine.submit(msg) engine.wait(h) comp, trace = engine.get_completion(h) assert comp.ok is True assert trace["total_ns"] > 0 clear_registry() # ══════════════════════════════════════════════════════════════════ # 5. Latency Invariants # ══════════════════════════════════════════════════════════════════ def test_h2d_latency_monotonicity(): """H2D write: closer cube = lower latency (1hop < 2hop < 3hop).""" cubes = [0, 4, 8] latencies = [] for cube in cubes: engine = _engine() msg = MemoryWriteMsg( correlation_id="noc", request_id=f"mono-c{cube}", dst_sip=0, dst_cube=cube, dst_pe=0, dst_pa=_hbm_pa(sip=0, cube=cube, pe_id=0), nbytes=4096, pattern="zero", target_pe=0, ) h = engine.submit(msg) engine.wait(h) _, t = engine.get_completion(h) latencies.append(t["total_ns"]) for i in range(len(latencies) - 1): assert latencies[i] < latencies[i + 1], ( f"Monotonicity: cube{cubes[i]}({latencies[i]:.2f}) " f"must < cube{cubes[i+1]}({latencies[i+1]:.2f})" ) def test_h2d_path_includes_io_noc(): """H2D path from pcie_ep to hbm must traverse io_noc.""" graph = _graph() resolver = AddressResolver(graph) router = PathRouter(graph) pcie_ep = resolver.find_pcie_ep(0) pa = _hbm_pa(sip=0, cube=0, pe_id=0) hbm_target = resolver.resolve(PhysAddr.decode(pa)) path = router.find_memory_path(pcie_ep, hbm_target) assert "sip0.io0.noc" in path, ( f"H2D path must include io_noc. Path: {path}" ) def test_h2d_path_excludes_m_cpu(): """H2D MemoryWrite path must NOT include m_cpu (direct to hbm).""" graph = _graph() resolver = AddressResolver(graph) router = PathRouter(graph) pcie_ep = resolver.find_pcie_ep(0) pa = _hbm_pa(sip=0, cube=0, pe_id=0) hbm_target = resolver.resolve(PhysAddr.decode(pa)) path = router.find_memory_path(pcie_ep, hbm_target) m_cpu_nodes = [n for n in path if "m_cpu" in n] assert len(m_cpu_nodes) == 0, ( f"H2D MemoryWrite path must not include m_cpu. " f"Path: {path}, m_cpu nodes: {m_cpu_nodes}" )