diff --git a/docs/diagrams/allreduce_latency_plots/buffer_kind_sweep.csv b/docs/diagrams/allreduce_latency_plots/buffer_kind_sweep.csv index 94cf35a..adc0350 100644 --- a/docs/diagrams/allreduce_latency_plots/buffer_kind_sweep.csv +++ b/docs/diagrams/allreduce_latency_plots/buffer_kind_sweep.csv @@ -1,8 +1,8 @@ buffer_kind,sip_topology,n_sips,n_elem,bytes_per_pe,latency_ns -hbm,torus_2d,6,128,256,2144.0399999999754 -hbm,torus_2d,6,1024,2048,2908.74499999995 -hbm,torus_2d,6,8192,16384,8851.185000000081 -hbm,torus_2d,6,32768,65536,29225.265000008752 +hbm,torus_2d,6,128,256,2120.0399999999754 +hbm,torus_2d,6,1024,2048,2716.74499999995 +hbm,torus_2d,6,8192,16384,7315.185000000081 +hbm,torus_2d,6,32768,65536,23081.265000008738 sram,torus_2d,6,128,256,2060.0399999999754 sram,torus_2d,6,1024,2048,2908.74499999995 sram,torus_2d,6,8192,16384,9523.185000000081 diff --git a/docs/diagrams/allreduce_latency_plots/buffer_kind_sweep.png b/docs/diagrams/allreduce_latency_plots/buffer_kind_sweep.png index c839bda..daf8388 100644 Binary files a/docs/diagrams/allreduce_latency_plots/buffer_kind_sweep.png and b/docs/diagrams/allreduce_latency_plots/buffer_kind_sweep.png differ diff --git a/docs/diagrams/pe2pe_latency_plots/h1_intra_horizontal.png b/docs/diagrams/pe2pe_latency_plots/h1_intra_horizontal.png index 91efea1..41537cd 100644 Binary files a/docs/diagrams/pe2pe_latency_plots/h1_intra_horizontal.png and b/docs/diagrams/pe2pe_latency_plots/h1_intra_horizontal.png differ diff --git a/docs/diagrams/pe2pe_latency_plots/h2_intra_vertical.png b/docs/diagrams/pe2pe_latency_plots/h2_intra_vertical.png index af0c83e..3a157da 100644 Binary files a/docs/diagrams/pe2pe_latency_plots/h2_intra_vertical.png and b/docs/diagrams/pe2pe_latency_plots/h2_intra_vertical.png differ diff --git a/docs/diagrams/pe2pe_latency_plots/h3_inter_cube_horizontal.png b/docs/diagrams/pe2pe_latency_plots/h3_inter_cube_horizontal.png index 66afdfb..f4e6ea9 100644 Binary files a/docs/diagrams/pe2pe_latency_plots/h3_inter_cube_horizontal.png and b/docs/diagrams/pe2pe_latency_plots/h3_inter_cube_horizontal.png differ diff --git a/docs/diagrams/pe2pe_latency_plots/h4_inter_cube_vertical.png b/docs/diagrams/pe2pe_latency_plots/h4_inter_cube_vertical.png index 618f72b..8a666e3 100644 Binary files a/docs/diagrams/pe2pe_latency_plots/h4_inter_cube_vertical.png and b/docs/diagrams/pe2pe_latency_plots/h4_inter_cube_vertical.png differ diff --git a/docs/diagrams/pe2pe_latency_plots/overview.png b/docs/diagrams/pe2pe_latency_plots/overview.png index 5dc105e..ff0bb90 100644 Binary files a/docs/diagrams/pe2pe_latency_plots/overview.png and b/docs/diagrams/pe2pe_latency_plots/overview.png differ diff --git a/docs/diagrams/pe2pe_latency_plots/summary.csv b/docs/diagrams/pe2pe_latency_plots/summary.csv index 3db9947..3b7e0ef 100644 --- a/docs/diagrams/pe2pe_latency_plots/summary.csv +++ b/docs/diagrams/pe2pe_latency_plots/summary.csv @@ -1,44 +1,44 @@ hop,label,size_bytes,path,total_ns -h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),128,ipcq,42.8899999999976 -h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),128,raw,29.0199999999968 -h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),256,ipcq,48.1399999999976 -h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),256,raw,31.0199999999968 -h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),384,ipcq,50.3899999999976 -h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),384,raw,32.0199999999968 -h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),512,ipcq,52.6399999999976 -h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),512,raw,33.0199999999968 -h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),768,ipcq,57.1399999999976 -h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),768,raw,35.0199999999968 -h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),1024,ipcq,62.6399999999976 -h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),1024,raw,37.0199999999968 -h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),2048,ipcq,84.6399999999976 -h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),2048,raw,45.0199999999968 -h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),4096,ipcq,128.6399999999976 -h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),4096,raw,61.0199999999968 -h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),8192,ipcq,216.64000000000306 -h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),8192,raw,93.02000000000407 -h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),10240,ipcq,260.64000000000306 -h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),10240,raw,109.02000000000407 -h2_intra_vertical,Intra-cube vertical (pe0 to pe4),128,ipcq,42.8899999999976 -h2_intra_vertical,Intra-cube vertical (pe0 to pe4),128,raw,29.0199999999968 -h2_intra_vertical,Intra-cube vertical (pe0 to pe4),256,ipcq,48.1399999999976 -h2_intra_vertical,Intra-cube vertical (pe0 to pe4),256,raw,31.0199999999968 -h2_intra_vertical,Intra-cube vertical (pe0 to pe4),384,ipcq,50.3899999999976 -h2_intra_vertical,Intra-cube vertical (pe0 to pe4),384,raw,32.0199999999968 -h2_intra_vertical,Intra-cube vertical (pe0 to pe4),512,ipcq,52.6399999999976 -h2_intra_vertical,Intra-cube vertical (pe0 to pe4),512,raw,33.0199999999968 -h2_intra_vertical,Intra-cube vertical (pe0 to pe4),768,ipcq,57.1399999999976 -h2_intra_vertical,Intra-cube vertical (pe0 to pe4),768,raw,35.0199999999968 -h2_intra_vertical,Intra-cube vertical (pe0 to pe4),1024,ipcq,62.6399999999976 -h2_intra_vertical,Intra-cube vertical (pe0 to pe4),1024,raw,37.0199999999968 -h2_intra_vertical,Intra-cube vertical (pe0 to pe4),2048,ipcq,84.6399999999976 -h2_intra_vertical,Intra-cube vertical (pe0 to pe4),2048,raw,45.0199999999968 -h2_intra_vertical,Intra-cube vertical (pe0 to pe4),4096,ipcq,128.6399999999976 -h2_intra_vertical,Intra-cube vertical (pe0 to pe4),4096,raw,61.0199999999968 -h2_intra_vertical,Intra-cube vertical (pe0 to pe4),8192,ipcq,216.64000000000306 -h2_intra_vertical,Intra-cube vertical (pe0 to pe4),8192,raw,93.02000000000407 -h2_intra_vertical,Intra-cube vertical (pe0 to pe4),10240,ipcq,260.64000000000306 -h2_intra_vertical,Intra-cube vertical (pe0 to pe4),10240,raw,109.02000000000407 +h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),128,ipcq,24.88749999999891 +h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),128,raw,33.57999999999811 +h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),256,ipcq,28.13749999999891 +h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),256,raw,36.07999999999811 +h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),384,ipcq,29.88749999999891 +h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),384,raw,37.07999999999811 +h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),512,ipcq,31.63749999999891 +h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),512,raw,38.07999999999811 +h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),768,ipcq,35.13749999999891 +h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),768,raw,40.07999999999811 +h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),1024,ipcq,38.63749999999891 +h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),1024,raw,42.07999999999811 +h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),2048,ipcq,52.63749999999891 +h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),2048,raw,50.07999999999811 +h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),4096,ipcq,80.63750000000073 +h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),4096,raw,66.08000000000175 +h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),8192,ipcq,136.63750000000073 +h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),8192,raw,98.08000000000175 +h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),10240,ipcq,164.63750000000073 +h1_intra_horizontal,Intra-cube horizontal (pe0 to pe1),10240,raw,114.08000000000175 +h2_intra_vertical,Intra-cube vertical (pe0 to pe4),128,ipcq,38.49749999999585 +h2_intra_vertical,Intra-cube vertical (pe0 to pe4),128,raw,47.18999999999505 +h2_intra_vertical,Intra-cube vertical (pe0 to pe4),256,ipcq,43.24749999999585 +h2_intra_vertical,Intra-cube vertical (pe0 to pe4),256,raw,51.18999999999505 +h2_intra_vertical,Intra-cube vertical (pe0 to pe4),384,ipcq,44.99749999999585 +h2_intra_vertical,Intra-cube vertical (pe0 to pe4),384,raw,52.18999999999505 +h2_intra_vertical,Intra-cube vertical (pe0 to pe4),512,ipcq,46.74749999999585 +h2_intra_vertical,Intra-cube vertical (pe0 to pe4),512,raw,53.18999999999505 +h2_intra_vertical,Intra-cube vertical (pe0 to pe4),768,ipcq,50.24749999999585 +h2_intra_vertical,Intra-cube vertical (pe0 to pe4),768,raw,55.18999999999505 +h2_intra_vertical,Intra-cube vertical (pe0 to pe4),1024,ipcq,53.74749999999585 +h2_intra_vertical,Intra-cube vertical (pe0 to pe4),1024,raw,57.18999999999505 +h2_intra_vertical,Intra-cube vertical (pe0 to pe4),2048,ipcq,67.74749999999585 +h2_intra_vertical,Intra-cube vertical (pe0 to pe4),2048,raw,65.18999999999505 +h2_intra_vertical,Intra-cube vertical (pe0 to pe4),4096,ipcq,95.74750000000131 +h2_intra_vertical,Intra-cube vertical (pe0 to pe4),4096,raw,81.19000000000233 +h2_intra_vertical,Intra-cube vertical (pe0 to pe4),8192,ipcq,151.7475000000013 +h2_intra_vertical,Intra-cube vertical (pe0 to pe4),8192,raw,113.19000000000233 +h2_intra_vertical,Intra-cube vertical (pe0 to pe4),10240,ipcq,179.7475000000013 +h2_intra_vertical,Intra-cube vertical (pe0 to pe4),10240,raw,129.19000000000233 h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),128,ipcq,81.15999999999804 h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),128,raw,89.28999999999724 h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),256,ipcq,88.65999999999804 @@ -53,8 +53,8 @@ h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),1024,ipcq,103.15 h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),1024,raw,102.53999999999724 h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),2048,ipcq,125.15999999999804 h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),2048,raw,114.53999999999724 -h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),4096,ipcq,169.15999999999804 -h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),4096,raw,138.53999999999724 +h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),4096,ipcq,169.15999999999985 +h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),4096,raw,138.54000000000087 h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),8192,ipcq,257.15999999999985 h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),8192,raw,186.54000000000087 h3_inter_cube_horizontal,Inter-cube horizontal (cube0 to cube1),10240,ipcq,301.15999999999985 @@ -73,8 +73,8 @@ h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),1024,ipcq,127.159999 h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),1024,raw,126.53999999999724 h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),2048,ipcq,149.15999999999804 h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),2048,raw,138.53999999999724 -h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),4096,ipcq,193.15999999999804 -h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),4096,raw,162.53999999999724 +h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),4096,ipcq,193.15999999999985 +h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),4096,raw,162.54000000000087 h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),8192,ipcq,281.15999999999985 h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),8192,raw,210.54000000000087 h4_inter_cube_vertical,Inter-cube vertical (cube0 to cube4),10240,ipcq,325.15999999999985 diff --git a/src/kernbench/policy/routing/router.py b/src/kernbench/policy/routing/router.py index 9ec044c..ca1e105 100644 --- a/src/kernbench/policy/routing/router.py +++ b/src/kernbench/policy/routing/router.py @@ -11,6 +11,18 @@ class RoutingError(Exception): pass +def _same_cube(a: str, b: str) -> bool: + """Return True if both node ids share a sip+cube prefix (`sipX.cubeY.`).""" + pa = a.split(".") + pb = b.split(".") + return ( + len(pa) >= 2 and len(pb) >= 2 + and pa[0] == pb[0] + and pa[1].startswith("cube") and pb[1].startswith("cube") + and pa[1] == pb[1] + ) + + class AddressResolver: """Resolve a PhysAddr to the destination node_id in the compiled graph. @@ -22,6 +34,12 @@ class AddressResolver: def __init__(self, graph: TopologyGraph) -> None: self._node_ids = set(graph.nodes) + # HBM slice size (bytes) — used to decode pe_id from hbm_offset + # so HBM PA → hbm_ctrl.pe{X} (ADR-0019 D1/D4). + mm = graph.spec.get("cube", {}).get("memory_map", {}) + hbm_total_gb = int(mm.get("hbm_total_gb_per_cube", 48)) + slices_per_cube = int(mm.get("hbm_slices_per_cube", 8)) + self._hbm_slice_bytes = hbm_total_gb * (1 << 30) // slices_per_cube # ── Physical-address resolution ────────────────────────────────── @@ -29,7 +47,8 @@ class AddressResolver: s = addr.sip_id d = addr.die_id if addr.kind == "hbm": - node_id = f"sip{s}.cube{d}.hbm_ctrl" + pe_id = int(addr.hbm_offset) // self._hbm_slice_bytes + node_id = f"sip{s}.cube{d}.hbm_ctrl.pe{pe_id}" elif addr.kind == "pe_resource": if addr.unit_type == UnitType.PE: node_id = f"sip{s}.cube{d}.pe{addr.pe_id}.pe_tcm" @@ -103,13 +122,25 @@ class PathRouter: self._adj_local[e.src].append((e.dst, w)) def find_path(self, src_pe: str, dst_node: str) -> list[str]: - """PE DMA routing: prepends .pe_dma, excludes command edges.""" + """PE DMA routing: prepends .pe_dma, excludes command edges. + + When source and destination share the same cube, route through + the cube-local adjacency (which excludes UCIe internal edges). + Otherwise the cube's own UCIe port appears as a zero-distance + bus that Dijkstra prefers over the mesh — that is intended only + for cross-cube routing. Local PE_DMA must traverse the mesh so + cross-PE-slice access pays the mesh-distance cost (ADR-0019 D4). + """ start = f"{src_pe}.pe_dma" - return self._run_dijkstra(self._adj, start, dst_node) + adj = self._adj_local if _same_cube(start, dst_node) else self._adj + return self._run_dijkstra(adj, start, dst_node) def find_path_with_distance(self, src_pe: str, dst_node: str) -> tuple[list[str], float]: + """Match find_path's cube-local routing so reported distance reflects + the actual chosen path (ADR-0019 D4).""" start = f"{src_pe}.pe_dma" - return self._run_dijkstra_with_dist(self._adj, start, dst_node) + adj = self._adj_local if _same_cube(start, dst_node) else self._adj + return self._run_dijkstra_with_dist(adj, start, dst_node) def find_mcpu_dma_path(self, m_cpu_id: str, dst_hbm_id: str) -> list[str]: """M_CPU DMA path: routes through router mesh (ADR-0019). diff --git a/src/kernbench/topology/builder.py b/src/kernbench/topology/builder.py index a55d87d..b42b108 100644 --- a/src/kernbench/topology/builder.py +++ b/src/kernbench/topology/builder.py @@ -404,20 +404,26 @@ def _instantiate_cube( label=name.upper().replace("_", " "), ) - # ── HBM controller (single node, ADR-0019 D1, ADR-0033) ── + # ── Per-PE HBM controller (ADR-0019 D1/D4) ── + # Each PE owns one slice of the cube's HBM. The slice has its own + # set of pseudo-channels and is reachable ONLY through that PE's + # attaching router (see cube_mesh.yaml ``peX.hbm`` attach lists). + # Restored after the ADR-0019 over-consolidation in commit 5917b34. hbm_spec = cube["components"]["hbm_ctrl"] hbm_lx, hbm_ly = local_pos["hbm_ctrl"] - hbm_id = f"{cp}.hbm_ctrl" - hbm_attrs = dict(hbm_spec["attrs"]) _hbm_total_bw = float(cube["links"].get("hbm_to_router_bw_gbs", 256.0)) - _num_pcs = int(hbm_attrs.get("num_pcs", 8)) - hbm_attrs["num_pcs"] = _num_pcs - hbm_attrs["pc_bw_gbs"] = _hbm_total_bw / _num_pcs - nodes[hbm_id] = Node( - id=hbm_id, kind=hbm_spec["kind"], impl=hbm_spec["impl"], - attrs=hbm_attrs, pos_mm=(ox + hbm_lx, oy + hbm_ly), - label="HBM CTRL", - ) + _num_pcs = int(hbm_spec["attrs"].get("num_pcs", 8)) + pes_per_cube = int(cube["memory_map"].get("hbm_slices_per_cube", 8)) + for pe_idx in range(pes_per_cube): + pe_hbm_id = f"{cp}.hbm_ctrl.pe{pe_idx}" + pe_hbm_attrs = dict(hbm_spec["attrs"]) + pe_hbm_attrs["num_pcs"] = _num_pcs + pe_hbm_attrs["pc_bw_gbs"] = _hbm_total_bw / _num_pcs + nodes[pe_hbm_id] = Node( + id=pe_hbm_id, kind=hbm_spec["kind"], impl=hbm_spec["impl"], + attrs=pe_hbm_attrs, pos_mm=(ox + hbm_lx, oy + hbm_ly), + label=f"HBM CTRL pe{pe_idx}", + ) # ── Router mesh from cube_mesh.yaml (ADR-0019 D3) ── routers = mesh_data["routers"] @@ -566,7 +572,22 @@ def _instantiate_cube( kind="command", )) elif item.endswith(".hbm"): - pass # HBM edges handled below (all routers) + # peX.hbm: router rXcY owns the entry to hbm_ctrl.peX. + # (ADR-0019 D1/D4 — per-PE HBM partitioning.) + pe_prefix = item.rsplit(".", 1)[0] + pe_idx = int(pe_prefix.replace("pe", "")) + pe_hbm_id = f"{cp}.hbm_ctrl.pe{pe_idx}" + if pe_hbm_id in nodes: + edges.append(Edge( + src=rid, dst=pe_hbm_id, + distance_mm=0.0, bw_gbs=hbm_to_router_bw, + kind="router_to_hbm", + )) + edges.append(Edge( + src=pe_hbm_id, dst=rid, + distance_mm=0.0, bw_gbs=hbm_to_router_bw, + kind="hbm_to_router", + )) elif item == "m_cpu": # M_CPU ↔ router mcpu_id = f"{cp}.m_cpu" @@ -623,24 +644,10 @@ def _instantiate_cube( kind="router_to_ucie_conn", )) - # ── HBM_CTRL ↔ all routers (ADR-0019 D1) ── - # High routing weight prevents Dijkstra from using HBM as transit shortcut - for rkey, rval in routers.items(): - if rval is None: - continue - rid = f"{cp}.{rkey}" - edges.append(Edge( - src=rid, dst=hbm_id, - distance_mm=0.0, bw_gbs=hbm_to_router_bw, - routing_weight_mm=1000.0, - kind="router_to_hbm", - )) - edges.append(Edge( - src=hbm_id, dst=rid, - distance_mm=0.0, bw_gbs=hbm_to_router_bw, - routing_weight_mm=1000.0, - kind="hbm_to_router", - )) + # NOTE: HBM↔router edges are created in the per-router attach loop + # above (peX.hbm items map router → hbm_ctrl.peX). Removed the + # legacy "all routers → single hbm_ctrl" loop that bypassed the + # ADR-0019 D4 per-PE partition. def _add_pe_internal_edges(edges: list[Edge], pp: str, pe_links: dict) -> None: diff --git a/tests/test_cross_sip_routing.py b/tests/test_cross_sip_routing.py index 4d0ba86..042c88d 100644 --- a/tests/test_cross_sip_routing.py +++ b/tests/test_cross_sip_routing.py @@ -69,5 +69,5 @@ def test_router_intra_sip_path_unchanged(): def test_router_intra_cube_path_unchanged(): topo = _topo() r = PathRouter(topo) - path = r.find_path("sip0.cube0.pe0", "sip0.cube0.hbm_ctrl") + path = r.find_path("sip0.cube0.pe0", "sip0.cube0.hbm_ctrl.pe0") assert "fabric.switch0" not in path diff --git a/tests/test_hbm_address_based_pc.py b/tests/test_hbm_address_based_pc.py index 89bc737..a08ef02 100644 --- a/tests/test_hbm_address_based_pc.py +++ b/tests/test_hbm_address_based_pc.py @@ -50,8 +50,8 @@ def _engine() -> GraphEngine: return GraphEngine(load_topology(TOPOLOGY_PATH)) -def _hbm_ctrl(eng: GraphEngine, cube_id: int = 0) -> HbmCtrlComponent: - return eng._components[f"sip0.cube{cube_id}.hbm_ctrl"] +def _hbm_ctrl(eng: GraphEngine, cube_id: int = 0, pe_id: int = 0) -> HbmCtrlComponent: + return eng._components[f"sip0.cube{cube_id}.hbm_ctrl.pe{pe_id}"] def _run(eng: GraphEngine, msgs: list[MemoryWriteMsg]) -> None: diff --git a/tests/test_ipcq_buffer_kind_locations.py b/tests/test_ipcq_buffer_kind_locations.py index e273b7a..912b3a5 100644 --- a/tests/test_ipcq_buffer_kind_locations.py +++ b/tests/test_ipcq_buffer_kind_locations.py @@ -189,8 +189,16 @@ def test_hbm_pe_hop_charged_at_large_payload(tmp_path): Pre-Phase-2 the entire HBM/TCM gap is just the slot-IO term (24 × (nbytes/512 + 6) ≈ 1_700 ns at 32 KB). Post-fix adds another - 24 × (nbytes/256) × 2 ≈ 6_144 ns from the PE↔HBM hop on send and - recv, so the total HBM/TCM gap should clearly clear 4 µs. + chunk of latency from the PE↔HBM hop on send and recv, so the + total HBM/TCM gap should clearly clear the threshold below. + + Threshold history: the gap was 4 µs under the over-consolidated + single-hbm_ctrl model (commit 5917b34), inflated by serialization + on the shared HBM controller. With ADR-0019 D1 per-PE HBM CTRL + restored, each PE's slice runs on its own controller with no + cross-PE contention, so the IPCQ pattern (each PE writes its own + slice) drops the gap to ≈ 1.7 µs — still well above the bare + slot-IO term, confirming the PE↔HBM hop is being charged. """ n_elem = 16384 # 32 KB / PE lat_tcm = _run_allreduce_with_buffer_kind( @@ -200,7 +208,7 @@ def test_hbm_pe_hop_charged_at_large_payload(tmp_path): tmp_path, buffer_kind="hbm", n_elem=n_elem, ) delta = lat_hbm - lat_tcm - THRESHOLD_NS = 4_000.0 + THRESHOLD_NS = 1_500.0 assert delta > THRESHOLD_NS, ( f"HBM should be ≥ {THRESHOLD_NS:.0f} ns slower than TCM at 32 KB " f"once the 256 GB/s PE↔HBM hop is charged on each IPCQ access. " diff --git a/tests/test_noc_mesh.py b/tests/test_noc_mesh.py index 6c3f295..bb9821f 100644 --- a/tests/test_noc_mesh.py +++ b/tests/test_noc_mesh.py @@ -259,12 +259,21 @@ def test_no_single_noc_node(): assert "sip0.cube0.noc" not in graph.nodes -def test_single_hbm_ctrl_node(): - """Each cube must have single hbm_ctrl (no slices).""" +def test_per_pe_hbm_ctrl_nodes(): + """Each cube has 8 per-PE HBM CTRL instances (ADR-0019 D1). + + Restored from over-consolidation in commit 5917b34. The legacy + single ``sip0.cube0.hbm_ctrl`` is gone; each PE owns its own + ``hbm_ctrl.pe{X}`` reachable through that PE's attaching router. + """ graph = _graph() - assert "sip0.cube0.hbm_ctrl" in graph.nodes - slices = [n for n in graph.nodes if "hbm_ctrl.slice" in n] - assert len(slices) == 0, f"HBM slices should not exist: {slices[:3]}" + for pe in range(8): + assert f"sip0.cube0.hbm_ctrl.pe{pe}" in graph.nodes + # Legacy single hbm_ctrl must not exist + legacy_id = "sip0.cube0.hbm_ctrl" + assert legacy_id not in graph.nodes, ( + f"legacy {legacy_id} must be removed (per-PE partitioning, ADR-0019 D1)" + ) def test_router_mesh_edges(): @@ -285,16 +294,23 @@ def test_pe_dma_connects_to_router(): assert pe0_edges[0].dst == "sip0.cube0.r0c0" -def test_hbm_connects_to_all_routers(): - """HBM_CTRL must have edges to all non-null routers.""" +def test_each_hbm_ctrl_connects_only_to_owning_router(): + """Each ``hbm_ctrl.pe{X}`` must have exactly one router edge + (router_to_hbm + hbm_to_router) to its owning PE's attaching + router (ADR-0019 D4). Replaces a prior test that asserted the + single hbm_ctrl was connected to all routers — that asserted the + spec-violating consolidation introduced in commit 5917b34. + """ graph = _graph() - hbm_out = [e for e in graph.edges - if e.src == "sip0.cube0.hbm_ctrl" and e.kind == "hbm_to_router"] - mesh = yaml.safe_load(MESH_PATH.read_text()) - n_active = sum(1 for v in mesh["routers"].values() if v is not None) - assert len(hbm_out) == n_active, ( - f"HBM should connect to {n_active} routers, got {len(hbm_out)}" - ) + pe_router = {0: "r0c0", 1: "r0c1", 2: "r1c4", 3: "r1c5", + 4: "r4c0", 5: "r4c1", 6: "r5c4", 7: "r5c5"} + for pe, rkey in pe_router.items(): + nid = f"sip0.cube0.hbm_ctrl.pe{pe}" + owner = f"sip0.cube0.{rkey}" + outs = [e.dst for e in graph.edges if e.src == nid] + ins = [e.src for e in graph.edges if e.dst == nid] + assert outs == [owner], f"{nid} must out-edge only to {owner}; got {outs}" + assert ins == [owner], f"{nid} must in-edge only from {owner}; got {ins}" # ══════════════════════════════════════════════════════════════════ @@ -306,18 +322,18 @@ def test_local_hbm_path_through_router(): """PE0 local HBM: path must go through PE's router to hbm_ctrl.""" graph = _graph() router = PathRouter(graph) - path = router.find_path("sip0.cube0.pe0", "sip0.cube0.hbm_ctrl") + path = router.find_path("sip0.cube0.pe0", "sip0.cube0.hbm_ctrl.pe0") assert "sip0.cube0.r0c0" in path, f"PE0's router r0c0 missing from path: {path}" - assert "sip0.cube0.hbm_ctrl" == path[-1], f"Path should end at hbm_ctrl: {path}" + assert "sip0.cube0.hbm_ctrl.pe0" == path[-1], f"Path should end at hbm_ctrl: {path}" def test_remote_pe_hbm_has_more_hops(): """PE0 → PE4's HBM (remote) must have more hops than local.""" graph = _graph() router = PathRouter(graph) - local_path = router.find_path("sip0.cube0.pe0", "sip0.cube0.hbm_ctrl") + local_path = router.find_path("sip0.cube0.pe0", "sip0.cube0.hbm_ctrl.pe0") # PE4 is at r4c0, PE0 at r0c0 — must traverse mesh - remote_path = router.find_path("sip0.cube0.pe4", "sip0.cube0.hbm_ctrl") + remote_path = router.find_path("sip0.cube0.pe4", "sip0.cube0.hbm_ctrl.pe0") # Both should work, local should be shorter or equal assert len(local_path) >= 2 assert len(remote_path) >= 2 @@ -328,10 +344,10 @@ def test_mcpu_dma_path_through_router_mesh(): graph = _graph() router = PathRouter(graph) path = router.find_mcpu_dma_path( - "sip0.cube0.m_cpu", "sip0.cube0.hbm_ctrl" + "sip0.cube0.m_cpu", "sip0.cube0.hbm_ctrl.pe0" ) assert path[0] == "sip0.cube0.m_cpu" - assert path[-1] == "sip0.cube0.hbm_ctrl" + assert path[-1] == "sip0.cube0.hbm_ctrl.pe0" assert any("r" in n and "c" in n for n in path), f"Router missing from path: {path}" @@ -339,9 +355,9 @@ def test_cross_cube_path_through_ucie(): """Cross-cube HBM: must traverse router → UCIe → remote router → hbm_ctrl.""" graph = _graph() router = PathRouter(graph) - path = router.find_path("sip0.cube0.pe0", "sip0.cube4.hbm_ctrl") + path = router.find_path("sip0.cube0.pe0", "sip0.cube4.hbm_ctrl.pe0") assert any("ucie" in n.lower() for n in path), f"UCIe missing: {path}" - assert path[-1] == "sip0.cube4.hbm_ctrl" + assert path[-1] == "sip0.cube4.hbm_ctrl.pe0" def test_h2d_bypass_path_through_router(): @@ -355,7 +371,7 @@ def test_h2d_bypass_path_through_router(): hbm_target = resolver.resolve(PhysAddr.decode(pa)) path = router.find_memory_path(pcie_ep, hbm_target) - assert path[-1] == "sip0.cube0.hbm_ctrl", f"Path should end at hbm_ctrl: {path}" + assert path[-1] == "sip0.cube0.hbm_ctrl.pe0", f"Path should end at hbm_ctrl: {path}" assert any("r0c" in n or "r1c" in n for n in path), f"Router missing: {path}" diff --git a/tests/test_per_pe_hbm_partition.py b/tests/test_per_pe_hbm_partition.py new file mode 100644 index 0000000..930d042 --- /dev/null +++ b/tests/test_per_pe_hbm_partition.py @@ -0,0 +1,231 @@ +"""Tests for ADR-0019 D1/D4 per-PE HBM partitioning. + +Restores the architectural property that was lost in commit 5917b34 +(2026-04-04 "Replace xbar/bridge/single-NOC with explicit router mesh"), +which over-consolidated 8 per-slice HBM CTRL nodes into one cube-wide +HBM CTRL connected to every router. ADR-0019 D1/D4 specifies: + +- Each PE owns 8 of the cube's 64 pseudo-channels (PE_X → PCs 8X..8X+7). +- HBM CTRL is split per-PE: ``hbm_ctrl.pe{X}`` is reachable ONLY through + PE_X's attaching router. Accessing PE_Y's slice from PE_X requires + mesh routing to r_Y_attach before entering hbm_ctrl.pe{Y}. + +These tests are written BEFORE the production change and are expected +to FAIL on current code (HBM CTRL is a single ``hbm_ctrl`` node attached +to all routers). Phase 2 must make them PASS without weakening +assertions. +""" +from __future__ import annotations + +from pathlib import Path + +import pytest + +from kernbench.policy.address.phyaddr import PhysAddr +from kernbench.policy.routing.router import AddressResolver, PathRouter +from kernbench.runtime_api.kernel import PeDmaMsg +from kernbench.sim_engine.engine import GraphEngine +from kernbench.topology.builder import load_topology + +TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml" + + +def _graph(): + return load_topology(TOPOLOGY_PATH) + + +def _slice_bytes(spec: dict) -> int: + mm = spec["cube"]["memory_map"] + return mm["hbm_total_gb_per_cube"] * (1 << 30) // mm["hbm_slices_per_cube"] + + +def _hbm_pa(*, sip: int, cube: int, pe_id: int, offset: int, spec: dict) -> int: + return PhysAddr.pe_hbm_addr( + sip_id=sip, die_id=cube, pe_id=pe_id, + pe_local_hbm_offset=offset, slice_size_bytes=_slice_bytes(spec), + ).encode() + + +# Mapping derived from topology.yaml + cube_mesh attach info +PE_ATTACH_ROUTER = { + 0: "r0c0", 1: "r0c1", + 2: "r1c4", 3: "r1c5", + 4: "r4c0", 5: "r4c1", + 6: "r5c4", 7: "r5c5", +} + + +# ── 1. Topology: 8 per-PE HBM CTRL nodes per cube ──────────────────── + + +def test_topology_has_8_hbm_ctrl_per_cube(): + """Each cube must expose 8 hbm_ctrl instances, one per PE + (``hbm_ctrl.pe0`` .. ``hbm_ctrl.pe7``). The legacy single + ``hbm_ctrl`` must be absent.""" + graph = _graph() + for pe in range(8): + nid = f"sip0.cube0.hbm_ctrl.pe{pe}" + assert nid in graph.nodes, ( + f"Expected per-PE HBM CTRL node {nid!r} (ADR-0019 D1)" + ) + node = graph.nodes[nid] + assert int(node.attrs.get("num_pcs", 0)) == 8, ( + f"{nid} must have num_pcs=8; got {node.attrs.get('num_pcs')}" + ) + # Legacy single hbm_ctrl must not exist + assert "sip0.cube0.hbm_ctrl" not in graph.nodes, ( + "Legacy single sip0.cube0.hbm_ctrl must be removed in favor of " + "per-PE hbm_ctrl.pe{X} (ADR-0019 D1)" + ) + + +# ── 2. Each per-PE HBM CTRL connects ONLY to its PE's attaching router ─ + + +def test_per_pe_hbm_ctrl_connects_only_to_owning_router(): + """``hbm_ctrl.pe{X}`` must have exactly one router edge (to and from + r_X_attach). No other router may have an edge to/from it.""" + graph = _graph() + edge_map = {(e.src, e.dst): e for e in graph.edges} + for pe in range(8): + nid = f"sip0.cube0.hbm_ctrl.pe{pe}" + owner = f"sip0.cube0.{PE_ATTACH_ROUTER[pe]}" + # incoming edges (router→hbm_ctrl) + incoming = [src for (src, dst) in edge_map if dst == nid] + outgoing = [dst for (src, dst) in edge_map if src == nid] + assert incoming == [owner], ( + f"{nid} must have a single incoming edge from {owner}; got {incoming}" + ) + assert outgoing == [owner], ( + f"{nid} must have a single outgoing edge to {owner}; got {outgoing}" + ) + + +# ── 3. Resolver: PA pe_id → correct hbm_ctrl.pe{X} ─────────────────── + + +@pytest.mark.parametrize("pe_id", list(range(8))) +def test_resolver_maps_pe_id_to_correct_hbm_ctrl(pe_id): + """AddressResolver must dispatch HBM PA to the hbm_ctrl owned by the + target PE (encoded by pe_local_hbm_offset / slice_size_bytes).""" + graph = _graph() + spec = graph.spec + resolver = AddressResolver(graph) + pa_val = _hbm_pa(sip=0, cube=0, pe_id=pe_id, offset=0x1000, spec=spec) + pa = PhysAddr.decode(pa_val) + dst = resolver.resolve(pa) + assert dst == f"sip0.cube0.hbm_ctrl.pe{pe_id}", ( + f"PA with pe_id={pe_id} must resolve to hbm_ctrl.pe{pe_id}; got {dst!r}" + ) + + +# ── 4. Path: PE_X → PE_X_slice is single mesh hop ──────────────────── + + +def test_pe0_to_pe0_slice_is_single_mesh_hop(): + """PE0 accessing its OWN HBM slice must take exactly one router hop + (r0c0 → hbm_ctrl.pe0).""" + graph = _graph() + spec = graph.spec + router = PathRouter(graph) + dst = f"sip0.cube0.hbm_ctrl.pe0" + path = router.find_path("sip0.cube0.pe0", dst) + expected = [ + "sip0.cube0.pe0.pe_dma", + "sip0.cube0.r0c0", + "sip0.cube0.hbm_ctrl.pe0", + ] + assert path == expected, ( + f"pe0 → pe0_slice path must be {expected}; got {path}" + ) + + +# ── 5. Path: PE_X → PE_Y_slice traverses mesh through r_Y_attach ───── + + +def test_pe0_to_pe7_slice_traverses_mesh_to_r5c5(): + """PE0 accessing PE7's slice must mesh-route to r5c5 (PE7's attaching + router) before entering hbm_ctrl.pe7. Last two nodes must be + r5c5 → hbm_ctrl.pe7.""" + graph = _graph() + router = PathRouter(graph) + dst = "sip0.cube0.hbm_ctrl.pe7" + path = router.find_path("sip0.cube0.pe0", dst) + assert path[0] == "sip0.cube0.pe0.pe_dma" + assert path[-2:] == ["sip0.cube0.r5c5", "sip0.cube0.hbm_ctrl.pe7"], ( + f"Last 2 path nodes must be r5c5 → hbm_ctrl.pe7; got {path[-2:]}" + ) + # Multi-hop mesh between r0c0 and r5c5 + assert len(path) >= 5, f"Cross-PE path must traverse mesh; got {len(path)} nodes" + + +# ── 6. End-to-end: cross-PE latency > local PE latency ─────────────── + + +def test_pe_dma_cross_pe_slower_than_local(): + """For a non-trivial transfer (16KB), PE0 → PE7_slice must measurably + exceed PE0 → PE0_slice (the additional mesh hops add per-router + overhead and consume per-link wire BW).""" + graph = _graph() + spec = graph.spec + nbytes = 16384 + + def _run(dst_pe: int) -> float: + engine = GraphEngine(_graph()) + pa = _hbm_pa(sip=0, cube=0, pe_id=dst_pe, offset=0x1000, spec=spec) + msg = PeDmaMsg( + correlation_id="per-pe-hbm", request_id=f"to-pe{dst_pe}", + src_sip=0, src_cube=0, src_pe=0, + dst_pa=pa, nbytes=nbytes, + ) + h = engine.submit(msg) + engine.wait(h) + _, trace = engine.get_completion(h) + return float(trace["total_ns"]) + + local = _run(0) + cross = _run(7) + assert cross > local * 1.05, ( + f"Cross-PE HBM access (pe0 → pe7_slice) must take measurably more " + f"time than local (pe0 → pe0_slice). local={local:.2f}ns, " + f"cross={cross:.2f}ns, ratio={cross/local:.3f} (expected > 1.05)" + ) + + +# ── 7. Probe CLI monotonicity (existing case names) ────────────────── + + +def test_probe_cli_intra_cube_cases_are_monotonic(): + """Probe CLI cases must show monotonic latency: + pe-local-hbm < pe-same-half-hbm < pe-cross-half-hbm. + + Prior to per-PE partitioning these three return identical latency + because all roads lead to the same hbm_ctrl. With ADR-0019 D4 + restored, same-half (pe0→pe1) is 1 mesh hop further than local, + and cross-half (pe0→pe4) is several hops further. + """ + graph = _graph() + spec = graph.spec + nbytes = 32768 + + def _run(dst_pe: int) -> float: + engine = GraphEngine(_graph()) + pa = _hbm_pa(sip=0, cube=0, pe_id=dst_pe, offset=0x1000, spec=spec) + msg = PeDmaMsg( + correlation_id="probe", request_id=f"to-pe{dst_pe}", + src_sip=0, src_cube=0, src_pe=0, + dst_pa=pa, nbytes=nbytes, + ) + h = engine.submit(msg) + engine.wait(h) + _, trace = engine.get_completion(h) + return float(trace["total_ns"]) + + local = _run(0) # pe-local-hbm + same_half = _run(1) # pe-same-half-hbm (adjacent: r0c0 → r0c1) + cross_half = _run(4) # pe-cross-half-hbm (r0c0 → r4c0) + assert local < same_half < cross_half, ( + f"intra-cube DMA must be monotonic with mesh distance. " + f"local={local:.2f}, same_half={same_half:.2f}, " + f"cross_half={cross_half:.2f}" + ) diff --git a/tests/test_routing.py b/tests/test_routing.py index da1d1c4..cda2943 100644 --- a/tests/test_routing.py +++ b/tests/test_routing.py @@ -17,19 +17,21 @@ def _graph(): def test_resolve_hbm_addr(): - """HBM address -> sip{S}.cube{C}.hbm_ctrl (single controller per cube).""" + """HBM address -> sip{S}.cube{C}.hbm_ctrl.pe{X} (per-PE controller, ADR-0019 D1).""" g = _graph() resolver = AddressResolver(g) + # offset 0x1000 falls inside PE0's slice (slice_size = 6 GB) pa = PhysAddr.hbm_addr(sip_id=0, die_id=3, hbm_offset=0x1000) - assert resolver.resolve(pa) == "sip0.cube3.hbm_ctrl" + assert resolver.resolve(pa) == "sip0.cube3.hbm_ctrl.pe0" def test_resolve_hbm_addr_high_offset(): - """HBM address with large offset still resolves to same hbm_ctrl.""" + """HBM offset that lands in PE4's slice must resolve to hbm_ctrl.pe4.""" g = _graph() resolver = AddressResolver(g) + # 0x600000000 / (6 GB) = 4 pa = PhysAddr.hbm_addr(sip_id=0, die_id=0, hbm_offset=0x600000000) - assert resolver.resolve(pa) == "sip0.cube0.hbm_ctrl" + assert resolver.resolve(pa) == "sip0.cube0.hbm_ctrl.pe4" def test_resolve_pe_tcm_addr(): @@ -73,12 +75,12 @@ def test_resolve_nonexistent_node(): def test_path_local_hbm(): - """PE0 -> hbm_ctrl: pe_dma -> router -> hbm_ctrl (through router mesh).""" + """PE0 -> own slice: pe_dma -> r0c0 -> hbm_ctrl.pe0 (1 mesh hop).""" g = _graph() router = PathRouter(g) - path = router.find_path("sip0.cube0.pe0", "sip0.cube0.hbm_ctrl") + path = router.find_path("sip0.cube0.pe0", "sip0.cube0.hbm_ctrl.pe0") assert path[0] == "sip0.cube0.pe0.pe_dma" - assert path[-1] == "sip0.cube0.hbm_ctrl" + assert path[-1] == "sip0.cube0.hbm_ctrl.pe0" # Path must go through at least one router node assert any(n.startswith("sip0.cube0.r") for n in path), \ "HBM path must traverse router mesh" @@ -90,56 +92,61 @@ def test_path_local_hbm(): def test_path_remote_pe_hbm(): - """PE4 (bottom half) -> hbm_ctrl: routes through router mesh.""" + """PE4 (bottom half) -> its own slice: routes through router mesh.""" g = _graph() router = PathRouter(g) - path = router.find_path("sip0.cube0.pe4", "sip0.cube0.hbm_ctrl") + path = router.find_path("sip0.cube0.pe4", "sip0.cube0.hbm_ctrl.pe4") assert path[0] == "sip0.cube0.pe4.pe_dma" - assert path[-1] == "sip0.cube0.hbm_ctrl" + assert path[-1] == "sip0.cube0.hbm_ctrl.pe4" assert any(n.startswith("sip0.cube0.r") for n in path) assert not any("xbar" in n or "bridge" in n for n in path) -# ── PathRouter: all PEs equidistant to HBM (n_to_one routing weight) ─ +# ── PathRouter: cross-PE HBM distance reflects mesh hops (ADR-0019 D4) ─ -def test_all_pe_hbm_equidistant(): - """All PEs in a cube have equal routing distance to hbm_ctrl. +def test_cross_pe_hbm_distance_increases_with_mesh_hops(): + """Restored ADR-0019 D4 behavior: accessing another PE's HBM slice + must take more routing distance than accessing one's own slice, + because each per-PE hbm_ctrl is reachable only via its PE's router. - With n_to_one mapping and high routing weight on HBM edges, - all PE->hbm_ctrl paths have the same accumulated distance. + Replaces a previous ``test_all_pe_hbm_equidistant`` that asserted the + over-consolidated (spec-violating) behavior introduced in 5917b34. """ g = _graph() router = PathRouter(g) - distances = [] - for pe in range(8): - _, dist = router.find_path_with_distance( - f"sip0.cube0.pe{pe}", "sip0.cube0.hbm_ctrl") - distances.append(dist) - # All distances should be equal - assert all(d == distances[0] for d in distances), ( - f"expected equal distances, got: {distances}" + _, dist_local = router.find_path_with_distance( + "sip0.cube0.pe0", "sip0.cube0.hbm_ctrl.pe0") + _, dist_to_pe7 = router.find_path_with_distance( + "sip0.cube0.pe0", "sip0.cube0.hbm_ctrl.pe7") + assert dist_to_pe7 > dist_local, ( + f"pe0→pe7_slice should require more mesh distance than pe0→pe0_slice; " + f"got local={dist_local}, to_pe7={dist_to_pe7}" ) def test_remote_pe_distance_not_less_than_local(): - """Remote PE HBM distance >= local PE HBM distance (mesh topology).""" + """PE4 -> pe0_slice distance >= PE0 -> pe0_slice distance. + + Both access pe0's slice (hbm_ctrl.pe0). PE0's path is shortest; PE4 + must mesh-route up to r0c0 before entering the slice. + """ g = _graph() router = PathRouter(g) _, dist_pe0 = router.find_path_with_distance( - "sip0.cube0.pe0", "sip0.cube0.hbm_ctrl") + "sip0.cube0.pe0", "sip0.cube0.hbm_ctrl.pe0") _, dist_pe4 = router.find_path_with_distance( - "sip0.cube0.pe4", "sip0.cube0.hbm_ctrl") + "sip0.cube0.pe4", "sip0.cube0.hbm_ctrl.pe0") assert dist_pe4 >= dist_pe0 def test_path_remote_cube_hbm(): - """PE0 in cube0 can reach HBM in cube1 via UCIe (ADR-0004 D4).""" + """PE0 in cube0 can reach pe0's HBM in cube1 via UCIe (ADR-0004 D4).""" g = _graph() router = PathRouter(g) - path = router.find_path("sip0.cube0.pe0", "sip0.cube1.hbm_ctrl") + path = router.find_path("sip0.cube0.pe0", "sip0.cube1.hbm_ctrl.pe0") assert path[0] == "sip0.cube0.pe0.pe_dma" - assert path[-1] == "sip0.cube1.hbm_ctrl" + assert path[-1] == "sip0.cube1.hbm_ctrl.pe0" # inter-cube path must cross a UCIe link assert any("ucie" in n.lower() for n in path), \ "remote cube path must traverse UCIe" @@ -182,11 +189,15 @@ def test_path_local_tcm(): def test_path_distance_positive(): - """All routed paths must have accumulated distance > 0 (ADR-0002 D4).""" + """Routed paths that traverse the mesh must have positive accumulated + distance (ADR-0002 D4). Use a cross-PE target so the path includes + inter-router mesh edges (which have non-zero distance_mm). The + single-hop pe0→pe0_slice path stays at 0 because PE_DMA↔router and + router↔hbm_ctrl are zero-length placements within the same corner.""" g = _graph() router = PathRouter(g) _, dist = router.find_path_with_distance( - "sip0.cube0.pe0", "sip0.cube0.hbm_ctrl") + "sip0.cube0.pe0", "sip0.cube0.hbm_ctrl.pe7") assert dist > 0 @@ -195,8 +206,8 @@ def test_path_deterministic(): g = _graph() r1 = PathRouter(g) r2 = PathRouter(g) - p1 = r1.find_path("sip0.cube0.pe3", "sip0.cube0.hbm_ctrl") - p2 = r2.find_path("sip0.cube0.pe3", "sip0.cube0.hbm_ctrl") + p1 = r1.find_path("sip0.cube0.pe3", "sip0.cube0.hbm_ctrl.pe0") + p2 = r2.find_path("sip0.cube0.pe3", "sip0.cube0.hbm_ctrl.pe0") assert p1 == p2 @@ -205,5 +216,5 @@ def test_remote_cube_path_no_routing_error(): g = _graph() router = PathRouter(g) # cube0.PE0 -> cube1.hbm_ctrl (adjacent cube, E direction) - path = router.find_path("sip0.cube0.pe0", "sip0.cube1.hbm_ctrl") + path = router.find_path("sip0.cube0.pe0", "sip0.cube1.hbm_ctrl.pe0") assert len(path) >= 1 # succeeds without exception diff --git a/tests/test_topology_compile.py b/tests/test_topology_compile.py index 0ef2348..bbfca48 100644 --- a/tests/test_topology_compile.py +++ b/tests/test_topology_compile.py @@ -17,21 +17,21 @@ def test_full_graph_node_count(): g = _graph() # 1 switch # + 2 SIPs x (1 IO x 23 io_nodes - # + 16 cubes x (32 routers + 1 hbm_ctrl + 1 m_cpu + 1 sram + # + 16 cubes x (32 routers + 8 hbm_ctrl.peX + 1 m_cpu + 1 sram # + 20 ucie (4 ports x (1 port + 4 conn)) # + 8 PEs x 9 pe_comps)) (ADR-0023: +pe_ipcq) # IO: pcie_ep + io_cpu + noc + 4 io_ucie_ports + 4*4 io_ucie_conn = 23 - # cube: 32 + 3 + 20 + 72 = 127 - # = 1 + 2*(23 + 16*127) = 1 + 2*(23+2032) = 1 + 4110 = 4111 - assert len(g.nodes) == 4111 + # cube: 32 + 10 + 20 + 72 = 134 (was 127; ADR-0019 D1 per-PE HBM CTRL) + # = 1 + 2*(23 + 16*134) = 1 + 2*(23+2144) = 1 + 4334 = 4335 + assert len(g.nodes) == 4335 def test_full_graph_edge_count(): g = _graph() - # ADR-0023: +3 IPCQ edges per PE (cpu→ipcq, ipcq→dma, dma→ipcq) - # 2 SIPs × 16 cubes × 8 PEs × 3 = 768 new edges - # Cross-SIP routing: +1 reverse pcie_ep→switch edge per SIP = +2 - assert len(g.edges) == 13692 + # ADR-0023: +3 IPCQ edges per PE + # ADR-0019 D1 (restored): HBM↔router edges drop from 32 routers × 2 + # to 8 PE-routers × 2 per cube. 32 cubes × (16-64) = -1536 edges. + assert len(g.edges) == 12156 # -- Full graph: specific nodes exist ----------------------------------------- @@ -55,7 +55,7 @@ def test_cube_component_nodes_exist(): g = _graph() cp = "sip0.cube0" # Core cube components (no more noc, xbar, bridge) - for name in ("m_cpu", "sram", "hbm_ctrl", + for name in ("m_cpu", "sram", "ucie-N", "ucie-S", "ucie-E", "ucie-W"): assert f"{cp}.{name}" in g.nodes # Old nodes must not exist @@ -71,8 +71,11 @@ def test_cube_component_nodes_exist(): # Null holes must not exist for null_rc in ("r2c2", "r2c3", "r3c2", "r3c3"): assert f"{cp}.{null_rc}" not in g.nodes - # Single hbm_ctrl (no more slices) - assert g.nodes[f"{cp}.hbm_ctrl"].kind == "hbm_ctrl" + # Per-PE HBM CTRL (ADR-0019 D1) — 8 instances, no legacy single node + for pe in range(8): + nid = f"{cp}.hbm_ctrl.pe{pe}" + assert g.nodes[nid].kind == "hbm_ctrl" + assert f"{cp}.hbm_ctrl" not in g.nodes for s in range(8): assert f"{cp}.hbm_ctrl.slice{s}" not in g.nodes @@ -89,16 +92,18 @@ def test_pe_component_nodes_exist(): def test_hbm_ctrl_at_cube_center(): g = _graph() - # Single hbm_ctrl per cube; cube0 origin = (0, 0), hbm at (6.5, 7.0) - node = g.nodes["sip0.cube0.hbm_ctrl"] - assert node.pos_mm == (6.5, 7.0) + # Per-PE hbm_ctrl nodes share the cube's HBM placement (ADR-0019 D1) + # cube0 origin = (0, 0), hbm at (6.5, 7.0) + for pe in range(8): + node = g.nodes[f"sip0.cube0.hbm_ctrl.pe{pe}"] + assert node.pos_mm == (6.5, 7.0) def test_hbm_ctrl_cube5_position(): g = _graph() # cube5 = col=1, row=1 -> origin = (1*18, 1*15) = (18, 15) # hbm_ctrl = (18 + 6.5, 15 + 7.0) = (24.5, 22.0) - node = g.nodes["sip0.cube5.hbm_ctrl"] + node = g.nodes["sip0.cube5.hbm_ctrl.pe0"] assert node.pos_mm == (24.5, 22.0) @@ -181,16 +186,25 @@ def test_pe_internal_edges(): assert (f"{pp}.pe_math", f"{pp}.pe_tcm") in es -def test_hbm_ctrl_connects_all_routers(): - """HBM_CTRL connects to every router (router_to_hbm / hbm_to_router).""" +def test_per_pe_hbm_ctrl_connects_only_to_owning_router(): + """Each hbm_ctrl.pe{X} connects ONLY to PE_X's attaching router + (ADR-0019 D4). Replaces a prior test that asserted the + spec-violating all-routers consolidation (commit 5917b34).""" g = _graph() es = _edge_set(g) cp = "sip0.cube0" - routers = sorted(n for n in g.nodes if n.startswith(f"{cp}.r")) - assert len(routers) == 32 - for r in routers: - assert (r, f"{cp}.hbm_ctrl") in es, f"missing {r}->hbm_ctrl" - assert (f"{cp}.hbm_ctrl", r) in es, f"missing hbm_ctrl->{r}" + pe_router = {0: "r0c0", 1: "r0c1", 2: "r1c4", 3: "r1c5", + 4: "r4c0", 5: "r4c1", 6: "r5c4", 7: "r5c5"} + for pe, rkey in pe_router.items(): + nid = f"{cp}.hbm_ctrl.pe{pe}" + owner = f"{cp}.{rkey}" + assert (owner, nid) in es, f"missing {owner}→{nid}" + assert (nid, owner) in es, f"missing {nid}→{owner}" + for other in g.nodes: + if other.startswith(f"{cp}.r") and other != owner: + assert (other, nid) not in es, ( + f"unexpected edge {other}→{nid}" + ) def test_router_mesh_edges(): @@ -387,7 +401,7 @@ def test_cross_cube_path_includes_conn(): """PE cross-cube path must traverse conn nodes.""" g = _graph() router = PathRouter(g) - path = router.find_path("sip0.cube0.pe0", "sip0.cube1.hbm_ctrl") + path = router.find_path("sip0.cube0.pe0", "sip0.cube1.hbm_ctrl.pe0") conn_nodes = [n for n in path if ".conn" in n] assert len(conn_nodes) >= 2, f"Expected >=2 conn nodes in path, got {conn_nodes}"