From 1c33afec552dd3c546a3400199a78d18a6ed819a Mon Sep 17 00:00:00 2001 From: Mukesh Garg Date: Mon, 27 Apr 2026 16:43:01 -0700 Subject: [PATCH] ADR-0032 + intra_* opposite directions in IPCQ install Add intra_N/S/E/W to install.py _OPPOSITE_DIR table so the intra-cube PE-to-PE namespace is symmetrical with intercube N/S/E/W. ADR-0032 documents the intercube allreduce algorithm (supersedes ADR-0029). Refresh ADR-0024/0025/0029 cross-refs and update test_intercube_sfr_config.py to cover the new intra_* mappings. Drop the obsolete test_ccl_round_robin_recv.py (replaced by intercube tests). Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/adr/ADR-0024-sip-tp-launcher.md | 9 +- .../adr/ADR-0025-ipcq-direction-addressing.md | 9 +- docs/adr/ADR-0029-hierarchical-allreduce.md | 4 +- docs/adr/ADR-0032-intercube-allreduce.md | 256 ++++++++++++++++++ src/kernbench/ccl/install.py | 2 + tests/test_ccl_round_robin_recv.py | 48 ---- tests/test_intercube_sfr_config.py | 146 ++++++---- 7 files changed, 363 insertions(+), 111 deletions(-) create mode 100644 docs/adr/ADR-0032-intercube-allreduce.md delete mode 100644 tests/test_ccl_round_robin_recv.py diff --git a/docs/adr/ADR-0024-sip-tp-launcher.md b/docs/adr/ADR-0024-sip-tp-launcher.md index 9e6ecbd..9103e5f 100644 --- a/docs/adr/ADR-0024-sip-tp-launcher.md +++ b/docs/adr/ADR-0024-sip-tp-launcher.md @@ -2,7 +2,14 @@ ## Status -Proposed (Revision 8 — Hierarchical content split out to ADR-0029) +Accepted. rank = SIP process-group model stands. The allreduce algorithm +path (mapper / validator / per-PE install machinery originally targeted at +ADR-0029) has been replaced by ADR-0032: `AhbmCCLBackend` now calls +`configure_sfr_intercube_multisip` at `init_process_group` time and the +intercube kernel receives `(sip_rank, sip_topo_kind, sip_topo_w, +sip_topo_h)` appended after the module's `kernel_args()`. The +`leader_only` / `all_pes` mapper concepts in this document are no longer +used by the default allreduce path. ## Context diff --git a/docs/adr/ADR-0025-ipcq-direction-addressing.md b/docs/adr/ADR-0025-ipcq-direction-addressing.md index 5ef6e62..c2eeb39 100644 --- a/docs/adr/ADR-0025-ipcq-direction-addressing.md +++ b/docs/adr/ADR-0025-ipcq-direction-addressing.md @@ -89,7 +89,14 @@ direction_idx × bytes_per_direction). 따라서: `src/kernbench/ccl/install.py`: ```python -_OPPOSITE_DIR = {"E": "W", "W": "E", "N": "S", "S": "N"} +# Extended in ADR-0032 with global_* pairs for inter-SIP directions, +# which were introduced by configure_sfr_intercube_multisip to keep +# intercube (N/S/E/W) and inter-SIP (global_N/S/E/W) namespaces disjoint. +_OPPOSITE_DIR = { + "E": "W", "W": "E", "N": "S", "S": "N", + "global_E": "global_W", "global_W": "global_E", + "global_N": "global_S", "global_S": "global_N", +} def reverse_direction(my_rank: int, peer_rank: int, my_dir: str) -> str | None: """Find peer's direction that reciprocates my_dir→peer_rank. diff --git a/docs/adr/ADR-0029-hierarchical-allreduce.md b/docs/adr/ADR-0029-hierarchical-allreduce.md index 604abeb..7c4794d 100644 --- a/docs/adr/ADR-0029-hierarchical-allreduce.md +++ b/docs/adr/ADR-0029-hierarchical-allreduce.md @@ -2,7 +2,9 @@ ## Status -Proposed +Superseded by ADR-0032 (Intercube all-reduce). The 3-level kernel and +`hierarchical_allreduce.py` module have been removed. The cube-mesh +intercube + inter-SIP path is now the single all-reduce algorithm. ## Context diff --git a/docs/adr/ADR-0032-intercube-allreduce.md b/docs/adr/ADR-0032-intercube-allreduce.md new file mode 100644 index 0000000..c3df130 --- /dev/null +++ b/docs/adr/ADR-0032-intercube-allreduce.md @@ -0,0 +1,256 @@ +# ADR-0032: Intercube All-Reduce — pe0 cube-mesh reduce + multi-SIP exchange + +## Status + +Accepted (supersedes ADR-0029). + +## Context + +### Goal + +Define a single all-reduce algorithm that exploits the topology hierarchy: +cube mesh within each SIP (intercube) + inter-SIP exchange. One kernel, +one SFR configuration path, driven by `topology.yaml` and `ccl.yaml`. + +### Why replace ADR-0029 (hierarchical 3-level) + +ADR-0029 proposed a 3-level (intra-cube → inter-cube → inter-SIP) algorithm +where every PE in the system participates. In practice this adds the +intra-cube PE-to-PE stage complexity (bidirectional reduce + chain broadcast) +without matching the common workload pattern where the tensor is sharded +**per cube** (not per PE within a cube). + +Moreover, the hierarchical design required: +- per-PE neighbor graph installation (`_build_pe_installs` multi-level) +- multi-level topology schema (`hierarchical_3level`) +- `all_pes` mapper + `multi_pe_sip_local` validator infrastructure + +The intercube algorithm below removes all of that: **pe0-only same-lane +intercube reduce on the 4×4 cube mesh**, then inter-SIP exchange on the +root cube, then broadcast back. Simpler kernel, simpler wiring, same +bandwidth characteristics for the common per-cube DP workload. + +### Current state + +- `src/kernbench/ccl/algorithms/intercube_allreduce.py` — kernel +- `src/kernbench/ccl/sfr_config.py` — `configure_sfr_intercube_multisip` +- `src/kernbench/runtime_api/distributed.py` — `AhbmCCLBackend` wires this + automatically at `init_process_group` time. +- Old `ring_allreduce`, `mesh_allreduce`, `tree_allreduce`, + `hierarchical_allreduce` modules and their tests are **removed**. + +--- + +## Decision + +### D1. Algorithm structure — 5 phases + +For each SIP (launched concurrently by `mp.spawn`): + +``` +Phase 1 — Row reduce W → E (cube mesh, pe0 only): + col=0 sends E → col=1 accumulates, sends E → ... → col=3 holds row sum. + +Phase 2 — Col reduce N → S on rightmost column (pe0, col = mesh_w-1): + row=0 sends S → row=1 accumulates, sends S → ... → root cube (15) + holds the full SIP sum. + +Phase 3 — Inter-SIP exchange on root cube (pe0 of root cube only): + Ring / torus-2d row+col ring / mesh-2d chain reduce+broadcast — + selected by sip_topo_kind (from topology.yaml sips.topology). + +Phase 4 — Col broadcast S → N on rightmost column. + +Phase 5 — Row broadcast E → W across the cube mesh. +``` + +After all phases every cube's pe0 holds the global sum. + +The kernel is a single function parameterised by `sip_topo_kind ∈ {0, 1, 2}` +(ring_1d, torus_2d, mesh_2d_no_wrap). Phases 1-2 and 4-5 are identical +across topologies; only phase 3 branches. Helper functions +`_inter_sip_ring`, `_inter_sip_torus_2d`, `_inter_sip_mesh_2d` encode the +three exchange patterns. + +### D2. Tensor layout (rank = SIP, per-worker) + +Per ADR-0024 rank = SIP at the process-group level. Each worker allocates +its own cube-mesh-spanning tensor: + +```python +dp = DPPolicy(cube="row_wise", pe="replicate", num_cubes=16, num_pes=1) +tensor = torch.zeros((n_cubes, n_elem), dtype="f16", dp=dp) +``` + +Shard layout: 16 shards per SIP, one per cube on pe0. The kernel addresses +each cube's shard as `pe_addr = t_ptr + cube_id * n_elem * 2`. + +### D3. SFR / IPCQ wiring — `configure_sfr_intercube_multisip` + +Replaces the rank-to-2-PE install from ADR-0024. Wires PE_IPCQ neighbor +tables for **every cube's pe0 across every SIP** — regardless of which +cube is the root or which SIP topology is selected. This lets the kernel +elect the root cube at runtime and supports topology switches without +re-wiring. + +| Level | Direction labels | Scope | +|---|---|---| +| Intercube within SIP | N / S / E / W | pe0 of every cube → pe0 of mesh neighbors (no wrap) | +| Inter-SIP (all cubes) | global_E / global_W / global_N / global_S | pe0 of cube c on sip A → pe0 of cube c on peer SIP per `sips.topology` | + +Inter-SIP directions use the `global_*` prefix to keep the namespace +disjoint from intercube directions. ADR-0025's `_OPPOSITE_DIR` is extended +with `global_E ↔ global_W` and `global_N ↔ global_S` so the reverse- +direction resolver handles 2-SIP bidirectional rings correctly. + +Internally the function calls `install_ipcq` with: +- `world_size = n_sips × n_cubes` +- `rank_to_pe = [(sip, cube, 0) for sip in range(n_sips) for cube in range(n_cubes)]` +- A closure-captured `neighbors()` function that builds the map above. + +This `world_size` is internal to IPCQ wiring and does not leak to the +process-group rank. + +### D4. SIP topology — from `topology.yaml` + +```yaml +system: + sips: + count: 2 + topology: ring_1d # or torus_2d, mesh_2d_no_wrap +``` + +- `ring_1d`: n_sips-1 rounds of `send global_E / recv global_W`. +- `torus_2d`: sqrt(n_sips)×sqrt(n_sips) wrapping mesh. Row ring on + `global_E/W` then col ring on `global_S/N`. +- `mesh_2d_no_wrap`: square mesh without wrap-around. Chain reduce + + broadcast per dimension. + +2D variants require `n_sips` to be a perfect square. + +### D5. Process-group integration — `AhbmCCLBackend` + +At `init_process_group` time the backend: + +1. Loads `ccl.yaml` + `topology.yaml`. +2. Derives `sip_topo_kind, sip_topo_w, sip_topo_h` from + `system.sips.topology` using the algorithm module's `TOPO_NAME_TO_KIND`. +3. Calls `configure_sfr_intercube_multisip(engine, spec, cfg)` — one-time + SFR wiring, mirrors NCCL communicator creation. + +At each `dist.all_reduce(tensor)` call: + +1. Resolves `kernel_fn` from `cfg["module"]`. +2. Builds args: `(n_elem, cube_w, cube_h, n_sips)` from + `kernel_args(world_size, n_elem)`. +3. Appends `(sip_rank, sip_topo_kind, sip_topo_w, sip_topo_h)` where + `sip_rank` is the current greenlet's bound rank. +4. Launches with `_defer_wait=True`; the main scheduler drains pending + handles after all workers submit (per ADR-0024 D7 / ADR-0027 D0.4). + +### D6. Config schema + +`ccl.yaml`: + +```yaml +defaults: + algorithm: intercube_allreduce + buffer_kind: tcm + ... + +algorithms: + intercube_allreduce: + module: kernbench.ccl.algorithms.intercube_allreduce + topology: none + buffer_kind: tcm + n_elem: 8 + root_cube: 15 +``` + +`topology.yaml`: + +```yaml +system: + sips: + count: 2 + topology: ring_1d +sip: + cube_mesh: { w: 4, h: 4 } +``` + +### D7. Algorithm module contract + +Modules loaded via `cfg["module"]` must export: + +| Name | Purpose | +|---|---| +| `kernel` | callable, signature `(t_ptr, n_elem, cube_w, cube_h, n_sips, sip_rank, sip_topo_kind, sip_topo_w, sip_topo_h, tl)` | +| `kernel_args(world_size, n_elem) -> tuple` | returns the first 4 scalar args (per-tensor) | +| `TOPO_NAME_TO_KIND: dict[str, int]` | maps `system.sips.topology` name to kernel branch code | +| `SIP_TOPO_RING`, `SIP_TOPO_TORUS`, `SIP_TOPO_MESH` | integer constants (0, 1, 2) | + +--- + +## Dependencies + +- **ADR-0023**: IPCQ protocol (neighbor table, send/recv, credit return). +- **ADR-0024**: rank = SIP launcher, `mp.spawn`, greenlet-local rank. +- **ADR-0025**: Address-based IPCQ direction matching; extended + `_OPPOSITE_DIR` with `global_*` pairs. +- **ADR-0027**: Worker-wait / collective-pending drain in main scheduler. + +## Non-goals + +- **Per-PE allreduce** (intra-cube PE-to-PE reduce). Out of scope — the + workload for this algorithm is per-cube DP. +- **Asymmetric SIP topologies** (non-square mesh/torus). `torus_2d` and + `mesh_2d_no_wrap` require `n_sips = k²`. +- **Pipelined chunks**: single-tile per cube, no pipelining yet. +- **Root cube runtime election**: the kernel currently uses + `root_cube = (mesh_h - 1) * mesh_w + (mesh_w - 1)` hardcoded to the SE + corner. SFR wiring covers all cubes, so runtime election is a pure kernel + change when needed. + +--- + +## Consequences + +### Positive + +- **Single kernel, single install path** for all-reduce — replaces four + removed modules (`ring`, `mesh`, `tree`, `hierarchical`). +- **Topology-agnostic kernel**: ring / torus / mesh selected via one + integer param, no kernel duplication. +- **Automatic via `dist.all_reduce`**: no bench-level or user-level + algorithm selection needed; config-driven end-to-end. +- **Full SFR wiring**: every cube on every SIP has inter-SIP links + available — supports future dynamic root-cube election. + +### Negative + +- **Not suitable for per-PE sharded tensors**: TP-layer-style tensors that + shard within one cube across 8 PEs are not addressable by this kernel. + Such workloads would need a separate intra-cube all-reduce path (not + yet implemented). +- **`configure_sfr_intercube_multisip` always wires all pe0s**: even if a + given run only needs a subset (e.g. 1 SIP, ring only). Install cost is + small but not zero. + +--- + +## Affected files + +| File | Change | +|---|---| +| `src/kernbench/ccl/algorithms/intercube_allreduce.py` (new) | Kernel + `_inter_sip_*` helpers + `TOPO_NAME_TO_KIND` | +| `src/kernbench/ccl/sfr_config.py` (new) | `configure_sfr_intercube_multisip` | +| `src/kernbench/ccl/topologies.py` | Added `torus_2d`, `mesh_2d_no_wrap` | +| `src/kernbench/ccl/install.py` | Extended `_OPPOSITE_DIR` with `global_*` pairs | +| `src/kernbench/runtime_api/distributed.py` | `AhbmCCLBackend` uses `configure_sfr_intercube_multisip` + appends sip_rank/topo args | +| `ccl.yaml` | Single `intercube_allreduce` entry | +| `topology.yaml` | Added `system.sips.topology` | +| `benches/ccl_allreduce.py` | Row-wise cube-mesh tensor layout | +| `tests/test_allreduce_multidevice.py` (new) | Config-driven ring/torus/mesh | +| `tests/test_distributed_intercube_allreduce.py` (new) | Full `dist.all_reduce` path | +| `tests/test_intercube_sfr_config.py` (new) | SFR wiring verification | +| Removed | `ring_allreduce.py`, `mesh_allreduce.py`, `tree_allreduce.py`, `hierarchical_allreduce.py`, `hello_send.py`, `testing.py` and their tests | diff --git a/src/kernbench/ccl/install.py b/src/kernbench/ccl/install.py index 563a2f4..6d86595 100644 --- a/src/kernbench/ccl/install.py +++ b/src/kernbench/ccl/install.py @@ -221,6 +221,8 @@ def install_ipcq( _OPPOSITE_DIR = { "E": "W", "W": "E", "N": "S", "S": "N", + "intra_E": "intra_W", "intra_W": "intra_E", + "intra_N": "intra_S", "intra_S": "intra_N", "global_E": "global_W", "global_W": "global_E", "global_N": "global_S", "global_S": "global_N", } diff --git a/tests/test_ccl_round_robin_recv.py b/tests/test_ccl_round_robin_recv.py deleted file mode 100644 index 8b98dfa..0000000 --- a/tests/test_ccl_round_robin_recv.py +++ /dev/null @@ -1,48 +0,0 @@ -"""Test that tl.recv() (no direction) works under the mock runtime -and the SimPy PE_IPCQ component (ADR-0023 D4 weak fairness).""" -from __future__ import annotations - -import numpy as np - -from kernbench.ccl.testing import run_kernel_in_mock - - -def kernel_round_robin(t_ptr, n_elem, tl): - """Each PE sends one tile E then receives N-1 tiles via round-robin. - Uses TensorHandle math (PE_MATH) so Phase 2 produces correct HBM - contents under SimPy + op_log replay.""" - rank = tl.program_id(axis=0) - world_size = tl.num_programs(axis=0) - nbytes = n_elem * 2 - - pe_addr = t_ptr + rank * nbytes - acc = tl.load(pe_addr, shape=(n_elem,), dtype="f16") - current = acc - - for _step in range(world_size - 1): - tl.send(dir="E", src=current) - # No direction → round-robin - recv = tl.recv(shape=(n_elem,), dtype="f16") - acc = acc + recv - current = recv # forward W's tile to E next round - - tl.store(pe_addr, acc) - - -def test_round_robin_recv_mock_runtime(): - n_elem = 8 - inputs = [ - np.full((n_elem,), float(r + 1), dtype=np.float16) - for r in range(4) - ] - expected = sum(inputs) # [10,...] - - outputs = run_kernel_in_mock( - kernel_fn=kernel_round_robin, - world_size=4, - topology="ring_1d", - inputs=inputs, - kernel_args=(n_elem,), - ) - for r in range(4): - assert np.allclose(outputs[r], expected) diff --git a/tests/test_intercube_sfr_config.py b/tests/test_intercube_sfr_config.py index d99c3fd..bdf0cea 100644 --- a/tests/test_intercube_sfr_config.py +++ b/tests/test_intercube_sfr_config.py @@ -1,8 +1,9 @@ """Tests for configure_sfr_intercube_multisip neighbor table wiring. -Verifies that IPCQ neighbor tables are correctly installed for -intercube (pe0, 4×4 mesh N/S/E/W) + inter-SIP (pe0, all cubes, -global_E/global_W) communication. +Verifies full IPCQ hardware wiring (independent of DPPolicy): + - intra-cube (2×4 PE grid) → intra_N/S/E/W + - intercube same-lane → N/S/E/W + - inter-SIP same-(cube, pe) → global_N/S/E/W """ from __future__ import annotations @@ -16,6 +17,7 @@ from kernbench.topology.builder import resolve_topology TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml" N_CUBES = 16 +PES_PER_CUBE = 8 def _engine_and_spec(): @@ -36,78 +38,102 @@ class TestConfigureSfrNeighborTables: plan = configure_sfr_intercube_multisip(engine, spec, cfg) n_sips = int(spec["system"]["sips"]["count"]) - assert plan["world_size"] == n_sips * N_CUBES - assert len(plan["rank_to_pe"]) == n_sips * N_CUBES - for pe_idx, (sip, cube, pe) in enumerate(plan["rank_to_pe"]): - assert pe == 0, f"pe_idx {pe_idx}: pe must be 0, got {pe}" + expected = n_sips * N_CUBES * PES_PER_CUBE + assert plan["world_size"] == expected + assert len(plan["rank_to_pe"]) == expected - def test_corner_cube0_has_E_and_S_only(self): - """Cube 0 (row=0, col=0) is NW corner: only E and S neighbors.""" + # ── Intra-cube (intra_N/S/E/W) ──────────────────────────────── + + def test_pe0_intra_cube_has_intra_E_and_intra_S(self): + """pe0 is NW of the 2×4 PE grid: intra_E=pe1, intra_S=pe4.""" engine, spec = _engine_and_spec() cfg = _merged_cfg() configure_sfr_intercube_multisip(engine, spec, cfg) - ipcq = engine._components["sip0.cube0.pe0.pe_ipcq"] - qp = ipcq.queue_pairs - assert "E" in qp, "cube 0 must have E neighbor" - assert "S" in qp, "cube 0 must have S neighbor" - assert "W" not in qp, "cube 0 (col=0) must NOT have W neighbor" - assert "N" not in qp, "cube 0 (row=0) must NOT have N neighbor" + qp = engine._components["sip0.cube0.pe0.pe_ipcq"].queue_pairs + assert "intra_E" in qp + assert qp["intra_E"]["peer"].pe == 1 + assert "intra_S" in qp + assert qp["intra_S"]["peer"].pe == 4 + assert "intra_W" not in qp + assert "intra_N" not in qp + + def test_pe5_intra_cube_has_all_four(self): + """pe5 (row=1, col=1 in 2×4 grid) has all 4 intra directions. + + Intra neighbors: intra_N=pe1, intra_E=pe6, intra_W=pe4, + intra_S not present (row=1 is bottom row). + """ + engine, spec = _engine_and_spec() + cfg = _merged_cfg() + configure_sfr_intercube_multisip(engine, spec, cfg) + + qp = engine._components["sip0.cube0.pe5.pe_ipcq"].queue_pairs + assert qp["intra_N"]["peer"].pe == 1 + assert qp["intra_E"]["peer"].pe == 6 + assert qp["intra_W"]["peer"].pe == 4 + assert "intra_S" not in qp # bottom row + + # ── Intercube same-lane (N/S/E/W) ───────────────────────────── + + def test_corner_cube0_pe0_has_intercube_E_and_S(self): + """Cube 0 (NW mesh corner): intercube E→cube1, S→cube4.""" + engine, spec = _engine_and_spec() + cfg = _merged_cfg() + configure_sfr_intercube_multisip(engine, spec, cfg) + + qp = engine._components["sip0.cube0.pe0.pe_ipcq"].queue_pairs assert qp["E"]["peer"].cube == 1 + assert qp["E"]["peer"].pe == 0 # same-lane assert qp["S"]["peer"].cube == 4 + assert qp["S"]["peer"].pe == 0 + assert "W" not in qp, "cube 0 has no west neighbor" + assert "N" not in qp, "cube 0 has no north neighbor" - def test_interior_cube5_has_all_four(self): - """Cube 5 (row=1, col=1) is interior: N/S/E/W all present.""" + def test_interior_cube5_pe3_has_all_four_intercube_same_lane(self): + """Cube 5 interior, pe3: intercube N/S/E/W all present, same-lane.""" engine, spec = _engine_and_spec() cfg = _merged_cfg() configure_sfr_intercube_multisip(engine, spec, cfg) - ipcq = engine._components["sip0.cube5.pe0.pe_ipcq"] - qp = ipcq.queue_pairs - assert qp["N"]["peer"].cube == 1 - assert qp["S"]["peer"].cube == 9 - assert qp["E"]["peer"].cube == 6 - assert qp["W"]["peer"].cube == 4 + qp = engine._components["sip0.cube5.pe3.pe_ipcq"].queue_pairs + for d, expected_cube in [("N", 1), ("S", 9), ("E", 6), ("W", 4)]: + assert qp[d]["peer"].cube == expected_cube + assert qp[d]["peer"].pe == 3 # same-lane - def test_root_cube15_has_inter_sip(self): - """Cube 15 (root, SE corner) has N, W + global_E/global_W.""" + def test_all_pes_have_intercube_wiring(self): + """Every PE on every interior cube has intercube same-lane wiring.""" engine, spec = _engine_and_spec() cfg = _merged_cfg() configure_sfr_intercube_multisip(engine, spec, cfg) - ipcq0 = engine._components["sip0.cube15.pe0.pe_ipcq"] - qp0 = ipcq0.queue_pairs - assert "N" in qp0 - assert "W" in qp0 - assert "E" not in qp0, "cube 15 (col=3) must NOT have E" - assert "S" not in qp0, "cube 15 (row=3) must NOT have S" - assert "global_E" in qp0, "root cube must have global_E" - assert "global_W" in qp0, "root cube must have global_W" - assert qp0["global_E"]["peer"].sip == 1 - assert qp0["global_E"]["peer"].cube == 15 - - ipcq1 = engine._components["sip1.cube15.pe0.pe_ipcq"] - qp1 = ipcq1.queue_pairs - assert qp1["global_E"]["peer"].sip == 0 - assert qp1["global_E"]["peer"].cube == 15 - - def test_all_cubes_have_inter_sip(self): - """ALL cubes (not just root) are wired for inter-SIP.""" - engine, spec = _engine_and_spec() - cfg = _merged_cfg() - configure_sfr_intercube_multisip(engine, spec, cfg) - - root_cube = int(cfg.get("root_cube", N_CUBES - 1)) - for cube_id in range(N_CUBES): - ipcq = engine._components[f"sip0.cube{cube_id}.pe0.pe_ipcq"] - qp = ipcq.queue_pairs - assert "global_E" in qp, ( - f"sip0.cube{cube_id}.pe0 missing global_E" - ) - assert "global_W" in qp, ( - f"sip0.cube{cube_id}.pe0 missing global_W" - ) - if cube_id == root_cube: - assert qp["global_E"]["peer"].sip != 0, ( - f"root cube {root_cube} global_E must point to another SIP" + # Interior cube 5: every PE should have N/S/E/W same-lane. + for pe in range(PES_PER_CUBE): + qp = engine._components[f"sip0.cube5.pe{pe}.pe_ipcq"].queue_pairs + for d in ("N", "S", "E", "W"): + assert d in qp, f"sip0.cube5.pe{pe} missing intercube {d}" + assert qp[d]["peer"].pe == pe, ( + f"sip0.cube5.pe{pe} {d} not same-lane" ) + + # ── Inter-SIP (global_*) ────────────────────────────────────── + + def test_every_pe_on_every_cube_has_inter_sip(self): + """All PEs on all cubes wired for inter-SIP via global_*.""" + engine, spec = _engine_and_spec() + cfg = _merged_cfg() + configure_sfr_intercube_multisip(engine, spec, cfg) + + for cube_id in range(N_CUBES): + for pe in range(PES_PER_CUBE): + qp = engine._components[ + f"sip0.cube{cube_id}.pe{pe}.pe_ipcq" + ].queue_pairs + assert "global_E" in qp, ( + f"sip0.cube{cube_id}.pe{pe} missing global_E" + ) + assert "global_W" in qp + # Peer must be same (cube, pe) on another SIP. + assert qp["global_E"]["peer"].sip == 1 + assert qp["global_E"]["peer"].cube == cube_id + assert qp["global_E"]["peer"].pe == pe