Files
kernbench2/tests/test_ccl_install.py
T
ywkang 32536daf2e Fix ADR-0025: IPCQ direction addressing via address-based matching
2-rank bidirectional ring deadlock: when E and W neighbors point to the
same peer, sender-coord matching in _handle_meta_arrival / _credit_worker
picked the first direction in dict order, landing data in the wrong rx
slot relative to what the kernel recv(W) was waiting on.

Fix (ADR-0025 D1/D2/D3):
- install.reverse_direction: prefer OPPOSITE direction (E↔W, N↔S) when
  peer has it pointing back to us; fallback to any matching for
  topologies without opposite convention (tree_binary parent/child).
- _handle_meta_arrival: match by token.dst_addr range against each qp's
  my_rx_base_pa + n_slots × slot_size window (unambiguous).
- _credit_worker: match by credit.dst_rx_base_pa == qp.peer.rx_base_pa.
- IpcqCreditMetadata: new dst_rx_base_pa field carrying receiver-side
  rx base; _delayed_credit_send fills it from the consuming qp.

Tests (Phase 1 → Phase 2):
- test_reverse_direction_opposite_preference_2rank_ring
- test_reverse_direction_opposite_preference_4rank_ring_sanity
- test_meta_arrival_matches_by_dst_addr_same_peer
- test_credit_matches_by_dst_rx_base_pa_same_peer
- Existing credit-return test updated with dst_rx_base_pa.

508 tests pass.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-14 00:38:41 -07:00

161 lines
5.8 KiB
Python

"""Tests for CCL backend install (ADR-0023 D10/D11)."""
from __future__ import annotations
from kernbench.ccl.install import (
install_ipcq,
linear_rank_to_pe,
load_ccl_config,
resolve_algorithm_config,
)
from kernbench.sim_engine.engine import GraphEngine
from kernbench.topology.builder import resolve_topology
def _engine():
topo = resolve_topology("topology.yaml").topology_obj
return GraphEngine(topo, enable_data=True), topo
def test_load_ccl_config():
cfg = load_ccl_config()
assert "defaults" in cfg
assert "algorithms" in cfg
def test_resolve_algorithm_config_default():
cfg = load_ccl_config()
merged = resolve_algorithm_config(cfg)
assert merged["algorithm"] == cfg["defaults"]["algorithm"]
# ccl.yaml no longer carries defaults.world_size — backend derives
# it from topology.yaml at install time. Just check the field is
# absent here (verified per-test where install_ipcq is called).
assert "world_size" not in merged or merged["world_size"] >= 1
def test_resolve_algorithm_config_override():
cfg = load_ccl_config()
merged = resolve_algorithm_config(cfg, name="ring_allreduce_hbm")
assert merged["algorithm"] == "ring_allreduce_hbm"
assert merged["buffer_kind"] == "hbm" # algo override
# defaults still apply
assert merged["n_slots"] == cfg["defaults"]["n_slots"]
def test_linear_rank_to_pe():
engine, topo = _engine()
spec = topo.spec
# Cube 0 of SIP 0
assert linear_rank_to_pe(0, spec) == (0, 0, 0)
assert linear_rank_to_pe(7, spec) == (0, 0, 7)
# Should not exceed total PE count
pes_per_sip = (
spec["sip"]["cube_mesh"]["w"] * spec["sip"]["cube_mesh"]["h"]
* spec["cube"]["pe_layout"]["pe_per_corner"]
* len(spec["cube"]["pe_layout"]["corners"])
)
sips = spec["system"]["sips"]["count"]
total = sips * pes_per_sip
assert total >= 8
def test_install_ipcq_neighbors_correct():
engine, topo = _engine()
cfg = load_ccl_config()
merged = resolve_algorithm_config(cfg, name="ring_allreduce_tcm")
# Force a single-cube 8-rank install for the assertions below.
merged["world_size"] = 8
plan = install_ipcq(engine, topo.spec, merged)
assert plan["world_size"] == 8
assert plan["buffer_kind"] == "tcm"
# Each rank should have E and W entries
for r, nbrs in plan["neighbor_table"].items():
assert "E" in nbrs
assert "W" in nbrs
# Inspect installed PE_IPCQ for rank 0
ipcq = engine._components["sip0.cube0.pe0.pe_ipcq"]
qp_e = ipcq.queue_pairs["E"]
qp_w = ipcq.queue_pairs["W"]
assert qp_e["peer"].pe == 1 # rank 0's E neighbor is rank 1
assert qp_w["peer"].pe == 7 # rank 0's W neighbor is rank 7
# rx_base addresses should be unique
assert qp_e["my_rx_base_pa"] != qp_w["my_rx_base_pa"]
def test_install_ipcq_credit_stores_wired():
engine, topo = _engine()
cfg = load_ccl_config()
merged = resolve_algorithm_config(cfg, name="ring_allreduce_tcm")
merged["world_size"] = 8
install_ipcq(engine, topo.spec, merged)
# rank 0 (pe0) sending E goes to rank 1 (pe1)
# rank 0's peer_credit_store on E direction should equal rank 1's credit_inbox
pe0 = engine._components["sip0.cube0.pe0.pe_ipcq"]
pe1 = engine._components["sip0.cube0.pe1.pe_ipcq"]
qp_e = pe0.queue_pairs["E"]
assert qp_e["peer_credit_store"] is pe1.credit_inbox
# ── ADR-0025 D1: reverse_direction opposite-preference ───────────────
def test_reverse_direction_opposite_preference_2rank_ring():
"""ADR-0025 D1: In a 2-rank bidirectional ring both E and W point to the
same peer; reverse_direction must pick the OPPOSITE direction (W for E,
E for W) so rx_base targets the semantically-correct slot.
Concretely: rank 0 sending via E to rank 1 must target rank 1's W-rx
buffer (not rank 1's E-rx), because rank 1's kernel recv(W) reads from
its W-rx.
"""
engine, topo = _engine()
cfg = load_ccl_config()
merged = resolve_algorithm_config(cfg, name="ring_allreduce_tcm")
merged["world_size"] = 2
install_ipcq(engine, topo.spec, merged)
ipcq0 = engine._components["sip0.cube0.pe0.pe_ipcq"]
ipcq1 = engine._components["sip0.cube0.pe1.pe_ipcq"]
rank1_e_rx = ipcq1.queue_pairs["E"]["my_rx_base_pa"]
rank1_w_rx = ipcq1.queue_pairs["W"]["my_rx_base_pa"]
qp0_e = ipcq0.queue_pairs["E"]
qp0_w = ipcq0.queue_pairs["W"]
# rank 0's E entry should target rank 1's W-rx (opposite), NOT rank 1's E-rx.
assert qp0_e["peer"].rx_base_pa == rank1_w_rx, (
f"expected rank 0's E peer.rx_base_pa == rank 1's W-rx ({rank1_w_rx:#x}), "
f"got {qp0_e['peer'].rx_base_pa:#x} (matches E-rx: {rank1_e_rx:#x}) — "
f"reverse_direction picked same-label instead of opposite"
)
# rank 0's W entry should target rank 1's E-rx (opposite).
assert qp0_w["peer"].rx_base_pa == rank1_e_rx
def test_reverse_direction_opposite_preference_4rank_ring_sanity():
"""ADR-0025 D1 sanity: ws>=3 ring. E and W have distinct peers, so
opposite-preference produces same result as old dict-order first-match.
This test should PASS both under current and post-fix code.
"""
engine, topo = _engine()
cfg = load_ccl_config()
merged = resolve_algorithm_config(cfg, name="ring_allreduce_tcm")
merged["world_size"] = 4
install_ipcq(engine, topo.spec, merged)
ipcq0 = engine._components["sip0.cube0.pe0.pe_ipcq"]
ipcq1 = engine._components["sip0.cube0.pe1.pe_ipcq"]
ipcq3 = engine._components["sip0.cube0.pe3.pe_ipcq"]
# rank 0 E → rank 1 → rank 1's W-rx
qp0_e = ipcq0.queue_pairs["E"]
assert qp0_e["peer"].rx_base_pa == ipcq1.queue_pairs["W"]["my_rx_base_pa"]
# rank 0 W → rank 3 (last in ring) → rank 3's E-rx
qp0_w = ipcq0.queue_pairs["W"]
assert qp0_w["peer"].rx_base_pa == ipcq3.queue_pairs["E"]["my_rx_base_pa"]