diff --git a/tests/test_ccl_allreduce_matrix.py b/tests/test_ccl_allreduce_matrix.py index fb732e2..c6864fb 100644 --- a/tests/test_ccl_allreduce_matrix.py +++ b/tests/test_ccl_allreduce_matrix.py @@ -70,16 +70,27 @@ CASES = [ # Default fallback — no world_size override → ADR-0024 D1 derives # from topology (SIP count = 2). Exercises the new SIP-level TP # launcher + cross-SIP ring. - # XFAIL: Phase A fix (scheduler-level wait) resolves the greenlet- - # re-entry hang, but Phase 2 DataExecutor still reports only 1 math - # op for a 2-rank ring (expected 2) — cross-SIP op_log replay - # integration pending ADR-0024 Phase B. + # XFAIL — architectural blocker (ADR-0024 Phase B, future redesign): + # Bench workers call torch.zeros / copy_ which internally drive + # env.run in the WORKER-greenlet context. Any KernelLaunchMsg already + # pending in the SimPy queue gets stepped inside that worker context, + # which in turn spawns kernel_runner + kernel greenlet with parent = + # worker (not main). When the worker later yields / finishes, the + # kernel greenlet is orphaned; its next switch_to_simpy raises + # GreenletExit mid-add, producing rank 0 mean=1 (expected 3). + # Fix requires redesigning worker semantics so env.run only ever + # drives from main (options: lazy-deploy tensor API, coroutine + # worker, or setup/verify split). Not a single-PR change — parked + # until ADR-0027 (Megatron TP) starts, at which point a proper + # architectural solution lands together with TP use cases. pytest.param( "ring_allreduce_tcm", "kernbench.ccl.algorithms.ring_allreduce", "ring_1d", "tcm", None, 8, 2, id="ring_default_ws", marks=pytest.mark.xfail( - reason="ADR-0024 Phase B: cross-SIP op_log replay integration", + reason="ADR-0024 Phase B: worker-greenlet env.run captures " + "kernel greenlet as child → orphaned on worker yield. " + "Needs architectural redesign (see test comment).", strict=True, ), ),