From 787409ced12a5e47cd3ca7c46d6ba8e69e47b095 Mon Sep 17 00:00:00 2001 From: Yangwook Kang Date: Tue, 14 Apr 2026 12:46:33 -0700 Subject: [PATCH] ADR-0024 Phase B: update xfail reason with architectural blocker details MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase B Option A (freeze + defer to ADR-0027): the root cause of ring_default_ws strict-xfail is that bench workers call torch.zeros / copy_ which drive env.run in the WORKER-greenlet context. Any pending KernelLaunchMsg gets stepped inside that worker, spawning kernel_runner with parent = worker (not main). When the worker yields/finishes, the kernel greenlet is orphaned and its next switch_to_simpy raises GreenletExit mid-add — producing rank 0 mean=1 (expected 3). This is a larger architectural redesign (lazy-deploy tensor API, coroutine worker, or setup/verify split) and is parked until ADR-0027 (Megatron TP) starts, where the proper solution ships with TP use cases. No production changes; xfail reason + inline comment only. Co-Authored-By: Claude Opus 4.6 (1M context) --- tests/test_ccl_allreduce_matrix.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/tests/test_ccl_allreduce_matrix.py b/tests/test_ccl_allreduce_matrix.py index fb732e2..c6864fb 100644 --- a/tests/test_ccl_allreduce_matrix.py +++ b/tests/test_ccl_allreduce_matrix.py @@ -70,16 +70,27 @@ CASES = [ # Default fallback — no world_size override → ADR-0024 D1 derives # from topology (SIP count = 2). Exercises the new SIP-level TP # launcher + cross-SIP ring. - # XFAIL: Phase A fix (scheduler-level wait) resolves the greenlet- - # re-entry hang, but Phase 2 DataExecutor still reports only 1 math - # op for a 2-rank ring (expected 2) — cross-SIP op_log replay - # integration pending ADR-0024 Phase B. + # XFAIL — architectural blocker (ADR-0024 Phase B, future redesign): + # Bench workers call torch.zeros / copy_ which internally drive + # env.run in the WORKER-greenlet context. Any KernelLaunchMsg already + # pending in the SimPy queue gets stepped inside that worker context, + # which in turn spawns kernel_runner + kernel greenlet with parent = + # worker (not main). When the worker later yields / finishes, the + # kernel greenlet is orphaned; its next switch_to_simpy raises + # GreenletExit mid-add, producing rank 0 mean=1 (expected 3). + # Fix requires redesigning worker semantics so env.run only ever + # drives from main (options: lazy-deploy tensor API, coroutine + # worker, or setup/verify split). Not a single-PR change — parked + # until ADR-0027 (Megatron TP) starts, at which point a proper + # architectural solution lands together with TP use cases. pytest.param( "ring_allreduce_tcm", "kernbench.ccl.algorithms.ring_allreduce", "ring_1d", "tcm", None, 8, 2, id="ring_default_ws", marks=pytest.mark.xfail( - reason="ADR-0024 Phase B: cross-SIP op_log replay integration", + reason="ADR-0024 Phase B: worker-greenlet env.run captures " + "kernel greenlet as child → orphaned on worker yield. " + "Needs architectural redesign (see test comment).", strict=True, ), ),