Add Tensor indexing + hierarchical 3-level all-reduce kernel
Tensor.__setitem__ / __getitem__: - Shard-aligned slice assignment and read on deployed tensors. - Scalar broadcast and numpy array assignment supported. - Cross-shard slices raise NotImplementedError (use copy_ for that). - 3 new tests: single-PE, multi-PE, cross-shard error case. Hierarchical all-reduce kernel (src/kernbench/ccl/algorithms/): - 3-level reduce: intra-cube (E/W) → inter-cube (N/S) → inter-SIP (parent). - Bidirectional ring reduce at each level: ceil((N-1)/2) rounds. Left half sends via dir_dec, right half via dir_inc (wrap). Representative receives from both sides. - Chain broadcast for reverse path: cube 0 PE 0 → all PE 0s → all PEs. - Registered in ccl.yaml as "hierarchical_allreduce" with topology: none (neighbors() override builds the full 3-level neighbor map). - kernel_args derives pes_per_cube/cubes_per_sip/num_sips from world_size. - Mock-verified at 8/16/32/64/128 ranks. Mock runtime fixes: - Direction pairing: explicit N↔S, E↔W, parent↔parent instead of "first matching reverse". Fixes 2-element rings where N and S both point to the same peer. - Deadlock detection: send-counter based (not just queue-depth-total) to catch chain reductions where send+recv pairs net to zero. - Multi-cube program_id: pes_per_cube parameter enables program_id(axis=0) = PE within cube, program_id(axis=1) = cube id. Legacy single-cube tests unaffected (default = world_size). 504 tests pass in 12s. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -46,9 +46,13 @@ class _MockRankState:
|
||||
world_size: int,
|
||||
neighbors: dict[str, int],
|
||||
input_arr: np.ndarray,
|
||||
pes_per_cube: int = 0,
|
||||
) -> None:
|
||||
self.rank = rank
|
||||
self.world_size = world_size
|
||||
# PEs per cube for program_id(axis=0/1). If 0 or world_size,
|
||||
# all ranks are in one cube (legacy single-cube behavior).
|
||||
self.pes_per_cube = pes_per_cube if pes_per_cube > 0 else world_size
|
||||
self.neighbors = neighbors # direction → peer rank
|
||||
# HBM "memory": addr → ndarray. Per-rank, no cross-rank sharing.
|
||||
self._hbm: dict[int, np.ndarray] = {}
|
||||
@@ -99,10 +103,19 @@ class _MockTL:
|
||||
|
||||
# axis-aware
|
||||
def program_id(self, axis: int = 0) -> int:
|
||||
return self._state.rank if axis == 0 else 0
|
||||
# Multi-cube: axis=0 = PE within cube, axis=1 = global cube id.
|
||||
# Falls back to flat (all ranks in one cube) if pes_per_cube
|
||||
# is not set (legacy single-cube tests).
|
||||
ppc = self._state.pes_per_cube
|
||||
if axis == 1:
|
||||
return self._state.rank // ppc
|
||||
return self._state.rank % ppc
|
||||
|
||||
def num_programs(self, axis: int = 0) -> int:
|
||||
return self._state.world_size if axis == 0 else 1
|
||||
ppc = self._state.pes_per_cube
|
||||
if axis == 1:
|
||||
return self._state.world_size // ppc
|
||||
return ppc
|
||||
|
||||
# ── arithmetic ops (called by TensorHandle.__add__ etc.) ──
|
||||
|
||||
@@ -272,18 +285,27 @@ class _MockTL:
|
||||
if data is None:
|
||||
raise RuntimeError("mock tl.send: src is None")
|
||||
peer_rank = self._state.neighbors[dir]
|
||||
# Find the reverse direction in peer's neighbors that points back to me
|
||||
# Find the reverse direction at the peer, mirroring real IPCQ
|
||||
# install pairing: N↔S, E↔W, parent↔parent, child_left↔child_left, etc.
|
||||
_REVERSE = {"N": "S", "S": "N", "E": "W", "W": "E",
|
||||
"parent": "parent", "child_left": "child_left",
|
||||
"child_right": "child_right"}
|
||||
peer_state = self._scheduler.states[peer_rank]
|
||||
reverse_dir = None
|
||||
for d, target in peer_state.neighbors.items():
|
||||
if target == self._state.rank:
|
||||
reverse_dir = d
|
||||
break
|
||||
reverse_dir = _REVERSE.get(dir)
|
||||
# Fall back to "first direction pointing at me" if the explicit
|
||||
# reverse doesn't exist at the peer (e.g. custom directions).
|
||||
if reverse_dir is None or reverse_dir not in peer_state.neighbors:
|
||||
reverse_dir = None
|
||||
for d, target in peer_state.neighbors.items():
|
||||
if target == self._state.rank:
|
||||
reverse_dir = d
|
||||
break
|
||||
if reverse_dir is None:
|
||||
raise RuntimeError(
|
||||
f"mock tl.send: peer rank {peer_rank} has no reverse direction"
|
||||
)
|
||||
peer_state.recv_q[reverse_dir].append(data.copy())
|
||||
self._scheduler._send_counter += 1
|
||||
# After delivering, hand control back to scheduler so the receiver
|
||||
# can wake up.
|
||||
self._scheduler.yield_()
|
||||
@@ -388,33 +410,34 @@ class _MockScheduler:
|
||||
state.g = _spawn(state.rank)
|
||||
|
||||
# Drive each rank round-robin until all dead. Detect global deadlock.
|
||||
max_rounds = 10_000
|
||||
round_no = 0
|
||||
# A global send counter tracks whether any greenlet delivered data
|
||||
# in the current round. This is more reliable than queue-depth
|
||||
# tracking because a recv+send pair in the same round nets to zero
|
||||
# depth change yet still represents real progress.
|
||||
self._send_counter = 0
|
||||
max_idle_rounds = 10_000
|
||||
idle_rounds = 0
|
||||
while True:
|
||||
alive = [s for s in self.states if s.g is not None and not s.g.dead]
|
||||
if not alive:
|
||||
break
|
||||
progressed = False
|
||||
counter_before = self._send_counter
|
||||
for s in self.states:
|
||||
if s.g is None or s.g.dead:
|
||||
continue
|
||||
# Multi-rank greenlets share TLContext active state via the
|
||||
# module-level thread-local; restore this rank's tl before
|
||||
# resuming so TensorHandle operator overloads dispatch to
|
||||
# the right _MockTL.
|
||||
TLContext._set_active(tls[s.rank]) # type: ignore[attr-defined]
|
||||
s.g.switch()
|
||||
if s.g.dead:
|
||||
progressed = True
|
||||
TLContext._set_active(None) # type: ignore[attr-defined]
|
||||
# Loose progress check: if no greenlet died and queues didn't grow,
|
||||
# advance round counter; abort after too many idle rounds.
|
||||
round_no += 1
|
||||
if round_no > max_rounds and not progressed:
|
||||
raise RuntimeError(
|
||||
"mock CCL runtime: deadlock detected (no progress for "
|
||||
f"{max_rounds} rounds)"
|
||||
)
|
||||
any_died = any(s.g is not None and s.g.dead for s in self.states)
|
||||
if self._send_counter > counter_before or any_died:
|
||||
idle_rounds = 0
|
||||
else:
|
||||
idle_rounds += 1
|
||||
if idle_rounds >= max_idle_rounds:
|
||||
raise RuntimeError(
|
||||
"mock CCL runtime: deadlock detected (no progress for "
|
||||
f"{max_idle_rounds} rounds)"
|
||||
)
|
||||
|
||||
return [
|
||||
s.output if s.output is not None else s._hbm.get(s._slice_addr)
|
||||
@@ -432,6 +455,7 @@ def run_kernel_in_mock(
|
||||
inputs: list[np.ndarray],
|
||||
kernel_args: tuple = (),
|
||||
algo_module: Any | None = None,
|
||||
pes_per_cube: int = 0,
|
||||
) -> list[np.ndarray]:
|
||||
"""Run a CCL kernel under the mock runtime with no SimPy/fabric.
|
||||
|
||||
@@ -443,6 +467,8 @@ def run_kernel_in_mock(
|
||||
local tile at HBM address 0.
|
||||
kernel_args: extra positional args after t_ptr
|
||||
algo_module: optional module providing ``neighbors()`` override
|
||||
pes_per_cube: PEs per cube for multi-cube program_id mapping.
|
||||
0 → single-cube legacy (all ranks in one cube).
|
||||
|
||||
Returns:
|
||||
Per-rank output ndarrays — whatever the kernel wrote via tl.store
|
||||
@@ -457,6 +483,7 @@ def run_kernel_in_mock(
|
||||
rank=r, world_size=world_size,
|
||||
neighbors=topo_fn(r, world_size),
|
||||
input_arr=inputs[r],
|
||||
pes_per_cube=pes_per_cube,
|
||||
)
|
||||
for r in range(world_size)
|
||||
]
|
||||
|
||||
Reference in New Issue
Block a user