# kernbench/runtime_api/context.py from __future__ import annotations from dataclasses import dataclass, field from typing import Any from kernbench.common.types import Completion, RequestHandle, SimEngine from .types import DeviceSelector def _world_size_from_spec(spec: dict | None) -> int: """Derive world_size from topology spec: sips × cubes × pes_per_cube.""" spec = spec or {} sips = int(spec.get("system", {}).get("sips", {}).get("count", 1)) cm = spec.get("sip", {}).get("cube_mesh", {}) cubes_per_sip = int(cm.get("w", 1)) * int(cm.get("h", 1)) pl = spec.get("cube", {}).get("pe_layout", {}) corners = pl.get("corners", []) pe_per_corner = int(pl.get("pe_per_corner", 1)) pes_per_cube = pe_per_corner * max(len(corners), 1) return sips * cubes_per_sip * pes_per_cube def _numpy_to_dtype_str(np_dtype) -> str: """Map numpy dtype → kernbench dtype string used by Tensor.""" import numpy as np kind_map = { np.float16: "f16", np.float32: "f32", np.int8: "i8", np.int16: "i16", np.int32: "i32", np.uint8: "u8", np.uint16: "u16", np.uint32: "u32", } for np_type, s in kind_map.items(): if np.dtype(np_dtype) == np.dtype(np_type): return s raise ValueError(f"unsupported numpy dtype: {np_dtype!r}") class _AhbmNamespace: """torch.ahbm — per-greenlet SIP device binding (ADR-0024 D10). Real-PyTorch parity idiom: ``torch.cuda.set_device(rank)``. KernBench's backend is 'ahbm' (not CUDA), so this namespace avoids pretending to be a CUDA runtime. """ def __init__(self) -> None: self._device_by_greenlet: dict = {} def set_device(self, device: int) -> None: from greenlet import getcurrent self._device_by_greenlet[getcurrent()] = int(device) def current_device(self) -> int | None: from greenlet import getcurrent return self._device_by_greenlet.get(getcurrent()) class _AcceleratorNamespace: """torch.accelerator — device-agnostic alias (PyTorch 2.x style). Wraps _AhbmNamespace. Bench code can pick either: torch.ahbm.set_device(rank) # explicit backend torch.accelerator.set_device_index(rank) # portable """ def __init__(self, ahbm: "_AhbmNamespace") -> None: self._ahbm = ahbm def set_device_index(self, device: int) -> None: self._ahbm.set_device(device) def current_device_index(self) -> int | None: return self._ahbm.current_device() @dataclass class RuntimeContext: engine: SimEngine target_device: DeviceSelector correlation_id: str spec: dict | None = None _handles: list[RequestHandle] = field(default_factory=list, init=False) _completed: set[RequestHandle] = field(default_factory=set, init=False) _allocators: dict[int, Any] = field(default_factory=dict, init=False) _va_allocator: Any = field(default=None, init=False) _tensor_counter: int = field(default=0, init=False) _traces: list[dict] = field(default_factory=list, init=False) _tensors: list[Any] = field(default_factory=list, init=False) distributed: Any = field(default=None, init=False) # DistributedContext for CCL benches _ipcq_plan: dict = field(default_factory=dict, init=False) # ADR-0023 install plan def __post_init__(self) -> None: # Eagerly attach a DistributedContext so bench code can do # ``dist = torch.distributed`` + ``dist.init_process_group(...)`` # without needing a separate launcher to install it. from kernbench.runtime_api.distributed import DistributedContext dc = DistributedContext() dc._ctx_ref = self # back-reference for AhbmCCLBackend to reach ctx.launch etc. self.distributed = dc # ADR-0024 D10: torch.ahbm (KernBench-native) + torch.accelerator # (PyTorch 2.x portable) namespaces for per-greenlet device binding. self.ahbm = _AhbmNamespace() self.accelerator = _AcceleratorNamespace(self.ahbm) def install_ipcq( self, algorithm: str | None = None, ccl_yaml: str | None = None, world_size_override: int | None = None, rank_to_pe: list[tuple[int, int, int]] | None = None, ) -> dict: """Install IPCQ neighbor tables on all participating PEs (ADR-0023 D10). Loads ``ccl.yaml`` (or the path provided), resolves the chosen algorithm (or ``defaults.algorithm`` if None), and pushes per-PE IpcqInitMsg into every PE_IPCQ component via the engine. Args: algorithm: name of the algorithm in ccl.yaml (or use defaults). ccl_yaml: optional path to ccl.yaml. world_size_override: if set, replace the algorithm's world_size. Returns the install plan dict (rank → (sip,cube,pe), neighbor table). """ import importlib from kernbench.ccl.install import ( install_ipcq as _install, load_ccl_config, resolve_algorithm_config, ) cfg = load_ccl_config(ccl_yaml) merged = resolve_algorithm_config(cfg, algorithm) if world_size_override is not None: merged["world_size"] = world_size_override elif "world_size" not in merged: # Derive from topology.yaml when neither the algorithm entry # nor ``defaults`` carries ``world_size`` (matches pytorch DDP # where env vars determine ranks, not the ccl config file). merged["world_size"] = _world_size_from_spec(self.spec) algo_module = None try: algo_module = importlib.import_module(merged["module"]) except ModuleNotFoundError: pass plan = _install( self.engine, self.spec, merged, algo_module=algo_module, rank_to_pe=rank_to_pe, ) self._ipcq_plan = plan self._ipcq_config = merged return plan def __enter__(self): return self def __exit__(self, *exc): self.cleanup() return False def submit(self, request: Any) -> RequestHandle: submit_fn = getattr(self.engine, "submit", None) if submit_fn is None: raise AttributeError("Engine does not implement submit(request) -> RequestHandle.") handle: RequestHandle = submit_fn(request) # type: ignore[call-arg] self._handles.append(handle) return handle def is_completed(self, handle: RequestHandle) -> bool: return handle in self._completed def wait(self, handle: RequestHandle, *, _meta: dict | None = None) -> Completion: if handle in self._completed: completion, trace = self.engine.get_completion(handle) return completion wait_fn = getattr(self.engine, "wait", None) if wait_fn is not None: wait_fn(handle) # type: ignore[misc] completion, trace = self.engine.get_completion(handle) self._completed.add(handle) if _meta is not None and trace is not None: entry = dict(trace) if isinstance(trace, dict) else {"raw": trace} entry.update(_meta) self._traces.append(entry) return completion def wait_all(self) -> None: for h in self._handles: if h not in self._completed: self.wait(h) def handles(self) -> list[RequestHandle]: return list(self._handles) # ── Tensor lifecycle ───────────────────────────────────────────── def _free_tensor(self, tensor: Any) -> None: """Free a single tensor: unmap MMU, return VA and PA.""" handle = tensor._handle if handle is None: return tensor._handle = None if not handle.va_base: return from kernbench.runtime_api.kernel import MmuUnmapMsg dp_policy = None if tensor._dp_metadata is not None: dp_policy = tensor._dp_metadata.dp_policy is_cube_replicate = ( dp_policy is not None and dp_policy.cube == "replicate" ) # Send MmuUnmapMsg through fabric from collections import defaultdict if is_cube_replicate: cube_groups: dict[tuple[int, int], list] = defaultdict(list) for shard in handle.shards: cube_groups[(shard.sip, shard.cube)].append(shard) for (sip, cube), group_shards in cube_groups.items(): entries = tuple( {"va": handle.va_base + s.offset_bytes, "size": s.nbytes} for s in group_shards ) msg = MmuUnmapMsg( correlation_id=self.correlation_id, request_id=f"unmap_{tensor.name}_s{sip}c{cube}", entries=entries, target_sips=(sip,), target_cubes=(cube,), target_pe="all", ) h = self.submit(msg) self.wait(h) else: entries = tuple( {"va": handle.va_base + s.offset_bytes, "size": s.nbytes} for s in handle.shards ) sip_set = sorted({s.sip for s in handle.shards}) cube_set = sorted({s.cube for s in handle.shards}) msg = MmuUnmapMsg( correlation_id=self.correlation_id, request_id=f"unmap_{tensor.name}", entries=entries, target_sips=tuple(sip_set), target_cubes=tuple(cube_set), target_pe="all", ) h = self.submit(msg) self.wait(h) # Return VA space if self._va_allocator is not None: self._va_allocator.free(handle.va_base, handle.nbytes) # Return PA space if self._allocators: for shard in handle.shards: flat_idx = ( shard.sip * self._num_cubes * self._pes_per_cube + shard.cube * self._pes_per_cube + shard.pe ) alloc = self._allocators.get(flat_idx) if alloc is not None: from kernbench.policy.address.phyaddr import PhysAddr alloc.free_hbm(PhysAddr.decode(shard.pa), shard.nbytes) def cleanup(self) -> None: """Free all tensors created by this context.""" for ref in self._tensors: t = ref() if t is not None and t._handle is not None: self._free_tensor(t) self._tensors.clear() # ── PyTorch-like tensor API ────────────────────────────────────── def _ensure_allocators(self) -> dict: """Lazily create PEMemAllocator instances from spec.""" if self._allocators: return self._allocators if self.spec is None: raise RuntimeError( "RuntimeContext.spec is required for tensor operations. " "Pass spec=graph.spec when creating RuntimeContext." ) from kernbench.policy.address.allocator import AddressConfig, PEMemAllocator system = self.spec.get("system", {}) cube = self.spec.get("cube", {}) mm = cube.get("memory_map", {}) pe_template = cube.get("pe_template", {}) pe_comps = pe_template.get("components", {}) tcm_cfg = pe_comps.get("pe_tcm", {}).get("attrs", {}) total_sip_count = system.get("sips", {}).get("count", 1) cubes_per_sip = system.get("sips", {}).get("cubes_per_sip", 16) pes_per_cube = ( cube.get("pe_layout", {}).get("pe_per_corner", 2) * len(cube.get("pe_layout", {}).get("corners", ["NW", "NE", "SW", "SE"])) ) hbm_gb = mm.get("hbm_total_gb_per_cube", 48) hbm_slices = mm.get("hbm_slices_per_cube", 8) tcm_mb = tcm_cfg.get("size_mb", 16) # Scope to target_device: single SIP or all SIPs from kernbench.runtime_api.types import DeviceSelector, resolve_device td = self.target_device if isinstance(self.target_device, DeviceSelector) else resolve_device(str(self.target_device)) if td.is_all: sip_range = range(total_sip_count) sip_count = total_sip_count else: sip_idx = td.sip_index sip_range = range(sip_idx, sip_idx + 1) sip_count = 1 cfg = AddressConfig( sip_count=sip_count, cubes_per_sip=cubes_per_sip, pes_per_cube=pes_per_cube, hbm_bytes_per_cube=hbm_gb * (1 << 30), hbm_slices_per_cube=hbm_slices, tcm_bytes_per_pe=tcm_mb * (1 << 20), tcm_scheduler_reserved_bytes=4 * (1 << 20), sram_bytes_per_cube=32 * (1 << 20), ) # Create allocators scoped to target SIP(s) only # Flat index: sip_id * cubes_per_sip * pes_per_cube + cube_id * pes_per_cube + pe_id self._pes_per_cube = pes_per_cube self._num_cubes = cubes_per_sip self._num_sips = sip_count cubes_x_pes = cubes_per_sip * pes_per_cube for sip_id in sip_range: for cube_id in range(cubes_per_sip): for pe_id in range(pes_per_cube): flat_idx = sip_id * cubes_x_pes + cube_id * pes_per_cube + pe_id self._allocators[flat_idx] = PEMemAllocator( rack_id=0, sip_id=sip_id, cube_id=cube_id, pe_id=pe_id, cfg=cfg, ) # Initialize VA allocator (MMU mappings are installed via fabric MmuMapMsg) from kernbench.policy.address.va_allocator import VirtualAllocator pe_mmu_attrs = pe_comps.get("pe_mmu", {}).get("attrs", {}) page_size = int(pe_mmu_attrs.get("page_size", 4096)) self._va_allocator = VirtualAllocator( va_base=0x1_0000_0000, va_size=64 * (1 << 30), # 64 GB VA space page_size=page_size, ) return self._allocators def _next_tensor_name(self) -> str: self._tensor_counter += 1 return f"t{self._tensor_counter}" def zeros( self, shape: tuple[int, ...], dtype: str = "f16", *, dp: Any = None, name: str | None = None, ): """Create a tensor and deploy to HBM with zero-fill (like torch.zeros).""" return self._create_tensor(shape, dtype, name, pattern="zero", dp=dp) def empty( self, shape: tuple[int, ...], dtype: str = "f16", *, dp: Any = None, name: str | None = None, ): """Allocate a tensor in HBM without initialization (like torch.empty).""" return self._create_tensor(shape, dtype, name, pattern=None, dp=dp) def from_numpy(self, arr: Any): """Create a host-side tensor wrapping a numpy array. Mirrors ``torch.from_numpy``. The returned tensor is NOT deployed to any PE — it lives in an in-memory host staging buffer. Use ``target.copy_(host_tensor)`` to scatter its contents into a sharded, deployed tensor. """ import numpy as np from kernbench.runtime_api.tensor import Tensor arr_c = np.ascontiguousarray(arr) dtype_str = _numpy_to_dtype_str(arr_c.dtype) t = Tensor(shape=tuple(arr_c.shape), dtype=dtype_str, name="host") t._host_buffer = arr_c t._memory_store = getattr(self.engine, "_memory_store", None) return t def _create_tensor( self, shape: tuple[int, ...], dtype: str, name: str | None, pattern: str | None, dp: Any = None, ): from kernbench.policy.placement.dp import DPPolicy, resolve_dp_policy from kernbench.runtime_api.kernel import MemoryWriteMsg from kernbench.runtime_api.tensor import Tensor, deploy_tensor, dtype_itemsize if not isinstance(dp, DPPolicy): raise ValueError("dp=DPPolicy(...) is required for tensor creation") tensor_name = name or self._next_tensor_name() t = Tensor(shape=shape, dtype=dtype, name=tensor_name) dp_policy = dp allocators = self._ensure_allocators() itemsize = dtype_itemsize(dtype) shape_2d = (shape[0], shape[1]) if len(shape) >= 2 else (1, shape[0]) # DPPolicy overrides take precedence over topology dimensions eff_num_pe = dp.num_pes if dp.num_pes is not None else self._pes_per_cube eff_num_cubes = dp.num_cubes if dp.num_cubes is not None else self._num_cubes # ADR-0024 D11: if torch.ahbm.set_device(r) is active AND DPPolicy # leaves the SIP dimension at its default (replicate + no num_sips # override), scope the tensor to SIP r only. # NOTE: this path uses post-hoc pe_index shifting as a temporary # measure; ADR-0026 replaces it with structural (sip, cube, pe) # coords in ShardSpec. current_sip = ( self.ahbm.current_device() if hasattr(self, "ahbm") else None ) scope_to_current_sip = ( current_sip is not None and dp.sip == "replicate" and dp.num_sips is None ) if scope_to_current_sip: eff_num_sips = 1 else: eff_num_sips = ( dp.num_sips if dp.num_sips is not None else self._num_sips ) placement = resolve_dp_policy( dp, shape=shape_2d, itemsize=itemsize, num_pe=eff_num_pe, num_cubes=eff_num_cubes, num_sips=eff_num_sips, ) if scope_to_current_sip: from kernbench.policy.placement.dp import ShardSpec as _SS sip_stride = self._num_cubes * self._pes_per_cube offset = int(current_sip) * sip_stride placement = [ _SS(pe_index=s.pe_index + offset, offset_bytes=s.offset_bytes, nbytes=s.nbytes) for s in placement ] # Infer target_pe from placement using local (within-cube) PE IDs. # This ensures M_CPU only fans out to PEs that own shards, not all PEs. local_pe_ids = sorted({s.pe_index % eff_num_pe for s in placement}) if len(local_pe_ids) == 1: target_pe: int | tuple[int, ...] | str = local_pe_ids[0] elif len(local_pe_ids) == eff_num_pe and eff_num_pe == self._pes_per_cube: target_pe = "all" else: target_pe = tuple(local_pe_ids) t.to(placement=placement, target_pe=target_pe, dp_policy=dp_policy) # Allocate PAs via PEMemAllocator + VA via VirtualAllocator allocators = self._ensure_allocators() handle = deploy_tensor( name=tensor_name, shape=shape, dtype=dtype, placement=placement, allocators=allocators, va_allocator=self._va_allocator, ) t._handle = handle import weakref t._ctx_ref = weakref.ref(self) t._memory_store = getattr(self.engine, "_memory_store", None) self._tensors.append(weakref.ref(t)) # Install VA→PA mappings via fabric MmuMapMsg # Strategy: always SIP-scoped (each SIP gets only its own shards). # Within each SIP: cube="replicate" → per-cube, else broadcast within SIP. if handle.va_base: from collections import defaultdict from kernbench.runtime_api.kernel import MmuMapMsg is_cube_replicate = ( dp_policy is not None and dp_policy.cube == "replicate" ) # Group shards by SIP sip_groups: dict[int, list] = defaultdict(list) for shard in handle.shards: sip_groups[shard.sip].append(shard) for sip, sip_shards in sip_groups.items(): if is_cube_replicate: # Cube replicate: per-(sip, cube) local mapping cube_groups: dict[int, list] = defaultdict(list) for s in sip_shards: cube_groups[s.cube].append(s) for cube, group_shards in cube_groups.items(): entries = tuple( {"va": handle.va_base + s.offset_bytes, "pa": s.pa, "size": s.nbytes} for s in group_shards ) msg = MmuMapMsg( correlation_id=self.correlation_id, request_id=f"mmu_{tensor_name}_s{sip}c{cube}", entries=entries, target_sips=(sip,), target_cubes=(cube,), target_pe="all", ) h = self.submit(msg) self.wait(h) else: # Cube sharded: broadcast all cubes within this SIP entries = tuple( {"va": handle.va_base + s.offset_bytes, "pa": s.pa, "size": s.nbytes} for s in sip_shards ) cube_set = sorted({s.cube for s in sip_shards}) msg = MmuMapMsg( correlation_id=self.correlation_id, request_id=f"mmu_{tensor_name}_s{sip}", entries=entries, target_sips=(sip,), target_cubes=tuple(cube_set), target_pe="all", ) h = self.submit(msg) self.wait(h) # Submit MemoryWriteMsg per shard (deploy data to device) if pattern is not None: for shard in handle.shards: h = self.submit(MemoryWriteMsg( correlation_id=self.correlation_id, request_id=f"deploy_{tensor_name}_pe{shard.pe}", dst_sip=shard.sip, dst_cube=shard.cube, dst_pe=shard.pe, dst_pa=shard.pa, nbytes=shard.nbytes, pattern=pattern, target_cubes=(shard.cube,), target_pe=shard.pe, )) self.wait(h, _meta={ "phase": "memory_write", "name": tensor_name, "sip": shard.sip, "cube": shard.cube, "pe": shard.pe, "nbytes": shard.nbytes, }) return t def launch( self, kernel_name: str, kernel_fn: Any, *args: Any, _defer_wait: bool = False, **kwargs: Any, ) -> RequestHandle: """Register and launch a kernel (like a fused torch op). Positional args: Tensor objects become TensorArg, int/float become ScalarArg. Keyword args: become ScalarArg (name is discarded, order preserved). Creates per-SIP KernelLaunchMsg with local va_base per tensor (like host driver sending per-rank launch commands). When ``_defer_wait=True`` (ADR-0024 D7), returns the list of ``(handle, sip_id, meta)`` tuples instead of waiting. Caller is responsible for waiting — used by collective ops to yield between submit and wait so all sibling ranks can submit first. """ from collections import defaultdict from kernbench.runtime_api.kernel import ( KernelLaunchMsg, KernelRef, ScalarArg, TensorArg, TensorArgShard, ) from kernbench.runtime_api.tensor import Tensor from kernbench.triton_emu.registry import _kernels, register_kernel # Register kernel (idempotent overwrite — last call wins). # Tests can re-register the same kernel_name with a different # function; the user's most recent launch must use the latest fn. _kernels[kernel_name] = kernel_fn # Collect tensors and scalars tensor_args: list[Tensor] = [] scalar_args: list = [] _pe_set: set[int] = set() _pe_all = False for a in args: if isinstance(a, Tensor): tensor_args.append(a) if a._dp_metadata is not None: dp_target = a._dp_metadata.target_pe if dp_target == "all": _pe_all = True elif isinstance(dp_target, tuple): _pe_set.update(dp_target) elif isinstance(dp_target, int): _pe_set.add(dp_target) elif isinstance(a, (int, float)): dtype_str = "f32" if isinstance(a, float) else "i32" scalar_args.append(ScalarArg(dtype=dtype_str, value=a)) for v in kwargs.values(): if isinstance(v, (int, float)): dtype_str = "f32" if isinstance(v, float) else "i32" scalar_args.append(ScalarArg(dtype=dtype_str, value=v)) # Resolve target_pe from collected PE info if _pe_all: target_pe: int | tuple[int, ...] | str = "all" elif len(_pe_set) == 1: target_pe = next(iter(_pe_set)) elif len(_pe_set) > 1: target_pe = tuple(sorted(_pe_set)) else: target_pe = 0 # Determine all target SIPs from tensor shards sip_set: set[int] = set() for t in tensor_args: if t._handle is not None: for s in t._handle.shards: sip_set.add(s.sip) if not sip_set: sip_set = {0} # Build global→local dimension mapping from tensor DPPolicies. # Scalar args matching a tensor's global dimension get replaced # with the cube-local value (what the kernel actually operates on). def _compute_local_shape(t: Tensor) -> tuple[int, ...]: """Compute cube-local shape from DPPolicy.""" shape = t.shape if len(shape) < 2: shape = (1, shape[0]) M, K = shape[0], shape[1] dp = t._dp_metadata.dp_policy if t._dp_metadata else None if dp is None: return t.shape if dp.sip != "replicate": if dp.sip == "column_wise": K = K // self._num_sips elif dp.sip == "row_wise": M = M // self._num_sips if dp.cube != "replicate": if dp.cube == "column_wise": K = K // self._num_cubes elif dp.cube == "row_wise": M = M // self._num_cubes if len(t.shape) < 2: return (K,) return (M, K) dim_map: dict[int, int] = {} # global_dim → local_dim for t in tensor_args: local = _compute_local_shape(t) for g, l in zip(t.shape if len(t.shape) >= 2 else (1, t.shape[0]), local if len(local) >= 2 else (1, local[0])): if g != l: dim_map[g] = l # Per-SIP kernel launch: each SIP gets TensorArgs with local va_base last_handle = None _pending_handles: list[tuple[Any, int]] = [] for sip_id in sorted(sip_set): sip_kernel_args: list = [] sip_cube_set: set[int] = set() for t in tensor_args: if t._handle is None: continue sip_shards = [s for s in t._handle.shards if s.sip == sip_id] if not sip_shards: sip_shards = list(t._handle.shards) local_va_base = 0 if t._handle.va_base: min_offset = min(s.offset_bytes for s in sip_shards) local_va_base = t._handle.va_base + min_offset sip_kernel_args.append(TensorArg( shards=tuple( TensorArgShard( sip=s.sip, cube=s.cube, pe=s.pe, pa=s.pa, nbytes=s.nbytes, offset_bytes=s.offset_bytes, ) for s in sip_shards ), va_base=local_va_base, )) for s in sip_shards: sip_cube_set.add(s.cube) # Interleave tensor args and scalar args, replacing global dims with local final_args: list = [] t_idx, s_idx = 0, 0 for a in args: if isinstance(a, Tensor): final_args.append(sip_kernel_args[t_idx]) t_idx += 1 elif isinstance(a, (int, float)): sa = scalar_args[s_idx] if isinstance(a, int) and a in dim_map: sa = ScalarArg(dtype=sa.dtype, value=dim_map[a]) final_args.append(sa) s_idx += 1 while s_idx < len(scalar_args): sa = scalar_args[s_idx] if isinstance(sa.value, int) and int(sa.value) in dim_map: sa = ScalarArg(dtype=sa.dtype, value=dim_map[int(sa.value)]) final_args.append(sa) s_idx += 1 target_cubes = tuple(sorted(sip_cube_set)) if sip_cube_set else (0,) h = self.submit(KernelLaunchMsg( correlation_id=self.correlation_id, request_id=f"{kernel_name}_sip{sip_id}", kernel_ref=KernelRef(name=kernel_name, kind="builtin"), args=tuple(final_args), target_cubes=target_cubes, target_pe=target_pe, )) # Defer wait until all SIPs are submitted (multi-SIP CCL needs # all participating PEs to be live concurrently — waiting # per-SIP would deadlock when ranks span SIP boundaries). _pending_handles.append((h, sip_id)) last_handle = h if _defer_wait: # ADR-0024 D7: return the pending-list so the caller can yield # between submit and drain. Used by collective ops that need # all sibling ranks to submit before any rank waits. return [ (h, sip_id, { "phase": "kernel", "name": kernel_name, "sip": sip_id, "target_pe": target_pe, }) for h, sip_id in _pending_handles ] # Drain pending handles now that every SIP has a launch posted. for h, sip_id in _pending_handles: self.wait(h, _meta={ "phase": "kernel", "name": kernel_name, "sip": sip_id, "target_pe": target_pe, }) return last_handle