Add SchedulerV2 (pe_accel), DPPolicy overrides, and new benchmarks

- Add cycle-accurate PE accelerator scheduler (SchedulerV2) with tiled GEMM/Math pipelines (DMA_IN → GEMM → MATH → DMA_WB) - Add DPPolicy num_pes/num_cubes/num_sips overrides for single-PE testing - Support tuple target_pe for targeting specific PE subsets - Add gemm_single_pe and gpt3_qkv benchmarks - Switch default topology to pe_scheduler_v2 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-26 23:18:49 -07:00
parent 63669f82cb
commit 114510d4b9
22 changed files with 1822 additions and 15 deletions
@@ -269,15 +269,25 @@ class RuntimeContext:
        allocators = self._ensure_allocators()
        itemsize = dtype_itemsize(dtype)
        shape_2d = (shape[0], shape[1]) if len(shape) >= 2 else (1, shape[0])
+        # DPPolicy overrides take precedence over topology dimensions
+        eff_num_pe = dp.num_pes if dp.num_pes is not None else self._pes_per_cube
+        eff_num_cubes = dp.num_cubes if dp.num_cubes is not None else self._num_cubes
+        eff_num_sips = dp.num_sips if dp.num_sips is not None else self._num_sips
        placement = resolve_dp_policy(
            dp, shape=shape_2d, itemsize=itemsize,
-            num_pe=self._pes_per_cube, num_cubes=self._num_cubes,
-            num_sips=self._num_sips,
+            num_pe=eff_num_pe, num_cubes=eff_num_cubes,
+            num_sips=eff_num_sips,
        )

-        # Infer target_pe from placement: multi-PE → "all", single PE → pe_index
-        pe_indices = {s.pe_index for s in placement}
-        target_pe: int | str = "all" if len(pe_indices) > 1 else next(iter(pe_indices))
+        # Infer target_pe from placement using local (within-cube) PE IDs.
+        # This ensures M_CPU only fans out to PEs that own shards, not all PEs.
+        local_pe_ids = sorted({s.pe_index % eff_num_pe for s in placement})
+        if len(local_pe_ids) == 1:
+            target_pe: int | tuple[int, ...] | str = local_pe_ids[0]
+        elif len(local_pe_ids) == eff_num_pe and eff_num_pe == self._pes_per_cube:
+            target_pe = "all"
+        else:
+            target_pe = tuple(local_pe_ids)
        t.to(placement=placement, target_pe=target_pe, dp_policy=dp_policy)

        # Allocate PAs via PEMemAllocator + VA via VirtualAllocator
@@ -407,7 +417,8 @@ class RuntimeContext:
        # Collect tensors and scalars
        tensor_args: list[Tensor] = []
        scalar_args: list = []
-        target_pe: int | str = 0
+        _pe_set: set[int] = set()
+        _pe_all = False

        for a in args:
            if isinstance(a, Tensor):
@@ -415,9 +426,11 @@ class RuntimeContext:
                if a._dp_metadata is not None:
                    dp_target = a._dp_metadata.target_pe
                    if dp_target == "all":
-                        target_pe = "all"
-                    elif isinstance(dp_target, int) and target_pe != "all":
-                        target_pe = dp_target
+                        _pe_all = True
+                    elif isinstance(dp_target, tuple):
+                        _pe_set.update(dp_target)
+                    elif isinstance(dp_target, int):
+                        _pe_set.add(dp_target)
            elif isinstance(a, (int, float)):
                dtype_str = "f32" if isinstance(a, float) else "i32"
                scalar_args.append(ScalarArg(dtype=dtype_str, value=a))
@@ -427,6 +440,16 @@ class RuntimeContext:
                dtype_str = "f32" if isinstance(v, float) else "i32"
                scalar_args.append(ScalarArg(dtype=dtype_str, value=v))

+        # Resolve target_pe from collected PE info
+        if _pe_all:
+            target_pe: int | tuple[int, ...] | str = "all"
+        elif len(_pe_set) == 1:
+            target_pe = next(iter(_pe_set))
+        elif len(_pe_set) > 1:
+            target_pe = tuple(sorted(_pe_set))
+        else:
+            target_pe = 0
+
        # Determine all target SIPs from tensor shards
        sip_set: set[int] = set()
        for t in tensor_args: