Add SchedulerV2 (pe_accel), DPPolicy overrides, and new benchmarks

- Add cycle-accurate PE accelerator scheduler (SchedulerV2) with tiled
  GEMM/Math pipelines (DMA_IN → GEMM → MATH → DMA_WB)
- Add DPPolicy num_pes/num_cubes/num_sips overrides for single-PE testing
- Support tuple target_pe for targeting specific PE subsets
- Add gemm_single_pe and gpt3_qkv benchmarks
- Switch default topology to pe_scheduler_v2

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-03-26 23:18:49 -07:00
parent 63669f82cb
commit 114510d4b9
22 changed files with 1822 additions and 15 deletions
+32 -9
View File
@@ -269,15 +269,25 @@ class RuntimeContext:
allocators = self._ensure_allocators()
itemsize = dtype_itemsize(dtype)
shape_2d = (shape[0], shape[1]) if len(shape) >= 2 else (1, shape[0])
# DPPolicy overrides take precedence over topology dimensions
eff_num_pe = dp.num_pes if dp.num_pes is not None else self._pes_per_cube
eff_num_cubes = dp.num_cubes if dp.num_cubes is not None else self._num_cubes
eff_num_sips = dp.num_sips if dp.num_sips is not None else self._num_sips
placement = resolve_dp_policy(
dp, shape=shape_2d, itemsize=itemsize,
num_pe=self._pes_per_cube, num_cubes=self._num_cubes,
num_sips=self._num_sips,
num_pe=eff_num_pe, num_cubes=eff_num_cubes,
num_sips=eff_num_sips,
)
# Infer target_pe from placement: multi-PE → "all", single PE → pe_index
pe_indices = {s.pe_index for s in placement}
target_pe: int | str = "all" if len(pe_indices) > 1 else next(iter(pe_indices))
# Infer target_pe from placement using local (within-cube) PE IDs.
# This ensures M_CPU only fans out to PEs that own shards, not all PEs.
local_pe_ids = sorted({s.pe_index % eff_num_pe for s in placement})
if len(local_pe_ids) == 1:
target_pe: int | tuple[int, ...] | str = local_pe_ids[0]
elif len(local_pe_ids) == eff_num_pe and eff_num_pe == self._pes_per_cube:
target_pe = "all"
else:
target_pe = tuple(local_pe_ids)
t.to(placement=placement, target_pe=target_pe, dp_policy=dp_policy)
# Allocate PAs via PEMemAllocator + VA via VirtualAllocator
@@ -407,7 +417,8 @@ class RuntimeContext:
# Collect tensors and scalars
tensor_args: list[Tensor] = []
scalar_args: list = []
target_pe: int | str = 0
_pe_set: set[int] = set()
_pe_all = False
for a in args:
if isinstance(a, Tensor):
@@ -415,9 +426,11 @@ class RuntimeContext:
if a._dp_metadata is not None:
dp_target = a._dp_metadata.target_pe
if dp_target == "all":
target_pe = "all"
elif isinstance(dp_target, int) and target_pe != "all":
target_pe = dp_target
_pe_all = True
elif isinstance(dp_target, tuple):
_pe_set.update(dp_target)
elif isinstance(dp_target, int):
_pe_set.add(dp_target)
elif isinstance(a, (int, float)):
dtype_str = "f32" if isinstance(a, float) else "i32"
scalar_args.append(ScalarArg(dtype=dtype_str, value=a))
@@ -427,6 +440,16 @@ class RuntimeContext:
dtype_str = "f32" if isinstance(v, float) else "i32"
scalar_args.append(ScalarArg(dtype=dtype_str, value=v))
# Resolve target_pe from collected PE info
if _pe_all:
target_pe: int | tuple[int, ...] | str = "all"
elif len(_pe_set) == 1:
target_pe = next(iter(_pe_set))
elif len(_pe_set) > 1:
target_pe = tuple(sorted(_pe_set))
else:
target_pe = 0
# Determine all target SIPs from tensor shards
sip_set: set[int] = set()
for t in tensor_args: