Add SchedulerV2 (pe_accel), DPPolicy overrides, and new benchmarks
- Add cycle-accurate PE accelerator scheduler (SchedulerV2) with tiled GEMM/Math pipelines (DMA_IN → GEMM → MATH → DMA_WB) - Add DPPolicy num_pes/num_cubes/num_sips overrides for single-PE testing - Support tuple target_pe for targeting specific PE subsets - Add gemm_single_pe and gpt3_qkv benchmarks - Switch default topology to pe_scheduler_v2 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -269,15 +269,25 @@ class RuntimeContext:
|
||||
allocators = self._ensure_allocators()
|
||||
itemsize = dtype_itemsize(dtype)
|
||||
shape_2d = (shape[0], shape[1]) if len(shape) >= 2 else (1, shape[0])
|
||||
# DPPolicy overrides take precedence over topology dimensions
|
||||
eff_num_pe = dp.num_pes if dp.num_pes is not None else self._pes_per_cube
|
||||
eff_num_cubes = dp.num_cubes if dp.num_cubes is not None else self._num_cubes
|
||||
eff_num_sips = dp.num_sips if dp.num_sips is not None else self._num_sips
|
||||
placement = resolve_dp_policy(
|
||||
dp, shape=shape_2d, itemsize=itemsize,
|
||||
num_pe=self._pes_per_cube, num_cubes=self._num_cubes,
|
||||
num_sips=self._num_sips,
|
||||
num_pe=eff_num_pe, num_cubes=eff_num_cubes,
|
||||
num_sips=eff_num_sips,
|
||||
)
|
||||
|
||||
# Infer target_pe from placement: multi-PE → "all", single PE → pe_index
|
||||
pe_indices = {s.pe_index for s in placement}
|
||||
target_pe: int | str = "all" if len(pe_indices) > 1 else next(iter(pe_indices))
|
||||
# Infer target_pe from placement using local (within-cube) PE IDs.
|
||||
# This ensures M_CPU only fans out to PEs that own shards, not all PEs.
|
||||
local_pe_ids = sorted({s.pe_index % eff_num_pe for s in placement})
|
||||
if len(local_pe_ids) == 1:
|
||||
target_pe: int | tuple[int, ...] | str = local_pe_ids[0]
|
||||
elif len(local_pe_ids) == eff_num_pe and eff_num_pe == self._pes_per_cube:
|
||||
target_pe = "all"
|
||||
else:
|
||||
target_pe = tuple(local_pe_ids)
|
||||
t.to(placement=placement, target_pe=target_pe, dp_policy=dp_policy)
|
||||
|
||||
# Allocate PAs via PEMemAllocator + VA via VirtualAllocator
|
||||
@@ -407,7 +417,8 @@ class RuntimeContext:
|
||||
# Collect tensors and scalars
|
||||
tensor_args: list[Tensor] = []
|
||||
scalar_args: list = []
|
||||
target_pe: int | str = 0
|
||||
_pe_set: set[int] = set()
|
||||
_pe_all = False
|
||||
|
||||
for a in args:
|
||||
if isinstance(a, Tensor):
|
||||
@@ -415,9 +426,11 @@ class RuntimeContext:
|
||||
if a._dp_metadata is not None:
|
||||
dp_target = a._dp_metadata.target_pe
|
||||
if dp_target == "all":
|
||||
target_pe = "all"
|
||||
elif isinstance(dp_target, int) and target_pe != "all":
|
||||
target_pe = dp_target
|
||||
_pe_all = True
|
||||
elif isinstance(dp_target, tuple):
|
||||
_pe_set.update(dp_target)
|
||||
elif isinstance(dp_target, int):
|
||||
_pe_set.add(dp_target)
|
||||
elif isinstance(a, (int, float)):
|
||||
dtype_str = "f32" if isinstance(a, float) else "i32"
|
||||
scalar_args.append(ScalarArg(dtype=dtype_str, value=a))
|
||||
@@ -427,6 +440,16 @@ class RuntimeContext:
|
||||
dtype_str = "f32" if isinstance(v, float) else "i32"
|
||||
scalar_args.append(ScalarArg(dtype=dtype_str, value=v))
|
||||
|
||||
# Resolve target_pe from collected PE info
|
||||
if _pe_all:
|
||||
target_pe: int | tuple[int, ...] | str = "all"
|
||||
elif len(_pe_set) == 1:
|
||||
target_pe = next(iter(_pe_set))
|
||||
elif len(_pe_set) > 1:
|
||||
target_pe = tuple(sorted(_pe_set))
|
||||
else:
|
||||
target_pe = 0
|
||||
|
||||
# Determine all target SIPs from tensor shards
|
||||
sip_set: set[int] = set()
|
||||
for t in tensor_args:
|
||||
|
||||
Reference in New Issue
Block a user