Add SIP-level tensor parallelism, component registry YAML, VA offset verification
- DPPolicy: 3-level (sip/cube/pe), unified naming (column_wise/row_wise) - PE_CPU: auto num_programs from cube shard count - context.launch(): per-SIP KernelLaunchMsg with local va_base + auto local shape - deploy_tensor: removed mmus param, MMU mapping is context-only responsibility - ComponentRegistry: YAML-based lazy loading (components.yaml), impls→builtin rename - VA offset bench + tests: 2D/1D, standard Triton kernel pattern Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -59,10 +59,7 @@ def deploy_tensor(
|
||||
allocators: dict[int, PEMemAllocator],
|
||||
mem_kind: Literal["hbm", "tcm"] = "hbm",
|
||||
va_allocator=None,
|
||||
mmus: dict | None = None,
|
||||
) -> TensorHandle:
|
||||
from kernbench.policy.address.pe_mmu import PeMMU
|
||||
|
||||
isize = dtype_itemsize(dtype)
|
||||
total_nbytes = math.prod(shape) * isize
|
||||
|
||||
@@ -78,22 +75,15 @@ def deploy_tensor(
|
||||
pa = alloc.alloc_hbm(spec.nbytes)
|
||||
else:
|
||||
pa = alloc.alloc_tcm(spec.nbytes)
|
||||
encoded_pa = pa.encode()
|
||||
shards.append(TensorShard(
|
||||
sip=alloc._sip_id,
|
||||
cube=alloc._cube_id,
|
||||
pe=alloc._pe_id,
|
||||
pa=encoded_pa,
|
||||
pa=pa.encode(),
|
||||
nbytes=spec.nbytes,
|
||||
offset_bytes=spec.offset_bytes,
|
||||
))
|
||||
|
||||
# Register VA→PA mapping in all MMUs (broadcast)
|
||||
if va_base and mmus is not None:
|
||||
shard_va = va_base + spec.offset_bytes
|
||||
for mmu in mmus.values():
|
||||
mmu.map(va=shard_va, pa=encoded_pa, size=spec.nbytes)
|
||||
|
||||
return TensorHandle(
|
||||
name=name,
|
||||
shape=shape,
|
||||
|
||||
Reference in New Issue
Block a user