"""Single-PE GEMM benchmark via scheduler_v2 (pe_accel). Full host-to-PE pipeline: Host → PCIE_EP → IO_CPU → M_CPU → PE_CPU → SchedulerV2 → PE_DMA → HBM Single PE: num_sips=1, num_cubes=1, num_pes=1 via DPPolicy override. Both operands use tl.ref (HBM-resident); scheduler_v2 tiles and streams per-tile DMA internally. Run: kernbench run gemm_single_pe """ from kernbench.policy.placement.dp import DPPolicy # GEMM dimensions: (M, K) x (K, N) → (M, N) M, K, N = 32, 128, 32 DTYPE = "f16" def _gemm_kernel(a_ptr, b_ptr, out_ptr, M, K, N, tl, DTYPE="f16"): """Single-PE GEMM: out = a @ b. Both operands streamed from HBM by scheduler.""" M, K, N = int(M), int(K), int(N) a = tl.ref(int(a_ptr), shape=(M, K), dtype=DTYPE) b = tl.ref(int(b_ptr), shape=(K, N), dtype=DTYPE) h = tl.composite(op="gemm", a=a, b=b, out_ptr=int(out_ptr)) tl.wait(h) def run(torch): """Run the single-PE GEMM benchmark.""" dp = DPPolicy(cube="replicate", pe="replicate", num_sips=1, num_cubes=1, num_pes=1) a = torch.empty((M, K), dtype=DTYPE, dp=dp, name="a") b = torch.empty((K, N), dtype=DTYPE, dp=dp, name="b") out = torch.empty((M, N), dtype=DTYPE, dp=dp, name="out") torch.launch("gemm_single_pe", _gemm_kernel, a, b, out, M, K, N)