"""Hello-world CCL kernel for the docs/ccl-author-guide.md walkthrough. Each PE sends its tile to the E neighbor and receives one tile from W, then stores the received tile back into its own HBM slice. The simplest possible demonstration of ``tl.send`` / ``tl.recv``. """ from __future__ import annotations def kernel_args(world_size: int, n_elem: int) -> tuple: """Return the positional kernel arguments for the ahbm backend.""" return (n_elem,) def kernel(t_ptr, n_elem, tl): local_pe = tl.program_id(axis=0) cube_id = tl.program_id(axis=1) pes_per_cube = tl.num_programs(axis=0) rank = cube_id * pes_per_cube + local_pe nbytes = n_elem * 2 pe_addr = t_ptr + rank * nbytes # Send our local HBM tile to the E neighbor. src = tl.load(pe_addr, shape=(n_elem,), dtype="f16") tl.send(dir="E", src=src) # Receive a tile from W and store it into our slice (overwrite). recv = tl.recv(dir="W", shape=(n_elem,), dtype="f16") tl.store(pe_addr, recv)