Implement ADR-0020: 2-pass data execution with greenlet kernel runner
Step 1 — Foundation: - OpRecord/OpLogger: op log infrastructure with t_start stable ordering - MemoryStore: numpy ndarray tensor-granular storage (reference semantics) - data_op=True flag on DmaReadCmd, DmaWriteCmd, GemmCmd, MathCmd, CompositeCmd - numpy/greenlet dependencies added to pyproject.toml Step 2 — ComponentBase hooks: - _on_process_start/end hooks in _forward_txn (fabric messages) - _handle_with_hooks in PeEngineBase (PE-internal commands) - op_logger optional — zero overhead when disabled Step 3 — KernelRunner + greenlet: - KernelRunner: greenlet ↔ SimPy bridge in triton_emu/kernel_runner.py - TLContext: _emit() method routes to greenlet switch or command list - tl.load() returns real numpy data in greenlet mode - Dynamic control flow supported (memory-read based branching) Step 4 — PE_CPU integration: - Greenlet mode when ctx.memory_store is set, legacy fallback otherwise - Refactored into _execute_greenlet/_execute_legacy/_send_response - ComponentContext gains memory_store and op_logger fields Step 5 — DataExecutor: - Phase 2 numpy execution for GEMM/Math ops from op_log - _compute_math: all unary/binary/reduction ops - verify(): compare MemoryStore against expected with dtype tolerance 28 new tests, 366 total passing. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -52,6 +52,7 @@ class TLContext:
|
||||
pe_id: int = 0,
|
||||
num_programs: int = 1,
|
||||
dispatch_cycles: int = 1,
|
||||
runner: Any = None,
|
||||
) -> None:
|
||||
self._pe_id = pe_id
|
||||
self._num_programs = num_programs
|
||||
@@ -59,6 +60,7 @@ class TLContext:
|
||||
self._commands: list[PeCommand] = []
|
||||
self._handle_counter = 0
|
||||
self._completion_counter = 0
|
||||
self._runner = runner # KernelRunner for greenlet mode (ADR-0020 D3)
|
||||
|
||||
@property
|
||||
def commands(self) -> list[PeCommand]:
|
||||
@@ -83,7 +85,7 @@ class TLContext:
|
||||
|
||||
def _emit_dispatch_overhead(self) -> None:
|
||||
if self._dispatch_cycles > 0:
|
||||
self._commands.append(PeCpuOverheadCmd(cycles=self._dispatch_cycles))
|
||||
self._emit(PeCpuOverheadCmd(cycles=self._dispatch_cycles))
|
||||
|
||||
def _make_handle(
|
||||
self, addr: int, shape: tuple[int, ...], dtype: str,
|
||||
@@ -108,23 +110,38 @@ class TLContext:
|
||||
|
||||
# ── Data Movement (blocking, DMA engine) ──────────────────────
|
||||
|
||||
def _emit(self, cmd: PeCommand) -> Any:
|
||||
"""Emit command: greenlet switch if runner available, else append to list."""
|
||||
if self._runner is not None:
|
||||
return self._runner.switch_to_simpy(cmd)
|
||||
self._commands.append(cmd)
|
||||
return None
|
||||
|
||||
def load(
|
||||
self, ptr: int, shape: tuple[int, ...], dtype: str = "f16",
|
||||
) -> TensorHandle:
|
||||
"""Load tensor from HBM to TCM. Returns TensorHandle."""
|
||||
"""Load tensor from HBM to TCM. Returns TensorHandle.
|
||||
|
||||
In greenlet mode: returns TensorHandle with actual numpy data.
|
||||
In command-list mode: returns TensorHandle with data=None.
|
||||
"""
|
||||
self._emit_dispatch_overhead()
|
||||
handle = self._make_handle(addr=ptr, shape=shape, dtype=dtype)
|
||||
self._commands.append(DmaReadCmd(
|
||||
handle=handle, src_addr=ptr, nbytes=handle.nbytes,
|
||||
))
|
||||
cmd = DmaReadCmd(handle=handle, src_addr=ptr, nbytes=handle.nbytes)
|
||||
data = self._emit(cmd)
|
||||
if data is not None:
|
||||
# Greenlet mode: attach real data to handle
|
||||
return TensorHandle(
|
||||
id=handle.id, addr=handle.addr, shape=handle.shape,
|
||||
dtype=handle.dtype, nbytes=handle.nbytes, data=data,
|
||||
)
|
||||
return handle
|
||||
|
||||
def store(self, ptr: int, handle: TensorHandle) -> None:
|
||||
"""Store tensor from TCM to HBM."""
|
||||
self._emit_dispatch_overhead()
|
||||
self._commands.append(DmaWriteCmd(
|
||||
handle=handle, dst_addr=ptr, nbytes=handle.nbytes,
|
||||
))
|
||||
cmd = DmaWriteCmd(handle=handle, dst_addr=ptr, nbytes=handle.nbytes)
|
||||
self._emit(cmd)
|
||||
|
||||
# ── GEMM Engine (blocking) ────────────────────────────────────
|
||||
|
||||
@@ -143,7 +160,7 @@ class TLContext:
|
||||
out_dtype = a.dtype
|
||||
out = self._make_handle(addr=0, shape=out_shape, dtype=out_dtype)
|
||||
self._emit_dispatch_overhead()
|
||||
self._commands.append(GemmCmd(a=a, b=b, out=out, m=m, k=k, n=n))
|
||||
self._emit(GemmCmd(a=a, b=b, out=out, m=m, k=k, n=n))
|
||||
return out
|
||||
|
||||
# ── MATH Engine: unary (blocking) ─────────────────────────────
|
||||
@@ -151,7 +168,7 @@ class TLContext:
|
||||
def _unary_math(self, op: str, x: TensorHandle) -> TensorHandle:
|
||||
out = self._make_handle(addr=0, shape=x.shape, dtype=x.dtype)
|
||||
self._emit_dispatch_overhead()
|
||||
self._commands.append(MathCmd(op=op, inputs=(x,), out=out))
|
||||
self._emit(MathCmd(op=op, inputs=(x,), out=out))
|
||||
return out
|
||||
|
||||
def exp(self, x: TensorHandle) -> TensorHandle:
|
||||
@@ -184,7 +201,7 @@ class TLContext:
|
||||
out_shape[axis] = 1
|
||||
out = self._make_handle(addr=0, shape=tuple(out_shape), dtype=x.dtype)
|
||||
self._emit_dispatch_overhead()
|
||||
self._commands.append(MathCmd(op=op, inputs=(x,), out=out, axis=axis))
|
||||
self._emit(MathCmd(op=op, inputs=(x,), out=out, axis=axis))
|
||||
return out
|
||||
|
||||
def sum(self, x: TensorHandle, axis: int) -> TensorHandle:
|
||||
@@ -203,7 +220,7 @@ class TLContext:
|
||||
) -> TensorHandle:
|
||||
out = self._make_handle(addr=0, shape=a.shape, dtype=a.dtype)
|
||||
self._emit_dispatch_overhead()
|
||||
self._commands.append(MathCmd(op=op, inputs=(a, b), out=out))
|
||||
self._emit(MathCmd(op=op, inputs=(a, b), out=out))
|
||||
return out
|
||||
|
||||
def where(
|
||||
@@ -211,7 +228,7 @@ class TLContext:
|
||||
) -> TensorHandle:
|
||||
out = self._make_handle(addr=0, shape=a.shape, dtype=a.dtype)
|
||||
self._emit_dispatch_overhead()
|
||||
self._commands.append(MathCmd(op="where", inputs=(cond, a, b), out=out))
|
||||
self._emit(MathCmd(op="where", inputs=(cond, a, b), out=out))
|
||||
return out
|
||||
|
||||
# ── Index / Scalar (PE_CPU, no engine) ────────────────────────
|
||||
@@ -276,7 +293,7 @@ class TLContext:
|
||||
|
||||
completion = CompletionHandle(id=self._next_completion_id())
|
||||
self._emit_dispatch_overhead()
|
||||
self._commands.append(CompositeCmd(
|
||||
self._emit(CompositeCmd(
|
||||
completion=completion, op=op,
|
||||
a=a, b=b, out_addr=out_ptr, out_nbytes=out_nbytes,
|
||||
math_op=math_op,
|
||||
@@ -285,11 +302,11 @@ class TLContext:
|
||||
|
||||
def wait(self, handle: CompletionHandle | None = None) -> None:
|
||||
"""Wait for a specific composite or all pending composites."""
|
||||
self._commands.append(WaitCmd(handle=handle))
|
||||
self._emit(WaitCmd(handle=handle))
|
||||
|
||||
def cycles(self, n: int) -> None:
|
||||
"""Declare PE_CPU scalar execution overhead (cycles)."""
|
||||
self._commands.append(PeCpuOverheadCmd(cycles=n))
|
||||
self._emit(PeCpuOverheadCmd(cycles=n))
|
||||
|
||||
|
||||
# ── TensorHandle arithmetic operators ─────────────────────────────
|
||||
|
||||
Reference in New Issue
Block a user