commit - release 1

2026-03-18 11:47:48 -07:00
commit 6f43807900
109 changed files with 14909 additions and 0 deletions
@@ -0,0 +1,150 @@
+"""PE-internal command types and handles (ADR-0014).
+
+Generated by triton_emu (TLContext) and consumed by PE component
+implementations (PE_CPU, PE_SCHEDULER, PE_DMA, PE_GEMM, PE_MATH).
+
+Command lifecycle:
+  Triton kernel → TLContext → [PeCommand list] → PE_CPU → PE_SCHEDULER → engines
+"""
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any, Literal
+
+if TYPE_CHECKING:
+    import simpy
+
+
+# ── Handles ───────────────────────────────────────────────────────
+
+
+@dataclass(frozen=True)
+class TensorHandle:
+    """Opaque reference to a tensor residing in PE_TCM.
+
+    Returned by tl.load, tl.dot, tl.exp, etc.
+    Carries metadata for command generation; data field is reserved
+    for future validate mode (numpy array).
+    """
+
+    id: str
+    pa: int                          # physical address in HBM/TCM
+    shape: tuple[int, ...]
+    dtype: str
+    nbytes: int                      # total byte size
+    data: object = None              # reserved for validate mode
+
+
+@dataclass(frozen=True)
+class CompletionHandle:
+    """Opaque handle for a non-blocking composite command.
+
+    Returned by tl.composite, consumed by tl.wait.
+    """
+
+    id: str
+
+
+# ── PE Commands ───────────────────────────────────────────────────
+
+
+@dataclass(frozen=True)
+class DmaReadCmd:
+    """DMA READ: HBM → PE_TCM."""
+
+    handle: TensorHandle
+    src_pa: int
+    nbytes: int
+
+
+@dataclass(frozen=True)
+class DmaWriteCmd:
+    """DMA WRITE: PE_TCM → HBM."""
+
+    handle: TensorHandle
+    dst_pa: int
+    nbytes: int
+
+
+@dataclass(frozen=True)
+class GemmCmd:
+    """GEMM engine command: matrix multiply on TCM data.
+
+    out = a @ b, all operands in TCM.
+    """
+
+    a: TensorHandle
+    b: TensorHandle
+    out: TensorHandle
+    m: int
+    k: int
+    n: int
+
+
+@dataclass(frozen=True)
+class MathCmd:
+    """MATH engine command: unary/binary/reduction on TCM data.
+
+    op: "exp", "log", "sqrt", "abs", "sigmoid", "cos", "sin",
+        "add", "sub", "mul", "div", "where",
+        "sum", "max", "min"
+    """
+
+    op: str
+    inputs: tuple[TensorHandle, ...]
+    out: TensorHandle
+    axis: int | None = None          # for reductions
+
+
+@dataclass(frozen=True)
+class CompositeCmd:
+    """Composite command: tiled pipeline of DMA_READ + COMPUTE + DMA_WRITE.
+
+    Non-blocking — submitted to PE_SCHEDULER which manages tile splitting
+    and pipeline overlaps (ADR-0014 D3.2).
+    """
+
+    completion: CompletionHandle
+    op: Literal["gemm", "math"]
+    a: TensorHandle
+    b: TensorHandle | None
+    out_pa: int
+    out_nbytes: int
+    math_op: str | None = None       # for op="math": which math operation
+
+
+@dataclass(frozen=True)
+class WaitCmd:
+    """Wait for a specific composite or all pending composites."""
+
+    handle: CompletionHandle | None = None  # None = wait all
+
+
+@dataclass(frozen=True)
+class PeCpuOverheadCmd:
+    """PE_CPU scalar execution overhead (cycles)."""
+
+    cycles: int
+
+
+# Union type for all PE commands
+PeCommand = (
+    DmaReadCmd | DmaWriteCmd | GemmCmd | MathCmd
+    | CompositeCmd | WaitCmd | PeCpuOverheadCmd
+)
+
+
+@dataclass
+class PeInternalTxn:
+    """PE-internal message flowing PE_CPU → PE_SCHEDULER → engines.
+
+    Carries a single PeCommand and a completion event.  PE_CPU creates one
+    PeInternalTxn per command during the replay phase and sends it to
+    PE_SCHEDULER, which routes it to the appropriate engine (PE_DMA,
+    PE_GEMM, PE_MATH).  The engine signals ``done`` on completion.
+    """
+
+    command: PeCommand
+    done: simpy.Event  # succeeded when the engine completes this command
+    pe_prefix: str = ""  # e.g. "sip0.cube0.pe0" — needed by PE_DMA for path resolution
+    result_data: dict[str, Any] = field(default_factory=dict)