kernbench2/src/kernbench/runtime_api/kernel.py

from __future__ import annotations

from dataclasses import dataclass
from typing import Literal, TypeAlias


@dataclass(frozen=True)
class MemoryWriteMsg:
    correlation_id: str
    request_id: str
    dst_sip: int
    dst_cube: int
    dst_pe: int
    dst_pa: int
    nbytes: int
    src_kind: Literal["pattern", "host_buffer_ref"] = "pattern"
    pattern: str | None = None
    target_cubes: tuple[int, ...] | Literal["all"] = "all"
    target_pe: int | Literal["all"] = "all"
    msg_type: Literal["memory_write"] = "memory_write"


@dataclass(frozen=True)
class MemoryReadMsg:
    correlation_id: str
    request_id: str
    src_sip: int
    src_cube: int
    src_pe: int
    src_pa: int
    nbytes: int
    target_cubes: tuple[int, ...] | Literal["all"] = "all"
    target_pe: int | Literal["all"] = "all"
    msg_type: Literal["memory_read"] = "memory_read"


@dataclass(frozen=True)
class KernelRef:
    """Reference to a kernel binary or builtin timing model.

    Kernel binaries must be pre-deployed to device memory via MemoryWriteMsg.
    KernelLaunchMsg references the deployed location by PA — source code or IR
    MUST NOT be embedded in launch messages.

    - "deployed": kernel binary pre-deployed to HBM/SRAM at deploy_pa.
    - "builtin":  simulator built-in timing model, identified by name.
    """

    name: str
    kind: Literal["deployed", "builtin"]
    deploy_pa: int | None = None
    deploy_sip: int = 0
    deploy_cube: int = 0
    deploy_pe: int = 0
    nbytes_code: int = 0


@dataclass(frozen=True)
class TensorArgShard:
    sip: int
    cube: int
    pe: int
    pa: int
    nbytes: int
    offset_bytes: int


@dataclass(frozen=True)
class TensorArg:
    shards: tuple[TensorArgShard, ...]
    arg_kind: Literal["tensor"] = "tensor"
    va_base: int = 0  # VA base address for the entire tensor


@dataclass(frozen=True)
class ScalarArg:
    dtype: str
    value: float | int
    arg_kind: Literal["scalar"] = "scalar"


KernelArg: TypeAlias = TensorArg | ScalarArg


@dataclass(frozen=True)
class KernelLaunchMsg:
    correlation_id: str
    request_id: str
    kernel_ref: KernelRef
    args: tuple[KernelArg, ...]
    target_cubes: tuple[int, ...] | Literal["all"] = "all"
    target_pe: int | tuple[int, ...] | Literal["all"] = "all"
    # ADR-0009 D5: synchronized kernel start. When set, each PE_CPU yields
    # until env.now >= target_start_ns before beginning kernel execution,
    # so every PE in a launch starts at the same simulated time regardless
    # of its M_CPU dispatch path length. Stamped by M_CPU fan-out.
    target_start_ns: float | None = None
    msg_type: Literal["kernel_launch"] = "kernel_launch"


@dataclass(frozen=True)
class ResponseMsg:
    """Device→Host response carrying PE execution result."""

    correlation_id: str
    request_id: str
    src_cube: int
    src_pe: int
    success: bool
    msg_type: Literal["response"] = "response"


@dataclass(frozen=True)
class PeDmaMsg:
    """Direct PE DMA request: host injects a transfer at PE_DMA level.

    Used by the probe utility to measure PE→HBM latency without requiring
    the full PE_CPU → scheduler → DMA pipeline.
    """

    correlation_id: str
    request_id: str
    src_sip: int
    src_cube: int
    src_pe: int
    dst_pa: int
    nbytes: int
    is_write: bool = False
    msg_type: Literal["pe_dma"] = "pe_dma"


@dataclass(frozen=True)
class MmuMapMsg:
    """MMU mapping install: broadcast VA→PA entries to target PEs.

    Sent via fabric: Host → PCIE_EP → IO_CPU → M_CPU → NOC → PE_MMU.
    target_sips controls which SIPs receive the message.
    """

    correlation_id: str
    request_id: str
    entries: tuple[dict, ...]  # ({"va": int, "pa": int, "size": int}, ...)
    target_sips: tuple[int, ...] | Literal["all"] = "all"
    target_cubes: tuple[int, ...] | Literal["all"] = "all"
    target_pe: int | Literal["all"] = "all"
    msg_type: Literal["mmu_map"] = "mmu_map"


@dataclass(frozen=True)
class MmuUnmapMsg:
    """MMU mapping removal: broadcast VA ranges to unmap from all PEs."""

    correlation_id: str
    request_id: str
    entries: tuple[dict, ...]  # ({"va": int, "size": int}, ...)
    target_sips: tuple[int, ...] | Literal["all"] = "all"
    target_cubes: tuple[int, ...] | Literal["all"] = "all"
    target_pe: int | Literal["all"] = "all"
    msg_type: Literal["mmu_unmap"] = "mmu_unmap"


@dataclass(frozen=True)
class IpcqInitMsg:
    """IPCQ neighbor table install (sideband fan-out, ADR-0023 D10/D12).

    Backend issues this at ``init_process_group`` time to install per-PE
    IPCQ neighbor tables. Each entry covers one direction (N/S/E/W) and
    carries the peer's IpcqEndpoint plus this PE's own rx_buffer base
    and a pre-wired SimPy Store for credit return fast path (D9).

    Routing is similar to MmuMapMsg.
    """

    correlation_id: str
    request_id: str
    target_sips: tuple[int, ...] | Literal["all"] = "all"
    target_cubes: tuple[int, ...] | Literal["all"] = "all"
    target_pe: int | tuple[int, ...] | Literal["all"] = "all"
    # entries: tuple[IpcqInitEntry, ...] — kept as tuple of plain objects to
    # avoid a runtime import cycle (IpcqInitEntry lives in
    # kernbench.common.ipcq_types).
    entries: tuple = ()
    backpressure_mode: str = "sleep"  # "poll" | "sleep"
    buffer_kind: str = "tcm"          # "tcm" | "hbm" | "sram"
    credit_size_bytes: int = 16
    msg_type: Literal["ipcq_init"] = "ipcq_init"